Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions accelforge/frontend/arch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"Comparison",
"Component",
"Compute",
"ComputeAction",
"Container",
"Fork",
"Array",
Expand Down
47 changes: 42 additions & 5 deletions accelforge/frontend/arch/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,30 @@ def __call__(self, field, value, evaluated, symbol_table):
return super()._eval_expressions(*args, **kwargs, post_calls=(MyPostCall(),))


class ComputeAction(Action):
op_kind: str = "mac"
""" The semantic category of operation this action models (e.g., "mul", "add",
"mac", "max"). Einsums declare `map_op`/`reduce_op`; the analysis derives an
op_profile from those and binds each op_kind to the Compute action that declares
it. The default "mac" preserves legacy single-action behavior. """

fuses: EvalableList[str] = []
""" op_kinds this action coalesces into a single fire. When an einsum's op_profile
contains all listed op_kinds with EQUAL counts, those entries collapse into one
entry keyed under this action's `op_kind` with the shared count -- e.g. a fused MAC
pairs (mul, add) into one charge per iteration. Left unset, an `op_kind="mac"`
action defaults to fusing `[mul, add]` (so legacy single-MAC arches need no
change); any other op_kind defaults to no fusion. See `effective_fuses`. """

@property
def effective_fuses(self) -> list[str]:
"""`fuses` if set, else the op_kind-derived default: a bare `mac` action
fuses `[mul, add]` (legacy fused-MAC); every other op_kind fuses nothing."""
if self.fuses:
return list(self.fuses)
return ["mul", "add"] if self.op_kind == "mac" else []


_COMPONENT_MODEL_CACHE: dict[tuple, "Component"] = {}


Expand Down Expand Up @@ -888,7 +912,7 @@ def _copy_for_component_modeling(self) -> Self:

COMPUTE_ACTIONS = EvalableList(
[
Action(name="compute"),
ComputeAction(name="compute", op_kind="mac"),
]
)

Expand Down Expand Up @@ -1279,17 +1303,30 @@ def _render_node_color(self) -> str:


class Compute(Component, Leaf):
actions: EvalableList[Action] = COMPUTE_ACTIONS
""" The actions that this `Compute` can perform. """
actions: EvalableList[ComputeAction] = COMPUTE_ACTIONS
""" The actions that this `Compute` can perform. Each `ComputeAction` declares an
`op_kind` that einsums bind to via their `map_op`/`reduce_op`. """

skip_initial_output_write: bool = True
"""
If False, the initial value of output tensors will be fetched from above and used to
initalize outputs. If True, this initial fetch and fill is skipped.
"""

def model_post_init(self, __context__=None) -> None:
self._update_actions(COMPUTE_ACTIONS)
def action_for_op_kind(self, op_kind: str) -> ComputeAction:
"""Return the `ComputeAction` on this Compute whose `op_kind` matches.

Raises EvaluationError if no action declares this op_kind.
"""
for action in self.actions:
if getattr(action, "op_kind", None) == op_kind:
return action
declared = sorted({getattr(a, "op_kind", None) for a in self.actions})
raise EvaluationError(
f"Compute component {self.name!r} has no action with op_kind "
f"{op_kind!r}. Declared op_kinds: {declared}.",
source_field=f"{self.name}.actions",
)

def _render_node_shape(self) -> str:
return "ellipse"
Expand Down
36 changes: 36 additions & 0 deletions accelforge/frontend/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,18 @@ class Einsum(EvalableModel):
and directly place them at the location of the output tensor(s) without any
computation. If the destination tensor is at the same location, then this is a
no-op."""
map_op: str = "mul"
""" Binary operator applied to paired input-tensor values at each iteration-space
point (e.g. "mul", "add", "max", "square"). Combined with `reduce_op` to derive the
einsum's op_profile: the map and reduce ops are each charged once per
iteration-space point. The default "mul" + "add" describes a standard
sum-of-products; on an arch that declares a fused-MAC compute action this pair
collapses into one MAC (see `ComputeAction.fuses`), so legacy arches stay
bit-identical. """
reduce_op: str = "add"
""" Operator that folds mapped values into the output tensor across the reduction
ranks (e.g. "add", "max"). Charged once per iteration-space point alongside
`map_op`. Ignored for copy operations. """
renames: RenameList[Rename] = RenameList()
""" Renames of the Einsum. Renames here can be used to rename rank variables or
tensors. When this Einsum is executed on an architecture, the architecture can use
Expand Down Expand Up @@ -582,6 +594,30 @@ def tensor2irrelevant_rank_variables(
for t in self.tensor_accesses
}

def effective_op_profile(self) -> dict[str, int]:
"""Per-iteration-space-point op counts keyed by op_kind, derived from
`map_op` and `reduce_op`.

Copy operations have no ops. Every other einsum is charged one map and
one reduce per point (matching the legacy uniform "1 op per iter"
attribution); if `map_op` and `reduce_op` are the same op_kind the two
entries collapse into one with count 2. These are the *raw* ops -- a
fused-MAC arch coalesces a `{mul: N, add: N}` profile back into
`{mac: N}` downstream via `ComputeAction.fuses`, so the default mul+add
einsum stays bit-identical on legacy single-MAC arches.

`square` is treated as `mul` here (x*x runs on any multiplier), so a
square+add reduction fuses into a MAC like an ordinary product. An arch
declaring a dedicated `op_kind="square"` action would therefore not
bind -- the substitution erases the distinction.
"""
if self.is_copy_operation:
return {}
map_op = "mul" if self.map_op == "square" else self.map_op
if map_op == self.reduce_op:
return {map_op: 2}
return {map_op: 1, self.reduce_op: 1}

def _to_formatted_string(self, compress: bool = False) -> str:
"""
Returns a string representation of this Einsum for use in a Pydot graph.
Expand Down
26 changes: 20 additions & 6 deletions accelforge/model/_looptree/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,22 @@ def gather_actions(
actions[key].total += accesses.net_total_write_actions()
actions[key].max_per_unit += accesses.net_max_per_unit_write_actions()

# `ops.total_ops` is a per-op-kind dict ({op_kind: count}). Emit one action
# key per (level, op_kind), where the action *name* is resolved from the
# Compute's ComputeAction whose op_kind matches. This is what lets the
# downstream `compute_energy_from_actions` look up energy via
# `component.actions[key.action].energy`. With the legacy single-action
# arch ({op_kind: "mac"}, name: "compute") and the default einsum profile
# ({"mac": 1}), this collapses to exactly one ("compute") key per level,
# bit-identical to prior behavior.
for compute, ops in looptree_results.compute_stats.items():
key = compute_keyer(compute, "compute")
if key not in actions:
actions[key] = ActionCount.default()
actions[key].total += ops.total_ops
actions[key].max_per_unit += ops.max_per_unit_ops
for op_kind, total in ops.total_ops.items():
action_name = _resolve_compute_action_name(spec, compute.level, op_kind)
key = compute_keyer(compute, action_name)
if key not in actions:
actions[key] = ActionCount.default()
actions[key].total += total
actions[key].max_per_unit += ops.max_per_unit_ops.get(op_kind, 0)

for network, stats in looptree_results.network_stats.items():
key = network_keyer(network, "hops")
Expand All @@ -70,7 +80,6 @@ def gather_actions(

return actions


def _apply_actions_scale(actions, spec):
components = {}
for key, count in actions.items():
Expand All @@ -80,6 +89,11 @@ def _apply_actions_scale(actions, spec):
count.total *= scale
count.max_per_unit *= scale

def _resolve_compute_action_name(spec: Spec, level: str, op_kind: str) -> str:
"""Map (compute level, op_kind) to the matching ComputeAction's name.
"""
component = spec.arch.find(level)
return component.action_for_op_kind(op_kind).name

def _get_buffet_keyer(verbose, use_name, bindings):
if not verbose:
Expand Down
9 changes: 8 additions & 1 deletion accelforge/model/_looptree/latency/latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,17 @@ def calculate_compute_latency(reuse_analysis_results, mapping, workload):

def compute_summarized_latency(compute_stats, mapping, workload):
# TODO: this is only for single-Einsum!!!
# `stats.max_latency` is a dict[op_kind, cycles]. Sum across op_kinds
# within a ComputeStats entry (matching the Compute's default
# total_latency = sum(*action2latency.values())), then take the max
# across entries -- i.e., sum-then-max. The cross-stats max here mirrors
# max_nonzero(comp_latency, ...) in get_latency(), keeping this code path
# consistent with the per-component path in latency/memory.py.
longest_compute_latency = 0
for stats in compute_stats.values():
per_iter_latency = sum(stats.max_latency.values(), 0)
longest_compute_latency = max_nonzero(
longest_compute_latency, stats.max_latency
longest_compute_latency, per_iter_latency
)
return longest_compute_latency

Expand Down
23 changes: 19 additions & 4 deletions accelforge/model/_looptree/latency/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,25 @@ def component_latency(
f"Component {component} is not a TensorHolder or Compute"
)

longest_compute_latency = Max(
0, *[s.max_latency for s in looptree_results.compute_stats.values()]
)
component_to_actions[compute_obj.name]["compute"] = longest_compute_latency
# `max_latency` is now a per-op-kind dict ({op_kind: cycles-per-iter-of-worst-iter}).
# For each op_kind, take the max across compute_stats entries (different
# (einsum, compute-level) keys) and inject it under the action's name,
# where action_name is the ComputeAction on this Compute whose op_kind matches.
# The Compute's `total_latency` expression (default sum(*action2latency.values()))
# then turns the per-kind counts into per-kind latency contributions and combines
# them. This implements sum-then-max: sum across op_kinds within a Compute
# (via total_latency), max across compute levels (via the per-kind max here
# and the Max(...) over component latencies at the get_latency layer).
per_kind_max_latency: dict[str, float] = {}
for s in looptree_results.compute_stats.values():
for op_kind, val in s.max_latency.items():
if op_kind in per_kind_max_latency:
per_kind_max_latency[op_kind] = Max(per_kind_max_latency[op_kind], val)
else:
per_kind_max_latency[op_kind] = val
for op_kind, count in per_kind_max_latency.items():
action = compute_obj.action_for_op_kind(op_kind)
component_to_actions[compute_obj.name][action.name] = count

new_component_to_actions: dict[str, list] = {}
for component, action_counts in component_to_actions.items():
Expand Down
60 changes: 42 additions & 18 deletions accelforge/model/_looptree/reuse/symbolic/_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,13 +175,39 @@ def blank(cls):
stats.n_loops_above = None # Inherit from whoever is added to this
return stats

def _scale_op_dict(d: dict[str, Any], factor: Any) -> dict[str, Any]:
"""Multiply every per-op-kind value by `factor`. Identity at factor==1."""
if factor == 1:
return dict(d)
if isinstance(factor, float) and factor == int(factor):
factor = int(factor)
return {k: v * factor for k, v in d.items()}


def _sum_op_dicts(a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any]:
"""Per-op-kind sum. Keys present in only one operand are kept as-is."""
out = dict(a)
for k, v in b.items():
out[k] = out[k] + v if k in out else v
return out


def _max_op_dicts(a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any]:
"""Per-op-kind max_nonzero. Keys present in only one operand are kept as-is."""
out = dict(a)
for k, v in b.items():
out[k] = max_nonzero(out[k], v) if k in out else v
return out

@dataclass
class ComputeStats:
total_ops: Any = field(default=0)
max_per_unit_ops: Any = field(default=0)
# Per-op-kind counts. Keys are op_kind strings (e.g. "mul", "add", "mac")
# matching the ComputeAction.op_kind values declared on the arch's Compute
# component. An empty dict means no contribution.
total_ops: dict[str, Any] = field(default_factory=dict)
max_per_unit_ops: dict[str, Any] = field(default_factory=dict)
# "max" below refers to the longest latency of any iteration
max_latency: Any = field(default=0)
max_latency: dict[str, Any] = field(default_factory=dict)
# Mapping from the loop-index (0 at top) to the latency of the first
# iteration of that loop. "Max" because we may have loops above that and we
# will take the maximum of the firsts.
Expand All @@ -193,9 +219,9 @@ def repeat_temporal(self, factor: int) -> "ComputeStats":
return new
if type(factor) is float and factor == int(factor):
factor = int(factor)
new.total_ops = new.total_ops * factor
new.max_per_unit_ops = new.max_per_unit_ops * factor
new.max_latency = new.max_latency * factor
new.total_ops = _scale_op_dict(new.total_ops, factor)
new.max_per_unit_ops = _scale_op_dict(new.max_per_unit_ops, factor)
new.max_latency = _scale_op_dict(new.max_latency, factor)
# NOTE: max_first_latency does not change
return new

Expand All @@ -205,14 +231,14 @@ def repeat_spatial(self, factor: int) -> "ComputeStats":
return new
if type(factor) is float and factor == int(factor):
factor = int(factor)
new.total_ops = new.total_ops * factor
new.total_ops = _scale_op_dict(new.total_ops, factor)
return new

def __add__(self, other: "ComputeStats") -> "ComputeStats":
new = copy.copy(self)
new.total_ops += other.total_ops
new.max_per_unit_ops += other.max_per_unit_ops
new.max_latency += other.max_latency
new.total_ops = _sum_op_dicts(new.total_ops, other.total_ops)
new.max_per_unit_ops = _sum_op_dicts(new.max_per_unit_ops, other.max_per_unit_ops)
new.max_latency = _sum_op_dicts(new.max_latency, other.max_latency)
# max_first_latency is only ever updated across loops ABOVE the loop
# for which we calculated that first latency, so we should MAX
new.max_first_latency = max_dict(
Expand All @@ -221,21 +247,19 @@ def __add__(self, other: "ComputeStats") -> "ComputeStats":
return new

def combine_temporal(self, other: "ComputeStats"):
self.total_ops += other.total_ops
self.max_per_unit_ops += other.max_per_unit_ops
self.max_latency += other.max_latency
self.total_ops = _sum_op_dicts(self.total_ops, other.total_ops)
self.max_per_unit_ops = _sum_op_dicts(self.max_per_unit_ops, other.max_per_unit_ops)
self.max_latency = _sum_op_dicts(self.max_latency, other.max_latency)
# max_first_latency is only ever updated across loops ABOVE the loop
# for which we calculated that first latency, so we should MAX
self.max_first_latency = max_dict(
self.max_first_latency, other.max_first_latency
) # FIRST LATENCY

def combine_spatial(self, other: "ComputeStats"):
self.total_ops += other.total_ops
self.max_per_unit_ops = max_nonzero(
self.max_per_unit_ops, other.max_per_unit_ops
)
self.max_latency = max_nonzero(self.max_latency, other.max_latency)
self.total_ops = _sum_op_dicts(self.total_ops, other.total_ops)
self.max_per_unit_ops = _max_op_dicts(self.max_per_unit_ops, other.max_per_unit_ops)
self.max_latency = _max_op_dicts(self.max_latency, other.max_latency)
# max_first_latency is only ever updated across loops ABOVE the loop
# for which we calculated that first latency, so we should MAX
self.max_first_latency = max_dict(
Expand Down
Loading