Accelergy-Project · cg592 · Jun 12, 2026
diff --git a/accelforge/frontend/arch/__init__.py b/accelforge/frontend/arch/__init__.py
@@ -27,6 +27,7 @@
     "Comparison",
     "Component",
     "Compute",
+    "ComputeAction",
     "Container",
     "Fork",
     "Array",

diff --git a/accelforge/frontend/arch/components.py b/accelforge/frontend/arch/components.py
@@ -225,6 +225,30 @@ def __call__(self, field, value, evaluated, symbol_table):
         return super()._eval_expressions(*args, **kwargs, post_calls=(MyPostCall(),))
 
 
+class ComputeAction(Action):
+    op_kind: str = "mac"
+    """ The semantic category of operation this action models (e.g., "mul", "add",
+    "mac", "max"). Einsums declare `map_op`/`reduce_op`; the analysis derives an
+    op_profile from those and binds each op_kind to the Compute action that declares
+    it. The default "mac" preserves legacy single-action behavior. """
+
+    fuses: EvalableList[str] = []
+    """ op_kinds this action coalesces into a single fire. When an einsum's op_profile
+    contains all listed op_kinds with EQUAL counts, those entries collapse into one
+    entry keyed under this action's `op_kind` with the shared count -- e.g. a fused MAC
+    pairs (mul, add) into one charge per iteration. Left unset, an `op_kind="mac"`
+    action defaults to fusing `[mul, add]` (so legacy single-MAC arches need no
+    change); any other op_kind defaults to no fusion. See `effective_fuses`. """
+
+    @property
+    def effective_fuses(self) -> list[str]:
+        """`fuses` if set, else the op_kind-derived default: a bare `mac` action
+        fuses `[mul, add]` (legacy fused-MAC); every other op_kind fuses nothing."""
+        if self.fuses:
+            return list(self.fuses)
+        return ["mul", "add"] if self.op_kind == "mac" else []
+
+
 _COMPONENT_MODEL_CACHE: dict[tuple, "Component"] = {}
 
 
@@ -888,7 +912,7 @@ def _copy_for_component_modeling(self) -> Self:
 
 COMPUTE_ACTIONS = EvalableList(
     [
-        Action(name="compute"),
+        ComputeAction(name="compute", op_kind="mac"),
     ]
 )
 
@@ -1279,17 +1303,30 @@ def _render_node_color(self) -> str:
 
 
 class Compute(Component, Leaf):
-    actions: EvalableList[Action] = COMPUTE_ACTIONS
-    """ The actions that this `Compute` can perform. """
+    actions: EvalableList[ComputeAction] = COMPUTE_ACTIONS
+    """ The actions that this `Compute` can perform. Each `ComputeAction` declares an
+    `op_kind` that einsums bind to via their `map_op`/`reduce_op`. """
 
     skip_initial_output_write: bool = True
     """
     If False, the initial value of output tensors will be fetched from above and used to
     initalize outputs. If True, this initial fetch and fill is skipped.
     """
 
-    def model_post_init(self, __context__=None) -> None:
-        self._update_actions(COMPUTE_ACTIONS)
+    def action_for_op_kind(self, op_kind: str) -> ComputeAction:
+        """Return the `ComputeAction` on this Compute whose `op_kind` matches.
+
+        Raises EvaluationError if no action declares this op_kind.
+        """
+        for action in self.actions:
+            if getattr(action, "op_kind", None) == op_kind:
+                return action
+        declared = sorted({getattr(a, "op_kind", None) for a in self.actions})
+        raise EvaluationError(
+            f"Compute component {self.name!r} has no action with op_kind "
+            f"{op_kind!r}. Declared op_kinds: {declared}.",
+            source_field=f"{self.name}.actions",
+        )
 
     def _render_node_shape(self) -> str:
         return "ellipse"

diff --git a/accelforge/frontend/workload.py b/accelforge/frontend/workload.py
@@ -456,6 +456,18 @@ class Einsum(EvalableModel):
     and directly place them at the location of the output tensor(s) without any
     computation. If the destination tensor is at the same location, then this is a
     no-op."""
+    map_op: str = "mul"
+    """ Binary operator applied to paired input-tensor values at each iteration-space
+    point (e.g. "mul", "add", "max", "square"). Combined with `reduce_op` to derive the
+    einsum's op_profile: the map and reduce ops are each charged once per
+    iteration-space point. The default "mul" + "add" describes a standard
+    sum-of-products; on an arch that declares a fused-MAC compute action this pair
+    collapses into one MAC (see `ComputeAction.fuses`), so legacy arches stay
+    bit-identical. """
+    reduce_op: str = "add"
+    """ Operator that folds mapped values into the output tensor across the reduction
+    ranks (e.g. "add", "max"). Charged once per iteration-space point alongside
+    `map_op`. Ignored for copy operations. """
     renames: RenameList[Rename] = RenameList()
     """ Renames of the Einsum. Renames here can be used to rename rank variables or
     tensors. When this Einsum is executed on an architecture, the architecture can use
@@ -582,6 +594,30 @@ def tensor2irrelevant_rank_variables(
             for t in self.tensor_accesses
         }
 
+    def effective_op_profile(self) -> dict[str, int]:
+        """Per-iteration-space-point op counts keyed by op_kind, derived from
+        `map_op` and `reduce_op`.
+
+        Copy operations have no ops. Every other einsum is charged one map and
+        one reduce per point (matching the legacy uniform "1 op per iter"
+        attribution); if `map_op` and `reduce_op` are the same op_kind the two
+        entries collapse into one with count 2. These are the *raw* ops -- a
+        fused-MAC arch coalesces a `{mul: N, add: N}` profile back into
+        `{mac: N}` downstream via `ComputeAction.fuses`, so the default mul+add
+        einsum stays bit-identical on legacy single-MAC arches.
+
+        `square` is treated as `mul` here (x*x runs on any multiplier), so a
+        square+add reduction fuses into a MAC like an ordinary product. An arch
+        declaring a dedicated `op_kind="square"` action would therefore not
+        bind -- the substitution erases the distinction.
+        """
+        if self.is_copy_operation:
+            return {}
+        map_op = "mul" if self.map_op == "square" else self.map_op
+        if map_op == self.reduce_op:
+            return {map_op: 2}
+        return {map_op: 1, self.reduce_op: 1}
+
     def _to_formatted_string(self, compress: bool = False) -> str:
         """
         Returns a string representation of this Einsum for use in a Pydot graph.

diff --git a/accelforge/model/_looptree/energy.py b/accelforge/model/_looptree/energy.py
@@ -52,12 +52,22 @@ def gather_actions(
         actions[key].total += accesses.net_total_write_actions()
         actions[key].max_per_unit += accesses.net_max_per_unit_write_actions()
 
+    # `ops.total_ops` is a per-op-kind dict ({op_kind: count}). Emit one action
+    # key per (level, op_kind), where the action *name* is resolved from the
+    # Compute's ComputeAction whose op_kind matches. This is what lets the
+    # downstream `compute_energy_from_actions` look up energy via
+    # `component.actions[key.action].energy`. With the legacy single-action
+    # arch ({op_kind: "mac"}, name: "compute") and the default einsum profile
+    # ({"mac": 1}), this collapses to exactly one ("compute") key per level,
+    # bit-identical to prior behavior.
     for compute, ops in looptree_results.compute_stats.items():
-        key = compute_keyer(compute, "compute")
-        if key not in actions:
-            actions[key] = ActionCount.default()
-        actions[key].total += ops.total_ops
-        actions[key].max_per_unit += ops.max_per_unit_ops
+        for op_kind, total in ops.total_ops.items():
+            action_name = _resolve_compute_action_name(spec, compute.level, op_kind)
+            key = compute_keyer(compute, action_name)
+            if key not in actions:
+                actions[key] = ActionCount.default()
+            actions[key].total += total
+            actions[key].max_per_unit += ops.max_per_unit_ops.get(op_kind, 0)
 
     for network, stats in looptree_results.network_stats.items():
         key = network_keyer(network, "hops")
@@ -70,7 +80,6 @@ def gather_actions(
 
     return actions
 
-
 def _apply_actions_scale(actions, spec):
     components = {}
     for key, count in actions.items():
@@ -80,6 +89,11 @@ def _apply_actions_scale(actions, spec):
         count.total *= scale
         count.max_per_unit *= scale
 
+def _resolve_compute_action_name(spec: Spec, level: str, op_kind: str) -> str:
+    """Map (compute level, op_kind) to the matching ComputeAction's name.
+    """
+    component = spec.arch.find(level)
+    return component.action_for_op_kind(op_kind).name
 
 def _get_buffet_keyer(verbose, use_name, bindings):
     if not verbose:

diff --git a/accelforge/model/_looptree/latency/latency.py b/accelforge/model/_looptree/latency/latency.py
@@ -47,10 +47,17 @@ def calculate_compute_latency(reuse_analysis_results, mapping, workload):
 
 def compute_summarized_latency(compute_stats, mapping, workload):
     # TODO: this is only for single-Einsum!!!
+    # `stats.max_latency` is a dict[op_kind, cycles]. Sum across op_kinds
+    # within a ComputeStats entry (matching the Compute's default
+    # total_latency = sum(*action2latency.values())), then take the max
+    # across entries -- i.e., sum-then-max. The cross-stats max here mirrors
+    # max_nonzero(comp_latency, ...) in get_latency(), keeping this code path
+    # consistent with the per-component path in latency/memory.py.
     longest_compute_latency = 0
     for stats in compute_stats.values():
+        per_iter_latency = sum(stats.max_latency.values(), 0)
         longest_compute_latency = max_nonzero(
-            longest_compute_latency, stats.max_latency
+            longest_compute_latency, per_iter_latency
         )
     return longest_compute_latency
 

diff --git a/accelforge/model/_looptree/latency/memory.py b/accelforge/model/_looptree/latency/memory.py
@@ -103,10 +103,25 @@ def component_latency(
                 f"Component {component} is not a TensorHolder or Compute"
             )
 
-    longest_compute_latency = Max(
-        0, *[s.max_latency for s in looptree_results.compute_stats.values()]
-    )
-    component_to_actions[compute_obj.name]["compute"] = longest_compute_latency
+    # `max_latency` is now a per-op-kind dict ({op_kind: cycles-per-iter-of-worst-iter}).
+    # For each op_kind, take the max across compute_stats entries (different
+    # (einsum, compute-level) keys) and inject it under the action's name,
+    # where action_name is the ComputeAction on this Compute whose op_kind matches.
+    # The Compute's `total_latency` expression (default sum(*action2latency.values()))
+    # then turns the per-kind counts into per-kind latency contributions and combines
+    # them. This implements sum-then-max: sum across op_kinds within a Compute
+    # (via total_latency), max across compute levels (via the per-kind max here
+    # and the Max(...) over component latencies at the get_latency layer).
+    per_kind_max_latency: dict[str, float] = {}
+    for s in looptree_results.compute_stats.values():
+        for op_kind, val in s.max_latency.items():
+            if op_kind in per_kind_max_latency:
+                per_kind_max_latency[op_kind] = Max(per_kind_max_latency[op_kind], val)
+            else:
+                per_kind_max_latency[op_kind] = val
+    for op_kind, count in per_kind_max_latency.items():
+        action = compute_obj.action_for_op_kind(op_kind)
+        component_to_actions[compute_obj.name][action.name] = count
 
     new_component_to_actions: dict[str, list] = {}
     for component, action_counts in component_to_actions.items():

diff --git a/accelforge/model/_looptree/reuse/symbolic/_stats.py b/accelforge/model/_looptree/reuse/symbolic/_stats.py
@@ -175,13 +175,39 @@ def blank(cls):
         stats.n_loops_above = None  # Inherit from whoever is added to this
         return stats
 
+def _scale_op_dict(d: dict[str, Any], factor: Any) -> dict[str, Any]:
+    """Multiply every per-op-kind value by `factor`. Identity at factor==1."""
+    if factor == 1:
+        return dict(d)
+    if isinstance(factor, float) and factor == int(factor):
+        factor = int(factor)
+    return {k: v * factor for k, v in d.items()}
+
+
+def _sum_op_dicts(a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any]:
+    """Per-op-kind sum. Keys present in only one operand are kept as-is."""
+    out = dict(a)
+    for k, v in b.items():
+        out[k] = out[k] + v if k in out else v
+    return out
+
+
+def _max_op_dicts(a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any]:
+    """Per-op-kind max_nonzero. Keys present in only one operand are kept as-is."""
+    out = dict(a)
+    for k, v in b.items():
+        out[k] = max_nonzero(out[k], v) if k in out else v
+    return out
 
 @dataclass
 class ComputeStats:
-    total_ops: Any = field(default=0)
-    max_per_unit_ops: Any = field(default=0)
+    # Per-op-kind counts. Keys are op_kind strings (e.g. "mul", "add", "mac")
+    # matching the ComputeAction.op_kind values declared on the arch's Compute
+    # component. An empty dict means no contribution.
+    total_ops: dict[str, Any] = field(default_factory=dict)
+    max_per_unit_ops: dict[str, Any] = field(default_factory=dict)
     # "max" below refers to the longest latency of any iteration
-    max_latency: Any = field(default=0)
+    max_latency: dict[str, Any] = field(default_factory=dict)
     # Mapping from the loop-index (0 at top) to the latency of the first
     # iteration of that loop. "Max" because we may have loops above that and we
     # will take the maximum of the firsts.
@@ -193,9 +219,9 @@ def repeat_temporal(self, factor: int) -> "ComputeStats":
             return new
         if type(factor) is float and factor == int(factor):
             factor = int(factor)
-        new.total_ops = new.total_ops * factor
-        new.max_per_unit_ops = new.max_per_unit_ops * factor
-        new.max_latency = new.max_latency * factor
+        new.total_ops = _scale_op_dict(new.total_ops, factor)
+        new.max_per_unit_ops = _scale_op_dict(new.max_per_unit_ops, factor)
+        new.max_latency = _scale_op_dict(new.max_latency, factor)
         # NOTE: max_first_latency does not change
         return new
 
@@ -205,14 +231,14 @@ def repeat_spatial(self, factor: int) -> "ComputeStats":
             return new
         if type(factor) is float and factor == int(factor):
             factor = int(factor)
-        new.total_ops = new.total_ops * factor
+        new.total_ops = _scale_op_dict(new.total_ops, factor)
         return new
 
     def __add__(self, other: "ComputeStats") -> "ComputeStats":
         new = copy.copy(self)
-        new.total_ops += other.total_ops
-        new.max_per_unit_ops += other.max_per_unit_ops
-        new.max_latency += other.max_latency
+        new.total_ops = _sum_op_dicts(new.total_ops, other.total_ops)
+        new.max_per_unit_ops = _sum_op_dicts(new.max_per_unit_ops, other.max_per_unit_ops)
+        new.max_latency = _sum_op_dicts(new.max_latency, other.max_latency)
         # max_first_latency is only ever updated across loops ABOVE the loop
         # for which we calculated that first latency, so we should MAX
         new.max_first_latency = max_dict(
@@ -221,21 +247,19 @@ def __add__(self, other: "ComputeStats") -> "ComputeStats":
         return new
 
     def combine_temporal(self, other: "ComputeStats"):
-        self.total_ops += other.total_ops
-        self.max_per_unit_ops += other.max_per_unit_ops
-        self.max_latency += other.max_latency
+        self.total_ops = _sum_op_dicts(self.total_ops, other.total_ops)
+        self.max_per_unit_ops = _sum_op_dicts(self.max_per_unit_ops, other.max_per_unit_ops)
+        self.max_latency = _sum_op_dicts(self.max_latency, other.max_latency)
         # max_first_latency is only ever updated across loops ABOVE the loop
         # for which we calculated that first latency, so we should MAX
         self.max_first_latency = max_dict(
             self.max_first_latency, other.max_first_latency
         )  # FIRST LATENCY
 
     def combine_spatial(self, other: "ComputeStats"):
-        self.total_ops += other.total_ops
-        self.max_per_unit_ops = max_nonzero(
-            self.max_per_unit_ops, other.max_per_unit_ops
-        )
-        self.max_latency = max_nonzero(self.max_latency, other.max_latency)
+        self.total_ops = _sum_op_dicts(self.total_ops, other.total_ops)
+        self.max_per_unit_ops = _max_op_dicts(self.max_per_unit_ops, other.max_per_unit_ops)
+        self.max_latency = _max_op_dicts(self.max_latency, other.max_latency)
         # max_first_latency is only ever updated across loops ABOVE the loop
         # for which we calculated that first latency, so we should MAX
         self.max_first_latency = max_dict(