diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py
index a12460e3..0de76246 100644
--- a/AsmParser/tog_generator.py
+++ b/AsmParser/tog_generator.py
@@ -1,3 +1,9 @@
+# DEPRECATED (timing path): legacy ONNX Tile-Operation-Graph producer. Builds
+# the TOG and serializes it to ONNX for the C++ TileGraphParser. Superseded by
+# the C++ trace pipeline (PyTorchSimFrontend/mlir/passes/build_skeleton.py +
+# lower_to_emitc.py + cycle_table.py -> a compiled trace .so). Kept live so the
+# current pipeline does not break; to be retired once the trace pipeline (P3+)
+# stabilizes. See docs/design/togsim_cpp_trace.md.
 import os
 import sys
 import importlib.util
diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py
index 492133a3..e573d1a5 100644
--- a/PyTorchSimFrontend/extension_codecache.py
+++ b/PyTorchSimFrontend/extension_codecache.py
@@ -241,8 +241,19 @@ def load(cls, source_code,
             # Run cyclesim
             cyclesim = CycleSimulator()
             cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode)
+            # Snapshot for the P3-trace hook below: generate_tile_graph consumes
+            # cycle_list in place (cycle_list.pop(0) per tile), leaving it empty.
+            cycle_list_for_trace = list(cycle_list)
 
             # Create TOG
+            # DEPRECATED (timing path): this ONNX-TOG producer -- run_tog ->
+            # tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser --
+            # is being superseded by the C++ trace pipeline (build_skeleton +
+            # lower_to_emitc -> compiled .so, + the cycle_table sidecar). The
+            # per-tile cycle_list / x_offset / w_offset computed here are exactly
+            # what cycle_table.build_cycle_table will reuse, so both paths stay
+            # cycle-consistent during the transition. Kept live (pipeline must not
+            # break); to be retired once the trace pipeline (P3+) stabilizes.
             w_offset, x_offset = vectorlane_size, vectorlane_size
             if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size:
                 x_offset = kwargs['loop_size'][-3]
@@ -258,6 +269,36 @@ def load(cls, source_code,
                 w_offset=w_offset, # FIXME.
                 vector_lane=vectorlane_size
             )
+
+            # Trace pipeline (DEFAULT): emit the compiled trace producer .so + the
+            # cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This
+            # is the default simulation path (the C++ TOG); the legacy ONNX TOG is the
+            # opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the .so is unused
+            # so skip emitting it. Best-effort: never breaks the compile.
+            if os.environ.get("TORCHSIM_LEGACY_TOG") != "1":
+                try:
+                    import mlir.ir as ir
+                    from PyTorchSimFrontend.mlir.passes import (
+                        build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e)
+                    pv = sample_mlir_path + "_postvcix.mlir"
+                    _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True
+                    with _ctx:
+                        _mod = ir.Module.parse(open(pv).read(), _ctx)
+                        _bs.build_skeleton(_mod)
+                        _ntiles = len(_ct._compute_types(_mod))
+                        # align lengths: gem5 gives one numCycles per compute node;
+                        # pad with the last value / truncate if it disagrees.
+                        _cl = list(cycle_list_for_trace)
+                        if _cl and len(_cl) != _ntiles:
+                            _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles]
+                        logger.info(f"[P3-trace] cycle_list={cycle_list_for_trace} -> {_cl} "
+                                    f"(#tiles={_ntiles}, x_off={x_offset}, w_off={w_offset})")
+                        _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset)
+                    _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv"))
+                    _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so"))
+                    logger.info(f"[P3-trace] wrote trace.so + trace_cycles.tsv in {write_path}")
+                except Exception as e:
+                    logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}")
         return key
 
 class CustomAsyncCompile(AsyncCompile):
diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
index 725e0dc6..1ee62f36 100644
--- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
+++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py
@@ -1120,9 +1120,13 @@ def codegen_nodes(self, nodes, kernel_name):
         src_code, meta_code = super().codegen_nodes(nodes, kernel_name)
         self._prepare_simulator_headers(src_code)
         if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode:
-            optimal_src_code, meta_code = self.autotune(nodes, kernel_name)[:2]
+            # Use temporaries: autotune returns [None, None, None] when it cannot
+            # autotune (e.g. a size-1 pointwise kernel with ranges == [1]), and
+            # unpacking into meta_code would clobber the valid arg_attributes that
+            # the fall-through below returns.
+            optimal_src_code, optimal_meta_code = self.autotune(nodes, kernel_name)[:2]
             if optimal_src_code is not None:
-                return optimal_src_code, meta_code
+                return optimal_src_code, optimal_meta_code
         return src_code, meta_code
 
     def _prepare_simulator_headers(self, src_code):
diff --git a/PyTorchSimFrontend/mlir/passes/_mlir_util.py b/PyTorchSimFrontend/mlir/passes/_mlir_util.py
new file mode 100644
index 00000000..e39f9d6f
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/_mlir_util.py
@@ -0,0 +1,87 @@
+"""Small, dependency-light helpers shared across the MLIR passes.
+
+Every pass had its own copy of the same op-walk generator (named variously
+`_iter_ops` / `_walk` / `_walk_ops`) and the same one-line attribute builders
+(`_i32` / `_i64` / ...). This module is the single source for both.
+
+Import-safety: `walk_ops` is pure block/op attribute access and needs no MLIR
+bindings, so this module does NOT import `mlir.ir` at top level -- some passes
+(e.g. lower_vlane_idx, decompose_transfer) are deliberately importable without
+the bindings present and only touch `mlir.ir` inside their run functions. The
+attribute builders therefore import `mlir.ir` lazily; they require an active
+MLIR context (the caller's `with ctx:`), exactly as the per-pass copies did.
+"""
+
+
+def walk_ops(block):
+    """Yield every op under `block` in program order, recursing into regions.
+
+    Snapshots each block's operation list, so a caller may erase ops while
+    iterating (the strictest of the former copies; a superset of the rest)."""
+    for op in list(block.operations):
+        yield op
+        for region in op.operation.regions:
+            for b in region.blocks:
+                yield from walk_ops(b)
+
+
+def _ir():
+    import mlir.ir as ir
+    return ir
+
+
+def i32(v):
+    """`i32` IntegerAttr for `v` (uses the active MLIR context)."""
+    ir = _ir()
+    return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), int(v))
+
+
+def i64(v):
+    """`i64` IntegerAttr for `v`."""
+    ir = _ir()
+    return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), int(v))
+
+
+def i64_array(vals):
+    """ArrayAttr of `i64` IntegerAttrs for `vals`."""
+    ir = _ir()
+    i = ir.IntegerType.get_signless(64)
+    return ir.ArrayAttr.get([ir.IntegerAttr.get(i, int(v)) for v in vals])
+
+
+def str_attr(v):
+    """StringAttr of `str(v)`."""
+    ir = _ir()
+    return ir.StringAttr.get(str(v))
+
+
+# ---------------------------------------------------------------------------
+# attribute readers -- accept an OpView or an Operation; `default` is returned
+# when `key` is absent (callers that want the strict "must be present" behaviour
+# simply never pass an absent key).
+# ---------------------------------------------------------------------------
+def _attrs(op):
+    return getattr(op, "operation", op).attributes
+
+
+def attr_int(op, key, default=None):
+    """Integer value of `op`'s `key` attribute, or `default` if absent."""
+    ir = _ir()
+    a = _attrs(op)
+    return ir.IntegerAttr(a[key]).value if key in a else default
+
+
+def attr_bool(op, key, default=False):
+    """Bool value of `op`'s `key` attribute, or `default` if absent."""
+    ir = _ir()
+    a = _attrs(op)
+    return bool(ir.BoolAttr(a[key]).value) if key in a else default
+
+
+def attr_i64_array(op, key, default=None):
+    """`op`'s `key` ArrayAttr of integers as a Python list, or `default` if
+    absent (pass `default=[]` for the "missing -> empty" convention)."""
+    ir = _ir()
+    a = _attrs(op)
+    return ([ir.IntegerAttr(x).value for x in ir.ArrayAttr(a[key])]
+            if key in a else default)
diff --git a/PyTorchSimFrontend/mlir/passes/build_skeleton.py b/PyTorchSimFrontend/mlir/passes/build_skeleton.py
new file mode 100644
index 00000000..df4c6046
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/build_skeleton.py
@@ -0,0 +1,512 @@
+"""build_skeleton pass (C2): reduce a kernel's post-vcix MLIR to the
+*skeleton + API* form, in place.
+
+The trace pipeline (docs/design/togsim_cpp_trace.md) compiles a kernel to a
+shape-parametric C++ trace producer. The producer is just the kernel's loop
+skeleton with the data computation replaced by calls to the event-based runtime
+API. This pass performs that reduction at the MLIR level:
+
+  * `memref.dma_start`  -> `togsim.dma(...) {tag_id, is_async, ...}` carrying the
+                            runtime tag index operand (`%tag[%idx]`).
+  * `memref.dma_wait`   -> `togsim.memory_barrier(tag_idx) {tag_id, write_bufs}`,
+                            the explicit async-DMA sync. It pairs with its dma by
+                            the RUNTIME tag slot (tag_id + the tag index), not a
+                            compile-time id: one static dma op runs once per loop
+                            iteration with a different `%tag[%idx]`, so only the
+                            runtime slot can pair iteration i's dma with its wait.
+  * each compute node   -> a single `togsim.compute {tile_id, compute_type}`
+  * everything else      -> removed by a use-based DCE, keeping the loops and the
+                            index/address arithmetic the survivors depend on.
+
+It reuses build_tog's traversal (`TogBuilder` / `_build`): loops, DMAs and
+compute blocks are already identified there, each with a back-pointer to its
+MLIR op(s), so this pass only adds the *rewrite*. Keeping a single traversal
+guarantees the skeleton and the legacy TOG see the same structure.
+
+Counterpart to `build_tog.build_tog_and_mutate`.
+
+The DCE is safe by construction: it never erases an op whose results still have
+uses, so at worst it leaves extra ops in the dump (visible for diagnosis) rather
+than producing invalid IR.
+
+Requires the MLIR Python bindings (importing `build_tog` pulls in `mlir.ir`).
+"""
+
+from . import togsim_ops as ts
+from ._mlir_util import walk_ops, i32, i64, i64_array, str_attr
+from .build_tog import (
+    ir,
+    TogBuilder,
+    _build,
+    _reset_ids,
+    _find_kernel,
+    _value_key,
+    TOGDMANode,
+    TOGDMAWaitNode,
+    _COMPUTE_TYPE_NAME,
+)
+
+#: Marker op names for the passes/__init__ fast-path (skip parsing if absent).
+MARKERS = ("memref.dma_start", "memref.dma_wait")
+
+#: Ops the DCE must never remove (loops, terminators, our API ops).
+_KEEP = {
+    "affine.for", "scf.for", "scf.while",
+    "affine.yield", "scf.yield", "func.return",
+    ts.DMA, ts.COMPUTE, ts.COMPUTE_BAR, ts.MEMORY_BAR,
+}
+
+
+def _kernel_block(module):
+    func_op = _find_kernel(module)
+    if func_op is None:
+        return None
+    return func_op.regions[0].blocks[0]
+
+
+# ---------------------------------------------------------------------------
+# op construction
+# ---------------------------------------------------------------------------
+def _arg_id_of(base_addr):
+    """Tensor func-arg ordinal from a build_tog base name ("arg3" -> 3); -1 if
+    it is not a plain block-arg base."""
+    s = str(base_addr)
+    return int(s[3:]) if s.startswith("arg") and s[3:].isdigit() else -1
+
+
+def _emit_dma(ctx, dma_node, tag_id, dram_index, tag_index, read_bufs, write_bufs):
+    """Insert a `togsim.dma` before the original `memref.dma_start`.
+
+    `tag_id` is the identity of this DMA's tag memref. An async DMA pairs with
+    its `togsim.memory_barrier` (the original dma_wait) by the RUNTIME tag slot
+    -- (tag_id, tag_index) -- not a compile-time identifier: one static dma op runs
+    once per loop iteration, each with a different runtime `%tag[%idx]` slot, so
+    only a runtime key can pair iteration i's dma with iteration i's wait.
+
+    `dram_index` is the original linear DRAM index Value (the `affine.apply`
+    result that indexed the tensor in the `memref.dma_start`) -- carried as an
+    operand so the DCE keeps the address arithmetic live and the C4 lowering can
+    compute the real `base_addr = base[arg_id] + index*elem` (P3, approach A).
+
+    `tag_index` is the original SRAM tag index Value (`%tag[%idx]`), carried as a
+    second operand: the runtime tag slot, used both to pair with the barrier and
+    for the double-buffer / SRAM-capacity (WAR) model.
+    Operand order: [dram_index, tag_index] (each omitted if absent)."""
+    op = dma_node.op
+    attrs = {
+        ts.ATTR_DIR: i32(ts.DIR_STORE if dma_node.is_write else ts.DIR_LOAD),
+        ts.ATTR_DIMS: i64_array(dma_node.tile_size),
+        ts.ATTR_STRIDES: i64_array(dma_node.tile_stride),
+        ts.ATTR_ELEM_BITS: i32(dma_node.element_size),
+        ts.ATTR_IS_ASYNC: ir.BoolAttr.get(bool(dma_node.is_async)),
+        ts.ATTR_TAG_ID: i32(tag_id),
+        ts.ATTR_ARG_ID: i32(_arg_id_of(dma_node.base_addr)),
+        "base": str_attr(dma_node.base_addr),
+        # SRAM spad this DMA touches (load writes it, store reads it) -- sec 10.
+        ts.ATTR_READ_BUFS: i64_array(read_bufs),
+        ts.ATTR_WRITE_BUFS: i64_array(write_bufs),
+    }
+    operands = [v for v in (dram_index, tag_index) if v is not None]
+    ir.Operation.create(
+        ts.DMA,
+        results=[],
+        operands=operands,
+        attributes=attrs,
+        loc=ir.Location.unknown(ctx),
+        ip=ir.InsertionPoint(op),
+    )
+
+
+def _emit_compute_bar(ctx, anchor_op):
+    """Insert a `togsim.compute_barrier` before `anchor_op` -- the fence that
+    drains in-flight async compute (the systolic-array matmuls) before a store
+    consumes their result (sec 10.7).
+
+    FIXME: this is the one barrier still synthesized here rather than read from
+    the IR. Like the async-load memory barrier (now mapped 1:1 from the explicit
+    dma_wait), the compute fence should eventually appear explicitly in the input
+    MLIR and be mapped through, not auto-inserted -- no surprising insertion."""
+    ir.Operation.create(
+        ts.COMPUTE_BAR, results=[], operands=[], attributes={},
+        loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(anchor_op))
+
+
+def _emit_memory_bar(ctx, anchor_op, tag_id, tag_index, write_bufs):
+    """Insert a `togsim.memory_barrier` before `anchor_op` -- the explicit
+    async-DMA sync that was the original `memref.dma_wait`. It pairs with its
+    async `togsim.dma` by the RUNTIME tag slot (tag_id + tag_index), and carries
+    the SRAM buffer that dma loaded so consumers gate on data-arrival, not on the
+    async dma's issue-complete."""
+    attrs = {
+        ts.ATTR_TAG_ID: i32(tag_id),
+        ts.ATTR_WRITE_BUFS: i64_array(write_bufs),
+    }
+    operands = [tag_index] if tag_index is not None else []
+    ir.Operation.create(
+        ts.MEMORY_BAR, results=[], operands=operands, attributes=attrs,
+        loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(anchor_op))
+
+
+def _flatten_add(expr):
+    """Top-level additive summands of an AffineExpr (`.lhs`/`.rhs` come back typed
+    as the base AffineExpr, so use the `isinstance`/cast pattern, not Python
+    isinstance)."""
+    if ir.AffineAddExpr.isinstance(expr):
+        a = ir.AffineAddExpr(expr)
+        return _flatten_add(a.lhs) + _flatten_add(a.rhs)
+    return [expr]
+
+
+def _neg_coeff_dim(summand):
+    """If `summand` is `dim * c` with a negative constant `c`, return that dim's
+    position; else None. lower_to_vcix tags each accumulation (reduction) loop var
+    with coefficient -1 in the dma_wait tag index -- a SENTINEL marking the
+    reduction axis, not an arithmetic offset (legacy TileGraphParser skips stride
+    -1 for the same reason)."""
+    if not ir.AffineMulExpr.isinstance(summand):
+        return None
+    mul = ir.AffineMulExpr(summand)
+    l, r = mul.lhs, mul.rhs
+    dim = l if ir.AffineDimExpr.isinstance(l) else (r if ir.AffineDimExpr.isinstance(r) else None)
+    con = l if ir.AffineConstantExpr.isinstance(l) else (r if ir.AffineConstantExpr.isinstance(r) else None)
+    if dim is None or con is None or ir.AffineConstantExpr(con).value >= 0:
+        return None
+    return ir.AffineDimExpr(dim).position
+
+
+def _strip_accum_terms(ctx, tag_index, anchor_op):
+    """Return a tag-index Value with the accumulation-marked (-1 coefficient) terms
+    dropped, so a memory_barrier waits on the SAME subtile slot its async load
+    wrote.
+
+    The wait tag index built by lower_to_vcix carries `-acc_iv` for each reduction
+    loop var; the matching load index (dma_fine_grained) is subtile-only. Without
+    this, at reduction iteration > 0 the producer EVALUATES `-acc_iv` to a negative
+    slot, so the recorded barrier slot diverges from the load slot and the runtime
+    tag pairing fails (TOGSim aborts with "Key does not exist in ... tag table").
+    Dropping the -1 terms mirrors legacy TileGraphParser.cc, which skips stride -1
+    and routes the reduction axis to a separate accum tag component; here the
+    per-iteration tag alloc (dma_fine_grained) already separates the reductions, so
+    the barrier only needs the subtile slot.
+
+    Falls through (returns `tag_index` unchanged) for anything that is not an
+    affine.apply whose single result carries such a term -- e.g. the single-tile
+    case, whose index has no reduction term."""
+    if tag_index is None:
+        return None
+    try:
+        apply_op = tag_index.owner
+        if apply_op.name != "affine.apply":
+            return tag_index
+        amap = ir.AffineMapAttr(apply_op.attributes["map"]).value
+    except Exception:
+        return tag_index
+    if amap.n_dims == 0 or amap.n_symbols != 0 or len(amap.results) != 1:
+        return tag_index
+    expr = amap.results[0]
+    dropped = sorted({p for p in (_neg_coeff_dim(s) for s in _flatten_add(expr))
+                      if p is not None})
+    if not dropped:
+        return tag_index
+    n = amap.n_dims
+    kept = [i for i in range(n) if i not in dropped]
+    new_pos = {old: i for i, old in enumerate(kept)}
+    # compose the original expr with a selector that sends each dropped dim to 0
+    # and renumbers the kept dims 0..k-1.
+    sel = [ir.AffineConstantExpr.get(0) if i in dropped
+           else ir.AffineDimExpr.get(new_pos[i]) for i in range(n)]
+    new_expr = expr.compose(ir.AffineMap.get(len(kept), 0, sel))
+    new_map = ir.AffineMap.get(len(kept), 0, [new_expr])
+    operands = list(apply_op.operands)
+    new_operands = [operands[i] for i in kept]
+    new_apply = ir.Operation.create(
+        "affine.apply",
+        results=[ir.IndexType.get(ctx)],
+        operands=new_operands,
+        attributes={"map": ir.AffineMapAttr.get(new_map)},
+        loc=ir.Location.unknown(ctx),
+        ip=ir.InsertionPoint(anchor_op),
+    )
+    return new_apply.results[0]
+
+
+def _emit_compute(ctx, compute_node, tile_id, read_bufs, write_bufs):
+    front = compute_node.operations[0]
+    attrs = {
+        ts.ATTR_TILE_ID: i64(tile_id),
+        # int code (0 vector / 1 matmul / 2 preload) consumed by the C4 lowering;
+        # maps directly to the Core compute-unit enum. Keep the readable name too.
+        ts.ATTR_COMPUTE_TYPE: i32(int(compute_node.compute_type)),
+        "compute_type_name": str_attr(_COMPUTE_TYPE_NAME[compute_node.compute_type]),
+        # SRAM buffer ids read/written (sec 10 dataflow); the bridge builds the
+        # dependency DAG by last-writer per buffer.
+        ts.ATTR_READ_BUFS: i64_array(read_bufs),
+        ts.ATTR_WRITE_BUFS: i64_array(write_bufs),
+    }
+    ir.Operation.create(
+        ts.COMPUTE,
+        results=[],
+        operands=[],
+        attributes=attrs,
+        loc=ir.Location.unknown(ctx),
+        ip=ir.InsertionPoint(front),
+    )
+
+
+# ---------------------------------------------------------------------------
+# DCE
+# ---------------------------------------------------------------------------
+def _has_nonempty_region(op):
+    for region in op.operation.regions:
+        for b in region.blocks:
+            if len(list(b.operations)) > 0:
+                return True
+    return False
+
+
+def _results_unused(op):
+    for r in op.operation.results:
+        if len(list(r.uses)) > 0:
+            return False
+    return True
+
+
+def _dce(block):
+    """Erase non-kept ops with no used results, to a fixed point. Safe: an op
+    with live SSA uses is never touched."""
+    changed = True
+    while changed:
+        changed = False
+        victims = []
+        for op in walk_ops(block):
+            name = op.operation.name
+            if name in _KEEP:
+                continue
+            if _has_nonempty_region(op):
+                continue
+            if _results_unused(op):
+                victims.append(op)
+        for op in victims:
+            try:
+                op.operation.erase()
+                changed = True
+            except Exception:
+                # Still referenced via something we will erase next round; retry.
+                pass
+
+
+# ---------------------------------------------------------------------------
+# driver
+# ---------------------------------------------------------------------------
+def _collect_dma_nodes(builder):
+    """Map op-identity -> DMA/DMAWait node, by walking the built tree."""
+    by_op = {}
+    seen = set()
+
+    def visit(n):
+        if id(n) in seen:
+            return
+        seen.add(id(n))
+        if isinstance(n, (TOGDMANode, TOGDMAWaitNode)) and n.op is not None:
+            by_op[id(n.op.operation)] = n
+        for c in n.children:
+            visit(c)
+
+    for ln in builder.loop_nodes:
+        visit(ln)
+    return by_op
+
+
+class _BufferIds:
+    """Assigns each SRAM buffer name a stable small int id, shared by DMA and
+    compute so the bridge can match a reader to its buffer's writer (sec 10).
+    The virtual SA_WEIGHTS buffer (preload -> matmul) is numbered here too, on
+    first sight. `None` (a non-buffer base) is -1."""
+
+    def __init__(self):
+        self._ids = {}
+
+    def of(self, name):
+        if name is None:
+            return -1
+        return self._ids.setdefault(name, len(self._ids))
+
+
+class _TagIds:
+    """Identity of a DMA's tag memref -> stable small int, plus the SRAM buffer
+    that tag's async DMA loads. An async dma and its memory_barrier (the original
+    dma_wait) share a tag memref; this assigns it a tag_id (so the runtime can
+    pair them by the runtime tag slot) and remembers the loaded buffer so the
+    barrier can release it to consumers. Pairing is by tag, never a static id."""
+
+    def __init__(self):
+        self._ids = {}   # tag value-key -> tag_id
+        self._buf = {}   # tag value-key -> SRAM buffer id the dma loads
+
+    def bind(self, key, buf):
+        tag_id = self._ids.setdefault(key, len(self._ids))
+        self._buf[key] = buf
+        return tag_id
+
+    def lookup(self, key):
+        """(tag_id, buffer) for a tag memref, or None if no dma used it."""
+        if key not in self._ids:
+            return None
+        return self._ids[key], self._buf[key]
+
+
+def _emit_computes(ctx, builder, bufs):
+    """Step 1: each compute node -> one togsim.compute carrying its tile_id and
+    the ids of the SRAM buffers it reads/writes. Returns the count."""
+    from . import dep_analysis as dep  # lazy: dep_analysis imports build_skeleton
+    n = 0
+    for tile_id, cn in enumerate(builder.compute_nodes):
+        if not cn.operations:
+            continue
+        reads, writes = dep.compute_buffers(cn)
+        _emit_compute(ctx, cn, tile_id,
+                      sorted(bufs.of(b) for b in reads),
+                      sorted(bufs.of(b) for b in writes))
+        n += 1
+    return n
+
+
+def _emit_one_dma(ctx, op, node, builder, bufs, tags):
+    """Rewrite one memref.dma_start as togsim.dma. A load reads DRAM and writes
+    its SRAM spad; a store reads the spad and writes DRAM -- which sets the
+    read/write buffer that drives the dependency edge (sec 10). The tag memref is
+    bound to a tag_id (with its loaded buffer) so the paired memory_barrier finds
+    it by the runtime tag slot."""
+    from . import dep_analysis as dep  # lazy: dep_analysis imports build_skeleton
+    f = builder._dma_start_fields(op)
+    dram_indices = f["dst_indices"] if node.is_write else f["src_indices"]
+    dram_index = dram_indices[0] if dram_indices else None
+    tag_indices = f["tag_indices"]
+    tag_index = tag_indices[0] if tag_indices else None
+    # the spad is the SRAM side of the copy: dst for a load, src for a store.
+    spad_id = bufs.of(dep._global_of(f["src"] if node.is_write else f["dst"]))
+    read_bufs = [spad_id] if node.is_write else []
+    write_bufs = [] if node.is_write else [spad_id]
+    tag_id = tags.bind(_value_key(f["tag"]), spad_id)
+    if node.is_write:
+        _emit_compute_bar(ctx, op)   # FIXME(sec10.7): auto-inserted; should be explicit in the IR.
+    _emit_dma(ctx, node, tag_id, dram_index, tag_index, read_bufs, write_bufs)
+
+
+def _emit_one_wait(ctx, op, tags):
+    """Rewrite one memref.dma_wait as togsim.memory_barrier -- the explicit
+    async-DMA sync already in the IR. Paired with its dma by the tag memref
+    (tag_id) and the runtime tag index; carries the buffer the dma loaded.
+    Returns True iff emitted (a wait whose tag no dma used is dropped)."""
+    operands = list(op.operation.operands)
+    tag = operands[0]
+    tag_index = operands[1] if len(operands) >= 2 else None
+    binding = tags.lookup(_value_key(tag))
+    if binding is None:
+        return False
+    tag_id, buf = binding
+    # honor lower_to_vcix's -1 accumulation marker: strip the reduction terms so
+    # the barrier slot equals the subtile slot the paired async load wrote.
+    tag_index = _strip_accum_terms(ctx, tag_index, op)
+    _emit_memory_bar(ctx, op, tag_id, tag_index, [buf])
+    return True
+
+
+def _emit_dmas_and_waits(ctx, block, builder, dma_by_op, bufs):
+    """Step 2: rewrite memref.dma_start -> togsim.dma and memref.dma_wait ->
+    togsim.memory_barrier in program order. An async dma and its barrier are
+    paired by the RUNTIME tag slot (tag_id + tag index), not a compile-time id:
+    one static dma op runs per loop iteration with a different `%tag[%idx]`, so
+    only the runtime slot can pair iteration i's dma with iteration i's wait.
+    Returns the original ops to erase and the (dma, wait) counts."""
+    tags = _TagIds()
+    originals = []
+    n_dma = n_wait = 0
+    for op in list(walk_ops(block)):
+        name = op.operation.name
+        if name == "memref.dma_start":
+            node = dma_by_op.get(id(op.operation))
+            if node is None:
+                continue
+            _emit_one_dma(ctx, op, node, builder, bufs, tags)
+            originals.append(op)
+            n_dma += 1
+        elif name == "memref.dma_wait":
+            if _emit_one_wait(ctx, op, tags):
+                n_wait += 1
+            originals.append(op)
+    return originals, n_dma, n_wait
+
+
+def build_skeleton(module):
+    """Reduce `func.func @kernel` in `module` to the skeleton+API form, in place.
+
+    Four steps: analyze the kernel into loop/compute/DMA nodes, emit a
+    togsim.compute per compute node, rewrite the DMAs/waits to togsim.dma/wait,
+    then DCE the leftover data computation. Returns a short text report (counts).
+    """
+    _reset_ids()
+    builder = TogBuilder()
+    _build(module, builder)  # populates loop/compute nodes + op back-pointers
+
+    block = _kernel_block(module)
+    if block is None:
+        return "no @kernel found"
+    ctx = module.context
+    dma_by_op = _collect_dma_nodes(builder)
+    bufs = _BufferIds()
+
+    n_compute = _emit_computes(ctx, builder, bufs)
+    originals, n_dma, n_wait = _emit_dmas_and_waits(ctx, block, builder, dma_by_op, bufs)
+
+    # erase the now-replaced originals (result-less -> safe), then strip the
+    # leftover data computation.
+    for op in originals:
+        try:
+            op.operation.erase()
+        except Exception:
+            pass
+    _dce(block)
+
+    return ("skeleton: compute=%d dma=%d wait=%d (unpaired waits dropped)"
+            % (n_compute, n_dma, n_wait))
+
+
+def run(module, vectorlane=128):
+    """passes/__init__ pass protocol entry (vectorlane unused; kept for parity)."""
+    build_skeleton(module)
+
+
+def run_skeleton(in_path, out_path=None):
+    """Read post-vcix MLIR at `in_path`, reduce to skeleton+API, write it out.
+
+    Requires the MLIR bindings.
+    """
+    if out_path is None:
+        out_path = in_path
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(open(in_path).read(), ctx)
+        report = build_skeleton(module)
+        with open(out_path, "w") as fh:
+            fh.write(str(module))
+    return report
+
+
+def main(argv):
+    import argparse
+
+    parser = argparse.ArgumentParser(prog="build_skeleton.py")
+    parser.add_argument("input")
+    parser.add_argument("--out", default=None)
+    args = parser.parse_args(argv[1:])
+    report = run_skeleton(args.input, args.out)
+    import sys
+    sys.stderr.write(report + "\n")
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main(sys.argv))
diff --git a/PyTorchSimFrontend/mlir/passes/cycle_table.py b/PyTorchSimFrontend/mlir/passes/cycle_table.py
new file mode 100644
index 00000000..40dd3459
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/cycle_table.py
@@ -0,0 +1,103 @@
+"""cycle_table (C3): the precomputed tile_id -> (cycle, overlapping_cycle) table
+the C++ trace pipeline looks up at runtime (docs/design/togsim_cpp_trace.md sec
+6, sec 9.8 task 4).
+
+A `togsim.compute(tile_id=...)` in the trace says *which* tile to compute, not
+how long it takes. Because tiles are fixed size, each tile's cost is invariant
+(only the trip count varies with shape), so it is sampled once and stored here,
+keyed by `tile_id`. Two numbers per tile, mirroring the legacy TOG:
+
+  * `cycle`            -- full compute latency, sampled by gem5 sample-mode
+                          (the existing measurement: `_rewrite_loop_steps` +
+                          `_insert_compute_markers` in build_tog, run through
+                          CycleSimulator -> the per-tile `cycle_list`).
+  * `overlapping_cycle` -- the portion that overlaps the previous instruction in
+                          the systolic pipeline; the timing core uses it as
+                          `finish = prev.finish + cycle - overlapped` (Core.cc).
+                          Derived exactly as the legacy path does
+                          (tog_generator.generate_tile_graph):
+                              type 0 (VectorCompute)  -> 0
+                              type 1 (MatmulCompute)  -> max(cycle - x_offset, 0)
+                              type 2 (MatmulPreload)  -> max(cycle - w_offset, 0)
+
+This module only *builds/serializes* the table from a cycle_list; obtaining the
+cycle_list reuses the existing sample-mode + gem5 path (wired in P3 task 5). The
+`tile_id` order matches build_skeleton's `compute_nodes` order, which matches the
+legacy TOG, so the same sampling keys both paths.
+
+Requires the MLIR Python bindings (to read the skeleton's togsim.compute ops).
+"""
+
+import json
+
+from . import togsim_ops as ts
+from ._mlir_util import walk_ops
+from .build_tog import (
+    ir,
+    VECTOR_COMPUTE,
+    MATMUL_COMPUTE,   # noqa: F401 (documents the type enum used by the formula)
+    MATMUL_PRELOAD,
+)
+
+
+def overlapping_cycle(cycle, compute_type, x_offset, w_offset):
+    """Hideable (pipeline-overlapped) portion of `cycle`. Mirrors
+    tog_generator.generate_tile_graph."""
+    if compute_type <= VECTOR_COMPUTE:           # VectorCompute: no systolic overlap
+        return 0
+    offset = w_offset if compute_type == MATMUL_PRELOAD else x_offset
+    return max(int(cycle) - int(offset), 0)
+
+
+def _compute_types(skeleton_module):
+    """tile_id-ordered list of compute_type ints, from the skeleton's
+    togsim.compute ops."""
+    items = []
+    for op in walk_ops(skeleton_module.body):
+        if op.operation.name != ts.COMPUTE:
+            continue
+        tid = ir.IntegerAttr(op.operation.attributes[ts.ATTR_TILE_ID]).value
+        ct = ir.IntegerAttr(op.operation.attributes[ts.ATTR_COMPUTE_TYPE]).value
+        items.append((tid, ct))
+    items.sort()
+    return [t for _, t in items]
+
+
+def build_cycle_table(skeleton_module, cycle_list, x_offset, w_offset):
+    """Return `[(cycle, overlapping_cycle), ...]` indexed by tile_id.
+
+    `cycle_list` is the per-tile gem5 measurement (compute_nodes order ==
+    tile_id order). `x_offset`/`w_offset` are the systolic-fill offsets the
+    legacy path computes from the vector-lane size / loop size."""
+    types = _compute_types(skeleton_module)
+    if len(cycle_list) != len(types):
+        raise ValueError(
+            "cycle_list (%d) does not match #compute tiles (%d)"
+            % (len(cycle_list), len(types)))
+    return [(int(c), overlapping_cycle(c, t, x_offset, w_offset))
+            for c, t in zip(cycle_list, types)]
+
+
+def dump_cycle_table(table, path, x_offset=None, w_offset=None):
+    """Serialize the table as a sidecar JSON next to the trace `.so`. The P3 C6
+    loader reads it and sets compute_cycle + overlapping_cycle on each emitted
+    Instruction."""
+    with open(path, "w") as fh:
+        json.dump({"x_offset": x_offset, "w_offset": w_offset,
+                   "table": [list(e) for e in table]}, fh)
+    return path
+
+
+def load_cycle_table(path):
+    with open(path) as fh:
+        return json.load(fh)
+
+
+def dump_cycle_table_tsv(table, path):
+    """Plain `cycle<TAB>overlapping` per line, in tile_id order -- the trivial
+    format the C++ `--cycle_table` loader (main.cc, P3 trace pipeline) reads with
+    ifstream (no JSON dependency in TOGSim)."""
+    with open(path, "w") as fh:
+        for cycle, overlapping in table:
+            fh.write("%d\t%d\n" % (int(cycle), int(overlapping)))
+    return path
diff --git a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py
index c0e82b66..10b2edfb 100644
--- a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py
+++ b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py
@@ -32,13 +32,7 @@
 OP_NAME = "togsim.transfer"
 MARKERS = (OP_NAME,)
 
-
-def _iter_ops(block):
-    for op in list(block.operations):
-        yield op
-        for region in op.operation.regions:
-            for b in region.blocks:
-                yield from _iter_ops(b)
+from ._mlir_util import walk_ops
 
 
 def _int_array(attr):
@@ -92,7 +86,7 @@ def run(module, vectorlane=128, **_):
     targets = []
     for region in module.operation.regions:
         for b in region.blocks:
-            for op in _iter_ops(b):
+            for op in walk_ops(b):
                 if op.operation.name == OP_NAME:
                     targets.append(op.operation)
 
diff --git a/PyTorchSimFrontend/mlir/passes/dep_analysis.py b/PyTorchSimFrontend/mlir/passes/dep_analysis.py
new file mode 100644
index 00000000..bc53bfc9
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/dep_analysis.py
@@ -0,0 +1,194 @@
+"""dep_analysis.py -- dependency-edge analysis for the C++ trace pipeline (P3, sec 10).
+
+The current TOG pass does NO dependency analysis (it emits a lexical loop tree +
+runtime tags). This module derives the producer->consumer edges that the explicit
+dataflow trace needs, from two sources available on the post-vcix IR (before
+build_skeleton collapses the compute regions):
+
+  1. SRAM access: each DMA/compute's read/write SRAM buffer(s), recovered by
+     following SSA (a vcix.iv's input vector -> its vector.transfer_read -> the
+     memref -> @global), and the DMA's spad operand. Edge: a reader depends on
+     the last node that wrote the same buffer.
+  2. vcix preload/matmul pairing: a matmul (vcix opcode 0) consumes the weights a
+     preceding preload (opcode 1) loaded into the systolic array -- an SA-internal
+     dependency NOT visible as a memref access, so it comes from the opcode order.
+
+This is a node-level analysis (one node per build_tog compute/DMA node); the loops
+replay the nodes, so loop-carried edges (the Y_spad accumulator) are materialized
+per iteration downstream. First cut: buffer granularity (slot-level value matching
+is a later refinement). Output is an edge list for validation / to drive emit.
+"""
+import sys
+import os
+
+from .build_tog import TogBuilder, ir, _reset_ids
+from . import build_skeleton as _bs
+
+
+def _global_of(memref_val):
+    """memref SSA value -> @global symbol name (e.g. 'X_spad'), or None."""
+    owner = memref_val.owner
+    op = owner if isinstance(owner, ir.Operation) else getattr(owner, "operation", None)
+    if op is None:
+        return None
+    if op.name == "memref.get_global":
+        return str(op.attributes["name"]).strip('@" ')
+    # walk through view-like ops (subview/cast) to their source
+    if op.operands:
+        try:
+            return _global_of(op.operands[0])
+        except Exception:
+            return None
+    return None
+
+
+def _read_buffers_of_compute(cn):
+    """SRAM buffers a compute node reads: (a) each vcix.iv input traced to its
+    vector.transfer_read source (activations/weights streamed into the SA), and
+    (b) any direct vector.transfer_read in the node (the epilogue's accumulator
+    read-modify-write of Y_spad)."""
+    bufs = set()
+    for op in cn.operations:
+        if op.name == "vector.transfer_read" and list(op.operands):
+            b = _global_of(op.operands[0])
+            if b:
+                bufs.add(b)
+        elif op.name == "vcix.iv" and list(op.operands):
+            v = op.operands[0]
+            defop = v.owner if isinstance(v.owner, ir.Operation) else getattr(v.owner, "operation", None)
+            if defop is not None and defop.name == "vector.transfer_read" and list(defop.operands):
+                b = _global_of(defop.operands[0])
+                if b:
+                    bufs.add(b)
+    return bufs
+
+
+def _write_buffers_of_compute(cn):
+    """SRAM buffers a compute node writes: vector.transfer_write / vector_store target."""
+    bufs = set()
+    for op in cn.operations:
+        if op.name in ("vector.transfer_write", "affine.vector_store", "vector.store"):
+            # target memref is the last memref operand
+            for v in op.operands:
+                try:
+                    if ir.MemRefType.isinstance(v.type):
+                        b = _global_of(v)
+                        if b:
+                            bufs.add(b)
+                except Exception:
+                    pass
+    return bufs
+
+
+def _dma_buffer(builder, dma_node):
+    """The SRAM spad buffer a DMA touches (dst for load, src for store)."""
+    try:
+        f = builder._dma_start_fields(dma_node.op)
+    except Exception:
+        return None
+    val = f["dst"] if not dma_node.is_write else f["src"]
+    return _global_of(val)
+
+
+# Virtual buffer for the systolic-array weight registers: a preload writes it,
+# the following matmul reads it. This folds the SA-internal preload->matmul
+# dependency (not a memref access) into the uniform "last-writer per buffer" rule.
+SA_WEIGHTS = "__SA_WEIGHTS__"
+
+
+def compute_buffers(cn):
+    """(read_buffers, write_buffers) for one compute node, including the virtual
+    SA_WEIGHTS edge (preload writes it, matmul reads it)."""
+    reads = set(_read_buffers_of_compute(cn))
+    writes = set(_write_buffers_of_compute(cn))
+    if cn.compute_type == 1:      # MATMUL consumes the preloaded weights
+        reads.add(SA_WEIGHTS)
+    elif cn.compute_type == 2:    # PRELOAD loads them
+        writes.add(SA_WEIGHTS)
+    return reads, writes
+
+
+def analyze(module):
+    """Return (nodes, edges). nodes: list of dicts; edges: list of (consumer_idx,
+    producer_idx, reason)."""
+    _reset_ids()
+    builder = TogBuilder()
+    _bs._build(module, builder)
+
+    nodes = []
+    # DMA nodes only (the map also contains TOGDMAWaitNode; keep real DMAs).
+    dma_nodes = [dn for dn in dict.fromkeys(_bs._collect_dma_nodes(builder).values())
+                 if hasattr(dn, "is_write")]
+    for dn in dma_nodes:
+        buf = _dma_buffer(builder, dn)
+        nodes.append({
+            "kind": "STORE" if dn.is_write else "LOAD",
+            "buf": buf, "arg": str(dn.base_addr),
+            "reads": {buf} if dn.is_write else set(),
+            "writes": {buf} if not dn.is_write else set(),
+            "node": dn,
+        })
+    for cn in builder.compute_nodes:
+        if not cn.operations:
+            continue
+        ct = {0: "VECTOR", 1: "MATMUL", 2: "PRELOAD"}.get(cn.compute_type, f"c{cn.compute_type}")
+        nodes.append({
+            "kind": ct,
+            "reads": _read_buffers_of_compute(cn),
+            "writes": _write_buffers_of_compute(cn),
+            "node": cn,
+            "compute_type": cn.compute_type,
+        })
+
+    # Order nodes by program position (last-writer needs program order: e.g. the
+    # store reads Y_spad written by the matmul, which lexically precedes it).
+    pos = {}
+    idx = [0]
+    def _index(op):
+        pos[op] = idx[0]; idx[0] += 1
+        for r in op.regions:
+            for b in r.blocks:
+                for o in b.operations:
+                    _index(o)
+    _index(module.operation)
+    def _key(n):
+        node = n["node"]
+        op = getattr(node, "op", None) or (node.operations[0] if getattr(node, "operations", None) else None)
+        return pos.get(op, 1 << 30)
+    nodes.sort(key=_key)
+
+    # Edges: (1) buffer last-writer, (2) preload->matmul.
+    edges = []
+    last_writer = {}  # buffer -> node idx
+    prev_preload = None
+    for i, n in enumerate(nodes):
+        for b in sorted(n["reads"]):
+            if b in last_writer:
+                edges.append((i, last_writer[b], f"reads {b}"))
+        if n["kind"] == "MATMUL" and prev_preload is not None:
+            edges.append((i, prev_preload, "uses preloaded weights (vcix op1->op0)"))
+        for b in n["writes"]:
+            last_writer[b] = i
+        if n["kind"] == "PRELOAD":
+            prev_preload = i
+    return nodes, edges
+
+
+def _main():
+    path = sys.argv[1]
+    ctx = ir.Context(); ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(open(path).read(), ctx)
+        nodes, edges = analyze(module)
+    print("=== nodes ===")
+    for i, n in enumerate(nodes):
+        r = ",".join(sorted(n["reads"])) or "-"
+        w = ",".join(sorted(n["writes"])) or "-"
+        print(f"  #{i:<2} {n['kind']:<8} reads[{r}] writes[{w}]")
+    print("=== edges (consumer -> producer) ===")
+    for c, p, why in edges:
+        print(f"  #{c} ({nodes[c]['kind']}) -> #{p} ({nodes[p]['kind']})   [{why}]")
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py b/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py
index 3f583ef2..f1872dca 100644
--- a/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py
+++ b/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py
@@ -30,6 +30,8 @@
 
 import mlir.ir as ir  # noqa: E402
 
+from ._mlir_util import walk_ops, attr_i64_array
+
 MARKERS = ("subtile_size",)   # only subtile DMAs are split
 
 MVIN, MVIN2, MVIN3, MVOUT = 2, 1, 14, 3
@@ -54,12 +56,6 @@ def _const_int(value, default=-1):
         return default
 
 
-def _int_array_attr(op, key):
-    if key not in op.attributes:
-        return []
-    return [ir.IntegerAttr(a).value for a in ir.ArrayAttr(op.attributes[key])]
-
-
 def _is_block_arg(v):
     return isinstance(v, ir.BlockArgument)
 
@@ -106,13 +102,13 @@ def tile_shape(self):
         return list(mt.shape)
 
     def subtile_size(self):
-        return _int_array_attr(self.op, "subtile_size")
+        return attr_i64_array(self.op, "subtile_size", default=[])
 
     def sram_stride(self):
-        return _int_array_attr(self.op, "sram_stride")
+        return attr_i64_array(self.op, "sram_stride", default=[])
 
     def dram_stride(self):
-        return _int_array_attr(self.op, "dram_stride")
+        return attr_i64_array(self.op, "dram_stride", default=[])
 
     def is_async(self):
         a = self.op.attributes
@@ -244,6 +240,27 @@ def _const_index(v, ip):
                             ir.IntegerAttr.get(ir.IndexType.get(), v), ip=ip).result
 
 
+def _fresh_tag(dma):
+    """Give this DMA a fresh tag memref.alloc right BEFORE the (pre-split) coarse
+    dma_start, and rewire every use of the old tag -- the dma_start re-emitted
+    below AND its dma_wait -- to it. The coarse dma sits at the reduction-loop body
+    level (it has not been wrapped in a subtile load nest yet), so the alloc there
+    dominates both the load nest fine-grained is about to build and the sibling
+    wait nest. Each reduction iteration thus allocates its own tag -> successive
+    iterations are distinct (multi-tile-K / conv) and the per-iteration tag
+    semantics is in the IR, not reconstructed downstream. Old alloc becomes dead."""
+    old = dma.tag
+    new_tag = ir.Operation.create("memref.alloc", results=[old.type],
+                                  operands=[], ip=ir.InsertionPoint(dma.op)).results[0]
+    old.replace_all_uses_with(new_tag)
+    dma.tag = new_tag
+    # the old (func-entry, per-tensor unique) alloc is now dead -- erase it.
+    try:
+        old.owner.erase()
+    except Exception:
+        pass
+
+
 # ---------------------------------------------------------------------------
 # Loop-nest construction
 # ---------------------------------------------------------------------------
@@ -293,20 +310,12 @@ def _reaches(value, target):
 # ---------------------------------------------------------------------------
 # Pass driver
 # ---------------------------------------------------------------------------
-def _iter_ops(block):
-    for op in list(block.operations):
-        yield op
-        for region in op.operation.regions:
-            for b in region.blocks:
-                yield from _iter_ops(b)
-
-
 def _run_func(func, vectorlane):
     from mlir.dialects import linalg
     # First matmul only.
     matmul = None
     dmas = []
-    for op in _iter_ops(func.regions[0].blocks[0]):
+    for op in walk_ops(func.regions[0].blocks[0]):
         name = op.operation.name
         if name == "linalg.matmul" and matmul is None:
             matmul = op
@@ -363,6 +372,12 @@ def _run_func(func, vectorlane):
     for d, f in enumerate(fuse["w_to_fused"]):
         bounds[f] = w_counts[d]
 
+    # Give each load a fresh per-iteration tag alloc just before its coarse dma
+    # (rewiring its dma_wait via the old tag's uses), so the tag is distinct per
+    # reduction iteration -- positioned to match the per-iteration tag semantics.
+    _fresh_tag(mvin_input)
+    _fresh_tag(mvin_weight)
+
     # Insert the fused nest at the weight DMA (the later of the two): both DMAs'
     # original DRAM base indices (src_idx[0], computed in the enclosing loops) must
     # dominate the nest. Codegen emits input before weight, matching the C++ pass
diff --git a/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py b/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py
index f5b841bb..998a6db5 100644
--- a/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py
+++ b/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py
@@ -22,6 +22,8 @@
 WAIT_NAME = "memref.dma_wait"
 MARKERS = (OP_NAME, WAIT_NAME)
 
+from ._mlir_util import attr_i64_array
+
 # func7 instruction codes (CustomDMAAttribute.h)
 CONFIG, CONFIG2, CONFIG3, CONFIG4 = 0, 4, 5, 6
 MVIN, MVIN2, MVIN3, MVOUT = 2, 1, 14, 3
@@ -124,8 +126,8 @@ def elem_addr_i64(memref_val, indices, mtype, elem_bytes):
         tile_shape = _subtile(op)
         if tile_shape is None:
             tile_shape = list(dst_ty.shape) if is_mvin else list(src_ty.shape)
-        dram_strides = _int_array(op, "dram_stride")
-        spad_strides = _int_array(op, "sram_stride")
+        dram_strides = attr_i64_array(op, "dram_stride")
+        spad_strides = attr_i64_array(op, "sram_stride")
         assert len(tile_shape) == len(dram_strides) == len(spad_strides), \
             f"shape/stride rank mismatch: {tile_shape} {dram_strides} {spad_strides}"
 
@@ -180,11 +182,6 @@ def _subtile(op):
     return [IntegerAttr(a).value for a in ArrayAttr(op.attributes["subtile_size"])]
 
 
-def _int_array(op, name):
-    from mlir.ir import ArrayAttr, IntegerAttr
-    return [IntegerAttr(a).value for a in ArrayAttr(op.attributes[name])]
-
-
 def _elem_bytes(elem_type):
     from mlir.ir import IntegerType, FloatType
     bits = (IntegerType(elem_type).width if IntegerType.isinstance(elem_type)
diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py b/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py
new file mode 100644
index 00000000..6ade7442
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py
@@ -0,0 +1,613 @@
+"""lower_to_emitc pass (C4): skeleton+API MLIR -> EmitC -> C++ -> trace `.so`.
+
+Second stage of the C++ trace pipeline (docs/design/togsim_cpp_trace.md, sec
+5-7). Takes the skeleton+API module from `build_skeleton` (loop nest +
+`togsim.*` ops) and produces an EmitC module whose single entry function
+
+    extern "C" void togsim_kernel(EmitCtx* ctx, int64_t* shape_args, int32_t n)
+
+mirrors the loop skeleton, with every `togsim.*` op as an `emitc.call_opaque`
+to the matching `togsim_runtime.h` free function (`togsim_ops.EMITC_CALLEE`).
+`mlir-translate --mlir-to-cpp` renders it to C++, compiled to a `.so` that
+exports `togsim_kernel` and leaves `togsim_dma/wait/compute/signal` undefined for
+the TOGSim loader to resolve at `dlopen`.
+
+How the lowering is done -- it drives the *upstream* EmitC conversion passes and
+adds only the glue they cannot do:
+
+  1. (python) Rewrite the unregistered `togsim.*` ops to `emitc.call_opaque`.
+     Unregistered ops have no registered conversion patterns, so this must be a
+     custom rewrite (design sec 8). Also rewrite the kernel's signature to the
+     ABI form (drop the memref tensor args -- the trace producer never touches
+     tensor data; base addresses are deferred to P3) and drop the aux
+     globals / wrapper func.
+  2. (upstream passes, in-process PassManager)
+        func.func(lower-affine) -> convert-scf-to-emitc
+        -> convert-arith-to-emitc -> convert-func-to-emitc
+     This is the EmitC infrastructure: it lowers the affine/scf loop nest to
+     `emitc.for`, the index/arith (loop bounds, and in P3 the address
+     arithmetic) to EmitC, and the func to `emitc.func`.
+  3. (python) Two small fixups the passes leave behind in this LLVM 20 build:
+       * `convert-scf-to-emitc` emits `emitc.for` with `index`-typed bounds, so
+         `convert-arith-to-emitc` (which makes constants `!emitc.size_t`) leaves
+         `builtin.unrealized_conversion_cast` on the bounds that nothing folds
+         and `mlir-to-cpp` cannot print (design sec 8 "EmitC coverage" risk).
+         `_fold_for_bound_casts` rewrites those casts away.
+       * add the `extern "C"` specifier so `dlsym` finds the entry unmangled.
+
+Requires the MLIR Python bindings (incl. `mlir.passmanager`); the .cpp/.so
+steps additionally require `mlir-translate` (TORCHSIM_LLVM_PATH) and a host C++
+compiler.
+"""
+
+import os
+import subprocess
+
+from mlir.passmanager import PassManager
+
+from . import togsim_ops as ts
+from ._mlir_util import walk_ops, i32, i64, attr_int, attr_i64_array
+from .build_tog import ir, _find_kernel
+
+#: emitted entry symbol (== ts.ENTRY_SYMBOL == "togsim_kernel").
+ENTRY = ts.ENTRY_SYMBOL
+
+#: EmitC type of the opaque EmitCtx* threaded through every call.
+CTX_TYPE = '!emitc.ptr<!emitc.opaque<"EmitCtx">>'
+
+#: upstream EmitC conversion pipeline (the infrastructure this pass drives).
+_PIPELINE = ("builtin.module("
+             "func.func(lower-affine),"
+             "convert-scf-to-emitc,"
+             "convert-arith-to-emitc,"
+             "convert-func-to-emitc)")
+
+#: prepended to the mlir-to-cpp output; pulls in size_t/intN_t and the ABI.
+_PRELUDE = (
+    "#include <cstddef>\n"
+    "#include <cstdint>\n"
+    "using std::size_t;\n"
+    '#include "togsim_runtime.h"\n'
+)
+
+
+# ---------------------------------------------------------------------------
+# attribute builders / readers
+# ---------------------------------------------------------------------------
+def _idx(v):
+    return ir.IntegerAttr.get(ir.IndexType.get(), int(v))
+
+
+def _opaque(ctx, text):
+    return ir.Attribute.parse('#emitc.opaque<"%s">' % text, ctx)
+
+
+def _arr(ctx, vals):
+    """A C compound-literal `(const int64_t[]){...}` arg, or `nullptr` if empty
+    (the call site decays it to a `const int64_t*`)."""
+    vals = list(vals)
+    if not vals:
+        return _opaque(ctx, "nullptr")
+    return _opaque(ctx, "(const int64_t[]){%s}" % ", ".join(str(int(v)) for v in vals))
+
+
+def _attr_bool(op, key):
+    return 1 if ir.BoolAttr(op.operation.attributes[key]).value else 0
+
+
+# ---------------------------------------------------------------------------
+# step 1: rewrite signature + togsim.* ops (the unregistered-op glue)
+# ---------------------------------------------------------------------------
+def _strip_aux(module):
+    """Erase memref.global decls and every func except @kernel (the wrapper)."""
+    victims = []
+    for op in module.body.operations:
+        name = op.operation.name
+        if name == "memref.global":
+            victims.append(op)
+        elif name == "func.func":
+            if ir.StringAttr(op.operation.attributes["sym_name"]).value != "kernel":
+                victims.append(op)
+    for op in victims:
+        op.operation.erase()
+
+
+def _rewrite_signature(kernel, ctx):
+    """Replace @kernel's memref tensor args with the ABI args
+    (EmitCtx*, int64_t* shape_args, int32_t n) and rename it to togsim_kernel.
+    Returns the ctx Value."""
+    block = kernel.regions[0].blocks[0]
+    for arg in block.arguments:
+        if len(list(arg.uses)) > 0:
+            raise ValueError(
+                "kernel arg still used after build_skeleton; cannot drop it "
+                "(expected the DCE to have removed all tensor-data ops)")
+    # erase existing (memref) args high-to-low, then append the ABI args.
+    for i in reversed(range(len(block.arguments))):
+        block.erase_argument(i)
+    ptr = ir.Type.parse(CTX_TYPE, ctx)
+    i64ptr = ir.Type.parse("!emitc.ptr<i64>", ctx)
+    i32 = ir.IntegerType.get_signless(32)
+    loc = ir.Location.unknown(ctx)
+    block.add_argument(ptr, loc)
+    block.add_argument(i64ptr, loc)
+    block.add_argument(i32, loc)
+    kernel.operation.attributes["function_type"] = ir.TypeAttr.get(
+        ir.FunctionType.get([ptr, i64ptr, i32], []))
+    kernel.operation.attributes["sym_name"] = ir.StringAttr.get(ENTRY)
+    return block.arguments[0]
+
+
+def _call(ctx, ctx_val, op, callee, arg_attrs):
+    """Insert emitc.call_opaque <callee>(ctx) {args=[0:index, ...]} before `op`.
+    The leading `0 : index` references operand 0 (ctx); other entries are
+    literal C args (integer attr -> literal, #emitc.opaque -> verbatim)."""
+    ir.Operation.create(
+        "emitc.call_opaque", results=[], operands=[ctx_val],
+        attributes={"callee": ir.StringAttr.get(callee),
+                    "args": ir.ArrayAttr.get([_idx(0)] + arg_attrs)},
+        loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(op))
+
+
+def _innermost_outer_loop(block):
+    """Deepest `affine.for {outer_loop=true}` (the PARALLEL/ACCUMULATION
+    boundary). Returns the op or None if the kernel has no parallel loop."""
+    found = [None]
+
+    def is_outer(op):
+        a = op.operation.attributes
+        return "outer_loop" in a and ir.BoolAttr(a["outer_loop"]).value
+
+    def walk(b):
+        for op in b.operations:
+            if op.operation.name == "affine.for" and is_outer(op):
+                found[0] = op   # nested outer loops overwrite -> deepest wins
+            for r in op.operation.regions:
+                for bb in r.blocks:
+                    walk(bb)
+
+    walk(block)
+    return found[0]
+
+
+def _is_outer(forop):
+    a = forop.operation.attributes
+    return "outer_loop" in a and ir.BoolAttr(a["outer_loop"]).value
+
+
+def _parallel_loop_chain(block):
+    """The nested chain of `affine.for {outer_loop}` from `block` inward (one
+    work-item's parallel indices). Empty if the kernel has no parallel loop."""
+    chain = []
+    cur = block
+    while True:
+        nxt = None
+        for op in cur.operations:
+            if op.operation.name == "affine.for" and _is_outer(op):
+                nxt = op
+                break
+        if nxt is None:
+            break
+        chain.append(nxt)
+        cur = nxt.operation.regions[0].blocks[0]
+    return chain
+
+
+def _const_op(value):
+    """The defining arith/emitc constant Operation if `value` is a constant
+    result, else None (block args / other ops)."""
+    owner = value.owner
+    if isinstance(owner, ir.Block):
+        return None
+    return owner if owner.name in ("arith.constant", "emitc.constant") else None
+
+
+def _outline_work_item(ctx, kernel, ctx_val):
+    """Outline the innermost parallel work-item body into a uniform
+    `togsim_kernel_tile(ctx, iv, n)` func, replacing it with a
+    `togsim_dispatch(ctx, togsim_kernel_tile, iv, n)` call (sec 9.3). The
+    work-item SCOPE becomes the function body; the runtime wrapper owns the
+    core-alloc + the TILE_BEGIN/TILE_END boundary (a decorator). One uniform tile
+    signature -> a single general dispatcher serves every kernel.
+
+    Runs after `_rewrite_togsim_ops`, so the moved body holds emitc.call_opaque
+    (not togsim.* ops). The only values captured from outside the body are ctx,
+    the enclosing parallel induction vars, and constants -- threaded via the iv
+    array (parallel IVs) / cloned (constants); anything else is unsupported
+    (dynamic shape -> P4)."""
+    kblk = kernel.regions[0].blocks[0]
+    chain = _parallel_loop_chain(kblk)
+    if chain:
+        L = chain[-1]
+        Lbody = L.operation.regions[0].blocks[0]
+        ivs = [c.operation.regions[0].blocks[0].arguments[0] for c in chain]
+    else:                       # no parallel loop -> the whole kernel body is one work-item
+        L = None
+        Lbody = kblk
+        ivs = []
+
+    i64 = ir.IntegerType.get_signless(64)
+    i32 = ir.IntegerType.get_signless(32)
+    idxty = ir.IndexType.get()
+    ctxty = ir.Type.parse(CTX_TYPE, ctx)
+    i64ptr = ir.Type.parse("!emitc.ptr<i64>", ctx)
+    loc = ir.Location.unknown(ctx)
+
+    # --- the outlined tile function (before the kernel so C defines it first) ---
+    tile = ir.Operation.create(
+        "func.func", results=[], regions=1,
+        attributes={
+            "function_type": ir.TypeAttr.get(ir.FunctionType.get([ctxty, i64ptr, i32], [])),
+            "sym_name": ir.StringAttr.get(ts.TILE_SYMBOL),
+            "sym_visibility": ir.StringAttr.get("private")},
+        loc=loc, ip=ir.InsertionPoint(kernel))
+    with loc:
+        tblk = tile.regions[0].blocks.append(ctxty, i64ptr, i32)
+    ctx2, iv2, _n2 = tblk.arguments
+    with ir.InsertionPoint(tblk):
+        tret = ir.Operation.create("func.return", results=[], operands=[], loc=loc)
+
+    # in the tile fn: recover each parallel index = index_cast(iv[k]).
+    idx_vals = []
+    with ir.InsertionPoint(tret):
+        for k in range(len(ivs)):
+            kc = ir.Operation.create("emitc.constant", results=[i64],
+                    attributes={"value": ir.IntegerAttr.get(i64, k)}, loc=loc).results[0]
+            elem = ir.Operation.create("emitc.subscript", results=[i64],
+                    operands=[iv2, kc], loc=loc).results[0]
+            idx_vals.append(ir.Operation.create("arith.index_cast", results=[idxty],
+                    operands=[elem], loc=loc).results[0])
+
+    # move the work-item body into the tile fn (terminators stay behind).
+    for op in [o for o in Lbody.operations
+               if o.operation.name not in ("affine.yield", "func.return")]:
+        op.operation.move_before(tret)
+
+    # remap captures (Value `==` is identity): ctx -> ctx2, each parallel IV ->
+    # its index_cast, each external constant -> a clone inside the tile fn. A
+    # constant defined inside the tile fn (moved/read) is internal -> left alone.
+    caps = [(ctx_val, ctx2)] + list(zip(ivs, idx_vals))
+    internal_consts = []
+    def _collect_internal(block):
+        for op in block.operations:
+            c = _const_op(op.operation.results[0]) if len(op.operation.results) == 1 else None
+            if c is not None:
+                internal_consts.append(op.operation.results[0])
+            for rg in op.operation.regions:
+                for b in rg.blocks:
+                    _collect_internal(b)
+    _collect_internal(tblk)
+    const_clones = []
+    ext_consts = []
+    def _find_ext_consts(block):
+        for op in block.operations:
+            for opnd in op.operation.operands:
+                if _const_op(opnd) is None:
+                    continue
+                if any(opnd == ic for ic in internal_consts):
+                    continue
+                if any(opnd == e for e in ext_consts):
+                    continue
+                ext_consts.append(opnd)
+            for rg in op.operation.regions:
+                for b in rg.blocks:
+                    _find_ext_consts(b)
+    _find_ext_consts(tblk)
+    top = ir.InsertionPoint(tblk.operations[0])
+    for e in ext_consts:
+        c = _const_op(e)
+        clone = ir.Operation.create(c.name, results=[e.type],
+                    attributes={"value": c.attributes["value"]}, loc=loc, ip=top).results[0]
+        const_clones.append((e, clone))
+
+    allcaps = caps + const_clones
+    def _remap(block):
+        for op in block.operations:
+            for i in range(len(op.operation.operands)):
+                cur = op.operation.operands[i]
+                for orig, new in allcaps:
+                    if cur == orig:
+                        op.operation.operands[i] = new
+                        break
+            for rg in op.operation.regions:
+                for b in rg.blocks:
+                    _remap(b)
+    _remap(tblk)
+
+    # --- the dispatcher: marshal the IVs and hand the tile fn to togsim_dispatch ---
+    term = [o for o in Lbody.operations
+            if o.operation.name in ("affine.yield", "func.return")][0]
+    fn_ref = _opaque(ctx, ts.TILE_SYMBOL)   # function name -> verbatim pointer in C
+    with ir.InsertionPoint(term):
+        if ivs:
+            arrty = ir.Type.parse("!emitc.array<%dxi64>" % len(ivs), ctx)
+            arr = ir.Operation.create("emitc.variable", results=[arrty],
+                    attributes={"value": _opaque(ctx, "")}, loc=loc).results[0]
+            for k, iv in enumerate(ivs):
+                kc = ir.Operation.create("emitc.constant", results=[i64],
+                        attributes={"value": ir.IntegerAttr.get(i64, k)}, loc=loc).results[0]
+                v64 = ir.Operation.create("arith.index_cast", results=[i64],
+                        operands=[iv], loc=loc).results[0]
+                sub = ir.Operation.create("emitc.subscript", results=[i64],
+                        operands=[arr, kc], loc=loc).results[0]
+                # emitc.assign operands are (lvalue dest, value).
+                ir.Operation.create("emitc.assign", results=[], operands=[sub, v64], loc=loc)
+            ir.Operation.create(
+                "emitc.call_opaque", results=[], operands=[ctx_val, arr],
+                attributes={"callee": ir.StringAttr.get(ts.DISPATCH_CALLEE),
+                            "args": ir.ArrayAttr.get(
+                                [_idx(0), fn_ref, _idx(1), ir.IntegerAttr.get(i32, len(ivs))])},
+                loc=loc)
+        else:
+            ir.Operation.create(
+                "emitc.call_opaque", results=[], operands=[ctx_val],
+                attributes={"callee": ir.StringAttr.get(ts.DISPATCH_CALLEE),
+                            "args": ir.ArrayAttr.get(
+                                [_idx(0), fn_ref, _opaque(ctx, "nullptr"), ir.IntegerAttr.get(i32, 0)])},
+                loc=loc)
+
+
+def _rewrite_togsim_ops(ctx, kernel, ctx_val):
+    block = kernel.regions[0].blocks[0]
+    victims = []
+    for op in walk_ops(block):
+        name = op.operation.name
+        ipo = ir.InsertionPoint(op)
+        if name == ts.DMA:
+            dims = attr_i64_array(op, ts.ATTR_DIMS)
+            # The DRAM element offset is the togsim.dma operand (the original
+            # affine index, kept live by build_skeleton); pass it as a call
+            # operand so convert-arith-to-emitc lowers the address arithmetic
+            # into the producer (P3 approach A). The runtime adds the tensor base.
+            # Operands carried by build_skeleton: [dram_index, tag_index] (each
+            # optional). Pass each as a call operand so convert-arith-to-emitc
+            # lowers it; reference it from `args` by its operand position. offset
+            # -> DRAM byte address (runtime adds the tensor base); tag_slot -> the
+            # SRAM tile slot (runtime uses it for double-buffer/SRAM-capacity).
+            ins = list(op.operation.operands)
+            dram_operand = ins[0] if len(ins) >= 1 else None
+            tag_operand = ins[1] if len(ins) >= 2 else None
+            operands = [ctx_val]
+            offset_arg = i64(0)
+            tag_arg = i64(0)
+            if dram_operand is not None:
+                operands.append(dram_operand)
+                offset_arg = _idx(len(operands) - 1)
+            if tag_operand is not None:
+                operands.append(tag_operand)
+                tag_arg = _idx(len(operands) - 1)
+            args = [_idx(0),
+                    i32(attr_int(op, ts.ATTR_DIR)),
+                    i32(attr_int(op, ts.ATTR_ARG_ID)),
+                    offset_arg,
+                    i32(len(dims)),
+                    _arr(ctx, dims),
+                    _arr(ctx, attr_i64_array(op, ts.ATTR_STRIDES)),
+                    i32(attr_int(op, ts.ATTR_ELEM_BITS)),
+                    i32(_attr_bool(op, ts.ATTR_IS_ASYNC)),
+                    i32(attr_int(op, ts.ATTR_TAG_ID)),
+                    tag_arg]
+            _rb = attr_i64_array(op, ts.ATTR_READ_BUFS)
+            _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS)
+            args += [_arr(ctx, _rb), i32(len(_rb)), _arr(ctx, _wb), i32(len(_wb))]
+            # togsim_dma is void: the dma is paired with its barrier by the runtime
+            # (tag_id, tag_slot), not a returned handle.
+            ir.Operation.create(
+                "emitc.call_opaque", results=[], operands=operands,
+                attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.DMA]),
+                            "args": ir.ArrayAttr.get(args)},
+                loc=ir.Location.unknown(ctx), ip=ipo)
+            victims.append(op)
+        elif name == ts.MEMORY_BAR:
+            # explicit async-DMA sync (the original dma_wait) ->
+            # togsim_memory_barrier(ctx, tag_id, tag_slot, write_bufs). The tag
+            # index operand (if any) is the runtime tag slot.
+            ins = list(op.operation.operands)
+            operands = [ctx_val]
+            tag_arg = i64(0)
+            if ins:
+                operands.append(ins[0])
+                tag_arg = _idx(len(operands) - 1)
+            _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS)
+            ir.Operation.create(
+                "emitc.call_opaque", results=[], operands=operands,
+                attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.MEMORY_BAR]),
+                            "args": ir.ArrayAttr.get(
+                                [_idx(0), i32(attr_int(op, ts.ATTR_TAG_ID)), tag_arg,
+                                 _arr(ctx, _wb), i32(len(_wb))])},
+                loc=ir.Location.unknown(ctx), ip=ipo)
+            victims.append(op)
+        elif name == ts.COMPUTE:
+            # skeleton compute carries no dims (cost is keyed by tile_id) -> 0/null.
+            _rb = attr_i64_array(op, ts.ATTR_READ_BUFS)
+            _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS)
+            _call(ctx, ctx_val, op, ts.EMITC_CALLEE[ts.COMPUTE],
+                  [i64(attr_int(op, ts.ATTR_TILE_ID)),
+                   i32(attr_int(op, ts.ATTR_COMPUTE_TYPE)),
+                   i32(0), _opaque(ctx, "nullptr"),
+                   _arr(ctx, _rb), i32(len(_rb)), _arr(ctx, _wb), i32(len(_wb))])
+            victims.append(op)
+        elif name == ts.COMPUTE_BAR:
+            # explicit compute fence -> togsim_compute_barrier(ctx) (sec 10.7).
+            ir.Operation.create(
+                "emitc.call_opaque", results=[], operands=[ctx_val],
+                attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.COMPUTE_BAR]),
+                            "args": ir.ArrayAttr.get([_idx(0)])},
+                loc=ir.Location.unknown(ctx), ip=ipo)
+            victims.append(op)
+    for op in victims:
+        op.operation.erase()
+
+
+# ---------------------------------------------------------------------------
+# step 3: post-conversion fixups
+# ---------------------------------------------------------------------------
+def _retype_for_to_size_t(module):
+    """Make every `emitc.for` use `!emitc.size_t` bounds + induction variable,
+    then drop the `index`<->`!emitc.size_t` `unrealized_conversion_cast` ops that
+    `convert-scf-to-emitc` / `convert-arith-to-emitc` leave behind (mlir-to-cpp
+    cannot print them; --reconcile cannot fold them).
+
+    `emitc.for` accepts `size_t` bounds with the explicit type, and a `size_t` IV
+    makes the lowered address arithmetic (`convert-arith-to-emitc`, which works
+    in `size_t`) cast-free. So: set each IV to size_t, then for every
+    index<->size_t cast replace its result with its source (every consumer here
+    -- `emitc.for` bounds, `emitc.call_opaque` operands, `emitc` arith -- accepts
+    either, and after the IV retype each such cast bridges equal types)."""
+    idx = ir.IndexType.get()
+    st = ir.Type.parse("!emitc.size_t", module.context)
+
+    for op in list(walk_ops(module.body)):
+        if op.operation.name == "emitc.for":
+            op.operation.regions[0].blocks[0].arguments[0].set_type(st)
+
+    dead = []
+    for op in list(walk_ops(module.body)):
+        if op.operation.name != "builtin.unrealized_conversion_cast":
+            continue
+        res = op.results[0]
+        src = list(op.operation.operands)[0]
+        # idx<->size_t bridges (incl. the size_t->size_t identities left after
+        # the IV retype): every consumer here accepts either, so fold to source.
+        if src.type in (idx, st) and res.type in (idx, st):
+            res.replace_all_uses_with(src)
+            dead.append(op)
+    for d in dead:
+        try:
+            d.operation.erase()
+        except Exception:
+            pass
+
+
+def _add_extern_c(module, ctx):
+    for op in module.body.operations:
+        if (op.operation.name == "emitc.func"
+                and ir.StringAttr(op.operation.attributes["sym_name"]).value == ENTRY):
+            op.operation.attributes["specifiers"] = ir.ArrayAttr.get(
+                [ir.StringAttr.get('extern "C"')])
+            return
+    raise ValueError("emitc.func @%s not found after conversion" % ENTRY)
+
+
+# ---------------------------------------------------------------------------
+# driver
+# ---------------------------------------------------------------------------
+def lower_to_emitc(skeleton_module):
+    """Lower a skeleton+API module (in place) to an EmitC module with the
+    `togsim_kernel` entry function. Returns the same module."""
+    ctx = skeleton_module.context
+    kernel = _find_kernel(skeleton_module)
+    if kernel is None:
+        raise ValueError("no @kernel found in skeleton module")
+
+    _strip_aux(skeleton_module)
+    ctx_val = _rewrite_signature(kernel, ctx)
+    _rewrite_togsim_ops(ctx, kernel, ctx_val)         # togsim.* -> emitc.call_opaque
+    _outline_work_item(ctx, kernel, ctx_val)          # work-item body -> togsim_kernel_tile + dispatch
+
+    PassManager.parse(_PIPELINE, ctx).run(skeleton_module.operation)
+
+    _retype_for_to_size_t(skeleton_module)
+    _add_extern_c(skeleton_module, ctx)
+    return skeleton_module
+
+
+# ---------------------------------------------------------------------------
+# C++ / .so backend
+# ---------------------------------------------------------------------------
+def _mlir_translate_bin():
+    return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"),
+                        "mlir-translate")
+
+
+def emitc_to_cpp(emitc_module, mlir_translate=None):
+    """Render `emitc_module` to C++ source (prelude + mlir-to-cpp body)."""
+    mlir_translate = mlir_translate or _mlir_translate_bin()
+    proc = subprocess.run(
+        [mlir_translate, "--mlir-to-cpp"],
+        input=str(emitc_module), capture_output=True, text=True)
+    if proc.returncode != 0:
+        raise RuntimeError("mlir-translate --mlir-to-cpp failed:\n" + proc.stderr)
+    return _PRELUDE + proc.stdout
+
+
+def compile_so(cpp_text, so_path, include_dir, cxx=None):
+    """Compile producer C++ to `so_path`. `include_dir` must hold
+    togsim_runtime.h. togsim_* symbols are left undefined (resolved at dlopen)."""
+    cxx = cxx or os.environ.get("CXX", "g++")
+    cpp_path = os.path.splitext(so_path)[0] + ".cpp"
+    with open(cpp_path, "w") as fh:
+        fh.write(cpp_text)
+    proc = subprocess.run(
+        [cxx, "-shared", "-fPIC", "-std=gnu++17", "-O2",
+         "-I", include_dir, cpp_path, "-o", so_path],
+        capture_output=True, text=True)
+    if proc.returncode != 0:
+        raise RuntimeError("%s failed:\n%s" % (cxx, proc.stderr))
+    return so_path
+
+
+def _default_include_dir():
+    root = os.environ.get("TORCHSIM_DIR")
+    if not root:
+        root = os.path.dirname(os.path.dirname(os.path.dirname(
+            os.path.dirname(os.path.abspath(__file__)))))
+    return os.path.join(root, "TOGSim", "include")
+
+
+def skeleton_to_so(skeleton_module, so_path, include_dir=None):
+    """skeleton module -> EmitC -> C++ -> compiled trace `.so`. Returns the
+    EmitC module text (for inspection / caching)."""
+    emitc = lower_to_emitc(skeleton_module)
+    cpp = emitc_to_cpp(emitc)
+    compile_so(cpp, so_path, include_dir or _default_include_dir())
+    return str(emitc)
+
+
+def build_trace_so(postvcix_path, so_path, include_dir=None):
+    """Full P2 path from a post-vcix kernel .mlir to a trace `.so`."""
+    from . import build_skeleton as bs
+
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(open(postvcix_path).read(), ctx)
+        bs.build_skeleton(module)
+        return skeleton_to_so(module, so_path, include_dir)
+
+
+def main(argv):
+    import argparse
+
+    parser = argparse.ArgumentParser(prog="lower_to_emitc.py")
+    parser.add_argument("input", help="post-vcix kernel .mlir")
+    parser.add_argument("--so", required=True, help="output .so path")
+    parser.add_argument("--include-dir", default=None,
+                        help="dir holding togsim_runtime.h (default: TOGSim/include)")
+    parser.add_argument("--emit-cpp", default=None,
+                        help="also write the generated C++ here")
+    parser.add_argument("--emit-mlir", default=None,
+                        help="also write the EmitC MLIR here")
+    args = parser.parse_args(argv[1:])
+
+    from . import build_skeleton as bs
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(open(args.input).read(), ctx)
+        bs.build_skeleton(module)
+        emitc = lower_to_emitc(module)
+        if args.emit_mlir:
+            open(args.emit_mlir, "w").write(str(emitc))
+        cpp = emitc_to_cpp(emitc)
+        if args.emit_cpp:
+            open(args.emit_cpp, "w").write(cpp)
+        compile_so(cpp, args.so, args.include_dir or _default_include_dir())
+    import sys
+    sys.stderr.write("wrote %s\n" % args.so)
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main(sys.argv))
diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py
index ac93ebc8..df124d00 100644
--- a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py
+++ b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py
@@ -29,6 +29,8 @@
 
 import mlir.ir as ir  # noqa: E402
 
+from ._mlir_util import walk_ops, i32, i64, attr_bool
+
 MARKERS = ("linalg.matmul", "math.exp", "math.erf", "math.tanh", "math.sin", "math.cos")
 
 # math op name -> (opcode, imm) for the vcix.v.iv lowering (mirror Math*ToVCIX).
@@ -80,20 +82,12 @@ def _legalize_vector_type(vt, vlen):
     return n, ir.VectorType.get([elt_count >> (n - 1)], elt_ty, scalable=[True])
 
 
-def _i64(v):
-    return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), v)
-
-
-def _i32(v):
-    return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), v)
-
-
 def _viv(operand, result_ty, opcode, imm, rvl=None):
     """Create an unregistered vcix.v.iv (vcix::BinaryImmOp) op at the current IP."""
     operands = [operand] if rvl is None else [operand, rvl]
     return ir.Operation.create(
         "vcix.v.iv", results=[result_ty], operands=operands,
-        attributes={"opcode": _i64(opcode), "imm": _i32(imm)}).results[0]
+        attributes={"opcode": i64(opcode), "imm": i32(imm)}).results[0]
 
 
 def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm):
@@ -104,7 +98,7 @@ def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm):
     scalable = legal_ty.scalable
     rvl = None
     if scalable:
-        rvl = arith.ConstantOp(ir.IntegerType.get_signless(64), _i64(9)).result
+        rvl = arith.ConstantOp(ir.IntegerType.get_signless(64), i64(9)).result
     if n == 1:
         return _viv(vec, legal_ty, opcode, imm, rvl)
     elt_ty = legal_ty.element_type
@@ -119,24 +113,16 @@ def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm):
         for i in range(total // elt_count):
             ext = vector.ExtractStridedSliceOp(
                 legal_ty, vec,
-                ir.ArrayAttr.get([_i64(i * elt_count)]),
-                ir.ArrayAttr.get([_i64(elt_count)]),
-                ir.ArrayAttr.get([_i64(1)])).result
+                ir.ArrayAttr.get([i64(i * elt_count)]),
+                ir.ArrayAttr.get([i64(elt_count)]),
+                ir.ArrayAttr.get([i64(1)])).result
             v = _viv(ext, legal_ty, opcode, imm, rvl)
             res = vector.InsertStridedSliceOp(
-                v, res, ir.ArrayAttr.get([_i64(i * elt_count)]),
-                ir.ArrayAttr.get([_i64(1)])).result
+                v, res, ir.ArrayAttr.get([i64(i * elt_count)]),
+                ir.ArrayAttr.get([i64(1)])).result
     return res
 
 
-def _iter_ops(block):
-    for op in list(block.operations):
-        yield op
-        for region in op.operation.regions:
-            for b in region.blocks:
-                yield from _iter_ops(b)
-
-
 # ---------------------------------------------------------------------------
 # matmul lowering helpers (mirror MatmulOpLowering)
 # ---------------------------------------------------------------------------
@@ -146,11 +132,6 @@ def _elt_bits(elt_ty):
     return ir.FloatType(elt_ty).width
 
 
-def _bool_attr_true(op, key):
-    a = op.attributes
-    return key in a and ir.BoolAttr(a[key]).value
-
-
 def _enclosing_loops(op):
     """Walk ancestor ops; return (accumulation, outer, inner) affine.for lists,
     outermost-first (mirror the C++ insert-at-begin)."""
@@ -158,11 +139,11 @@ def _enclosing_loops(op):
     parent = op.operation.parent
     while parent is not None:
         if parent.name == "affine.for":
-            if _bool_attr_true(parent, "accumulation_loop"):
+            if attr_bool(parent, "accumulation_loop"):
                 acc.insert(0, parent)
-            if _bool_attr_true(parent, "outer_loop"):
+            if attr_bool(parent, "outer_loop"):
                 outer.insert(0, parent)
-            if _bool_attr_true(parent, "inner_loop"):
+            if attr_bool(parent, "inner_loop"):
                 inner.insert(0, parent)
         parent = parent.parent
     return acc, outer, inner
@@ -200,7 +181,7 @@ def _scan_conv_offsets(ow_loop, o_h, k_h, o_w, k_w):
     """Mirror the heuristic offset scan: find affine.apply(o_h,k_h)/(o_w,k_w) in the
     o_w loop and read the constant in its map (default 1)."""
     offset_h = offset_w = 1
-    for o in _iter_ops(ow_loop.regions[0].blocks[0]):
+    for o in walk_ops(ow_loop.regions[0].blocks[0]):
         if o.operation.name != "affine.apply":
             continue
         ops = list(o.operation.operands)
@@ -391,7 +372,7 @@ def _root(v):
                 return owner.operands[0]
         return v
     rootA, rootB = _root(A), _root(B)
-    for o in _iter_ops(outer[-1].regions[0].blocks[0]):
+    for o in walk_ops(outer[-1].regions[0].blocks[0]):
         if o.operation.name == "affine.vector_store":
             dest = _root(o.operation.operands[1])
             if dest == rootA:
@@ -488,6 +469,14 @@ def _root(v):
         # --- B dma_wait ---
         nacc = len(acc)
         acc_ivs = [_loop_iv(l) for l in acc]
+        # LEGACY behavior: coefficient -1 on each accumulation (reduction) loop var
+        # is a SENTINEL marking "this tag dim is the reduction axis", not an
+        # arithmetic offset. The legacy TOG path (TileGraphParser.cc) honors it by
+        # routing those vars to a separate accum tag component and skipping stride
+        # -1. The C++ trace path does NOT honor it: build_skeleton._strip_accum_terms
+        # drops these -1 terms so the memory_barrier slot stays subtile-only and
+        # pairs with its async load. Kept here for byte-identity with the C++
+        # -test-pytorchsim-to-vcix pass; remove (do not flag) once legacy retires.
         bexpr = ir.AffineDimExpr.get(0) * -1
         for i in range(1, nacc):
             bexpr = bexpr + ir.AffineDimExpr.get(i) * -1
@@ -544,6 +533,10 @@ def _root(v):
 
     with body_ip:
         # --- A dma_wait ---
+        # LEGACY behavior (see the B dma_wait above): the -1 coefficients mark the
+        # reduction axis for the legacy TOG path; the trace path strips them in
+        # build_skeleton._strip_accum_terms. Kept for byte-identity with the C++
+        # -test-pytorchsim-to-vcix pass; remove once legacy retires.
         aexpr = ir.AffineDimExpr.get(0) * -1
         for i in range(1, nacc):
             aexpr = aexpr + ir.AffineDimExpr.get(i) * -1
@@ -617,7 +610,7 @@ def run(module, vectorlane=128, vlen=128, **_):
     mms = []
     for region in module.operation.regions:
         for b in region.blocks:
-            for o in _iter_ops(b):
+            for o in walk_ops(b):
                 if o.operation.name == "linalg.matmul":
                     mms.append(o.operation)
     for o in mms:
@@ -625,7 +618,7 @@ def run(module, vectorlane=128, vlen=128, **_):
     targets = []
     for region in module.operation.regions:
         for b in region.blocks:
-            for op in _iter_ops(b):
+            for op in walk_ops(b):
                 if op.operation.name in _MATH_VIV:
                     targets.append(op.operation)
     for op in targets:
diff --git a/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py b/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py
index 76e30cb3..3ed0a394 100644
--- a/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py
+++ b/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py
@@ -24,13 +24,7 @@
 OP_NAME = "torchsim.vlane_idx"
 MARKERS = (OP_NAME,)
 
-
-def _iter_ops(block):
-    for op in list(block.operations):
-        yield op
-        for region in op.operation.regions:
-            for b in region.blocks:
-                yield from _iter_ops(b)
+from ._mlir_util import walk_ops
 
 
 def run(module, **_):
@@ -46,7 +40,7 @@ def run(module, **_):
     targets = []
     for region in module.operation.regions:
         for b in region.blocks:
-            for op in _iter_ops(b):
+            for op in walk_ops(b):
                 if op.operation.name == OP_NAME:
                     targets.append(op.operation)
 
diff --git a/PyTorchSimFrontend/mlir/passes/togsim_ops.py b/PyTorchSimFrontend/mlir/passes/togsim_ops.py
new file mode 100644
index 00000000..740a8f2f
--- /dev/null
+++ b/PyTorchSimFrontend/mlir/passes/togsim_ops.py
@@ -0,0 +1,106 @@
+"""Shared vocabulary for the skeleton+API MLIR form (C1).
+
+The trace pipeline (docs/design/togsim_cpp_trace.md) reduces a kernel's MLIR to
+a *loop skeleton + API calls*: native `affine.for`/`scf.for` loops (bounds kept
+as-is, symbolic preserved) plus a handful of `togsim.*` ops that stand for the
+runtime API. This module is the single source of truth for those op names and
+attribute keys, shared by:
+
+  * build_skeleton (C2) -- produces the skeleton+API MLIR, and
+  * togsim->emitc lowering (C4) -- rewrites each op to an `emitc.call_opaque`.
+
+The ops are kept *unregistered* (like the existing `togsim.transfer`), so there
+is no C++ dialect to register; C4 is a custom rewrite, not a registered
+ConversionPass.
+
+Grammar (each op lowers 1:1 to a `togsim_runtime.h` free function):
+
+    "togsim.dma"(%dram_idx, %tag_idx) {         -> togsim_dma(ctx, dir, arg_id,
+            dir = 0 | 1,            # LOAD|STORE      offset, ndim, dims, strides,
+            dims = [..], strides = [..],                elem_bits, is_async,
+            elem_bits = i32, is_async = bool,           tag_id, tag_slot,
+            tag_id = i32, arg_id = i32,                 read_bufs, write_bufs)
+            read_bufs = [..], write_bufs = [..]
+         } : (index, index) -> ()
+
+    "togsim.compute"() {                        -> togsim_compute(ctx, tile_id,
+            tile_id = i64, compute_type = i32,          compute_type, ndim, dims,
+            read_bufs = [..], write_bufs = [..]         read_bufs, write_bufs)
+         } : () -> ()
+
+    "togsim.memory_barrier"(%tag_idx) {         -> togsim_memory_barrier(ctx,
+            tag_id = i32, write_bufs = [..]             tag_id, tag_slot, write_bufs)
+         } : (index) -> ()
+
+    "togsim.compute_barrier"() : () -> ()       -> togsim_compute_barrier(ctx)
+
+How an async dma pairs with its sync point: NOT by a compile-time id. One static
+`togsim.dma` op runs once per loop iteration, each with a different RUNTIME tag
+slot `%tag[%idx]`, so the pairing must be a runtime key. `togsim.dma` carries a
+`tag_id` (its tag memref identity) and the runtime `%tag[%idx]` operand; the
+original `memref.dma_wait` becomes an explicit `togsim.memory_barrier` carrying
+the same `tag_id` + tag index. They pair at runtime by `(tag_id, tag_slot)` via
+the Core's tag table (the dma signals the tag at data-arrival; the barrier waits
+it). `tag_id` (which tag memref) is distinct from `tag_slot` (the SRAM tile slot,
+used for the double-buffer / capacity model). A sync (non-async) dma is blocking,
+so it needs no barrier. (Supersedes the earlier static `event_id` + `togsim.wait`
+design, which could not express per-iteration pairing.)
+
+Keep this in lockstep with TOGSim/include/togsim_runtime.h (TOGSIM_ABI_VERSION).
+"""
+
+# ---- op names -------------------------------------------------------------
+DMA    = "togsim.dma"
+COMPUTE = "togsim.compute"
+COMPUTE_BAR = "togsim.compute_barrier"  # fence: drain async compute before a consumer (sec 10.7)
+MEMORY_BAR = "togsim.memory_barrier"    # explicit async-DMA sync (the original dma_wait); tag-keyed
+
+#: every op this module owns (for matchers / DCE roots in C2).
+OP_NAMES = (DMA, COMPUTE, COMPUTE_BAR, MEMORY_BAR)
+
+#: op name -> the togsim_runtime.h symbol C4 lowers it to.
+EMITC_CALLEE = {
+    DMA:     "togsim_dma",
+    COMPUTE: "togsim_compute",
+    COMPUTE_BAR: "togsim_compute_barrier",
+    MEMORY_BAR: "togsim_memory_barrier",
+}
+
+#: producer entry-point symbol the TOGSim loader resolves (see togsim_runtime.h).
+ENTRY_SYMBOL = "togsim_kernel"
+
+#: outlined per-work-item function the dispatcher hands to togsim_dispatch
+#: (uniform signature (ctx, int64* iv, i32 n); see togsim_cpp_trace.md sec 9.3).
+TILE_SYMBOL = "togsim_kernel_tile"
+
+#: runtime callees emitted directly by lower_to_emitc (not skeleton ops), kept in
+#: lockstep with togsim_runtime.h. DISPATCH_CALLEE is the higher-order wrapper the
+#: dispatcher loop calls per work-item (round-robins a core + TILE_BEGIN/END);
+#: TILE_SYMBOL is passed to it as the function pointer.
+DISPATCH_CALLEE = "togsim_dispatch"
+
+# ---- attribute keys -------------------------------------------------------
+ATTR_DIR       = "dir"        # i32: DIR_LOAD | DIR_STORE
+ATTR_DIMS      = "dims"       # i64 array: tile extents
+ATTR_STRIDES   = "strides"    # i64 array: tile strides
+ATTR_ELEM_BITS = "elem_bits"  # i32
+ATTR_IS_ASYNC  = "is_async"   # bool
+ATTR_TILE_ID   = "tile_id"    # i64: key into the precomputed tile_id->cycle table
+ATTR_COMPUTE_TYPE = "compute_type"  # i32: 0 vector / 1 matmul / 2 preload (Core enum)
+ATTR_READ_BUFS  = "read_bufs"   # i64 array: SRAM buffer ids this op reads  (sec 10 dataflow)
+ATTR_WRITE_BUFS = "write_bufs"  # i64 array: SRAM buffer ids this op writes (sec 10 dataflow)
+ATTR_TAG_ID    = "tag_id"     # i32: identity of the DMA's tag memref; pairs an async dma with
+                              #      its memory_barrier by the RUNTIME tag slot (tag_id + tag index)
+ATTR_ARG_ID    = "arg_id"     # i32: which tensor (func arg) this DMA's base is
+
+# Must match togsim_dma_dir in togsim_runtime.h.
+DIR_LOAD  = 0
+DIR_STORE = 1
+
+
+def is_togsim_op(op):
+    """True if `op` (an Operation or a wrapping view) is one of ours."""
+    name = getattr(op, "name", None)
+    if name is None:
+        name = getattr(getattr(op, "operation", None), "name", None)
+    return name in OP_NAMES
diff --git a/Simulator/simulator.py b/Simulator/simulator.py
index 2b9f05be..5f6ed08e 100644
--- a/Simulator/simulator.py
+++ b/Simulator/simulator.py
@@ -560,7 +560,20 @@ def run_standalone(
             os.fsync(trace_file.fileno())
 
         try:
-            cmd = f"{TOGSimulator.get_togsim_command(config_path, togsim_path)} --models_list {trace_file_path}"
+            # The C++ TOG (trace) path is the DEFAULT: drive the simulation from the
+            # emitted trace.so. The legacy ONNX TOG is the opt-in fallback via
+            # TORCHSIM_LEGACY_TOG=1. Autotune candidates each retile while the .so is
+            # one tiling, so they always run legacy; the trace path drives the final
+            # (chosen-tiling) run. Fall back to legacy if the .so was not emitted.
+            trace_so = os.path.join(os.path.dirname(str(model_path)), "trace.so")
+            cycle_tsv = os.path.join(os.path.dirname(str(model_path)), "trace_cycles.tsv")
+            base_cmd = TOGSimulator.get_togsim_command(config_path, togsim_path)
+            use_trace = (os.environ.get("TORCHSIM_LEGACY_TOG") != "1"
+                         and not autotune_mode and os.path.exists(trace_so))
+            if use_trace:
+                cmd = f"{base_cmd} --trace_so {trace_so} --cycle_table {cycle_tsv}"
+            else:
+                cmd = f"{base_cmd} --models_list {trace_file_path}"
             if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL:
                 cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}"
 
diff --git a/TOGSim/include/Core.h b/TOGSim/include/Core.h
index 286feb5f..0b6f8595 100644
--- a/TOGSim/include/Core.h
+++ b/TOGSim/include/Core.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <robin_hood.h>
 #include <unordered_set>
+#include <map>
 #include <memory>
 #include <vector>
 #include <fmt/core.h>
@@ -24,6 +25,10 @@ class Core {
   Core(uint32_t id, SimulationConfig config);
   ~Core()=default;
   virtual bool running();
+  // True if this core has work actively in flight (DMA / compute pipeline / queues)
+  // that will produce a future finish event -- i.e. running() minus "tiles waiting".
+  // Used by the frozen-state (spad-too-small) guard.
+  bool has_inflight();
   virtual bool can_issue(const std::shared_ptr<Tile>& op);
   virtual void issue(std::shared_ptr<Tile> tile);
   virtual std::shared_ptr<Tile> pop_finished_tile();
@@ -55,6 +60,16 @@ class Core {
   void sa_cycle();
   bool can_issue_compute(std::shared_ptr<Instruction>& inst);
   void update_stats();
+  // SRAM-capacity throttle (sec 10.x): a consumer frees the buffer-versions it
+  // read (refcount -> 0 releases the spad bytes). Called when COMP/MOVOUT issue.
+  void release_sram(const std::shared_ptr<Instruction>& inst);
+  // SA weight-buffer throttle (sec 10.x): pick a systolic array that has a free
+  // weight slot (round-robin among free); -1 if all full -> the preload stalls.
+  int pick_free_weight_sa();
+  // Free weight slots due this cycle: a matmul releases its slot at its
+  // streaming-end (finish - overlapping, when it stops reading the weight),
+  // scheduled at issue in _weight_release_q. Last consumer frees it.
+  void process_weight_releases();
 
   /* Core id & config file */
   const uint32_t _id;
@@ -103,4 +118,20 @@ class Core {
   std::queue<mem_fetch*> _request_queue;
   std::queue<mem_fetch*> _response_queue;
   uint32_t _waiting_write_reqs;
+
+  // SRAM-capacity throttle (sec 10.x). _sram_used = current per-core spad bytes;
+  // _sram_capacity = limit (0 = disabled); _sram_allocs maps a buffer-version id
+  // to its accumulated footprint bytes (freed when its last reader issues).
+  size_t _sram_used = 0;
+  size_t _sram_capacity = 0;
+  std::unordered_map<int64_t, size_t> _sram_allocs;
+
+  // SA weight-buffer throttle (sec 10.x). _weight_slots_used[s] = weights resident
+  // on SA s (loaded by a preload, not yet freed by their last matmul);
+  // _weight_slot_depth = per-SA capacity (0 = disabled -> plain round-robin).
+  std::vector<int> _weight_slots_used;
+  uint32_t _weight_slot_depth = 0;
+  // Pending weight-slot releases keyed by cycle (each matmul's streaming-end);
+  // process_weight_releases() drains those due and decrements the token.
+  std::multimap<cycle_type, std::shared_ptr<WeightToken>> _weight_release_q;
 };
\ No newline at end of file
diff --git a/TOGSim/include/Instruction.h b/TOGSim/include/Instruction.h
index bb62a440..fa5d4ca1 100644
--- a/TOGSim/include/Instruction.h
+++ b/TOGSim/include/Instruction.h
@@ -12,7 +12,16 @@
 #include <memory>
 #include <vector>
 
-enum class Opcode { MOVIN, MOVOUT, COMP, BAR, COUNT};
+// MEMORY_BAR: the DMA/memory barrier (waits a DMA tag in the tag table).
+// COMPUTE_BAR: the compute barrier -- waits the systolic-array compute pipeline(s)
+//              to drain (all SAs empty), then finishes. Used as the explicit
+//              fence before a store consumes async matmul results (sec 10.7).
+enum class Opcode { MOVIN, MOVOUT, COMP, MEMORY_BAR, COMPUTE_BAR, COUNT};
+
+// One weight slot on systolic array `sa` (sec 10.x). A preload sets refcount =
+// the matmuls reusing the weight; each frees it at its streaming-end, the last
+// one releases the slot. Shared (shared_ptr) by the preload's matmul consumers.
+struct WeightToken { int sa; int refcount; };
 
 typedef uint64_t addr_type;
 typedef uint64_t cycle_type;
@@ -29,6 +38,26 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   Instruction(Opcode opcode);
   void finish_instruction();
   void add_child(std::shared_ptr<Instruction> child);
+  // Occupancy (SA-pipeline) dependency: the child is released when THIS op is
+  // ISSUED (enters the pipeline), not when it finishes -- so a preload/matmul
+  // successor overlaps it instead of waiting its full latency (sec 10.7).
+  void add_pipeline_child(std::shared_ptr<Instruction> child);
+  void release_pipeline_children();
+  // SA weight-buffer model: the SA this op is pinned to (a preload picks it, its
+  // matmul consumers inherit it) and the shared weight slot the matmuls release.
+  const std::set<std::shared_ptr<Instruction>>& get_pipeline_children() { return _pipeline_children; }
+  void set_assigned_sa(int s) { _assigned_sa = s; }
+  int get_assigned_sa() const { return _assigned_sa; }
+  void set_weight_token(const std::shared_ptr<WeightToken>& t) { _weight_token = t; }
+  const std::shared_ptr<WeightToken>& get_weight_token() const { return _weight_token; }
+  // Trace-only: which work-item (togsim_dispatch tile) this op belongs to, for
+  // grouping/coloring in the timeline. Set by the bridge per TILE_BEGIN.
+  void set_tile_group(int g) { _tile_group = g; }
+  int get_tile_group() const { return _tile_group; }
+  // COMPUTE_BAR fence: the max finish_cycle of the async computes it gates (its
+  // own dispatch only), so it drains those instead of every SA pipeline.
+  void update_fence_finish(cycle_type c) { if (c > _fence_finish) _fence_finish = c; }
+  cycle_type get_fence_finish() const { return _fence_finish; }
   bool check_ready() { return ready_counter == 0; }
   const Opcode get_opcode() { return opcode; }
   bool is_dma_read() { return opcode == Opcode::MOVIN; }
@@ -51,6 +80,9 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   void inc_waiting_request();
   void dec_waiting_request();
   size_t get_waiting_request() { return _nr_waiting_request; }
+  // trace: log only the FIRST DRAM response of a load (when data starts arriving).
+  bool got_first_response() const { return _got_first_response; }
+  void mark_first_response() { _got_first_response = true; }
   std::vector<size_t>& get_tile_size() { return tile_size; }
   std::vector<int>& get_tile_stride() { return tile_stride; }
   void set_overlapping_cycle(cycle_type cycle) { overlapping_cycle = cycle; }
@@ -86,12 +118,26 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   std::set<std::shared_ptr<Instruction>>& get_child_inst() { return child_inst; }
   uint64_t get_global_inst_id() const { return _global_inst_id; }
 
-  cycle_type start_cycle;
-  cycle_type finish_cycle;
+  // SRAM-capacity model (sec 10.x). A load contributes its footprint to a
+  // buffer-version allocation; the version is freed when its LAST consumer (the
+  // program-order-last reader, tagged by the bridge) issues. The bridge fills
+  // these; Core enforces them.
+  //   _sram_alloc_id      : which buffer-version this load fills (-1 = untracked)
+  //   _sram_release_allocs: versions this consumer frees on issue (tagged only on
+  //                         each version's last reader)
+  void set_sram_alloc(int64_t id) { _sram_alloc_id = id; }
+  int64_t get_sram_alloc() const { return _sram_alloc_id; }
+  void add_sram_release(int64_t id) { _sram_release_allocs.push_back(id); }
+  const std::vector<int64_t>& get_sram_release() const { return _sram_release_allocs; }
+  // bytes this load occupies in the spad (from the tile it moves in).
+  size_t sram_footprint() const { return _tile_numel * (_elem_bits / 8); }
+
+  cycle_type start_cycle = 0;
+  cycle_type finish_cycle = 0;
   cycle_type bubble_cycle=0;
 
   bool finished=false;
-  int subgraph_id;
+  int subgraph_id = 0;
  private:
   uint64_t _global_inst_id = 0;
   static uint64_t _next_global_inst_id;
@@ -99,16 +145,19 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   void *_owner = nullptr;
   std::list<std::shared_ptr<Instruction>>* _owner_ready_queue_ref = nullptr;
   Opcode opcode;
-  cycle_type compute_cycle;
-  cycle_type overlapping_cycle;
-  size_t ready_counter;
+  cycle_type compute_cycle = 0;
+  cycle_type overlapping_cycle = 0;
+  size_t ready_counter = 0;   // parents not yet finished; the minimal Instruction(Opcode)
+                              // ctor (barriers) relies on this default + inc_ready_counter
   std::set<std::shared_ptr<Instruction>> child_inst;
+  std::set<std::shared_ptr<Instruction>> _pipeline_children;  // released at issue (sec 10.7)
   std::vector<size_t> tile_size;
   std::vector<int> tile_stride;
-  size_t _tile_numel;
+  size_t _tile_numel = 0;
   size_t _nr_waiting_request=0;
+  bool _got_first_response=false;
   size_t _elem_bits = 0;
-  addr_type dram_addr;
+  addr_type dram_addr = 0;
   uint32_t _numa_id = 0; // For DMA instruction
   int _compute_type = 0;
   std::vector<int64_t> _tag_idx_list;
@@ -123,4 +172,12 @@ class Instruction : public std::enable_shared_from_this<Instruction> {
   bool _is_indirect_mode=false;
   bool _is_sparse_inst=false;
   std::string _indirect_index_path="";
+  // SRAM-capacity model (see the setters above).
+  int64_t _sram_alloc_id = -1;
+  std::vector<int64_t> _sram_release_allocs;
+  // SA weight-buffer model (see the setters above).
+  int _assigned_sa = -1;
+  std::shared_ptr<WeightToken> _weight_token;
+  int _tile_group = -1;   // trace-only work-item id (see set_tile_group)
+  cycle_type _fence_finish = 0;   // COMPUTE_BAR: drain target (see update_fence_finish)
 };
\ No newline at end of file
diff --git a/TOGSim/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h
index 2ef08618..7785ff7a 100644
--- a/TOGSim/include/SimulationConfig.h
+++ b/TOGSim/include/SimulationConfig.h
@@ -27,6 +27,16 @@ struct SimulationConfig {
   uint32_t num_systolic_array_per_core = 1;
   uint32_t num_stonne_per_core = 1;
   uint32_t num_stonne_port = 1;
+  // Per-core VMEM/spad capacity (KB) for the trace-path DMA throttle (sec 10.x):
+  // a load that would overflow the spad does not issue until a consumer frees a
+  // tile. Provided by the config (the TPU configs set 16384 = 16 MB VMEM). 0 =
+  // unset -> gate disabled (unlimited). Only affects trace-path instructions
+  // (legacy TileGraphParser insts have alloc id -1 -> never gated).
+  uint32_t core_spad_size_kb = 0;
+  // SA weight-buffer depth (sec 10.x): weight tiles a systolic array holds; a
+  // preload stalls until a slot frees (its matmuls finished). 2 = weight
+  // double-buffer (convention default, tunable). 0 = disabled.
+  uint32_t sa_weight_buffer_depth = 2;
 
   /* DRAM config */
   DramType dram_type;
diff --git a/TOGSim/include/Simulator.h b/TOGSim/include/Simulator.h
index e3542d51..91baf5b5 100644
--- a/TOGSim/include/Simulator.h
+++ b/TOGSim/include/Simulator.h
@@ -48,6 +48,9 @@ class Simulator {
   void dram_cycle();
   void icnt_cycle();
   bool running();
+  // Spad-too-small guard: if the sim stays frozen (running() but nothing in
+  // flight) past kWedgeThreshold cycles, error out and exit. Called each cycle.
+  void check_frozen();
   void set_cycle_mask();
   uint32_t get_dest_node(mem_fetch *access);
   SimulationConfig _config;
diff --git a/TOGSim/include/TraceLogTags.h b/TOGSim/include/TraceLogTags.h
index 6c158099..759a4fdb 100644
--- a/TOGSim/include/TraceLogTags.h
+++ b/TOGSim/include/TraceLogTags.h
@@ -24,6 +24,7 @@ inline constexpr const char* kInstructionFinished = "INST_FINISHED";
 inline constexpr const char* kInstructionSkipped = "INST_SKIP";
 
 inline constexpr const char* kAsyncDmaAllRequestsIssued = "ASYNC_DMA_ISSUE";
+inline constexpr const char* kFirstDramResponse = "DRAM_RESP_FIRST";
 inline constexpr const char* kAllDramResponsesReceived = "DRAM_RESP_DONE";
 
 inline constexpr const char* kL2CacheableStatusForAddress = "L2CACHE_STAT";
diff --git a/TOGSim/include/togsim_loader.h b/TOGSim/include/togsim_loader.h
new file mode 100644
index 00000000..6c1273ee
--- /dev/null
+++ b/TOGSim/include/togsim_loader.h
@@ -0,0 +1,76 @@
+#pragma once
+// togsim_loader.h
+// -----------------------------------------------------------------------------
+// TOGSim-side loader for the compiled trace producer (C6, P3 task 5). NOT part
+// of the producer ABI (togsim_runtime.h) -- this is the TOGSim half that
+// `dlopen`s a producer `.so`, runs its `togsim_kernel`, and records the emitted
+// instruction stream. See docs/design/togsim_cpp_trace.md sec 5.3 / 9.7.
+//
+// This first cut is the "materializing sink": the callbacks resolve each tile's
+// DRAM address (base[arg_id] + offset*elem_bytes) and per-tile compute cost
+// (the cycle table), mint event handles, and append a TraceRec per modeled
+// instruction. Feeding the recorded stream into the existing timing core
+// (Core/Simulator) for cycle-equivalence vs the build_tog path is the remaining
+// task-5 step.
+// -----------------------------------------------------------------------------
+
+#include <cstdint>
+#include <vector>
+
+#include "togsim_runtime.h"
+
+namespace togsim {
+
+// One modeled instruction recorded by the runtime callbacks.
+struct TraceRec {
+  enum Kind { TILE_BEGIN, TILE_END, DMA, COMPUTE, MEMORY_BAR, COMPUTE_BAR } kind;
+  int32_t  core;          // work-item -> core binding (set by togsim_dispatch)
+  // DMA / MEMORY_BAR
+  int32_t  dir;           // togsim_dma_dir
+  int32_t  arg_id;        // tensor
+  int32_t  elem_bits;
+  int32_t  is_async;
+  uint64_t addr;          // resolved DRAM byte address = base[arg_id] + off*bytes
+  int32_t  tag_id;        // DMA/MEMORY_BAR: tag memref identity; with tag_slot the
+                          // runtime pairing key (an async dma <-> its memory_barrier)
+  uint64_t tag_slot;      // SRAM tile slot (double-buffer / capacity model)
+  std::vector<int64_t> dims;     // tile extents (DMA)
+  std::vector<int64_t> strides;  // tile strides (DMA)
+  std::vector<int64_t> read_bufs;   // SRAM buffer ids read  (sec 10 dataflow DAG)
+  std::vector<int64_t> write_bufs;  // SRAM buffer ids written (MEMORY_BAR: released bufs)
+  // COMPUTE
+  uint64_t tile_id;
+  int32_t  compute_type;  // 0 vector / 1 matmul / 2 preload (Core unit enum)
+  int64_t  cycle;         // looked up from the cycle table
+  int64_t  overlapping;   // looked up from the cycle table
+};
+
+struct RunResult {
+  bool ok = false;
+  std::vector<TraceRec> trace;
+};
+
+// Load `so_path`, run its `togsim_kernel(shape_args, n_shape)` against a freshly
+// built EmitCtx, and return the recorded trace.
+//   tensor_base[arg_id] : DRAM base address of each kernel tensor argument
+//   cyc[tile_id] / ovl[tile_id] : the cycle table (cycle, overlapping_cycle)
+//   num_cores : dispatch round-robins work-items across this many cores
+RunResult run_producer(const char* so_path,
+                       const int64_t* shape_args, int32_t n_shape,
+                       const uint64_t* tensor_base, int32_t n_tensors,
+                       const int64_t* cyc, const int64_t* ovl, int32_t n_tiles,
+                       int32_t num_cores);
+
+// First-order reference timing over a recorded trace, to validate that the
+// stream carries enough to be scheduled (it is NOT the production Core -- no
+// DRAM/NoC/L2 contention; the real cycle-equivalence path feeds Tile/TileGraph
+// into Core). Models, per core: a DMA-engine timeline (DMAs serialize, overlap
+// compute), a compute timeline (serial = reduction accumulate, with the
+// finish = prev.finish + cycle - overlapped pipeline overlap of Core.cc), and
+// data dependencies (a compute waits the dmas whose handles its preceding
+// togsim_wait()s named).
+struct TimingParams { uint64_t dma_latency = 100; };
+struct SimResult { uint64_t total_cycle = 0; int n_compute = 0, n_dma = 0; };
+SimResult simulate(const RunResult& run, const TimingParams& params);
+
+}  // namespace togsim
diff --git a/TOGSim/include/togsim_runtime.h b/TOGSim/include/togsim_runtime.h
new file mode 100644
index 00000000..e8fd6b84
--- /dev/null
+++ b/TOGSim/include/togsim_runtime.h
@@ -0,0 +1,177 @@
+#pragma once
+// togsim_runtime.h
+// -----------------------------------------------------------------------------
+// Shared C ABI between a compiled, shape-parametric trace producer (`.so`,
+// generated MLIR -> EmitC -> C++) and TOGSim. See docs/design/togsim_cpp_trace.md.
+//
+// The producer keeps loops as native loops (symbolic bounds become function
+// parameters) and calls the functions below; each call emits one trace record =
+// one modeled instruction. TOGSim `dlopen`s the producer, constructs an
+// `EmitCtx`, calls the entry point, records the emitted stream, and feeds it to
+// the existing timing core. The producer carries NO timing model and NO
+// functional compute -- it is a deterministic trace generator only.
+//
+// ABI shape rationale: `mlir-translate --mlir-to-cpp` lowers our `togsim.*` ops
+// (via `emitc.call_opaque`) to *free function* calls, so the contract is a set
+// of `extern "C"` free functions taking an opaque `EmitCtx*` as the first
+// argument. Implementations live in TOGSim and may dispatch internally; the
+// `EmitCtx` is opaque to the producer. `togsim_abi_version()` guards against a
+// producer `.so` built against a stale header.
+//
+// STATUS: firmed up in P2. The signatures below match what the C4
+// togsim->emitc lowering (PyTorchSimFrontend/mlir/passes/lower_to_emitc.py)
+// emits as `emitc.call_opaque` targets and what `mlir-translate --mlir-to-cpp`
+// renders. Synchronization is event-id based: each async op is registered
+// under an integer `event_id` and the matching wait passes the same id (the
+// "event-id table replaces the memory-keyed tag_table" decision). Tile DRAM
+// base addresses are still passed as a stub (0) until P3 wires real addresses.
+// -----------------------------------------------------------------------------
+
+#include <cstdint>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Bump whenever the signatures below change incompatibly. TOGSim refuses to load
+// a producer whose embedded version (a `togsim_producer_abi_version` symbol, or
+// a value passed at the entry point) does not match.
+//   v1 -> v2 (P2): dma takes an event_id and returns void (was: returns a
+//                  handle); togsim_kernel shape_args is non-const to match the
+//                  emitc/mlir-to-cpp output.
+//   v2 -> v3 (P3): add togsim_dispatch (work-item boundary + core binding) and
+//                  togsim_wait_all (join / barrier).
+//   v3 -> v4 (P3): togsim_dma takes (arg_id, element offset) instead of a
+//                  precomputed base_addr; the producer lowers the address
+//                  arithmetic and the runtime adds the tensor base.
+//   v4 -> v5 (P3): event handles. togsim_dma RETURNS a fresh handle (drops the
+//                  event_id arg); the producer parks it in a heap event buffer
+//                  (togsim_event_alloc/free) and togsim_wait takes the handle.
+//   v5 -> v6 (P3): replace togsim_dispatch with togsim_core_alloc (returns a
+//                  core id; no free) -- the runtime owns the core pool, num_cores
+//                  is never baked into the producer.
+//   v6 -> v7 (P3): togsim_dma takes a tag_slot (SRAM tile slot) for the runtime's
+//                  double-buffer / SRAM-capacity model.
+//   v7 -> v8 (P3): togsim_compute takes a compute_type (vector/matmul/preload) so
+//                  the Core routes it to the right compute unit.
+//   v8 -> v9 (P3 sec10): togsim_dma/compute take read_bufs/write_bufs (SRAM buffer
+//                  ids); the loader builds an explicit dependency DAG by
+//                  last-writer per buffer (replaces in-order/tag dependencies).
+//   v9 -> v10 (P3 sec10.7): add togsim_compute_barrier (the explicit compute fence
+//                  before a store; loader -> COMPUTE_BAR instruction).
+//   v10 -> v11 (P3 sec10): replace the static event-id pairing with the RUNTIME
+//                  tag slot. togsim_dma takes a tag_id (its tag memref identity)
+//                  and returns void; the original dma_wait becomes an explicit
+//                  togsim_memory_barrier(tag_id, tag_slot, write_bufs) that pairs
+//                  with its async dma by the runtime (tag_id, tag_slot) -- one
+//                  static dma op runs once per loop iteration with a different
+//                  %tag[%idx], so only a runtime key can pair them. Drops
+//                  togsim_wait/signal/wait_all/event_alloc/event_free + the
+//                  togsim_event handle (no compile-time pairing token).
+//   v11 -> v12 (P3 sec9.3): replace the bare togsim_core_alloc marker with a
+//                  higher-order togsim_dispatch(ctx, tile_fn, iv, n_iv) wrapper.
+//                  The producer outlines each parallel work-item into a uniform
+//                  togsim_kernel_tile(ctx, iv, n) and the dispatcher loop hands it
+//                  to togsim_dispatch, which round-robins a core and brackets the
+//                  call with TILE_BEGIN/TILE_END. The work-item scope is now the
+//                  function call itself (no implicit "until the next core_alloc"
+//                  range); one general dispatcher serves every kernel (uniform
+//                  iv-array ABI). Core alloc + the begin/end boundary are
+//                  runtime-owned.
+#define TOGSIM_ABI_VERSION 12
+int32_t togsim_abi_version(void);
+
+// Opaque per-invocation context owned by TOGSim. Holds the record sink and the
+// tile_id->cycle lookup. Never dereferenced by the producer.
+typedef struct EmitCtx EmitCtx;
+
+// Direction for togsim_dma.
+typedef enum {
+  TOGSIM_DMA_LOAD  = 0,  // DRAM -> SRAM (MOVIN)
+  TOGSIM_DMA_STORE = 1,  // SRAM -> DRAM (MOVOUT)
+} togsim_dma_dir;
+
+// Emit a DMA.
+//   dir       : load/store
+//   arg_id    : which tensor (kernel func arg) this tile lives in
+//   offset    : ELEMENT offset of this tile within that tensor, computed by the
+//               producer from the loop indices (the affine address arithmetic is
+//               lowered into the producer -- P3 approach A). The runtime forms
+//               the DRAM address as base[arg_id] + offset*elem_bytes (only the
+//               runtime knows the tensors' allocation base addresses).
+//   ndim      : rank of the tile
+//   dims      : ndim tile extents
+//   strides   : ndim tile strides (may be null => contiguous)
+//   elem_bits : element width in bits
+//   is_async  : non-zero => issue-complete is the finish; the consumer must be
+//               gated by an explicit togsim_memory_barrier (data arrives later).
+//               Zero => blocking: the dma finishes at data-arrival.
+//   tag_id    : identity of this dma's tag memref. With tag_slot it forms the
+//               RUNTIME pairing key (tag_id, tag_slot) the matching
+//               togsim_memory_barrier waits on -- not a compile-time id, since
+//               one static dma op runs once per loop iteration.
+//   tag_slot  : the SRAM tile slot this tile occupies (the producer's lowered
+//               tag index, evaluated at runtime). Also the double-buffer /
+//               SRAM-capacity slot. Single-buffer kernels pass 0.
+//   read_bufs/n_read, write_bufs/n_write : SRAM buffer ids this op reads/writes
+//   (sec 10 dataflow). The loader builds the dependency DAG by last-writer per
+//   buffer.
+void togsim_dma(EmitCtx* ctx, int32_t dir, int32_t arg_id,
+                uint64_t offset, int32_t ndim, const int64_t* dims,
+                const int64_t* strides, int32_t elem_bits,
+                int32_t is_async, int32_t tag_id, uint64_t tag_slot,
+                const int64_t* read_bufs, int32_t n_read,
+                const int64_t* write_bufs, int32_t n_write);
+
+// Emit a fixed-size tile compute. Cost is looked up from the precomputed
+// tile_id->cycle table (annotation pass / sample-mode); `dims` are passed for
+// logging and future remainder-tile handling, not to compute cost here.
+//   compute_type : 0 vector / 1 matmul / 2 preload (maps to the Core unit enum;
+//                  routes the op to the VPU vs the systolic array).
+void togsim_compute(EmitCtx* ctx, uint64_t tile_id, int32_t compute_type,
+                    int32_t ndim, const int64_t* dims,
+                    const int64_t* read_bufs, int32_t n_read,
+                    const int64_t* write_bufs, int32_t n_write);
+
+// Explicit async-DMA sync -- the original memref.dma_wait. Pairs with its async
+// togsim_dma by the RUNTIME tag slot (tag_id, tag_slot) and gates consumers on
+// data-arrival (resp-complete), since an async dma's own finish is only
+// issue-complete. `write_bufs` is the SRAM buffer(s) that dma loaded; the loader
+// makes the barrier the last writer of them so consumers depend on it. Sync DMAs
+// need no barrier (they block to data-arrival themselves).
+void togsim_memory_barrier(EmitCtx* ctx, int32_t tag_id, uint64_t tag_slot,
+                           const int64_t* write_bufs, int32_t n_write);
+
+// A parallel work-item body, outlined by the producer (sec 9.3). Uniform across
+// kernels: it takes the EmitCtx, the packed parallel loop indices `iv` (iv[0..
+// n_iv) -- e.g. the (m,n) output-tile indices) and their count. The body emits
+// the work-item's ops (init / reduction / store). One signature => one general
+// dispatcher serves every kernel.
+// (iv is non-const to match the `int64_t*` the EmitC producer emits; the runtime
+// only reads it.)
+typedef void (*togsim_tile_fn)(EmitCtx* ctx, int64_t* iv, int32_t n_iv);
+
+// Dispatch one work-item (sec 9.3). The runtime round-robins a core from the
+// pool, brackets the call with TILE_BEGIN/TILE_END (the work-item boundary), and
+// invokes `fn(ctx, iv, n_iv)` -- so the work-item SCOPE is exactly the function
+// call, not an implicit "ops until the next alloc" range. Core alloc + boundary
+// are runtime-owned; the producer is core-count transparent (never names
+// num_cores or a physical core). Independent work-items land on different cores
+// -> multi-core. A general (kernel-independent) wrapper: it only forwards the
+// opaque iv array to fn.
+void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn,
+                     int64_t* iv, int32_t n_iv);
+
+// Compute fence: drain in-flight async compute (the systolic-array matmuls)
+// before the following op (a store) consumes their result. Explicit barrier in
+// the trace; the loader turns it into a COMPUTE_BAR instruction (sec 10.7).
+void togsim_compute_barrier(EmitCtx* ctx);
+
+// Entry point the loader resolves in the producer `.so`. `shape_args` carries
+// the runtime values for the kernel's symbolic dimensions (in a kernel-specific
+// order recorded alongside the cached `.so`); `n_shape_args` is their count.
+void togsim_kernel(EmitCtx* ctx, int64_t* shape_args, int32_t n_shape_args);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/TOGSim/include/togsim_trace_bridge.h b/TOGSim/include/togsim_trace_bridge.h
new file mode 100644
index 00000000..f0213ef5
--- /dev/null
+++ b/TOGSim/include/togsim_trace_bridge.h
@@ -0,0 +1,18 @@
+#pragma once
+// togsim_trace_bridge.h
+// -----------------------------------------------------------------------------
+// Bridge from the recorded trace (togsim_loader.h RunResult) to a TileGraph the
+// existing Simulator/Core can run, for production cycle-equivalence (P3 task 5;
+// see togsim_cpp_trace.md sec 9.9). First cut: one Tile per work-item (the span
+// between two togsim_core_alloc markers), bound to that work-item's core; the
+// DMA/compute records become MOVIN/MOVOUT/COMP Instructions with the RAW
+// dependency edges (a compute waits the dmas its preceding waits named).
+// -----------------------------------------------------------------------------
+#include <memory>
+
+#include "TileGraph.h"
+#include "togsim_loader.h"
+
+// Build a TileGraph from a recorded trace. `path`/`name` label the graph.
+std::unique_ptr<TileGraph> trace_to_tilegraph(const togsim::RunResult& run,
+                                              const std::string& name);
diff --git a/TOGSim/src/CMakeLists.txt b/TOGSim/src/CMakeLists.txt
index 65cd4dd4..d782d4d1 100644
--- a/TOGSim/src/CMakeLists.txt
+++ b/TOGSim/src/CMakeLists.txt
@@ -12,3 +12,8 @@ file(GLOB_RECURSE SRC_FILES
 
 # build
 add_executable(${LIB_NAME} ${SRC_FILES})
+
+# Export the executable's dynamic symbols (-rdynamic) so a dlopen'd trace
+# producer .so resolves the togsim_* runtime callbacks back into this binary
+# (P3 trace pipeline).
+set_target_properties(${LIB_NAME} PROPERTIES ENABLE_EXPORTS ON)
diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc
index 3f84d885..6f9a74d7 100644
--- a/TOGSim/src/Common.cc
+++ b/TOGSim/src/Common.cc
@@ -64,6 +64,10 @@ SimulationConfig initialize_config(const YAML::Node& config,
   parsed_config.core_freq_mhz = get_config_value<uint32_t>(config, "core_freq_mhz");
   if (config["num_systolic_array_per_core"])
     parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"].as<uint32_t>();
+  if (config["core_spad_size_kb"])
+    parsed_config.core_spad_size_kb = config["core_spad_size_kb"].as<uint32_t>();
+  if (config["sa_weight_buffer_depth"])
+    parsed_config.sa_weight_buffer_depth = config["sa_weight_buffer_depth"].as<uint32_t>();
   if (config["num_stonne_per_core"])
     parsed_config.num_stonne_per_core = config["num_stonne_per_core"].as<uint32_t>();
   if (config["num_stonne_port"])
diff --git a/TOGSim/src/Core.cc b/TOGSim/src/Core.cc
index 9dad8597..25335c9c 100644
--- a/TOGSim/src/Core.cc
+++ b/TOGSim/src/Core.cc
@@ -17,6 +17,42 @@ Core::Core(uint32_t id, SimulationConfig config)
   _stat_sa_compute_idle_cycle.resize(_num_systolic_array_per_core);
   _stat_inst_count.resize(static_cast<size_t>(Opcode::COUNT), 0);
   _stat_tot_skipped_inst.resize(static_cast<size_t>(Opcode::COUNT), 0);
+  _sram_capacity = (size_t)config.core_spad_size_kb * 1024;  // 0 = throttle disabled
+  _weight_slot_depth = config.sa_weight_buffer_depth;        // 0 = disabled (plain rr)
+  _weight_slots_used.resize(_num_systolic_array_per_core, 0);
+}
+
+// Round-robin a systolic array that still has a free weight slot; -1 if all full
+// (the preload must stall). Advances _systolic_array_rr past the chosen SA.
+int Core::pick_free_weight_sa() {
+  for (uint32_t i = 0; i < _num_systolic_array_per_core; i++) {
+    uint32_t s = (_systolic_array_rr + i) % _num_systolic_array_per_core;
+    if (_weight_slots_used[s] < (int)_weight_slot_depth) {
+      _systolic_array_rr = (s + 1) % _num_systolic_array_per_core;
+      return (int)s;
+    }
+  }
+  return -1;
+}
+
+void Core::process_weight_releases() {
+  while (!_weight_release_q.empty() && _weight_release_q.begin()->first <= _core_cycle) {
+    auto tok = _weight_release_q.begin()->second;
+    _weight_release_q.erase(_weight_release_q.begin());
+    if (--tok->refcount <= 0) _weight_slots_used[tok->sa]--;  // last reader frees the slot
+  }
+}
+
+// The LAST reader of a buffer-version issued (bridge tags only that consumer):
+// free the version's bytes back to the per-core spad.
+void Core::release_sram(const std::shared_ptr<Instruction>& inst) {
+  if (!_sram_capacity) return;
+  for (int64_t id : inst->get_sram_release()) {
+    auto it = _sram_allocs.find(id);
+    if (it == _sram_allocs.end()) continue;
+    _sram_used -= it->second;
+    _sram_allocs.erase(it);
+  }
 }
 
 bool Core::can_issue(const std::shared_ptr<Tile>& op) {
@@ -154,7 +190,7 @@ void Core::dma_cycle() {
       } else if(!finished_inst->is_dma_read()) {
         core_trace_log::log_error_dma_instruction_invalid(_core_cycle, _id);
         exit(EXIT_FAILURE);
-      } else if (finished_inst->get_opcode() == Opcode::BAR) {
+      } else if (finished_inst->get_opcode() == Opcode::MEMORY_BAR) {
         core_trace_log::trace_instruction_line(_core_cycle,
                                                _id,
                                                TraceLogTag::pad15(TraceLogTag::kInstructionFinished),
@@ -200,6 +236,8 @@ void Core::cycle() {
   /* Increase core cycle counter */
   _core_cycle++;
 
+  process_weight_releases();  // free weight slots due this cycle before dispatch
+
   /* Iterate tile while an instruction is issued */
   bool issued = false;
 
@@ -240,6 +278,22 @@ void Core::cycle() {
               _stat_tot_skipped_inst.at(static_cast<size_t>(inst->get_opcode()))++;
               break;
             } else {
+              // SRAM-capacity gate (sec 10.x): a load that would overflow the
+              // per-core spad does not issue this cycle -- leave it in the ready
+              // queue (it++ retries next cycle) until a consumer frees a tile. On
+              // issue, occupy its bytes under its buffer-version allocation.
+              if (_sram_capacity && inst->get_sram_alloc() >= 0) {
+                size_t F = inst->sram_footprint();
+                // Stall if the tile does not fit in the free spad right now. If
+                // it can never fit (the kernel's working set exceeds the whole
+                // spad), the sim wedges -- Simulator::cycle() detects that frozen
+                // state and exits with a "spad too small" error rather than
+                // looping forever.
+                if (_sram_used + F > _sram_capacity)
+                  break;                                       // not issued -> retry next cycle
+                _sram_used += F;
+                _sram_allocs[inst->get_sram_alloc()] += F;     // accumulate version footprint
+              }
               core_trace_log::trace_instruction_line(_core_cycle,
                                                        _id,
                                                        TraceLogTag::pad15(
@@ -254,6 +308,7 @@ void Core::cycle() {
             }
           }
         case Opcode::MOVOUT:
+          release_sram(inst);   // store issued -> free the tiles it drained
           core_trace_log::trace_instruction_line(_core_cycle,
                                                    _id,
                                                    TraceLogTag::pad15(TraceLogTag::kInstructionIssued),
@@ -265,7 +320,44 @@ void Core::cycle() {
           break;
         case Opcode::COMP:
           {
-            auto& target_pipeline = get_compute_pipeline(inst->get_compute_type());
+            const int ct = inst->get_compute_type();
+            // --- SA selection + weight-buffer gate (sec 10.x) ---
+            // A preload picks a systolic array with a free weight slot and pins
+            // its matmul consumers to that SA (they free the slot on finish). A
+            // matmul runs on the SA its weight was preloaded into. This both
+            // bounds preload run-ahead and keeps matmuls on their weight's SA.
+            int sa_idx = -1;
+            if (ct == MATMUL || ct == PRELOAD) {
+              if (ct == PRELOAD) {
+                int n_consumers = 0;   // matmuls reusing this weight
+                for (auto& c : inst->get_pipeline_children())
+                  if (c->get_compute_type() == MATMUL) n_consumers++;
+                if (_weight_slot_depth > 0 && n_consumers > 0) {
+                  sa_idx = pick_free_weight_sa();
+                  if (sa_idx < 0) break;            // all weight slots full -> stall (retry)
+                  _weight_slots_used[sa_idx]++;
+                  auto tok = std::make_shared<WeightToken>(WeightToken{sa_idx, n_consumers});
+                  for (auto& c : inst->get_pipeline_children())
+                    if (c->get_compute_type() == MATMUL) {
+                      c->set_assigned_sa(sa_idx);
+                      c->set_weight_token(tok);
+                    }
+                } else {                            // disabled / no consumers -> plain rr
+                  sa_idx = _systolic_array_rr;
+                  _systolic_array_rr = (_systolic_array_rr + 1) % _num_systolic_array_per_core;
+                }
+              } else {                              // MATMUL
+                sa_idx = inst->get_assigned_sa();
+                if (sa_idx < 0) {                   // no preload pinned it -> rr fallback
+                  sa_idx = _systolic_array_rr;
+                  _systolic_array_rr = (_systolic_array_rr + 1) % _num_systolic_array_per_core;
+                }
+              }
+              inst->set_assigned_sa(sa_idx);         // record the SA actually used (for the trace)
+            }
+            release_sram(inst);   // consumer issued -> free the tiles it read
+            auto& target_pipeline = (ct == VECTOR_UNIT) ? _vu_compute_pipeline
+                                                        : _sa_compute_pipeline.at(sa_idx);
             if (target_pipeline.empty()) {
               inst->finish_cycle = _core_cycle + inst->get_compute_cycle();
               inst->bubble_cycle = inst->get_overlapping_cycle();
@@ -275,6 +367,18 @@ void Core::cycle() {
               inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle;
               inst->bubble_cycle = bubble_cycle;
             }
+            // sec 10.7: release the occupancy (pipeline) dependents so a successor
+            // overlaps this op. finish_cycle is set first so release can feed it to
+            // a COMPUTE_BAR child's per-dispatch fence (see release_pipeline_children).
+            inst->release_pipeline_children();
+
+            // Release this matmul's weight slot at its streaming-end (finish -
+            // overlapping), not at full finish (the drain tail does not read it).
+            if (ct == MATMUL && inst->get_weight_token()) {
+              cycle_type rel = inst->finish_cycle > inst->get_overlapping_cycle()
+                                 ? inst->finish_cycle - inst->get_overlapping_cycle() : _core_cycle;
+              _weight_release_q.emplace(rel, inst->get_weight_token());
+            }
 
             if (inst->get_compute_cycle() == 0) {
               inst->finish_instruction();
@@ -297,7 +401,7 @@ void Core::cycle() {
             }
           }
           break;
-        case Opcode::BAR:
+        case Opcode::MEMORY_BAR:
           {
             auto& key = inst->get_tag_id();
             uint32_t finished = _dma.get_tag_finish(inst->subgraph_id, key);
@@ -324,6 +428,24 @@ void Core::cycle() {
             issued = true;
           }
           break;
+        case Opcode::COMPUTE_BAR:
+          {
+            // Compute fence (sec 10.7): finish once THIS dispatch's async computes
+            // have drained -- i.e. the current cycle has reached the max finish of
+            // the computes it gates (fed in via update_fence_finish when each
+            // issued). Scoped to its own dispatch, so an unrelated tile's matmuls
+            // sharing the SA pipelines do not delay it (no cross-dispatch
+            // serialization). Not yet drained -> stays in the ready queue.
+            if (_core_cycle >= inst->get_fence_finish()) {
+              core_trace_log::trace_instruction_line(_core_cycle, _id,
+                  TraceLogTag::pad15(TraceLogTag::kInstructionFinished),
+                  inst->get_global_inst_id(),
+                  core_trace_log::format_instruction_detail_line(*inst));
+              finish_instruction(inst);
+              issued = true;
+            }
+          }
+          break;
         default:
           core_trace_log::log_error_undefined_opcode();
           exit(EXIT_FAILURE);
@@ -387,6 +509,19 @@ void Core::finish_instruction(std::shared_ptr<Instruction>& inst, InstFinishTrac
                                            core_trace_log::format_instruction_detail_line(*inst));
 }
 
+bool Core::has_inflight() {
+  // running() without the "_tiles.size() > 0" term: work that will produce a
+  // finish event on its own (so the sim is NOT frozen). If this is false but
+  // tiles remain, only stalled ready instructions are left.
+  if (!_vu_compute_pipeline.empty()) return true;
+  for (int i = 0; i < _num_systolic_array_per_core; i++)
+    if (!_sa_compute_pipeline.at(i).empty()) return true;
+  if (!_dma_waiting_queue.empty() || !_dma_finished_queue.empty()) return true;
+  if (!_dma.empty()) return true;
+  if (!_ld_inst_queue.empty() || !_st_inst_queue.empty()) return true;
+  return false;
+}
+
 bool Core::running() {
   bool running = false;
   running = running || _tiles.size() > 0;
@@ -412,6 +547,13 @@ void Core::push_memory_response(mem_fetch* response) {
   Instruction* owner_inst = static_cast<Instruction*>(response->get_custom_data());
   assert(owner_inst->get_waiting_request());
 
+  if (!owner_inst->got_first_response()) {   // first data of this load arrived
+    owner_inst->mark_first_response();
+    core_trace_log::trace_instruction_line(_core_cycle, _id,
+        TraceLogTag::pad15(TraceLogTag::kFirstDramResponse),
+        owner_inst->get_global_inst_id(),
+        core_trace_log::format_instruction_detail_line(*owner_inst));
+  }
   owner_inst->dec_waiting_request();
   if (!owner_inst->get_waiting_request()) {
     auto it = _dma_waiting_queue.find(owner_inst);
diff --git a/TOGSim/src/CoreTraceLog.cc b/TOGSim/src/CoreTraceLog.cc
index ebc31de0..7086893e 100644
--- a/TOGSim/src/CoreTraceLog.cc
+++ b/TOGSim/src/CoreTraceLog.cc
@@ -31,7 +31,7 @@ std::string format_dma_inst_issued_detail(Instruction& inst) {
   }
   return fmt::format(
       "addr_name={} dram=0x{:016x} rank={} elem_bits={} async={} indirect={} tag=0x{:016x} stride=[{}] size=[{}] "
-      "tag_idx=[{}]",
+      "tag_idx=[{}] tile={}",
       inst.get_addr_name(),
       static_cast<uint64_t>(inst.get_base_dram_address()),
       rank,
@@ -41,7 +41,8 @@ std::string format_dma_inst_issued_detail(Instruction& inst) {
       tag_hex,
       fmt::join(inst.get_tile_stride(), ","),
       fmt::join(ts, ","),
-      fmt::join(tidx, ","));
+      fmt::join(tidx, ","),
+      inst.get_tile_group());
 }
 
 std::string format_dma_inst_issued_trace_line(Instruction& inst) {
@@ -52,31 +53,35 @@ std::string format_instruction_detail_line(Instruction& inst) {
   const Opcode op = inst.get_opcode();
   const std::string opname = opcode_to_string(op);
   if (op == Opcode::COMP) {
-    return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={})",
+    return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={} sa={} tile={})",
                        opname,
                        inst.get_compute_type(),
                        inst.get_compute_cycle(),
-                       inst.get_overlapping_cycle());
+                       inst.get_overlapping_cycle(),
+                       inst.get_assigned_sa(),
+                       inst.get_tile_group());
   }
   if ((op == Opcode::MOVIN || op == Opcode::MOVOUT) && inst.is_async_dma()) {
-    return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])",
+    return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}] tile={})",
                        opname,
                        inst.subgraph_id,
                        inst.get_addr_name(),
                        format_tag_key_list_hex(inst.get_tag_id()),
                        fmt::join(inst.get_tag_idx_list(), ","),
-                       fmt::join(inst.get_tag_stride_list(), ","));
+                       fmt::join(inst.get_tag_stride_list(), ","),
+                       inst.get_tile_group());
   }
   if (op == Opcode::MOVIN || op == Opcode::MOVOUT) {
-    return fmt::format("{} (addr_name={})", opname, inst.get_addr_name());
+    return fmt::format("{} (addr_name={} tile={})", opname, inst.get_addr_name(), inst.get_tile_group());
   }
-  if (op == Opcode::BAR) {
-    return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])",
+  if (op == Opcode::MEMORY_BAR) {
+    return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}] tile={})",
                        opname,
                        inst.get_addr_name(),
                        format_tag_key_list_hex(inst.get_tag_id()),
                        fmt::join(inst.get_tag_idx_list(), ","),
-                       fmt::join(inst.get_tag_stride_list(), ","));
+                       fmt::join(inst.get_tag_stride_list(), ","),
+                       inst.get_tile_group());
   }
   return opname;
 }
diff --git a/TOGSim/src/Instruction.cc b/TOGSim/src/Instruction.cc
index f236d160..d0471226 100644
--- a/TOGSim/src/Instruction.cc
+++ b/TOGSim/src/Instruction.cc
@@ -23,7 +23,8 @@ std::string opcode_to_string(Opcode opcode) {
         case Opcode::MOVIN:        return "MOVIN";
         case Opcode::MOVOUT:       return "MOVOUT";
         case Opcode::COMP:         return "COMP";
-        case Opcode::BAR:          return "BAR";
+        case Opcode::MEMORY_BAR:   return "MEMORY_BAR";
+        case Opcode::COMPUTE_BAR:  return "COMPUTE_BAR";
         default:                   return "Unknown";
     }
 }
@@ -60,6 +61,21 @@ void Instruction::add_child(std::shared_ptr<Instruction> child) {
   child_inst.insert(child);
 }
 
+void Instruction::add_pipeline_child(std::shared_ptr<Instruction> child) {
+  child->inc_ready_counter();
+  _pipeline_children.insert(child);
+}
+
+void Instruction::release_pipeline_children() {
+  for (auto& c : _pipeline_children) {
+    // a COMPUTE_BAR child fences only its own dispatch -> it drains the max
+    // finish of the computes it gates, fed here as each one issues.
+    if (c->get_opcode() == Opcode::COMPUTE_BAR) c->update_fence_finish(finish_cycle);
+    c->dec_ready_counter();
+  }
+  _pipeline_children.clear();
+}
+
 void Instruction::inc_waiting_request() {
   _nr_waiting_request++;
 }
diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc
index d987d787..03dd7bf9 100644
--- a/TOGSim/src/Simulator.cc
+++ b/TOGSim/src/Simulator.cc
@@ -184,6 +184,38 @@ void Simulator::icnt_cycle() {
   _icnt->cycle();
 }
 
+// Consecutive frozen cycles tolerated before declaring the sim wedged (spad too
+// small). Generous so transient idle never false-fires; a true freeze is constant.
+static constexpr uint64_t kWedgeThreshold = 5000;
+
+// Frozen-state guard: work remains (running()) but nothing is in flight to
+// advance it -- the SRAM throttle can never satisfy a load because the kernel's
+// working set exceeds the whole per-core spad (core_spad_size_kb too small). The
+// state repeats every cycle, so after a margin error out instead of looping
+// forever. `stuck` is function-local-static (one running sim at a time; it resets
+// on any progress).
+void Simulator::check_frozen() {
+  static uint64_t stuck = 0;
+  // In flight = anything that will produce a future state change: icnt/dram busy,
+  // a core with DMA/compute pending, or a tile still schedulable.
+  bool inflight = _icnt->running() || _dram->running();
+  for (int id = 0; id < _n_cores && !inflight; id++) {
+    if (_cores[id]->has_inflight()) inflight = true;
+    else if (!get_partition_scheduler(id)->empty(id)) inflight = true;
+  }
+  if (running() && !inflight) {
+    if (++stuck > kWedgeThreshold) {
+      spdlog::error("[Simulator] simulation wedged at cycle {}: work remains but "
+                    "nothing is in flight -- the per-core spad (core_spad_size_kb) "
+                    "is too small to hold a kernel's working set. Increase it.",
+                    _core_cycles);
+      exit(EXIT_FAILURE);
+    }
+  } else {
+    stuck = 0;
+  }
+}
+
 void Simulator::cycle() {
   while (running() || _core_cycles < 1) {
     set_cycle_mask();
@@ -198,6 +230,8 @@ void Simulator::cycle() {
     // Interconnect cycle
     if (IS_ICNT_CYCLE(_cycle_mask))
       icnt_cycle();
+
+    check_frozen();   // spad-too-small guard (errors out if wedged)
   }
   for (auto &core: _cores) {
     core->check_tag();
diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc
index 5060d336..572062e0 100644
--- a/TOGSim/src/TileGraphParser.cc
+++ b/TOGSim/src/TileGraphParser.cc
@@ -543,7 +543,7 @@ std::vector<std::shared_ptr<Tile>> TileLoopNode::get_tiles_from_iter(TileGraphPa
              fmt::join(new_tag_stride_list, ", "));
 
       std::shared_ptr<Instruction> inst = std::make_shared<Instruction>(
-        Opcode::BAR, 0,
+        Opcode::MEMORY_BAR, 0,
         0, base_addr,
         std::vector<size_t>(), std::vector<int>(), 0,
         tag_list, new_tag_stride_list, accum_tag_list
diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc
index 010826ef..a763cdd0 100644
--- a/TOGSim/src/main.cc
+++ b/TOGSim/src/main.cc
@@ -8,6 +8,8 @@
 #include "Simulator.h"
 #include "TileGraphParser.h"
 #include "helper/CommandLineParser.h"
+#include "togsim_loader.h"        // P3 trace pipeline: run a compiled producer .so
+#include "togsim_trace_bridge.h"  // ... and bridge its trace to a TileGraph
 
 namespace fs = std::filesystem;
 namespace po = boost::program_options;
@@ -104,6 +106,11 @@ int main(int argc, char** argv) {
       "models_list", "Path for the trace file (.trace)");
   cmd_parser.add_command_line_option<std::string>(
       "log_level", "Set for log level [trace, debug, info], default = info");
+  cmd_parser.add_command_line_option<std::string>(
+      "trace_so", "Path to a compiled trace producer .so (P3 trace pipeline)");
+  cmd_parser.add_command_line_option<std::string>(
+      "cycle_table", "Path to a 'cycle<TAB>overlapping' per-tile_id sidecar (TSV) "
+                     "for --trace_so; falls back to a flat stub if omitted");
   try {
     cmd_parser.parse(argc, argv);
   } catch (const CommandLineParser::ParsingError& e) {
@@ -147,6 +154,47 @@ int main(int argc, char** argv) {
     exit(1);
   }
 
+  // P3 trace pipeline: if a compiled producer .so is given, run it, bridge the
+  // recorded trace to a TileGraph, and run the existing Simulator on it.
+  std::string trace_so_path;
+  cmd_parser.set_if_defined("trace_so", &trace_so_path);
+  if (!trace_so_path.empty()) {
+    const auto& cfg = simulator->get_hardware_config_yaml();
+    int num_cores = cfg["num_cores"] ? cfg["num_cores"].as<int>() : 1;
+    // First cut: stub tensor bases (real per-tensor addresses come later).
+    std::vector<uint64_t> bases(16);
+    for (size_t i = 0; i < bases.size(); ++i) bases[i] = 0x100000ull * (i + 1);
+    // Cycle table: load the per-tile_id TSV sidecar if given, else a flat stub.
+    std::vector<int64_t> cyc, ovl;
+    std::string cycle_table_path;
+    cmd_parser.set_if_defined("cycle_table", &cycle_table_path);
+    if (!cycle_table_path.empty()) {
+      std::ifstream ct(cycle_table_path);
+      if (!ct.is_open()) { spdlog::error("[TOGSim] cannot open cycle_table {}", cycle_table_path); exit(1); }
+      int64_t c, o;
+      while (ct >> c >> o) { cyc.push_back(c); ovl.push_back(o); }
+      spdlog::info("[TOGSim-trace] loaded cycle table: {} tiles from {}", cyc.size(), cycle_table_path);
+    } else {
+      cyc.assign(256, 128);
+      ovl.assign(256, 0);
+    }
+    auto run = togsim::run_producer(trace_so_path.c_str(), nullptr, 0,
+                                    bases.data(), (int)bases.size(),
+                                    cyc.data(), ovl.data(), (int)cyc.size(),
+                                    num_cores);
+    if (!run.ok) { spdlog::error("[TOGSim] trace producer run failed"); exit(1); }
+    spdlog::info("[TOGSim-trace] recorded {} instructions", run.trace.size());
+    auto tg = trace_to_tilegraph(run, "trace_kernel");
+    tg->set_arrival_time(simulator->get_core_cycle());
+    tg->set_kernel_id(0);
+    simulator->enqueue_graph(0, std::move(tg));
+    simulator->run_simulator();
+    spdlog::info("[TOGSim-trace] Total cycles: {}", simulator->get_core_cycle());
+    spdlog::info("Simulation finished");
+    simulator->print_core_stat();
+    return 0;
+  }
+
   // Get trace file path
   cmd_parser.set_if_defined("models_list", &trace_file_path);
 
diff --git a/TOGSim/src/togsim_runtime.cc b/TOGSim/src/togsim_runtime.cc
new file mode 100644
index 00000000..86de081e
--- /dev/null
+++ b/TOGSim/src/togsim_runtime.cc
@@ -0,0 +1,199 @@
+// togsim_runtime.cc
+// -----------------------------------------------------------------------------
+// C6 runtime + loader for the compiled trace producer (P3 task 5). Implements
+// the producer ABI (togsim_runtime.h) and the TOGSim-side loader
+// (togsim_loader.h). See docs/design/togsim_cpp_trace.md sec 5.3 / 9.6.1 / 9.7.
+//
+// The producer `.so` calls the extern "C" togsim_* functions below; each one
+// records a TraceRec on the EmitCtx. EmitCtx is the opaque type the producer
+// only ever passes back to us. This is the "materializing sink": it resolves
+// addresses and per-tile cycles into a recorded instruction stream. Wiring the
+// stream into the existing timing core (Core/Simulator) is the remaining step.
+// -----------------------------------------------------------------------------
+
+#include "togsim_loader.h"
+
+#include <algorithm>
+#include <cstdio>
+#include <cstdlib>
+#include <dlfcn.h>
+#include <map>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+// Full definition of the opaque handle from togsim_runtime.h. The producer holds
+// only EmitCtx* and never dereferences it.
+struct EmitCtx {
+  // inputs supplied by the loader
+  const uint64_t* tensor_base = nullptr;
+  int32_t         n_tensors = 0;
+  const int64_t*  cyc = nullptr;   // tile_id -> cycle
+  const int64_t*  ovl = nullptr;   // tile_id -> overlapping_cycle
+  int32_t         n_tiles = 0;
+  int32_t         num_cores = 1;
+  // mutable run state
+  int32_t  rr = 0;            // round-robin core cursor
+  int32_t  cur_core = -1;     // current work-item's core
+  std::vector<togsim::TraceRec> trace;
+};
+
+namespace {
+inline togsim::TraceRec blank(togsim::TraceRec::Kind k, int32_t core) {
+  togsim::TraceRec r{};
+  r.kind = k;
+  r.core = core;
+  return r;
+}
+}  // namespace
+
+extern "C" {
+
+int32_t togsim_abi_version(void) { return TOGSIM_ABI_VERSION; }
+
+void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, int64_t* iv, int32_t n_iv) {
+  // Higher-order work-item wrapper (sec 9.3): round-robin a core (the producer
+  // never sees num_cores), bracket the work-item with TILE_BEGIN/TILE_END, and
+  // run its body. The work-item SCOPE is exactly this fn call -- the begin/end
+  // are runtime-owned, so the producer never relies on an implicit "ops until
+  // the next alloc" boundary. The ops fn emits record under ctx->cur_core.
+  ctx->cur_core = ctx->num_cores > 0 ? (ctx->rr++ % ctx->num_cores) : 0;
+  ctx->trace.push_back(blank(togsim::TraceRec::TILE_BEGIN, ctx->cur_core));
+  fn(ctx, iv, n_iv);
+  ctx->trace.push_back(blank(togsim::TraceRec::TILE_END, ctx->cur_core));
+}
+
+void togsim_dma(EmitCtx* ctx, int32_t dir, int32_t arg_id,
+                uint64_t offset, int32_t ndim, const int64_t* dims,
+                const int64_t* strides, int32_t elem_bits,
+                int32_t is_async, int32_t tag_id, uint64_t tag_slot,
+                const int64_t* read_bufs, int32_t n_read,
+                const int64_t* write_bufs, int32_t n_write) {
+  uint64_t base = (arg_id >= 0 && arg_id < ctx->n_tensors)
+                      ? ctx->tensor_base[arg_id] : 0;
+  uint64_t addr = base + offset * (uint64_t)(elem_bits / 8);
+  togsim::TraceRec r = blank(togsim::TraceRec::DMA, ctx->cur_core);
+  r.dir = dir; r.arg_id = arg_id; r.elem_bits = elem_bits;
+  r.is_async = is_async; r.addr = addr; r.tag_id = tag_id; r.tag_slot = tag_slot;
+  for (int32_t i = 0; i < ndim; ++i) {
+    if (dims) r.dims.push_back(dims[i]);
+    if (strides) r.strides.push_back(strides[i]);
+  }
+  for (int32_t i = 0; i < n_read; ++i) r.read_bufs.push_back(read_bufs[i]);
+  for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]);
+  ctx->trace.push_back(r);
+}
+
+void togsim_compute(EmitCtx* ctx, uint64_t tile_id, int32_t compute_type,
+                    int32_t ndim, const int64_t* dims,
+                    const int64_t* read_bufs, int32_t n_read,
+                    const int64_t* write_bufs, int32_t n_write) {
+  (void)ndim; (void)dims;
+  togsim::TraceRec r = blank(togsim::TraceRec::COMPUTE, ctx->cur_core);
+  r.tile_id = tile_id;
+  r.compute_type = compute_type;
+  for (int32_t i = 0; i < n_read; ++i) r.read_bufs.push_back(read_bufs[i]);
+  for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]);
+  if (ctx->cyc && (int32_t)tile_id < ctx->n_tiles) r.cycle = ctx->cyc[tile_id];
+  if (ctx->ovl && (int32_t)tile_id < ctx->n_tiles) r.overlapping = ctx->ovl[tile_id];
+  ctx->trace.push_back(r);
+}
+
+void togsim_memory_barrier(EmitCtx* ctx, int32_t tag_id, uint64_t tag_slot,
+                           const int64_t* write_bufs, int32_t n_write) {
+  togsim::TraceRec r = blank(togsim::TraceRec::MEMORY_BAR, ctx->cur_core);
+  r.tag_id = tag_id; r.tag_slot = tag_slot;
+  for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]);
+  ctx->trace.push_back(r);
+}
+
+void togsim_compute_barrier(EmitCtx* ctx) {
+  ctx->trace.push_back(blank(togsim::TraceRec::COMPUTE_BAR, ctx->cur_core));
+}
+
+}  // extern "C"
+
+namespace togsim {
+
+RunResult run_producer(const char* so_path,
+                       const int64_t* shape_args, int32_t n_shape,
+                       const uint64_t* tensor_base, int32_t n_tensors,
+                       const int64_t* cyc, const int64_t* ovl, int32_t n_tiles,
+                       int32_t num_cores) {
+  RunResult res;
+  void* lib = dlopen(so_path, RTLD_NOW | RTLD_GLOBAL);
+  if (!lib) { fprintf(stderr, "togsim: dlopen failed: %s\n", dlerror()); return res; }
+  auto emit = (void (*)(EmitCtx*, int64_t*, int32_t))dlsym(lib, "togsim_kernel");
+  if (!emit) { fprintf(stderr, "togsim: dlsym togsim_kernel failed: %s\n", dlerror()); return res; }
+
+  EmitCtx ctx;
+  ctx.tensor_base = tensor_base; ctx.n_tensors = n_tensors;
+  ctx.cyc = cyc; ctx.ovl = ovl; ctx.n_tiles = n_tiles;
+  ctx.num_cores = num_cores > 0 ? num_cores : 1;
+  emit(&ctx, (int64_t*)shape_args, n_shape);
+
+  res.ok = true;
+  res.trace = std::move(ctx.trace);
+  return res;
+}
+
+SimResult simulate(const RunResult& run, const TimingParams& params) {
+  SimResult out;
+  std::unordered_map<int, uint64_t> dma_free;     // DMA-engine free time, per core
+  std::unordered_map<int, uint64_t> comp_free;    // compute free time, per core
+  std::unordered_map<int, uint64_t> prev_comp;    // prev compute finish (overlap), per core
+  std::map<std::pair<int32_t, uint64_t>, uint64_t> tag_finish;  // (tag_id,tag_slot) -> finish
+  std::vector<uint64_t> pending;                    // barrier-resolved deps since last compute
+
+  for (const auto& t : run.trace) {
+    const int c = t.core;
+    switch (t.kind) {
+      case TraceRec::DMA: {
+        // DMAs serialize on the core's DMA engine (overlap compute -> separate
+        // timeline). finish = issue + latency, recorded under the runtime tag.
+        uint64_t start = dma_free[c];
+        uint64_t fin = start + params.dma_latency;
+        dma_free[c] = fin;
+        tag_finish[{t.tag_id, t.tag_slot}] = fin;
+        out.n_dma++;
+        break;
+      }
+      case TraceRec::MEMORY_BAR: {
+        // the explicit async-DMA sync: gate the next compute on the paired dma's
+        // data-arrival, found by the runtime tag (tag_id, tag_slot).
+        auto it = tag_finish.find({t.tag_id, t.tag_slot});
+        if (it != tag_finish.end()) pending.push_back(it->second);
+        break;
+      }
+      case TraceRec::COMPUTE: {
+        uint64_t deps = 0;
+        for (uint64_t f : pending) deps = std::max(deps, f);
+        pending.clear();
+        uint64_t start = std::max(comp_free[c], deps);
+        uint64_t fin;
+        auto pit = prev_comp.find(c);
+        if (pit != prev_comp.end()) {
+          uint64_t prev = pit->second;
+          uint64_t tail = prev > start ? prev - start : 0;     // prev still running
+          uint64_t overlapped = std::min<uint64_t>(tail, (uint64_t)t.overlapping);
+          fin = std::max(start, prev) + (uint64_t)t.cycle - overlapped;
+        } else {
+          fin = start + (uint64_t)t.cycle;
+        }
+        comp_free[c] = fin;
+        prev_comp[c] = fin;
+        out.n_compute++;
+        break;
+      }
+      case TraceRec::TILE_BEGIN:
+      case TraceRec::TILE_END:
+      case TraceRec::COMPUTE_BAR:
+        break;  // work-item boundary / compute fence: no cost in this reference timer
+    }
+  }
+  for (auto& kv : dma_free) out.total_cycle = std::max(out.total_cycle, kv.second);
+  for (auto& kv : comp_free) out.total_cycle = std::max(out.total_cycle, kv.second);
+  return out;
+}
+
+}  // namespace togsim
diff --git a/TOGSim/src/togsim_trace_bridge.cc b/TOGSim/src/togsim_trace_bridge.cc
new file mode 100644
index 00000000..e13af2d7
--- /dev/null
+++ b/TOGSim/src/togsim_trace_bridge.cc
@@ -0,0 +1,278 @@
+// togsim_trace_bridge.cc -- see togsim_trace_bridge.h
+#include "togsim_trace_bridge.h"
+
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "Tile.h"
+#include "Instruction.h"
+
+namespace {
+
+// `uniq` is a per-DMA-record unique tag-key id minted by the caller. The Core
+// tag table keys completion on [addr_id, ..., sum(tag_idx*stride)]; using `uniq`
+// as addr_id makes every reduction iteration of one static dma get a DISTINCT
+// key -- so multi-tile-K (and conv, whose reduction is the kh*kw*C nest) do not
+// collide, with no coordinate enumeration. The matching memory_barrier reuses
+// the same `uniq` (current-load map per (tag_id, tag_slot), see
+// trace_to_tilegraph), so the table still pairs them. This works because the
+// recorded stream is already per-iteration (the producer ran the loops) --
+// unlike a compile-time event_id. `tag_idx` (the subtile slot) is retained for
+// the SRAM double-buffer model.
+//
+// FIXME(semantics): the per-iteration tag is still reconstructed HERE from the
+// record order. The producer IR now DOES carry a per-iteration tag -- dma_fine_-
+// grained emits a fresh tag memref.alloc just before each coarse load (rewiring
+// its dma_wait), so successive reduction iterations allocate distinct tags -- but
+// build_skeleton collapses that to one static tag_id (it DCEs the alloc and keys
+// togsim.dma by the alloc's static identity), so this bridge still needs `uniq`
+// to tell iterations apart at runtime. The faithful finish is to thread the
+// per-iteration alloc identity through build_skeleton as an SSA tag handle on the
+// togsim.dma / togsim.memory_barrier (then `uniq` here is unnecessary).
+std::shared_ptr<Instruction> make_dma(const togsim::TraceRec& t, int64_t uniq) {
+  Opcode op = (t.dir == 1) ? Opcode::MOVOUT : Opcode::MOVIN;
+  std::vector<size_t> tile_size(t.dims.begin(), t.dims.end());
+  std::vector<int> tile_stride(t.strides.begin(), t.strides.end());
+  std::vector<int64_t> tag_idx{(int64_t)t.tag_slot};
+  std::vector<int64_t> tag_stride{1};
+  auto inst = std::make_shared<Instruction>(
+      op, /*compute_cycle=*/0, /*num_parents=*/0, /*dram_addr=*/t.addr,
+      tile_size, tile_stride, (size_t)t.elem_bits, tag_idx, tag_stride,
+      /*accum_tag_idx_list=*/std::vector<int64_t>{});
+  inst->set_is_async(t.is_async != 0);
+  inst->set_addr_name("tag" + std::to_string(uniq), uniq);
+  inst->prepare_tag_key();
+  return inst;
+}
+
+// A MEMORY_BAR carrying the SAME `uniq` tag key as the async dma it gates -- the
+// Core's tag table signals it at the dma's DATA-ready (resp-complete), unlike a
+// raw add_child which the async dma releases at issue-complete.
+std::shared_ptr<Instruction> make_mem_bar(const togsim::TraceRec& t, int64_t uniq) {
+  auto bar = std::make_shared<Instruction>(
+      Opcode::MEMORY_BAR, 0, 0, 0,
+      std::vector<size_t>{}, std::vector<int>{}, 0,
+      std::vector<int64_t>{(int64_t)t.tag_slot}, std::vector<int64_t>{1},
+      std::vector<int64_t>{});
+  bar->set_addr_name("tag" + std::to_string(uniq), uniq);
+  bar->prepare_tag_key();
+  return bar;
+}
+
+std::shared_ptr<Instruction> make_compute(const togsim::TraceRec& t) {
+  auto inst = std::make_shared<Instruction>(
+      Opcode::COMP, /*compute_cycle=*/(cycle_type)t.cycle, /*num_parents=*/0,
+      /*dram_addr=*/0, std::vector<size_t>{}, std::vector<int>{}, /*elem_bits=*/0,
+      std::vector<int64_t>{}, std::vector<int64_t>{}, std::vector<int64_t>{});
+  inst->set_overlapping_cycle((cycle_type)t.overlapping);
+  inst->set_compute_type(t.compute_type);  // route to VPU vs systolic array
+  return inst;
+}
+
+}  // namespace
+
+std::unique_ptr<TileGraph> trace_to_tilegraph(const togsim::RunResult& run,
+                                              const std::string& name) {
+  using togsim::TraceRec;
+  auto tg = std::make_unique<TileGraph>(name, name);
+  // Empty cache plan (no L2/CMEM persistence) -- append_subgraph propagates it
+  // to each subgraph, and DMA::is_cacheable dereferences it, so it must be a
+  // valid (if empty) IntervalTree rather than null.
+  tg->init_cache_plan({});
+
+  std::shared_ptr<TileSubGraph> sg;
+  std::shared_ptr<Tile> tile;
+  // Explicit dependency DAG (sec 10): a reader depends on the last writer of each
+  // SRAM buffer it reads. Scoped per work-item (reset at each dispatch) -- buffers
+  // are work-item-local, so distinct work-items are independent (-> parallel).
+  std::map<int64_t, std::shared_ptr<Instruction>> last_writer;  // buffer id -> producer
+  // An async dma is paired with its explicit memory_barrier(s) by the runtime tag
+  // (tag_id, tag_slot). It is 1 load : N barriers (the load happens once per
+  // reduction iteration; each consumer in that iteration is preceded by a wait on
+  // the same tag), so we track the CURRENT (most recent) load per (tag_id,
+  // tag_slot) -- like last_writer for a buffer -- not a FIFO. Each load gets a
+  // fresh `uniq` Core key, so successive reduction iterations (multi-tile-K, conv)
+  // never collide in the tag table; the iteration's barriers reuse that load's
+  // uniq. Correct because the load nest and its consumer nest run in order within
+  // the reduction body (no cross-iteration prefetch). Scoped per work-item.
+  std::map<std::pair<int32_t, uint64_t>,
+           std::pair<int64_t, std::shared_ptr<Instruction>>> current_dma;
+  int64_t next_tag = 0;   // mints a unique Core tag key per dma record
+  int cur_tile_group = -1;   // work-item index, bumped per TILE_BEGIN (trace grouping)
+  // Async compute (matmul/preload): issued and pipelined on the systolic array;
+  // they do not block each other. A store then needs the drained result, so it
+  // FLUSHes -- waits all outstanding async compute before running (like a fence
+  // after async ops). No per-op completion events; one barrier before the store.
+  std::vector<std::shared_ptr<Instruction>> outstanding_async;
+  std::shared_ptr<Instruction> pending_bar;   // last COMPUTE_BAR fence, awaited by the next store
+  auto is_async_compute = [](int ct) { return ct == 1 || ct == 2; };  // matmul / preload
+
+  auto flush = [&]() {
+    if (sg && tile) {
+      sg->add_tile(tile);
+      tile->set_owner(sg);
+      tg->append_subgraph(sg);
+    }
+    sg.reset();
+    tile.reset();
+    last_writer.clear();
+    current_dma.clear();
+    next_tag = 0;
+    outstanding_async.clear();
+    pending_bar.reset();
+  };
+
+  // Build edges from the recorded read/write buffer sets: reader <- last writer of
+  // each buffer it reads (the virtual SA_WEIGHTS buffer carries preload->matmul;
+  // the Y_spad accumulator carries the reduction chain; the spads carry load->
+  // compute). No in-order chain, no tag matching, no op heuristics.
+  // sec 10.7 occupancy/latency split. An edge from a systolic-array producer
+  // (preload=2 or matmul=1) to a matmul (1) is an OCCUPANCY dependency: the
+  // successor overlaps the producer on the SA pipeline, so use add_pipeline_child
+  // (released when the producer ISSUES). Every other edge is a LATENCY
+  // dependency (the consumer needs the producer's result): load->compute,
+  // init->matmul, matmul->store -> add_child (released at the producer's finish).
+  const int MATMUL_CT = 1, PRELOAD_CT = 2;
+  auto link = [&](std::shared_ptr<Instruction> inst,
+                  const std::vector<int64_t>& reads,
+                  const std::vector<int64_t>& writes) {
+    for (int64_t b : reads) {
+      auto it = last_writer.find(b);
+      if (it == last_writer.end()) continue;
+      int pct = it->second->get_compute_type();
+      if (inst->get_compute_type() == MATMUL_CT && (pct == MATMUL_CT || pct == PRELOAD_CT))
+        it->second->add_pipeline_child(inst);  // SA pipeline -> occupancy (overlap)
+      else
+        it->second->add_child(inst);           // data/result -> latency (full wait)
+    }
+    for (int64_t b : writes) last_writer[b] = inst;
+    tile->append_instuction(inst);
+  };
+
+  // --- SRAM-capacity tracking (buffer-version allocations, sec 10.x) ---
+  // A coarse tile = one version of its buffer; the fine DMAs that fill it share
+  // one allocation, freed once all the version's consumers have issued (refcount
+  // -> 0). NOT reset in flush(): the spad is one physical per-core resource, so a
+  // buffer reused by the next reduction iter / work-item is a NEW version that
+  // must wait for the old one to free (WAR / double-buffer). Tracked buffers are
+  // the DMA-loaded ones; the accumulator / virtual SA-weights are never written
+  // by a load, so cur_alloc has no entry and they are skipped. (v1: single-core;
+  // multi-core would key cur_alloc/vers by (core, buf).)
+  int64_t next_alloc = 0;
+  std::map<int64_t, int64_t> cur_alloc;   // buf -> current version id
+  std::map<int64_t, bool> open_ver;       // buf -> version still accepting loads
+  struct Ver { std::vector<std::shared_ptr<Instruction>> loads, readers; };
+  std::map<int64_t, Ver> vers;
+  auto sram_on_load = [&](int64_t b, const std::shared_ptr<Instruction>& ld) {
+    if (!cur_alloc.count(b) || !open_ver[b]) {   // a read closed it -> new version
+      cur_alloc[b] = next_alloc++;
+      open_ver[b] = true;
+      vers[cur_alloc[b]] = {};
+    }
+    ld->set_sram_alloc(cur_alloc[b]);
+    vers[cur_alloc[b]].loads.push_back(ld);
+  };
+  auto sram_on_read = [&](int64_t b, const std::shared_ptr<Instruction>& rd) {
+    auto it = cur_alloc.find(b);
+    if (it == cur_alloc.end()) return;           // not a load buffer -> untracked
+    vers[it->second].readers.push_back(rd);
+    open_ver[b] = false;                          // next write starts a new version
+  };
+  auto sram_finalize = [&]() {                    // tag only each version's LAST reader
+    for (auto& kv : vers) {
+      auto& v = kv.second;
+      if (v.readers.empty()) {                    // no consumer -> never freed: untrack
+        for (auto& ld : v.loads) ld->set_sram_alloc(-1);
+        continue;
+      }
+      v.readers.back()->add_sram_release(kv.first);  // it frees the whole version on issue
+    }
+  };
+
+  for (const auto& t : run.trace) {
+    if (t.kind == TraceRec::TILE_BEGIN) {
+      // togsim_dispatch opened a work-item -> new subgraph (bound to its core) +
+      // tile. The scope runs until the matching TILE_END (the dispatch wrapper
+      // brackets the tile fn call), not until the next begin.
+      flush();
+      sg = std::make_shared<TileSubGraph>();
+      sg->set_core_id(t.core);
+      tile = std::make_shared<Tile>(Tile::Status::INITIALIZED);
+      cur_tile_group++;
+      continue;
+    }
+    if (t.kind == TraceRec::TILE_END) {
+      flush();   // close the work-item explicitly (scope = the tile fn call)
+      continue;
+    }
+    if (!tile) continue;  // defensive: ops before the first TILE_BEGIN
+
+    if (t.kind == TraceRec::DMA) {
+      int64_t uniq = next_tag++;                         // fresh Core tag key per dma record
+      auto inst = make_dma(t, uniq);
+      inst->set_tile_group(cur_tile_group);
+      size_t numel = 1;                                  // SRAM footprint (ready-tile ordering)
+      for (auto d : t.dims) numel *= (size_t)d;
+      tile->inc_required_sram_size(numel * (t.elem_bits / 8));
+      if (t.dir == 1) {                                  // STORE
+        if (pending_bar) {
+          // after a compute fence: wait it (drains the async matmuls) -- covers
+          // the accumulator read, so no per-buffer read edge.
+          pending_bar->add_child(inst);
+          pending_bar.reset();
+          for (int64_t b : t.write_bufs) last_writer[b] = inst;
+          tile->append_instuction(inst);
+        } else {
+          link(inst, t.read_bufs, t.write_bufs);
+        }
+        for (int64_t b : t.read_bufs) sram_on_read(b, inst);  // store frees what it drains
+      } else {                                           // LOAD
+        tile->append_instuction(inst);
+        // async load: record it as the CURRENT load for this (tag_id, tag_slot)
+        // with its fresh uniq; the barriers in this reduction iteration reuse that
+        // uniq (1 load : N barriers). A new iteration's load overwrites it with a
+        // new uniq -> distinct tag key, no collision. last_writer = the dma for now;
+        // the barrier overwrites it so consumers gate on data arrival. A sync load
+        // has no barrier and blocks to arrival itself.
+        if (t.is_async) current_dma[{t.tag_id, t.tag_slot}] = {uniq, inst};
+        for (int64_t b : t.write_bufs) last_writer[b] = inst;
+        for (int64_t b : t.write_bufs) sram_on_load(b, inst);   // occupy spad
+      }
+    } else if (t.kind == TraceRec::MEMORY_BAR) {
+      // the explicit async-DMA sync (the original dma_wait). Pair with the CURRENT
+      // load for this (tag_id, tag_slot), reusing its uniq Core key so the dma and
+      // bar pair in the tag table; the dma releases the bar at issue-complete
+      // (add_child), then the bar parks on the tag until data-ready (resp-complete,
+      // set_tag_finish). Consumers of the loaded buffer then gate on the bar.
+      auto it = current_dma.find({t.tag_id, t.tag_slot});
+      int64_t uniq = next_tag++;                         // fallback if unpaired
+      std::shared_ptr<Instruction> dma_inst;
+      if (it != current_dma.end()) { uniq = it->second.first; dma_inst = it->second.second; }
+      auto bar = make_mem_bar(t, uniq);
+      bar->set_tile_group(cur_tile_group);
+      if (dma_inst) dma_inst->add_child(bar);
+      tile->append_instuction(bar);
+      for (int64_t b : t.write_bufs) last_writer[b] = bar;
+    } else if (t.kind == TraceRec::COMPUTE) {
+      auto inst = make_compute(t);
+      inst->set_tile_group(cur_tile_group);
+      link(inst, t.read_bufs, t.write_bufs);
+      for (int64_t b : t.read_bufs) sram_on_read(b, inst);     // frees the tiles it consumes
+      if (is_async_compute(t.compute_type)) outstanding_async.push_back(inst);
+    } else if (t.kind == TraceRec::COMPUTE_BAR) {
+      // explicit compute fence: ready once all outstanding async compute have
+      // ISSUED (pipeline-child release); the Core then waits the SA pipelines to
+      // drain before it finishes (-> the store it gates).
+      auto bar = std::make_shared<Instruction>(Opcode::COMPUTE_BAR);
+      bar->set_tile_group(cur_tile_group);
+      for (auto& a : outstanding_async) a->add_pipeline_child(bar);
+      outstanding_async.clear();
+      tile->append_instuction(bar);
+      pending_bar = bar;
+    }
+  }
+  flush();
+  sram_finalize();   // readers per version are now final -> set each version's refcount
+  return tg;
+}
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
index 6d2537d9..7fea374b 100644
--- a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml
@@ -22,3 +22,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
index f830419b..3a96b588 100644
--- a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
index 1a8c60f6..41e267b6 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml
@@ -25,3 +25,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
index ff976784..397f0fb7 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
index 2ed1bb12..f080fc69 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
index 1bcc9bb3..f89661b8 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 8
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
index 39d195b0..ca69d930 100644
--- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
+++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml
@@ -28,3 +28,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
index bf01913b..b7b03e7a 100644
--- a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
index 8c71c528..903ffcbc 100644
--- a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
+++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml
@@ -34,3 +34,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
index d058f188..6a234017 100644
--- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml
@@ -28,3 +28,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
index 019a0f0f..f0546e56 100644
--- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
+++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml
@@ -27,3 +27,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
index 348babae..08ec26ac 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml
@@ -25,3 +25,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
index a0985aec..a6e073e9 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml
@@ -26,3 +26,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
index 166e2e25..5436b3e8 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml
@@ -29,3 +29,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
index 6119e83d..d928f9d3 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml
@@ -30,3 +30,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
index 9100c22a..dd9dfac7 100644
--- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
+++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml
@@ -28,3 +28,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB.
+core_spad_size_kb: 16384
diff --git a/configs/systolic_ws_8x8_c1_booksim.yml b/configs/systolic_ws_8x8_c1_booksim.yml
index f46d380e..1593e148 100644
--- a/configs/systolic_ws_8x8_c1_booksim.yml
+++ b/configs/systolic_ws_8x8_c1_booksim.yml
@@ -23,3 +23,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core spad: 8x8 array, 128 KB x 8 = 1 MB.
+core_spad_size_kb: 1024
diff --git a/configs/systolic_ws_8x8_c1_simple_noc.yml b/configs/systolic_ws_8x8_c1_simple_noc.yml
index 1be24b85..b2d16c6a 100644
--- a/configs/systolic_ws_8x8_c1_simple_noc.yml
+++ b/configs/systolic_ws_8x8_c1_simple_noc.yml
@@ -24,3 +24,6 @@ codegen_external_mapping_file: ''
 codegen_autotune_max_retry: 10
 codegen_autotune_template_topk: 4
 codegen_compiler_optimization: all
+
+# Per-core spad: 8x8 array, 128 KB x 8 = 1 MB.
+core_spad_size_kb: 1024
diff --git a/docs/design/togsim_cpp_trace.md b/docs/design/togsim_cpp_trace.md
new file mode 100644
index 00000000..9565bdfb
--- /dev/null
+++ b/docs/design/togsim_cpp_trace.md
@@ -0,0 +1,1006 @@
+# TOGSim C++ Trace Generation — Design Proposal
+
+**Status:** Implemented end-to-end through the real timing Core (256^3 GEMM); see
+§11 for remaining work.
+**Branch:** `feature/togsim-cpp-trace`
+**Scope:** Replace the timing-path TOG producer (MLIR → Python-dict → ONNX → C++
+parser) with a compiled, shape-parametric trace producer (MLIR → C++ → `.so`).
+TOGSim's timing core is preserved.
+
+**Note on the sync mechanism (read before §3, §5, §9).** An earlier version of
+this design synchronized an asynchronous DMA with the consumer that waits on its
+data using a compile-time integer `event_id` — one id per static `togsim.dma`/
+`togsim.wait` op, paired through a heap "event buffer" of opaque handles. That
+mechanism was *removed*: a single static `togsim.dma` op executes once per loop
+iteration, each iteration writing a different runtime tag slot, so one
+compile-time id per op cannot represent the per-iteration pairing. The current
+design (ABI v11) pairs an async DMA with its sync point by the **runtime tag
+slot** instead. Sections below have been rewritten to the runtime-tag model;
+where a section still mentions `event_id` / event handles / `togsim_wait` /
+`togsim_signal`, it is flagged as the superseded design, not current behavior.
+
+---
+
+## 1. Motivation
+
+The current Tile-Operation Graph (TOG) pipeline has accumulated structural debt
+that blocks where we want to go (notably dynamic shape for LLM decode / MoE):
+
+1. **"ONNX in name only."** TOG is serialized as ONNX, but every op is a custom
+   `torchsim_*` attribute. We pay ONNX's costs (rigid schema, protobuf,
+   stringly-typed attribute encoding) and use none of its interop value
+   (onnxruntime, standard ops, netron). The schema lives in three places —
+   Python dict (`extension_op.py`), ONNX (`AsmParser/onnx_utility.py`), C++
+   (`TOGSim/.../TileGraphParser`) — and drifts.
+
+2. **Synchronization is ad-hoc and DMA-specific.** Completion tracking is a
+   counting-semaphore in disguise, but unnamed and tangled:
+   - `DMA.h`: `tag_table[subgraph][tag_key] -> uint32` with overloaded magic
+     values (`0` pending, `1` signaled, `>1` consumed-count, `-1` sparse) plus a
+     parallel `waiters` wait-queue. The `tag_key` is a hand-rolled
+     content-addressed vector computed from loop indices/strides (`calc_tag`),
+     with implicit fallbacks (push `0` when an index is missing, dedup by
+     silently `continue`-ing).
+   - A *second*, separate dependency mechanism — `Instruction::ready_counter` +
+     `child_inst` graph edges — handles structural ordering.
+   - Net: one concept ("an async op completed; a consumer may proceed") is
+     expressed two different ways, and the event-like one only works for DMA.
+
+3. **Static shape is baked in.** `build_tog._affine_for_bounds` resolves loop
+   bounds to constants (`_const_index_value`). The graph is fully materialized
+   per static shape, so dynamic shape forces recompile-per-shape — pathological
+   for decode (a new `seq_len` every step) and MoE (variable expert load).
+
+4. **Loop-flattening hackery.** Much of the roughness (`loop_end` tricks,
+   `calc_tag`, dedup-by-skip, magic offsets) exists only to flatten loop nests
+   into a static graph.
+
+See [Appendix A](#appendix-a-current-state-references) for file:line references.
+
+## 2. Key idea: trace-driven → execution-driven
+
+Instead of materializing a flattened graph, **TOG becomes a stream emitted by
+*running* a shape-parametric producer.** The producer is C++ compiled from the
+kernel's MLIR; it keeps loops as loops (with symbolic bounds) and calls a small
+**event-based API**. Each API call emits one trace record = one modeled
+instruction. TOGSim `dlopen`s the producer `.so`, injects a callback context
+that records and times the stream.
+
+This directly resolves the four problems:
+
+| Problem | Resolution |
+|---|---|
+| ONNX-in-name-only / 3-place schema | The API signature is the single contract. No ONNX. |
+| DMA-only, ad-hoc sync | An async DMA and the consumer that waits on its data are paired at runtime by the tile's tag slot, through the existing Core tag table (`prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`). The DMA signals the tag when its data arrives; an explicit `togsim.memory_barrier` waits on it and becomes the last-writer of the loaded buffer, so consumers gate on data arrival. No content-hashed `calc_tag`, no magic values. |
+| Static shape | Loop bounds flow from MLIR as-is; symbolic bounds become native loop bounds in C++, so trip count is dynamic. |
+| Loop-flatten hacks | Loops stay loops; the trace is generated by executing them. `calc_tag`/dedup disappear. |
+
+It is *not* a dynamic hardware scheduler: control flow is still statically
+emitted by the compiler. The `.so` is a deterministic **trace generator**, not a
+timing model — it keeps the trace-as-data boundary, so TOGSim's timing core is
+untouched.
+
+## 3. Core algebra
+
+Small, orthogonal primitives. Everything else is composition (Layer-1 helpers
+like `double_buffered_loop`, not IR primitives).
+
+- `dma(dir, arg_id, offset, shape, is_async, tag_id, tag_slot, …)` —
+  `dir ∈ {LOAD, STORE}`. Returns void. A **synchronous** (non-async) DMA is
+  blocking: it finishes when its data arrives, and consumers depend on it
+  directly. An **async** DMA returns control immediately and signals its tag at
+  data arrival (DMA response-complete); a later `memory_barrier` is the explicit
+  point that waits on it.
+- `compute(tile_id, dims…)` — references a fixed-size tile kernel; cost is looked
+  up (§6), not computed here.
+- `memory_barrier(tag_id, tag_slot, write_bufs)` — the explicit async-DMA sync.
+  It waits until the async DMA carrying the same `(tag_id, tag_slot)` has
+  delivered its data, then becomes the last-writer of the loaded buffer so
+  consumers gate on data arrival. It is the original `memref.dma_wait` mapped
+  through from the source IR, not a synthesized barrier.
+- `compute_barrier()` — a compute fence inserted before a store, so the store
+  sees the drained accumulator. This is the **one** remaining auto-inserted
+  barrier; it is marked FIXME in the code as something that should also become
+  explicit in the source IR later (§10.7.3).
+- **Control flow lives in the producer** — ordinary `for`/`if`/`while` with
+  runtime bounds. Loop types (normal/parallel/accumulation/inner) and dynamic
+  shape are just producer loops; the emitted trace is already specialized.
+
+Two distinct things share the word "tag", and the design uses **both together**
+as the dma↔barrier pairing key:
+
+- **`tag_id`** — the identity of a DMA's *tag memref*. It plus the runtime
+  `tag_slot` index identifies which async DMA a `memory_barrier` is waiting on.
+- **`tag_slot`** — the SRAM tile slot the loaded tile occupies (the
+  double-buffer / SRAM-capacity index). It is *also* part of the pairing key
+  because each load's tile maps to its own slot. The slot is **subtile-only**:
+  `lower_to_vcix` writes the dma_wait tag index with a `-acc_iv` term for each
+  accumulation (reduction) loop var — a sentinel marking the reduction axis, not
+  an arithmetic offset — and `build_skeleton` strips those terms so a
+  `memory_barrier` waits on the same slot its async load wrote. (Mirrors legacy
+  `TileGraphParser`, which skips stride -1; reduction iterations are told apart
+  by the per-iteration tag alloc + a fresh per-record Core key in the bridge, not
+  by the slot.) Without the strip, the producer evaluates `-acc_iv` to a negative
+  slot at reduction iteration > 0 and the pairing fails on subtile + multi-tile-K.
+
+Pairing is done at runtime by the existing TOGSim Core tag table: the async DMA
+calls `prepare_tag_key` and `set_tag_finish` (signal at data arrival), the
+`memory_barrier` calls `register_tag_waiter` (wait on `(tag_id, tag_slot)`).
+A synchronous DMA needs no barrier — it blocks until data arrival itself.
+
+> **Superseded.** An earlier version used a neutral `event` completion token
+> (freely allocated, not tied to memory) with `signal`/`wait`/`wait_all`
+> primitives. That has been removed in favor of the runtime-tag mechanism above.
+
+## 4. Decisions (locked)
+
+| Axis | Decision |
+|---|---|
+| Input MLIR | Use the **given MLIR as-is**. Do not touch inductor / MLIR templates / shape plumbing. Whatever bounds the MLIR carries (const or symbolic) pass through verbatim. |
+| MLIR → C++ | **EmitC dialect + `mlir-translate --mlir-to-cpp`** (upstream). |
+| `.so` ↔ TOGSim | **`dlopen` + `EmitCtx` callback** (execution-driven). The ABI boundary is the main design surface. |
+| `.so` role | **Timing trace only.** Functional correctness stays on the existing Spike/LLVM path. Strip every op without a timing dependency; keep loop skeleton + API ops + ops feeding bounds/addresses. |
+| Compute cycle | A **separate annotation pass** reuses the existing **sample-mode** to produce a **precomputed `tile_id → cycle` table**, looked up at runtime. |
+| Dynamic shape | Falls out of symbolic loop bounds in the MLIR. Per-tile cost is static (tiles are fixed-size); only trip count is dynamic. |
+
+## 5. Architecture
+
+### 5.1 Artifacts (per kernel)
+
+- **Trace `.so`** — compiled from the skeleton+API MLIR. Shape-parametric:
+  symbolic bounds become C++ function parameters. Calls the runtime API
+  (`togsim_dma`, `togsim_compute`, `togsim_memory_barrier`, …).
+- **Cycle table** — `tile_id → cycle`, produced by the annotation pass.
+
+### 5.2 Pipeline (input = given MLIR)
+
+```
+given MLIR (affine/scf.for + memref.dma_start/dma_wait + vcix/vector compute)
+│
+├── Branch A (trace):
+│     C2 build_skeleton pass  (reuse build_tog traversal)
+│        • affine/scf.for kept, bounds as-is (symbolic preserved)
+│        • dma_start → togsim.dma(... tag_id, %tag[%idx], is_async)
+│        • dma_wait  → togsim.memory_barrier(tag_id, %tag[%idx], write_bufs)
+│        • compute block       → togsim.compute(tile_id, dims)
+│        • DCE: drop ops with no dependency to loop/address/API operands
+│     → C4 togsim→emitc lowering  (togsim.* → emitc.call_opaque;
+│        convert-scf/arith-to-emitc; func args incl. symbolic shapes)
+│     → mlir-translate --mlir-to-cpp
+│     → C5 compile → trace .so   (cached by kernel key)
+│
+└── Branch B (cost):
+      C3 annotation pass over the same MLIR
+        • extract per-tile compute bodies, assign tile_id
+        • run through existing sample-mode → tile_id → cycle table
+
+TOGSim (C6):
+  dlopen(trace.so) → resolve togsim_kernel
+  inject EmitCtx { tag table; record sink; cost = cycle_table[tile_id] }
+  togsim_kernel(ctx, runtime_shape_args...)   // producer runs, emits stream
+  → existing timing core consumes the recorded Instruction stream
+```
+
+### 5.3 Components
+
+- **C1 — `togsim` API op vocabulary.** `togsim.dma(...)` (void result, carrying
+  `tag_id`, the runtime tag-index operand, `is_async`),
+  `togsim.memory_barrier(tag_id, tag_slot, write_bufs)`,
+  `togsim.compute(tile_id, dims)`, `togsim.compute_barrier`. Kept *unregistered*
+  (like the existing `togsim.transfer`), so no C++ dialect registration; the
+  togsim→emitc step is a custom Python rewrite, not a registered ConversionPass.
+- **C2 — `build_skeleton` pass.** Sibling to `build_tog.py`, reusing its
+  traversal (matmul FSM, `_dma_start_fields`, loop typing). Emits the
+  skeleton+API MLIR instead of TOG nodes; preserves `is_async`. The original
+  `memref.dma_wait` is mapped through to an explicit `togsim.memory_barrier`
+  carrying the DMA's `tag_id` and the runtime tag-index operand.
+- **C3 — annotation pass + cycle table.** Reuses sample-mode to sample the
+  deterministic per-tile cycle; emits the `tile_id → cycle` table artifact.
+- **C4 — togsim→emitc lowering.** Maps each `togsim.*` op to an
+  `emitc.call_opaque "togsim_*"`; lowers control flow via `convert-scf-to-emitc`
+  / `convert-arith-to-emitc`; func arguments (including symbolic shapes) become
+  C++ parameters. Then `mlir-translate --mlir-to-cpp`.
+- **C5 — `.so` build.** Compile emitted `.cpp` + `togsim_runtime.h` to `.so`
+  via the existing toolchain; cache by kernel key.
+- **C6 — TOGSim runtime + loader.** `togsim_runtime.h/.cc`: `EmitCtx` and the
+  `togsim_dma/compute/memory_barrier/compute_barrier/core_alloc`
+  implementations (compute looks up the cycle table). Loader `dlopen`s the
+  `.so`, calls `togsim_kernel` with runtime shape args, records the stream, feeds
+  the existing timing core. An async DMA and its `memory_barrier` are paired at
+  runtime by `(tag_id, tag_slot)` through the existing Core tag table.
+
+### 5.4 ABI sketch (current: v11)
+
+```c
+// togsim_runtime.h — shared contract between emitted .cpp and TOGSim
+typedef struct EmitCtx EmitCtx;
+
+void togsim_dma(EmitCtx*, int32_t dir, int32_t arg_id, uint64_t offset,
+                int32_t ndim, const int64_t* dims, const int64_t* strides,
+                int32_t elem_bits, int32_t is_async,
+                int32_t tag_id, uint64_t tag_slot,
+                const int64_t* read_bufs, int32_t n_read,
+                const int64_t* write_bufs, int32_t n_write);
+
+void togsim_memory_barrier(EmitCtx*, int32_t tag_id, uint64_t tag_slot,
+                           const int64_t* write_bufs, int32_t n_write);
+
+void togsim_compute(EmitCtx*, uint64_t tile_id, int32_t compute_type, /* dims */ ...);
+void togsim_compute_barrier(EmitCtx*);
+int32_t togsim_core_alloc(EmitCtx*);
+
+// entry point the loader resolves:
+void togsim_kernel(EmitCtx*, int64_t* shape_args, int32_t n_shape_args);
+```
+
+`togsim_dma` returns void (no handle). An async DMA carries `(tag_id, tag_slot)`;
+the matching `togsim_memory_barrier` waits on the same pair through the Core tag
+table. The symbols are resolved as free `extern "C"` functions: the loaded `.so`
+links back into the Simulator binary (built with `ENABLE_EXPORTS`).
+
+> **Superseded.** v2–v10 evolved through a `togsim_event` handle type with
+> `togsim_dma` returning a handle and `togsim_wait`/`togsim_signal`/
+> `togsim_wait_all` plus `togsim_event_alloc`/`togsim_event_free`. v11 removed
+> all of those; see the note at the top of this doc and §9.6.1.
+
+## 6. Compute cost model
+
+The annotation pass (C3) reuses **sample-mode** to measure each tile's
+deterministic cycle once and stores a **precomputed `tile_id → cycle` table**.
+`togsim_compute` looks it up at runtime.
+
+This is consistent with dynamic shape because **tiles are fixed-size**
+(`TILE_M/N/K`): the per-tile cycle is invariant; only the *number* of tiles
+(loop trip count) varies, and that is handled by the symbolic loop in the `.so`.
+
+**Open edge case — remainder tiles.** When a dimension is not divisible by the
+tile size, edge tiles are partial and have a different cycle than the table
+entry. Options: pad to full-tile cost (simple, small error) vs. sample a
+separate `tile_id` for the remainder. Decided at P4.
+
+## 7. Milestones
+
+- **P0** — DONE. New branch; runtime API header (C6 surface) + `togsim` op
+  vocabulary (C1).
+- **P1** — DONE. `build_skeleton` pass (C2) on a matmul kernel; verified against
+  the legacy `build_tog` TOG. The async DMA's `memref.dma_wait` is mapped through
+  to an explicit `togsim.memory_barrier` carrying the DMA's `tag_id` and the
+  runtime tag-index operand; the IR verifies across sibling prefetch/compute loop
+  nests because the pairing is by runtime tag slot, not a cross-region SSA edge.
+- **P2** — DONE. togsim→emitc (C4) + `mlir-translate` + compile (C5) → `.so` for
+  that kernel (static shape). C4 rewrites the unregistered `togsim.*`/signature
+  then drives the upstream `lower-affine`/`convert-*-to-emitc` passes, with a
+  small fold for residual `emitc.for` bound casts (see §8). Base addresses
+  stubbed to 0 (wired in P3).
+- **P3** — DONE. TOGSim loader + runtime (C6) + cycle table (C3); runs end-to-end
+  through the real Simulator/Core (256^3 GEMM via `--trace_so`). Parallelism /
+  reduction / core dispatch design is locked in **§9** (core-transparent work
+  function + `togsim_core_alloc` hook). Async DMA↔consumer sync is the runtime
+  tag-slot mechanism (`togsim.memory_barrier`), not an event-id.
+- **P4** — Symbolic bounds end-to-end on a decode-style kernel; verify trace
+  length scales with runtime shape; decide remainder-tile handling.
+- **P5** — Migrate remaining op families (conv, SDPA, vector).
+
+## 8. Risks / open questions
+
+- **Remainder tiles vs. precomputed table** (§6) — P4.
+- **ABI versioning** — RESOLVED. Free `extern "C"` symbols (the `.so` links back
+  into the Simulator binary via `ENABLE_EXPORTS`); `TOGSIM_ABI_VERSION` is v11.
+- **togsim→emitc for unregistered ops** — must be a custom rewrite to
+  `emitc.call_opaque`, since unregistered ops have no registered conversion
+  patterns.
+- **EmitC coverage** — RESOLVED (P2). C4 uses the upstream conversion passes
+  (`lower-affine`, `convert-scf-to-emitc`, `convert-arith-to-emitc`,
+  `convert-func-to-emitc`). One gap in this LLVM 20 build:
+  `convert-scf-to-emitc` emits `emitc.for` with `index` bounds, so
+  `convert-arith-to-emitc` leaves `builtin.unrealized_conversion_cast` on the
+  bounds (`emitc.size_t`↔`index`) that `--reconcile-unrealized-casts` cannot
+  fold and `mlir-to-cpp` cannot print. C4 adds a small post-pass
+  (`_retype_for_to_size_t`) that retypes each `emitc.for` to `!emitc.size_t`
+  bounds + IV (`emitc.for` accepts size_t with the explicit type) and folds the
+  residual index<->size_t casts. A size_t IV also makes the lowered *address*
+  arithmetic cast-free, which is what lets P3 wire real addresses (approach A):
+  `togsim_dma` passes `(arg_id, element offset)` where the offset is computed
+  from the loop IVs and lowered by `convert-arith-to-emitc`.
+- **async/fire-and-forget** — `is_async` preserved on `togsim.dma`. An async DMA
+  signals its tag at data arrival; a sync DMA is blocking. A DMA with no matching
+  `memory_barrier` is fire-and-forget (nothing waits its tag).
+
+## 9. P3 design: parallelism, reduction, and core dispatch (locked)
+
+How the trace producer expresses *which core runs what*, *what is parallel*, and
+*what is a reduction* (cross-iteration dependency). This is the design for P3.
+
+### 9.1 Where the semantics come from
+
+Nothing new has to be inferred — the post-vcix `affine.for` already carries the
+mapping decision the frontend made, and `build_skeleton` preserves it:
+
+| attribute | meaning | role |
+|---|---|---|
+| `outer_loop` | PARALLEL axis (e.g. GEMM m, n) | independent output tiles -> distributable across cores |
+| `accumulation_loop` | REDUCTION axis (e.g. GEMM k) | partial sums into one output tile -> ordered dependency |
+| `inner_loop` | tile micro-loop | within one tile |
+
+This matches what legacy TOGSim already does with `torchsim_loop_type`
+(`TileGraphParser`: PARALLEL -> `outer_loop_idx` selects a core; ACCUMULATION ->
+`accum_tag` groups dependent partials). The current gap is only that
+`lower_to_emitc` (P2) *drops* these attributes when it lowers `affine.for` to
+`emitc.for`, producing a flat single-stream producer.
+
+### 9.2 Principle: bake intrinsic, parameterize extrinsic
+
+Two different kinds of hardware dependence must be treated differently:
+
+- **Intrinsic** (vlane / vector width, `TILE_M/N/K`, systolic size) — defines the
+  *content and cost of each instruction*. Already baked into the IR; correct.
+- **Extrinsic** (`num_cores`) — defines only the *distribution* of an otherwise
+  fixed set of work-items. The tile set, the per-tile cost table
+  (`tile_id -> cycle`), and the DMA tile shapes are all `num_cores`-invariant.
+
+Therefore `num_cores` is **not** baked into the producer. The producer is
+**core-count transparent**: it knows nothing about how many cores exist.
+
+### 9.3 Model: core-transparent work function + dispatch hook
+
+The producer is two functions, split at the PARALLEL/ACCUMULATION boundary:
+
+```c
+// WORK: trace for ONE independent output tile. Core-transparent: takes the
+// PARALLEL indices directly, names no core. Reduction (k) is program order ->
+// the dependency is implicit (the accumulator is core-local). An async load is
+// synced to its consumer by an explicit memory_barrier on the same tag slot.
+void togsim_kernel_tile(EmitCtx* ctx, int64_t mi, int64_t ni, int64_t* shape) {
+  togsim_core_alloc(ctx);                // first line: new work-item + pick core
+  togsim_compute(ctx, /*tile_id=*/0, ...);            // acc init
+  for (size_t ki = 0; ki < KT; ++ki) {                // REDUCTION = program order
+    togsim_dma(ctx, LOAD, A, offA(mi,ki), ..., /*is_async=*/1, /*tag_id=*/0, ki%D, ...);
+    togsim_dma(ctx, LOAD, B, offB(ki,ni), ..., /*is_async=*/1, /*tag_id=*/1, ki%D, ...);
+    togsim_memory_barrier(ctx, /*tag_id=*/1, ki%D, ...); togsim_compute(ctx, 1, ...);
+    togsim_memory_barrier(ctx, /*tag_id=*/0, ki%D, ...); togsim_compute(ctx, 2, ...);
+  }
+  togsim_dma(ctx, STORE, C, offC(mi,ni), ...);
+}
+
+// DISPATCH: enumerate the PARALLEL domain, one call per work-item.
+extern "C" void togsim_kernel(EmitCtx* ctx, int64_t* shape, int32_t n) {
+  size_t MT = shape[0]/256, NT = shape[1]/256;
+  for (size_t mi = 0; mi < MT; ++mi)
+    for (size_t ni = 0; ni < NT; ++ni)
+      togsim_kernel_tile(ctx, mi, ni, shape);
+}
+```
+
+Reduced to two orthogonal concepts:
+
+- **Parallel** = each `togsim_kernel_tile` call is an independent work-item (no
+  tags shared across calls). TOGSim is free to place it on any core.
+- **Reduction** = ordering *inside* one work-item: program order on its core
+  (no explicit barrier). The `memory_barrier`/tag-slot mechanism is only the
+  async-DMA → consumer data sync.
+- **Core assignment** = `togsim_core_alloc(ctx)` (a runtime callback, body in
+  TOGSim) marks the work-item boundary and binds the following ops to a chosen
+  core. The producer never sees `core_id`/`num_cores`; those live only in
+  TOGSim's dispatch policy (round-robin / blocked / cost-aware via the cycle
+  table).
+
+The boundary callback lives at the start of each work-item; it cannot be folded
+away because TOGSim cannot intercept the producer-internal work-function call --
+only `togsim_*` callbacks are visible across the `dlopen` boundary.
+
+> FINAL API (supersedes the `togsim_dispatch` naming used below): the boundary +
+> core binding is **`int32_t togsim_core_alloc(EmitCtx*)`** (header v6). The
+> producer calls it at each work-item start; the **runtime owns the core pool**
+> and round-robins -- `num_cores` is NEVER baked into the producer (it is purely
+> a runtime quantity). There is **no free**: a core is an assignment, not a held
+> resource; the next `togsim_core_alloc` starts the next work-item. The returned
+> id is discarded by the producer. This keeps the producer core-count transparent
+> while making the core mapping an explicit runtime allocation. Wherever the text
+> below says `togsim_dispatch`, read `togsim_core_alloc`.
+
+### 9.4 Codegen (lower_to_emitc) and ABI deltas
+
+- `lower_to_emitc` splits the loop nest at the PARALLEL/ACCUMULATION boundary
+  into two `emitc.func`: the PARALLEL loops become `togsim_kernel` (dispatcher,
+  passing the loop indices as args); the ACCUMULATION+INNER body becomes
+  `togsim_kernel_tile`, with `togsim_core_alloc(ctx)` inserted at its entry.
+- ABI additions in `togsim_runtime.h`: `int32_t togsim_core_alloc(EmitCtx*)`
+  (runtime owns the core pool; no `num_cores` in the producer; no free).
+  `togsim_kernel_tile` may stay internal (`static`) for now; export it only if a
+  future loader wants to own the parallel enumeration (which would also need a
+  `num_tiles`-style count — not required now).
+- `tile_id -> cycle` table unchanged (num_cores-invariant).
+
+> Implementation status (P3, ABI v12): `lower_to_emitc` OUTLINES the innermost
+> PARALLEL-loop body into a uniform `togsim_kernel_tile(ctx, iv, n)` func and the
+> dispatcher loop hands it to `togsim_dispatch(ctx, fn, iv, n)` -- a higher-order
+> runtime wrapper that round-robins a core and brackets the call with
+> TILE_BEGIN/TILE_END. The work-item SCOPE is now the function call itself (not an
+> implicit "ops until the next core_alloc" range), and one general dispatcher
+> serves every kernel (uniform iv-array ABI). Earlier this was a single
+> `togsim_kernel` with a bare `togsim_core_alloc` marker; the emitted *trace* is
+> identical (one work-item bracket, then the work ops), so cycles are unchanged --
+> the outline was done to make the boundary explicit, not for timing. Address
+> arithmetic is wired (approach A): each `togsim_dma` passes `(arg_id, element
+> offset)` with the offset computed from the loop IVs (lowered by
+> `convert-arith-to-emitc`, cast-free thanks to the size_t IV retype); the runtime
+> adds the tensor base. The parallel IVs reach the tile fn through the iv array.
+
+### 9.5 Stance and the split-K exception
+
+This refines the design's "not a dynamic scheduler / static control flow":
+**per-work-item trace is static and deterministic; only the work-item -> core
+binding is dynamic** (decided by `togsim_core_alloc`). That is independent-task
+distribution, not data-dependent control flow, and it matches a real tile
+scheduler more closely.
+
+The transparent model holds while work-items are independent (data-parallel over
+output tiles). **Split-K** (a reduction split *across* cores) breaks
+independence: the producer must emit `c` partials + a combine, so the
+instruction stream then depends on `num_cores`, and the cross-core dependency
+must be a real dataflow edge (not program order). Treat split-K as a deliberate,
+scoped exception — start P3 with data-parallel only.
+
+### 9.6 Work-items form a DAG (barriers, cross-parallel reduction)
+
+Work-items are not always a flat independent set. When there is a computation
+*between* parallel loops (e.g. an op at the m-level after the inner n parallel
+loop), it can only run once the inner parallel region completes — a join /
+barrier:
+
+```
+parallel for m:
+  parallel for n: A(m,n)     # leaf work-items
+  B(m)                       # join: needs all n of this m
+```
+
+This needs **no new primitive**: it is the same dataflow-edge mechanism the trace
+already uses (§10), just at work-item granularity. The join op declares the
+leaves' output buffers as its inputs, so the bridge makes it depend on every leaf
+through the last-writer-per-buffer analysis:
+
+```
+parallel for m:
+  parallel for n: A(m,n)   // each writes a tile of m's intermediate buffer
+  B(m)                     // reads that buffer -> depends on all n of this m
+```
+
+So the general picture: **work-items form a DAG; edges are buffer producer →
+consumer dependencies.** The independent data-parallel case is the degenerate
+edge-less DAG; barriers, reduction-across-a-parallel-axis, and split-K are the
+same DAG with real dataflow edges. (Async-DMA data arrival is the one edge that
+needs an explicit `memory_barrier` on the tag slot, because the buffer write
+completes only at DMA response-complete, later than the producing op's
+issue — see §10.7.4.)
+
+> **Superseded.** An earlier version expressed these joins with a per-leaf
+> completion `event` plus `togsim_wait_all`. Those primitives were removed; joins
+> are now ordinary buffer dependencies in the dataflow DAG (§10).
+
+### 9.6.1 How a barrier finds its DMA: runtime tag-slot pairing (locked)
+
+How the explicit `togsim.memory_barrier` (lowered from `memref.dma_wait`) finds
+*which* `togsim.dma` instance's data it must wait for. The hard case is a
+reduction loop: one static `togsim.dma` op executes once per iteration, each
+iteration loading a different tile into a different runtime tag slot. The pairing
+must therefore key on a *runtime* value, not a compile-time one.
+
+The locked model: pair by the **runtime tag slot**, using the existing TOGSim
+Core tag table.
+
+- **A DMA carries `(tag_id, tag_slot)`.** `tag_id` is the compile-time identity
+  of the DMA's tag memref (which logical channel — e.g. A-load vs B-load).
+  `tag_slot` is the *runtime* tag index `%tag[%idx]`, i.e. the SRAM tile slot
+  the loaded tile occupies this iteration. Together they uniquely name this
+  iteration's load.
+- **An async DMA signals; the barrier waits.** At DMA response-complete (the
+  moment data has actually arrived in SRAM), the runtime calls
+  `set_tag_finish(tag_id, tag_slot)`. The matching `togsim.memory_barrier`
+  carries the same `(tag_id, tag_slot)`; it calls `register_tag_waiter` and is
+  woken at that signal. The barrier then becomes the **last-writer** of the
+  loaded SRAM buffer (`write_bufs`), so every consumer that reads the buffer
+  gates on data arrival through the ordinary dataflow-edge analysis (§10).
+- **A synchronous DMA needs no barrier.** It is blocking — it finishes at data
+  arrival itself, and consumers depend on it directly.
+- **Reduction iterations do not collide.** Because `tag_slot` is the runtime
+  index, iteration `i`'s DMA and iteration `i`'s barrier share a slot that is
+  distinct from (or correctly reused after) other iterations — exactly the
+  per-iteration pairing a compile-time id could not express. The
+  double-buffer/pipeline depth is the slot's lifetime, owned by the Core's tag
+  table.
+
+**What this drops vs legacy `tag_table`:** no `calc_tag` content-hash, no magic
+values (`0`/`1`/`-1`/`>1`), no FIFO, no in-order assumption. The pairing key is
+`(tag_id, tag_slot)`, both carried explicitly on the trace ops.
+
+> Status: IMPLEMENTED (ABI v11). `build_skeleton` maps `memref.dma_wait` to
+> `togsim.memory_barrier` and tags `togsim.dma` with `tag_id` + the runtime
+> tag-index operand; `lower_to_emitc` lowers both; the runtime pairs them via
+> `prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`. Verified bad=0 on the
+> 256^3 GEMM. (All current fixtures have tag memref size 1, i.e. single-buffer;
+> deeper double-buffer pipelines exercise more slots but use the same key.)
+>
+> **Superseded.** ABI v5–v10 used a dynamically minted `togsim_event` handle
+> parked in a heap "event buffer" (`togsim_event_alloc`/`togsim_event_free`),
+> with `togsim_dma` returning the handle and `togsim_wait(handle)` consuming it.
+> That mechanism — and the earlier static `event_id` it replaced — could not
+> represent per-iteration reduction pairing and was removed in v11 in favor of
+> the runtime tag slot above.
+
+### 9.7 Execution / simulation model: trace generation (not co-execution)
+
+The producer is a **pure trace (DAG) generator**: running its loops *emits* the
+ordered op stream + dependency edges. It never computes cycles, models hardware,
+or schedules. Two consequences pin the model:
+
+- **What is an edge vs. what blocks.** Data dependencies (buffer producer →
+  consumer edges, plus the async-DMA `memory_barrier` on its tag slot) are
+  recorded *edges* — the producer does not block on them. The only thing that
+  ever blocks the producer is *resource backpressure* (finite cores,
+  double-buffer / SRAM slots, DMA-queue depth), and that is pure flow control,
+  not timing semantics.
+- **Cores, double-buffering, DRAM/NoC are the timing core's job — reused, not
+  reimplemented.** TOGSim's timing core already models all of this when it
+  consumes the legacy TOG (Appendix A: `tag_table` double-buffer sync,
+  `num_cores`). The producer stays oblivious; depths/counts are consumer-side
+  config.
+
+Consumption is staged via a swappable **sink** behind the callbacks, so the
+choice does not touch the producer or the ABI:
+
+| | sink | threads | when |
+|---|---|---|---|
+| **P3** | *materializing* — callbacks append to the timing core's input; reuse its existing scheduler/timing | none | static shape; like-for-like cycle-equivalence vs `build_tog` |
+| **P4+** | *streaming* — callbacks push to a bounded queue; the producer runs as a fiber/coroutine and blocks on backpressure; the DES loop advances time, frees resources, resumes it | producer fiber | only when dynamic-shape trace size makes full materialization impractical |
+
+This is **not** timing co-execution: even the streaming sink only blocks the
+producer on resource flow-control, never on timing-resolved data events. It is
+the lazy/streamed realization of the same trace model. Decision: **do P3 with
+the materializing sink (no threads); defer streaming to P4 as a sink swap.** The
+single forward-compat requirement is that the callback sink is an interface.
+
+### 9.8 P3 task list
+
+1. DONE. `togsim_runtime.h` + `togsim_runtime.cc`/`togsim_loader.h`: C6 runtime
+   (`EmitCtx`) + `dlopen` loader (`run_producer`), materializing sink. Callees:
+   `togsim_core_alloc` (runtime core pool), `togsim_dma` (records a tile load/
+   store, signals its tag at data arrival), `togsim_compute` (cycle-table lookup),
+   `togsim_memory_barrier` (waits the matching `(tag_id, tag_slot)`),
+   `togsim_compute_barrier`.
+2. DONE (single-buffer). `lower_to_emitc`: OUTLINES the work-item body into
+   `togsim_kernel_tile(ctx, iv, n)` + a `togsim_dispatch` call at the work-item
+   boundary (ABI v12; was a bare `togsim_core_alloc` marker), lowers
+   `togsim.memory_barrier`, and reads `loop_type`. (Two-function outline DONE;
+   trace identical.)
+3. DONE. Real tile addresses wired (approach A): build_skeleton keeps the DRAM
+   index operand on `togsim.dma`; lower_to_emitc passes `(arg_id, offset)` and
+   `convert-arith-to-emitc` lowers the offset (size_t IV retype makes it
+   cast-free). Verified on 1024^3 GEMM: per-tile offsets are correct
+   (A[m,k]=m*1024+k, B[k,n]=k*1024+n).
+4. PARTIAL. C3 cycle table: `cycle_table.py` builds `tile_id -> (cycle,
+   overlapping_cycle)` from a per-tile `cycle_list`, with `overlapping_cycle =
+   max(cycle - offset[type], 0)` (the legacy formula) and a JSON sidecar dump.
+   Remaining (folds into task 5): feed it the gem5 sample-mode `cycle_list`
+   already computed in `extension_codecache` (reused -> both paths stay
+   cycle-consistent), and have `togsim_compute` set BOTH cycle and
+   overlapping_cycle on the Instruction.
+5. PARTIAL. C6 runtime + loader: `TOGSim/src/togsim_runtime.cc` +
+   `togsim_loader.h` implement the producer ABI and `run_producer` -- dlopen the
+   `.so`, run `togsim_kernel` against an `EmitCtx`, and record a `TraceRec` stream
+   (the materializing sink): each dma resolves `base[arg_id] + offset*elem_bytes`
+   and signals its tag at data arrival, each compute looks up the cycle table,
+   core_alloc round-robins the core. Verified standalone on the 256^3 GEMM:
+   addresses/cycles resolved correctly. DONE (sec 10, 10.7.4): the recorded
+   stream is fed into the existing timing core (Core/Simulator) -- TraceRec maps
+   to `Instruction` (compute_cycle + overlapping_cycle, dataflow-buffer deps +
+   runtime-tag barriers).
+
+Legacy path: the ONNX-TOG producer (`run_tog` -> `tog_generator` -> ONNX ->
+C++ `TileGraphParser`) is marked DEPRECATED in place (comments in
+`extension_codecache.py` and `tog_generator.py`) but kept live -- it must not
+break during the transition. It is retired only once this trace pipeline is
+stable. The cycle measurement (`cycle_list`, `x_offset`/`w_offset`) is shared,
+so the two paths stay cycle-consistent meanwhile.
+
+### 9.9 Task-5 completion roadmap: TraceRec -> Core (DONE; see §10)
+
+> **Status: implemented.** This roadmap is retained for context. The dependency
+> model it sketches (a per-`togsim_wait`-handle RAW edge) was *superseded* during
+> implementation by the explicit dataflow-DAG model in §10: edges come from SRAM
+> last-writer-per-buffer plus the vcix preload/matmul FSM, and async-DMA data
+> arrival is gated by an explicit `togsim.memory_barrier` paired on the runtime
+> `(tag_id, tag_slot)` (§10.7.4) — not by a returned event handle. Read the
+> bullets below as the original target shape, with that one substitution.
+
+Grounded by reading `Instruction.h`, `Core.cc`, `TileGraphParser.h/.cc`,
+`Simulator.cc`.
+
+**Target architecture (legacy, reused):** `ONNX -> TileGraphParser -> TileGraph
+(TileLoopNode / TileMemoryNode / TileMemoryWaitNode / TileComputeNode) ->
+Simulator distributes Tiles to Cores -> Core runs Instructions`. We replace only
+the front: build the same `TileGraph` / `Instruction`s from the recorded
+`TraceRec` stream, then hand it to the existing `Simulator`.
+
+**Mapping (TraceRec -> Instruction):** `Instruction(opcode, compute_cycle,
+num_parents, dram_addr, tile_size, tile_stride, elem_bits, tag_idx_list,
+tag_stride_list, accum_tag_idx_list)`; `ready_counter = num_parents`.
+- DMA load/store -> `MOVIN`/`MOVOUT`: `dram_addr = TraceRec.addr`, `tile_size`/
+  `tile_stride`/`elem_bits` from the dma, `tag_idx_list = {tag_slot}` (the
+  SRAM-slot key), `is_async` set. compute_cycle 0.
+- COMPUTE -> `COMP`: `compute_cycle = TraceRec.cycle`,
+  `set_overlapping_cycle(TraceRec.overlapping)`, `set_compute_type(...)`.
+- Dependency (RAW): a compute depends on its loads through the SRAM
+  last-writer-per-buffer analysis (§10); for an async load the last-writer is the
+  `togsim.memory_barrier` paired on the load's runtime `(tag_id, tag_slot)`, so
+  the compute's `ready_counter` only clears once the data has arrived (§10.7.4).
+- SRAM double-buffer / capacity (WAR): the existing Core enforces it through the
+  tag mechanism (`register_tag`/`set_tag_finish`/`mark_tag_used`, DMA.h) keyed by
+  `tag_idx_list`; our `(arg_id, tag_slot)` is that key. Reduction grouping ->
+  `accum_tag_idx_list` (the accumulation-loop index).
+
+**Build/wiring:** compile the bridge into TOGSim (it needs the conan deps;
+include flags are in `TOGSim/build/compile_commands.json`, notably
+`-D_GLIBCXX_USE_CXX11_ABI=0` and the `/root/.conan/data/{robin-hood,spdlog,fmt,
+yaml-cpp,boost}` include dirs). Add `togsim_runtime.cc` + the bridge to
+`TOGSim/CMakeLists.txt`. Either (a) build `TileGraph`/`Tile` nodes from TraceRec
+(maximal reuse of `Simulator`'s tile distribution + Core), or (b) build the
+`Instruction` DAG directly and drive a single Core. (a) is closer to legacy and
+gives multi-core for free.
+
+**Cycle-table feed:** reuse the gem5 `cycle_list` already computed in
+`extension_codecache` (so both paths stay cycle-consistent); pass it +
+`x_offset`/`w_offset` to `cycle_table.build_cycle_table`, dump the sidecar, and
+have the loader populate `EmitCtx.cyc/ovl`.
+
+**Validation:** same post-vcix fixture through both paths; compare the
+`Simulator`'s total cycles / DRAM traffic. Start with the 256^3 GEMM (static
+shape, single-buffer), then multi-tile / double-buffer kernels.
+
+This is a focused C++ integration (TOGSim build + TileGraph construction), not a
+small increment -- best executed as its own push; all the producer-side inputs
+(addresses, cycles, handles, core, tag_slot) are already in the trace.
+
+## Appendix A: current-state references
+
+- `TOGSim/include/DMA.h:27-115` — `tag_table` (overloaded `0/1/-1/>1`) +
+  `waiters`; `register_tag` / `set_tag_finish` / `register_tag_waiter` /
+  `mark_tag_used` (= init / signal / wait / consume).
+- `TOGSim/src/Core.cc:118-140, 214-324` — async-DMA signal path and the `BAR`
+  wait/consume path over the tag table.
+- `TOGSim/include/Instruction.h:40-48, 104-117` — `ready_counter` / `child_inst`
+  (the second, separate dependency mechanism) and the tag fields.
+- `PyTorchSimFrontend/mlir/passes/build_tog.py` — `TogBuilder.print_operation`
+  dispatch (`affine.for` / `memref.dma_start` / `memref.dma_wait` / `vcix.*`);
+  `_affine_for_bounds` (constant-bound resolution → static shape).
+- `PyTorchSimFrontend/mlir/passes/__init__.py`,
+  `PyTorchSimFrontend/mlir/passes/lower_to_llvm.py` — in-process Python MLIR pass
+  orchestration via the bindings; the functional Spike/LLVM path (unchanged).
+- `PyTorchSimFrontend/mlir/mlir_gemm_template.py` — kernel template emitting the
+  `affine.for` nest + `linalg.matmul` + `togsim.transfer` DMA ops.
+
+## 10. Explicit dependency-edge trace (revised dependency model)
+
+Supersedes the in-order / runtime-tag approach for expressing dependencies. The
+trace is an explicit dataflow DAG: every op declares the producers of the data it
+consumes; the consumer (Core) does all resource scheduling. Reached after finding
+that (a) flat in-order over-serializes parallel tiles, (b) the current TOG pass
+does NO dependency analysis (it emits a lexical loop tree + tags resolved at
+runtime by the C++ tag_table), and (c) compute I/O is collapsed away by
+build_skeleton, so dependencies must be recovered before the collapse.
+
+### 10.1 Representation
+
+The dependency edge is "consumer reads the buffer that producer wrote". As
+landed (ABI v9 onward; see STATUS "sec 10 explicit-edge bridge"), each op
+declares the **SRAM buffer ids** it reads and writes (`read_bufs` / `write_bufs`);
+the bridge builds the Instruction DAG by **last-writer per buffer**, scoped per
+work-item. There is no SSA event token threaded by the producer and no event
+handle returned by an op.
+
+- The edge source is data, not order: an op that reads buffer `b` gets an edge
+  from whatever op most recently wrote `b`.
+- No in-order chain, no runtime tag content-hash, no op-pattern heuristics.
+- Resource scheduling -- SA round-robin, double-buffer (<=N in flight), SRAM --
+  stays entirely in the Core. The trace never reasons about SRAM occupancy or
+  timing; it only states producer->consumer order.
+- One exception: an **async** DMA's write completes only at data arrival (DMA
+  response-complete), later than its issue, so its last-writer edge is routed
+  through an explicit `togsim.memory_barrier` that waits the load's runtime
+  `(tag_id, tag_slot)` (§10.7.4). A synchronous DMA is blocking and needs no
+  barrier.
+
+> The sketch below uses an `out_ev = op(ctx, in_events[])` SSA notation to
+> *illustrate* the edges; it predates the landed `read_bufs`/`write_bufs` form
+> and is no longer the literal ABI. Read `in={…}` as "reads these buffers".
+
+Producer C++ form (events threaded like SSA; loop-carried = a reassigned var):
+
+    for mi, ni:                                  // PARALLEL: independent tiles
+      ev acc = compute(ctx, INIT, in={});
+      for ki:                                    // REDUCTION: loop-carried acc
+        ev a = dma_load(ctx, A[mi,ki], in={});
+        ev b = dma_load(ctx, B[ki,ni], in={});
+        ev w = compute(ctx, PRELOAD, in={b});
+        acc  = compute(ctx, MATMUL,  in={a,w,acc});  // new acc event each iter
+      dma_store(ctx, C[mi,ni], in={acc});
+
+The INIT dependency reaches every accumulate transitively through the acc chain
+(INIT -> mm_k0 -> mm_k1 -> store); each node only needs edges to its immediate
+producers. Different (mi,ni) -> separate acc chains -> independent -> parallel.
+
+### 10.2 Two dependency sources (both available pre-collapse in the TOG pass)
+
+A single "SRAM access" analysis is necessary but NOT sufficient -- verified on the
+GEMM post-vcix:
+
+| dependency | source | visible in SRAM? |
+|---|---|---|
+| load -> compute (DMA writes X_spad/W_spad, preload/matmul read) | SRAM last-writer per (buffer, slot) | yes |
+| accumulator chain (INIT writes Y_spad; the drain/epilogue read-modify-writes Y_spad; store reads it) | SRAM last-writer on Y_spad | yes |
+| **preload -> matmul** (preload loads weights into the systolic-array registers; matmul consumes them) | **vcix opcode FSM** (op1=preload pairs with the following op0=matmul; build_tog already tracks this via `current_preload_node`) | **no -- SA-internal, not a memref access** |
+
+So the analysis derives edges from (1) SRAM (buffer, slot) last-writer for loads
+and the accumulator, and (2) the vcix preload/matmul pairing for the SA-weight
+dependency. The slot is a concrete value at run time (the producer runs the
+loops), so matching is by value -- no static affine-overlap math.
+
+Key facts (256^3 GEMM, post-vcix): SRAM buffers are %0=X_spad(A), %1=W_spad(B),
+%2=Y_spad(acc/out). matmul (vcix op0) reads %0 only; preload (vcix op1) reads %1;
+the matmul does NOT read %1 (weights come from the SA), which is exactly why a
+memref-only analysis lets it run before the weight load -- the preload->matmul
+edge must come from the FSM. The accumulation is the epilogue's `transfer_read
+%2 + addf + transfer_write %2`, which IS SRAM-visible.
+
+### 10.3 Components changed (as landed)
+
+- TOG pass (`build_skeleton` + `dep_analysis`, on post-vcix before collapse): per
+  op, the read/write SRAM buffer ids + the preload->matmul pairing (folded as a
+  virtual `SA_WEIGHTS` buffer) -> the read/write buffer sets.
+- ABI (`togsim_runtime.h`): `togsim_dma`/`togsim_compute` carry
+  `read_bufs`/`write_bufs`; an async DMA also carries `(tag_id, tag_slot)` for the
+  `togsim.memory_barrier` pairing. No `in_events[]`, no returned event, no
+  `event_id`/handle-buffer mechanism.
+- `lower_to_emitc`: emits the buffer-id arrays on each op (and lowers
+  `togsim.memory_barrier`).
+- bridge: builds the Instruction DAG by last-writer per buffer (`add_child`);
+  no in-order chain, no runtime tag content-hash.
+- Core: unchanged (ready_counter DAG + SA pipeline + double-buffer already exist).
+
+### 10.4 Open decisions
+
+- Reduction timing: model the acc chain as completion-serial (conservative,
+  simple) first; SA-pipelined (matches legacy's overlap) — RESOLVED via the
+  occupancy/latency split (§10.7).
+- Buffer-id lifetime: the last-writer map is scoped per work-item (reset at each
+  `togsim_core_alloc`).
+
+### 10.5 Known issue: preload concurrency not bounded by #systolic-arrays
+
+Observed in the --trace_so run (256^3 GEMM): 4 PRELOADs execute concurrently
+(issue ~1028, finish ~1119-1122), but with num_systolic_array_per_core = 2 at
+most 2 should overlap, and two preloads on the same SA should serialize (one
+weight register file per array). Cause: a preload's overlapping_cycle equals its
+compute_cycle (91 == 91), so its occupancy (compute - overlapping) is ~0 and the
+Core's SA compute pipeline accepts unbounded back-to-back preloads.
+
+This is a PRE-EXISTING Core SA-model property, NOT introduced by the trace
+pipeline: the legacy build_tog path shows the same -- its 4 preloads issue at
+1215-1218 and finish 1306-1309 (4 concurrent). So it is not a trace-vs-legacy
+regression, but it is a real hardware-fidelity gap: the model should cap
+concurrent preloads at the systolic-array count and serialize same-SA preloads on
+the single weight buffer. Track separately from the trace work (affects both
+paths equally).
+
+### 10.6 Known issue: accumulator dependency over-serializes the reduction
+
+Observed in the --trace_so run: consecutive matmuls run 396 cycles apart (fully
+serial: issue 1120, 1516, 1912, ...), but physically matmuls that accumulate into
+the same output should PIPELINE on the systolic array (the partial sums stream
+through; consecutive matmuls overlap by overlapping_cycle, ~128 effective). They
+should NOT wait the previous matmul to complete.
+
+Cause: the explicit-edge bridge builds a hard completion edge (add_child) for the
+Y_spad accumulator read-modify-write, so matmul_k1 waits matmul_k0's
+finish_instruction -> when it issues, k0 is already done -> the overlapping_cycle
+window is empty -> no pipeline. This is the mechanism behind the 4888 vs legacy
+2095 gap (legacy has NO inter-matmul edges, so its matmuls pipeline on 2 SAs:
+finishes 1704,1707 | 1832,1835 = +128 within an SA, +3 across SAs).
+
+So the accumulator (Y_spad) dependency is a PIPELINED/ordering dependency, not a
+completion barrier. add_child cannot express that. Fix direction: do not create a
+matmul->matmul completion edge through the accumulator -- the accumulation order
+is preserved implicitly by same-SA issue order + the SA pipeline (overlapping_
+cycle), exactly as legacy does. Keep the real barriers: load->compute, and
+store->last-matmul (the store needs the final accumulator). The asymmetry (a
+matmul consuming Y pipelines; the store consuming Y waits) is the crux to model --
+likely "do not barrier when the consumer is a same-unit pipelined compute".
+
+Related to the same root as 10.5 (the SA/compute-pipeline occupancy model): both
+are about modeling the systolic array's streaming/pipelined execution rather than
+treating each compute as an atomic completion.
+
+### 10.7 Occupancy/latency split for pipelined computes (design + prototype)
+
+Idea (keeps add_child uniform): give each compute two completion points instead of
+one. A systolic-array op occupies its unit for occupancy = compute_cycle -
+overlapping_cycle (the initiation interval, ~128 for the matmul) and its result is
+ready at latency = compute_cycle (~395). Then add_child releases:
+  - a same-unit pipelined successor (next matmul, accumulator RMW) at OCCUPANCY
+    -> it starts ~128 later -> pipeline;
+  - a result consumer (the store reads the drained accumulator) at LATENCY
+    -> it waits the full drain (tail).
+So a single add_child mechanism stays, but the release point depends on whether
+the edge is an occupancy-dependency (same-unit pipeline) or a latency-dependency
+(reads the result). This also fixes 10.5: a preload then occupies its SA for its
+occupancy, so concurrent preloads are naturally capped at the SA count.
+
+Prototype (bridge stopgap, committed): skip the matmul->matmul accumulator edge
+(treat it as pipelined, not a barrier); keep every other edge. Result on 256^3
+GEMM: matmuls now issue back-to-back (1120-1127) and finish pipelined on 2 SAs
+(1515,1516 | 1643,1644 | 1771,1772 | 1899,1900 = +128 within an SA, +1 across),
+exactly like legacy. Total 4888 -> 2501 (vs legacy 2095 / 2608-incl-store; our
+matmuls finish at 1900 vs legacy 2091 -- our load chain is shorter). This
+confirms the accumulator dependency is pipelined. The clean replacement is the
+occupancy/latency split above in the Core so add_child stays uniform and the
+bridge needs no matmul-specific skip.
+
+#### 10.7.1 preload->matmul is also an occupancy dependency (preload fully overlaps)
+
+The preload->matmul edge is the SAME kind as matmul->matmul: a same-SA pipeline
+(occupancy) dependency, not a latency barrier. A preload's overlapping_cycle
+equals its compute_cycle (91 == 91), so its occupancy = compute - overlapping = 0
+-- it fully overlaps. With the occupancy/latency split, the matmul (successor)
+released at the preload's OCCUPANCY (= preload issue + 0) starts immediately, so
+the preload's 91-cycle latency is entirely hidden under the matmul.
+
+In the current prototype the preload->matmul edge is still an add_child barrier
+(only matmul->matmul was skipped), so the matmul issues at 1120 -- right after the
+preload finishes at ~1119 -- paying the full 91. The bridge cannot cleanly skip
+preload->matmul (skipping it outright loses the ordering: the matmul could be
+ready before the preload and reach the SA without weights). So preload-overlap is
+another reason the proper fix is the Core occupancy/latency split (10.7), which
+releases the matmul at the preload's occupancy (0) while keeping the issue order.
+
+Net: the Core occupancy/latency split resolves three notes at once -- 10.5
+(concurrent preloads capped at SA count via preload occupancy), 10.6 (matmuls
+pipeline), 10.7.1 (preload fully overlaps) -- all instances of "model the SA as a
+pipeline (occupancy + latency) instead of atomic completion".
+
+#### 10.7.2 Occupancy/latency split: implemented + POC result
+
+Implemented uniformly: Instruction gains add_pipeline_child / release_pipeline_
+children; the Core releases an op's pipeline children when it ISSUES (enters the
+SA pipeline), and its normal children at finish. The bridge classifies edges: a
+preload/matmul -> matmul edge is occupancy (add_pipeline_child), everything else
+is latency (add_child). No matmul-specific skip heuristic.
+
+256^3 GEMM result: preloads issue 1028-1031, matmuls issue 1032-1039 (right after
+the preloads ISSUE, not after they finish at ~1119 -> preload fully overlaps), and
+matmuls finish pipelined on 2 SAs (1427,1428 | 1555,1556 | 1683,1684 | 1811,1812
+= +128 within an SA, +1 across). Total 4888 -> 2501 (matmul-skip) -> 2413
+(occupancy/latency). Legacy is 2095 (matmul completion; our matmuls finish at 1812
+vs legacy 2091 -- shorter load chain -- and our 2413 includes the store).
+
+Note on 10.5 (preload concurrency): NOT fixed by this alone. A preload's
+overlapping_cycle == compute_cycle, so its occupancy is 0 -> it does not hold the
+SA -> 4 preloads still issue concurrently (1028-1031). Capping concurrent preloads
+at the SA count needs the preload to have a non-zero occupancy reflecting the
+weight-load time (a cycle-model input), separate from this edge-release change.
+
+#### 10.7.3 Explicit compute fence: implemented (COMPUTE_BAR), BAR -> MEMORY_BAR
+
+The compute fence is now a first-class trace entity, not a bridge-internal edge:
+  - togsim_ops: `togsim.compute_barrier`; ABI v10 adds `togsim_compute_barrier(ctx)`.
+  - build_skeleton emits a `togsim.compute_barrier` before each store DMA; lower_to_emitc
+    lowers it; the runtime records a COMPUTE_BAR TraceRec.
+  - The two barrier kinds are now named distinctly: Opcode::BAR -> Opcode::MEMORY_BAR
+    (the DMA/tag memory barrier, unchanged) and a new Opcode::COMPUTE_BAR.
+  - Core: COMPUTE_BAR finishes only once ALL compute pipelines drain (every systolic
+    array + the VPU empty); until then it stays in the ready queue (re-checked each
+    cycle). Its ready_counter is gated (pipeline-child of the outstanding async
+    computes) so it is only evaluated after they have ISSUED into the pipeline.
+  - bridge: a COMPUTE_BAR record -> a COMPUTE_BAR Instruction (pipeline-child of the
+    outstanding async matmuls); the following store add_child's the fence.
+
+256^3 GEMM: trace shows `... matmul x N -> COMPUTE_BAR -> STORE`; the COMPUTE_BAR
+instruction finishes at 1813 (after the SAs drain, last matmul ~1812), the store
+issues at 1814. Total 2414 (matches the implicit-flush 2413 + the 1-cycle fence).
+Multiple SAs handled (drains all _sa_compute_pipeline[*]). 7 python tests pass.
+
+#### 10.7.4 load->compute uses MEMORY_BAR (async DMA data wait); fixes a real bug
+
+Bug found: a consumer reading an async-loaded buffer ran BEFORE the data arrived
+(preload issued @1028 but its weight load W finished @1131). Cause: a raw
+add_child on an async DMA fires at the load's ISSUE-complete (program flow), not
+its DATA-ready (resp-complete) -- the async DMA signals data only via the tag
+table (set_tag_finish at resp-complete). So the buffer-edge model alone cannot
+gate compute on async-loaded data.
+
+Fix (symmetric with COMPUTE_BAR): route async load -> compute through a MEMORY_BAR
+that carries the load's tag. The load registers the tag at issue; the MEMORY_BAR
+(made ready after the load issues, via add_child) parks on the tag and is woken at
+resp-complete; consumers depend on the MEMORY_BAR (last_writer[buf] = bar). So the
+memory-arrival notification (set_tag_finish) connects to compute via the existing
+tag mechanism -- now explicit in the trace as a MEMORY_BAR instruction.
+
+256^3 GEMM: preload now issues @1132 (after W resp-done @1131), correct. Total
+2414 (buggy/optimistic) -> 2518 (correct: compute waits the slow weight load).
+Both barriers are explicit and symmetric: MEMORY_BAR (DMA tag, resp-complete) for
+load->compute, COMPUTE_BAR (SA pipeline drain) for compute->store.
+
+## 11. Remaining work + next-session handoff
+
+### 11.1 Status
+
+PR #267 (feature/togsim-cpp-trace -> develop). The trace pipeline runs end-to-end
+through the REAL Simulator/Core on a 256^3 GEMM via `--trace_so`, with an explicit
+dataflow dependency model (SRAM last-writer + vcix FSM) and two explicit barriers:
+MEMORY_BAR for async load->compute data (paired to its DMA by the runtime
+`(tag_id, tag_slot)` tag slot) and COMPUTE_BAR for the SA drain before a store.
+The async-DMA sync is the runtime tag slot, NOT a compile-time event-id (ABI
+bumped to v11; the event-id / event-handle / wait/signal design was removed).
+Legacy ONNX-TOG path kept + DEPRECATED. All togsim python tests pass; TOGSim
+builds.
+
+**Validation (256^3 GEMM, real gem5 cycle table):** through the real Core the
+trace path totals **2518 cycles** vs the legacy path's **2698** on the same
+table. The earlier 10.x notes (with a stub table) report different absolute
+numbers; 2518-vs-2698 is the current real-table figure.
+
+### 11.2 Remaining work (priority order)
+
+1. **Cycle-equivalence closure.** Characterize/close the trace-vs-legacy gap on the
+   256^3 GEMM with the SAME gem5 cycle_list. Sub-items 2-3 are the main drivers.
+2. **Preload concurrency cap (sec 10.5).** 4 preloads run concurrently though there
+   are 2 SAs, because a preload's occupancy is 0 (overlapping_cycle == compute).
+   Give the preload a non-zero occupancy (the weight-load time) so concurrent
+   preloads are capped at the SA count. Pre-existing in BOTH paths.
+3. **Robust gem5 cycle_list wiring.** The extension_codecache `TORCHSIM_DUMP_TRACE_SO=1`
+   hook dumps trace.so + trace_cycles.tsv from the real cycle_list, but is flaky
+   under concurrent compiles (saw cycle_list==[] once). Make it robust (or force a
+   single-thread compile), so `--trace_so --cycle_table` uses real per-tile cycles.
+4. **Parallel output tiles / multi-core.** One dispatch per work-item today; for
+   distributing independent output tiles across cores, emit a dispatch per parallel
+   (m_sub, n_sub) tile. The inner sub-tile loops are currently unlabeled (only the
+   macro loops carry subtile/accumulation), so the axis role must be recovered.
+5. **Cleanup.** The obsolete WAIT/SIGNAL trace records and the event-handle
+   buffer are dropped (v11). COMPUTE_BAR logs finish twice (cosmetic). The
+   preload node mis-attributes an X_spad read (build_tog `_steal_leading_transfer_read`)
+   -> a harmless extra edge.
+6. **P5 op coverage.** Only GEMM is exercised. Extend to conv / SDPA / vector / pool.
+7. **P4.** Symbolic/dynamic shape; streaming sink (coroutine, alloc-blocks).
+8. **Two-function outline** (togsim_kernel_tile) -- DONE (ABI v12). The work-item
+   body is outlined into a uniform `togsim_kernel_tile(ctx, iv, n)` and run via the
+   higher-order `togsim_dispatch` wrapper (round-robin core + TILE_BEGIN/TILE_END);
+   the work-item scope is now the function call. Trace/cycles identical to the old
+   single-function `togsim_core_alloc` form. One general dispatcher serves every
+   kernel.
+9. **Retire the legacy ONNX-TOG path** once the trace path is stable.
+
+### 11.3 Next-session context
+
+- Worktree `/workspace/PyTorchSim-cpptrace`, branch `feature/togsim-cpp-trace`,
+  PR #267 -> develop. The branch is rebased ONTO develop (the retire-floormod base
+  was dropped -- develop already has it). `source .envrc` in the worktree.
+- Build TOGSim: submodules are init'd; `cd TOGSim/build && cmake .. -DCMAKE_BUILD_TYPE=Release && make -j$(nproc)`.
+  The Simulator target has ENABLE_EXPORTS (so a dlopen'd .so resolves the togsim_*
+  callbacks); togsim_runtime.cc + togsim_trace_bridge.cc are picked up by the src glob.
+- Run the trace path:
+  `python -m PyTorchSimFrontend.mlir.passes.lower_to_emitc <postvcix.mlir> --so trace.so [--emit-cpp x.cpp]`
+  then `bin/Simulator --config <yml> --trace_so trace.so [--cycle_table cyc.tsv] [--log_level trace]`.
+- Get a post-vcix fixture: a real torch.compile GEMM with `TORCHSIM_DUMP_MLIR_IR=1
+  pytorchsim_functional_mode=False` writes `outputs/<hash>/..._sample_postvcix.mlir`.
+  Real cycle data + legacy reference: add `TORCHSIM_DUMP_TRACE_SO=1` to also dump
+  trace.so + trace_cycles.tsv in `outputs/<hash>/` (see 11.2 #3). (Prior /tmp
+  fixtures are ephemeral -- regenerate.)
+- Env (.envrc): gem5 `/gem5/release/gem5.opt`, spike `/release/bin/spike`,
+  LLVM `/riscv-llvm/bin`.
+- Tests: `TOGSIM_SKELETON_FIXTURE=<postvcix.mlir> pytest tests/test_togsim_{skeleton,emitc,runtime}.py`.
+  These are NOT in the CI allowlist (`.github/workflows/pytorchsim_test.yml`) -- register them to gate CI.
+- Key files: passes `build_skeleton.py`, `lower_to_emitc.py`, `dep_analysis.py`,
+  `cycle_table.py`, `togsim_ops.py`; `TOGSim/include/{togsim_runtime.h, togsim_loader.h, togsim_trace_bridge.h}`,
+  `TOGSim/src/{togsim_runtime.cc, togsim_trace_bridge.cc}`; `Core.cc`/`Instruction.{h,cc}`
+  (COMPUTE_BAR + MEMORY_BAR rename); `main.cc` (--trace_so); `extension_codecache.py`
+  (TORCHSIM_DUMP_TRACE_SO hook).
+- Local-only backups of the pre-squash/pre-rebase 28-commit history: tag
+  `pr-backup-ccfea43e`, branch `backup-presquash-3cfd4a3f` (NOT pushed).
diff --git a/docs/design/togsim_cpp_trace_HANDOFF.md b/docs/design/togsim_cpp_trace_HANDOFF.md
new file mode 100644
index 00000000..23f642bb
--- /dev/null
+++ b/docs/design/togsim_cpp_trace_HANDOFF.md
@@ -0,0 +1,191 @@
+# Handoff — TOGSim C++ Trace Generation
+
+Continuation notes for picking this work up in a fresh session. Read alongside
+the full design: [`togsim_cpp_trace.md`](./togsim_cpp_trace.md) and the snapshot
+[`togsim_cpp_trace_STATUS.md`](./togsim_cpp_trace_STATUS.md).
+
+## Goal (one line)
+
+Replace the timing-path TOG producer (MLIR -> Python-dict -> ONNX -> C++ parser)
+with a compiled, shape-parametric trace producer (MLIR -> EmitC -> C++ -> `.so`);
+TOGSim's timing core is preserved.
+
+## Current state (one paragraph)
+
+The trace pipeline is implemented end-to-end and runs through the REAL
+Simulator/Core on a 256^3 GEMM (`--trace_so`). Dependencies are an explicit
+dataflow DAG (SRAM last-writer per buffer + the vcix preload/matmul FSM). An
+asynchronous DMA is synced to the consumer of its data by the **runtime tag
+slot** `(tag_id, tag_slot)` through an explicit `togsim.memory_barrier` (lowered
+from the source `memref.dma_wait`); a sync DMA is blocking. ABI is **v11**. An
+earlier design used a compile-time `event_id` / heap event handle with
+`wait`/`signal`; it was removed because one static DMA op runs once per loop
+iteration into a different tag slot, which a compile-time id cannot pair per
+iteration. **Validation:** on the 256^3 GEMM with the real gem5 cycle table, the
+trace path totals **2518 cycles** vs the legacy path's **2698** through the real
+Core; all togsim python tests pass; TOGSim builds.
+
+## Branch
+
+- Work branch: `feature/togsim-cpp-trace` (PR #267 -> develop)
+
+## Status
+
+| Milestone | State |
+|---|---|
+| P0 — ABI header + op vocabulary | DONE (ABI evolved to v11) |
+| P1 — `build_skeleton` pass | DONE, verified — runs on a real GEMM fixture, module verifies, compute grouping + dma/barrier counts match the legacy `build_tog` TOG. |
+| P2 — togsim -> emitc -> cpp -> .so | DONE — `lower_to_emitc.py` builds EmitC, `mlir-translate` -> C++, `g++ -shared` -> `.so`; validated by build/symbol checks and a dlopen run harness. |
+| P3 — TOGSim loader + runtime + cycle table; real-Core run | DONE — runs end-to-end through the real Simulator/Core (256^3 GEMM, `--trace_so`). Runtime tag-slot pairing (ABI v11, `togsim.memory_barrier`), explicit dataflow DAG (read/write_bufs last-writer + vcix FSM), real tile addresses, cycle_table. `togsim_runtime.cc`/`togsim_loader.h`/`togsim_trace_bridge.cc` feed TraceRec into the real Core. Cycle comparison vs legacy on the real gem5 table: trace 2518 vs legacy 2698. Legacy ONNX-TOG path DEPRECATED in place, kept live. |
+| P4 — symbolic-bound dynamic shape, streaming sink | not started |
+| P5 — op-family migration (conv/SDPA/vector) | not started |
+
+### Async-DMA sync: runtime tag slot (current), event-id (removed)
+
+The original P1 threaded the dma->wait dependency as an SSA `!togsim.event`
+value, which fails `module.verify()` on a software-pipelined kernel (the
+`togsim.dma` sits in the prefetch loop nest, its consumer in a sibling compute
+nest, so the value does not dominate its use). An intermediate fix used a
+compile-time `event_id` attribute (later a heap-allocated event handle). Both
+were **removed**: one static `togsim.dma` op executes once per loop iteration
+into a *different* runtime tag slot, so a compile-time id (one per static op)
+cannot pair iteration i's DMA with iteration i's wait.
+
+Current mechanism (ABI v11): `togsim.dma` carries `tag_id` (its tag-memref
+identity) plus the runtime tag-index operand `%tag[%idx]` and returns void. The
+source `memref.dma_wait` is mapped through to an explicit
+`togsim.memory_barrier {tag_id, write_bufs}` carrying the runtime tag index. At
+runtime an async DMA and its barrier are paired by `(tag_id, tag_slot)` through
+the existing Core tag table (`prepare_tag_key`/`set_tag_finish`/
+`register_tag_waiter`): the DMA signals at data arrival, the barrier waits, and
+the barrier becomes the loaded buffer's last-writer so consumers gate on
+arrival. (The one remaining auto-inserted barrier is `togsim.compute_barrier`,
+the compute fence before a store — marked FIXME to become explicit later.)
+
+### P2 decisions
+
+* **ABI v11 (runtime tag slot).** `togsim_dma` returns void and carries
+  `(is_async, tag_id, tag_slot, read_bufs, write_bufs)`. The
+  `togsim_memory_barrier(tag_id, tag_slot, write_bufs)` is the explicit
+  async-DMA sync. No `event_id`, no event handle, no `wait`/`signal`.
+* **C4 drives the upstream EmitC conversion passes** (it does not hand-build
+  EmitC). It only does the parts upstream cannot: rewrite the *unregistered*
+  `togsim.*` ops to `emitc.call_opaque` and rewrite the kernel signature to the
+  ABI form. Then it runs, in-process (`mlir.passmanager`),
+  `func.func(lower-affine), convert-scf-to-emitc, convert-arith-to-emitc,
+  convert-func-to-emitc`. One local fixup: in this LLVM 20 build
+  `convert-scf-to-emitc` emits `emitc.for` with `index` bounds, so
+  `convert-arith-to-emitc` (constants -> `!emitc.size_t`) leaves
+  `unrealized_conversion_cast` on the bounds that nothing folds and
+  `mlir-to-cpp` can't print (design sec 8 risk). `_fold_for_bound_casts`
+  rewrites those bound constants to `index`-typed `emitc.constant`, clearing
+  the casts. (`emitc.for` *does* accept `size_t` bounds with an explicit
+  `: !emitc.size_t`, but keeping the bounds `index` avoids retyping the IV.)
+* **Addresses (wired in P3, approach A):** `togsim_dma` passes `(arg_id, element
+  offset)` with the offset computed from the loop IVs; the runtime adds the
+  tensor base. `togsim.compute` is keyed by `tile_id` for cost.
+
+## Files (key)
+
+- `TOGSim/include/togsim_runtime.h` — extern "C" ABI v11 (`togsim_dma`,
+  `togsim_memory_barrier`, `togsim_compute`, `togsim_compute_barrier`,
+  `togsim_core_alloc`, `togsim_kernel` entry, `TOGSIM_ABI_VERSION`, opaque
+  `EmitCtx`).
+- `PyTorchSimFrontend/mlir/passes/togsim_ops.py` — single source of truth for the
+  skeleton+API MLIR vocabulary (op names, attr keys, op->callee map).
+- `PyTorchSimFrontend/mlir/passes/build_skeleton.py` + `dep_analysis.py` — the P1
+  pass + dependency analysis (reuse build_tog's `TogBuilder`/`_build`; map
+  dma_start->togsim.dma, dma_wait->togsim.memory_barrier, attach read/write_bufs;
+  use-based DCE).
+- `TOGSim/src/togsim_runtime.cc`, `TOGSim/include/togsim_loader.h`,
+  `TOGSim/src/togsim_trace_bridge.cc` — C6 runtime, dlopen loader, and the bridge
+  that feeds the recorded TraceRec stream into the real Core.
+- `tests/test_togsim_skeleton.py` — `test_togsim_ops_contract` (runs anywhere) +
+  `test_build_skeleton_on_fixture` (gated on bindings + a fixture).
+- `PyTorchSimFrontend/mlir/passes/lower_to_emitc.py` — the P2/C4 pass: skeleton
+  module -> EmitC `togsim_kernel` -> C++ (`mlir-translate`) -> `.so` (`g++`).
+  Entry points: `lower_to_emitc(module)`, `build_trace_so(postvcix_path, so)`,
+  and a `__main__` CLI (`--so`, `--emit-cpp`, `--include-dir`).
+- `tests/test_togsim_emitc.py` — `test_build_trace_so` (EmitC + symbol checks) +
+  `test_trace_so_runs` (dlopen the `.so` against a stub runtime, run it). Gated
+  on bindings + `mlir-translate` + a C++ compiler + the fixture.
+
+## Reproduce P1 + P2 (one GEMM kernel)
+
+```bash
+# 1. post-vcix fixture: compile a GEMM (needs the built PyTorchSimDevice .so).
+export pytorchsim_functional_mode=False
+python tests/ops/gemm/test_matmul.py
+FIX=$(find "${TORCHSIM_DUMP_PATH:-.}" -name '*_postvcix.mlir' | head -1)
+# build_skeleton/lower_to_emitc only need the .mlir + bindings, not torch, so a
+# fixture compiled in any worktree is fine.
+
+# 2. P1: skeleton+API MLIR.
+python -m PyTorchSimFrontend.mlir.passes.build_skeleton "$FIX" --out /tmp/skel.mlir
+#   stderr: "skeleton: compute=.. dma=.. memory_barrier=.."
+
+# 3. P2: skeleton -> EmitC -> C++ -> .so (reads skel from $FIX via build_skeleton).
+python -m PyTorchSimFrontend.mlir.passes.lower_to_emitc "$FIX" \
+    --so /tmp/trace.so --emit-cpp /tmp/trace.cpp
+nm -D /tmp/trace.so | grep togsim     # togsim_kernel = T; togsim_dma/memory_barrier/compute = U
+
+# 4. tests
+TOGSIM_SKELETON_FIXTURE="$FIX" python -m pytest \
+    tests/test_togsim_skeleton.py tests/test_togsim_emitc.py -q
+```
+
+Note: `mlir-opt`/`mlir-translate` live in `$TORCHSIM_LLVM_PATH` but are not on
+`$PATH`; `lower_to_emitc` resolves `mlir-translate` from `TORCHSIM_LLVM_PATH`.
+
+## Next steps (P3 is done; remaining work)
+
+The producer is wired into TOGSim and runs through the real Core (trace 2518 vs
+legacy 2698 on the 256^3 GEMM). The parallelism / reduction / core-dispatch
+design is in `togsim_cpp_trace.md` §9. Summary: the producer is core-transparent
+(knows nothing about `num_cores`); it enumerates parallel output-tile work-items
+and calls `togsim_core_alloc` at each work-item boundary. Parallel = independent
+work-items; reduction = program order inside one work-item; core binding = the
+`togsim_core_alloc` runtime callback (policy lives in TOGSim). Async-DMA data
+sync = the runtime `(tag_id, tag_slot)` via `togsim.memory_barrier`. `num_cores`
+is extrinsic so it is never baked; vlane/tile sizes are intrinsic and stay baked.
+Split-K is a deferred exception.
+
+Remaining (priority order; full list in STATUS §7 and design §11.2):
+
+- **SRAM tile lifecycle (double-buffer throttle).** `togsim.dma` carries
+  `tag_slot` (the SRAM slot key); the consumer must use it to throttle in-flight
+  loads to the buffer depth on multi-tile / double-buffered kernels.
+- **Preload concurrency cap (design §10.5).** Give a preload a non-zero occupancy
+  (its weight-load time) so concurrent preloads are capped at the SA count.
+  Pre-existing in BOTH paths.
+- **Per-output-tile dispatch / multi-core.** One `togsim_core_alloc` per
+  work-item today; distribute independent output tiles across cores.
+- **Robust gem5 cycle_list wiring.** The extension_codecache
+  `TORCHSIM_DUMP_TRACE_SO=1` hook is flaky under concurrent compiles.
+- **P5 op coverage** (conv/SDPA/vector) and **P4** (symbolic shape, streaming
+  sink), then **retire the legacy ONNX-TOG path**.
+
+Full design: `togsim_cpp_trace.md` §5-11.
+
+## Environment requirements (for the new session)
+
+- MLIR Python bindings importable (`import mlir.ir`). They ship with the LLVM
+  build at `${TORCHSIM_LLVM_PATH%/bin}/python_packages/mlir_core`; the CI docker
+  image `ghcr.io/psal-postech/torchsim-ci` has them. `passes/__init__` also
+  derives the path from `TORCHSIM_LLVM_PATH`.
+- `pytest` to run the test files directly (`pip install pytest` if absent).
+- `mlir-translate` (in `$TORCHSIM_LLVM_PATH`) and a host C++ compiler (`g++`/
+  `$CXX`) for the P2 `.so` path.
+- TOGSim build (for `--trace_so`): `cd TOGSim/build && cmake ..
+  -DCMAKE_BUILD_TYPE=Release && make -j$(nproc)`. The Simulator target has
+  ENABLE_EXPORTS so a dlopen'd `.so` resolves the `togsim_*` callbacks.
+- When iterating on passes, clear the codegen caches (`$TORCHSIM_DUMP_PATH`,
+  default `outputs/`) between runs — see CLAUDE.md "Codegen changes are sticky".
+
+## Verification that already passes anywhere (sanity)
+
+```bash
+python -m py_compile PyTorchSimFrontend/mlir/passes/build_skeleton.py \
+    PyTorchSimFrontend/mlir/passes/togsim_ops.py tests/test_togsim_skeleton.py
+# contract test (no bindings needed): see test_togsim_ops_contract
+```
diff --git a/docs/design/togsim_cpp_trace_STATUS.md b/docs/design/togsim_cpp_trace_STATUS.md
new file mode 100644
index 00000000..ebf05701
--- /dev/null
+++ b/docs/design/togsim_cpp_trace_STATUS.md
@@ -0,0 +1,226 @@
+# TOGSim C++ Trace Generation — Status Report
+
+Branch: `feature/togsim-cpp-trace`. Design of record: `togsim_cpp_trace.md` (esp.
+§9); continuation notes: `togsim_cpp_trace_HANDOFF.md`. This file is a snapshot of
+progress.
+
+## 1. Goal
+
+Replace the timing-path TOG producer (`MLIR -> Python dict -> ONNX -> C++
+TileGraphParser`) with a compiled, shape-parametric trace producer
+(`MLIR -> skeleton -> EmitC -> C++ -> .so`). TOGSim's timing core is preserved;
+only the producer of its input changes. The key idea: do not flatten the TOG;
+instead **run** a compiled C++ producer that emits the trace as a stream of API
+calls.
+
+Each API call emits one trace record = one modeled instruction, fed to the
+existing timing Core. Dependencies are an explicit dataflow DAG (SRAM
+last-writer per buffer + the vcix preload/matmul FSM). An asynchronous DMA is
+synced to the consumer of its data by the **runtime tag slot** `(tag_id,
+tag_slot)` through an explicit `togsim.memory_barrier` (ABI v11). An earlier
+design used a compile-time `event_id` / event handle with `wait`/`signal`; that
+was removed because one static DMA op runs once per loop iteration into a
+different tag slot, which a single compile-time id cannot pair per iteration.
+
+## 2. Pipeline
+
+```
+post-vcix .mlir (torch.compile output)
+  | build_skeleton.py + dep_analysis.py (P1)  keep loops;
+  |   memref.dma_start -> togsim.dma(tag_id, %tag[%idx], is_async, read/write_bufs);
+  |   memref.dma_wait  -> togsim.memory_barrier(tag_id, tag_slot, write_bufs);
+  |   compute block    -> togsim.compute; DCE the rest
+  v
+skeleton+API MLIR
+  | lower_to_emitc.py (P2/C4)  togsim.* -> emitc.call_opaque; ABI signature; drive upstream
+  |                            lower-affine/convert-*-to-emitc; _retype_for_to_size_t fixups
+  v
+EmitC --mlir-translate--> C++ --g++ -shared--> trace.so
+                                                 | TOGSim loader (C6): dlopen + EmitCtx callbacks
+                                                 v
+                                       TraceRec stream (materializing sink)
+                                                 | togsim_trace_bridge.cc -> existing Core timing
+                                                 v
+                                       cycles / DRAM traffic (real Core)
+```
+
+Side artifact: cycle table `tile_id -> (cycle, overlapping_cycle)` (cycle_table.py).
+
+## 3. Milestones
+
+| | State |
+|---|---|
+| P0 ABI header + togsim vocabulary | DONE (ABI evolved to v11) |
+| P1 build_skeleton | DONE, verified (compute/dma/barrier match legacy TOG) |
+| P2 lower_to_emitc -> .so | DONE (real GEMM .so built and run) |
+| P3 loader/runtime + cycle table + real-Core run | DONE (runs end-to-end through the real Simulator/Core; below) |
+| P4 symbolic/dynamic shape, streaming sink | TODO |
+| P5 op-family migration (conv/SDPA/vector) | TODO |
+
+P3 detail:
+
+| | State |
+|---|---|
+| ABI (core_alloc, runtime tag pairing, dma address) | DONE (v11) |
+| work-item boundary (togsim_core_alloc) | DONE |
+| real tile DRAM addresses (approach A) | DONE, verified on 1024^3 |
+| cycle_table builder (cycle + overlapping) | DONE |
+| async DMA <-> consumer sync (runtime tag slot, memory_barrier) | DONE |
+| explicit dataflow DAG (read/write_bufs last-writer) | DONE |
+| C6 runtime + dlopen loader (materializing) | DONE |
+| TraceRec -> existing Core timing feed | DONE (runs end-to-end through real Core) |
+| cycle comparison vs build_tog (real gem5 table) | DONE: trace 2518 vs legacy 2698 |
+| SRAM tile lifecycle / preload-occupancy refinements | partial (see §7) |
+
+### TraceRec -> Core: now running end-to-end
+
+`TOGSim/src/togsim_trace_bridge.cc` (`trace_to_tilegraph`) + a `--trace_so` mode
+in `main.cc` feed the recorded trace into the REAL Simulator/Core. The producer
+`.so` is `dlopen`'d (the Simulator is built with ENABLE_EXPORTS so the `.so`
+resolves the `togsim_*` callbacks back into the binary), its trace recorded, then
+bridged to a `TileGraph`: one `TileSubGraph` per work-item (core_alloc marker)
+bound to its core, one `Tile` of MOVIN/MOVOUT/COMP/MEMORY_BAR/COMPUTE_BAR
+`Instruction`s. Dependency edges are built by **last-writer per SRAM buffer**
+(`read_bufs`/`write_bufs`); an async load's last-writer is the MEMORY_BAR paired
+to it by the runtime `(tag_id, tag_slot)` (so a consumer waits actual data
+arrival), and a COMPUTE_BAR drains the systolic-array pipeline before a store.
+Build it (`cd TOGSim/build && cmake .. && make`) and run:
+`bin/Simulator --config <yml> --trace_so gemm_trace.so`.
+
+### Cycle comparison vs legacy build_tog (256^3 GEMM, real gem5 table)
+
+Ran the same kernel through the legacy path (torch.compile -> gem5 -> build_tog
+-> Simulator) and the trace path (the same post-vcix IR -> trace .so + the SAME
+gem5 cycle_list -> --trace_so), both through the REAL Core. extension_codecache
+has an opt-in TORCHSIM_DUMP_TRACE_SO=1 hook that dumps trace.so + trace_cycles.tsv
+from the same cycle_list/offsets (best-effort, never breaks the legacy path);
+compute-unit routing uses compute_type and the tag key uses a per-tensor addr_id
+(set_addr_name(arg_id)+prepare_tag_key) so A and B don't collide on tag_slot 0.
+
+**Result: the trace path totals 2518 cycles vs the legacy path's 2698 on the
+same gem5 cycle table.** All togsim python tests pass; TOGSim builds. Compute
+work and DRAM traffic match; the remaining difference is scheduling (the
+explicit dataflow DAG plus the occupancy/latency SA-pipeline model overlap
+differently than legacy's per-iteration BARs).
+
+**Subtile + multi-tile-K now runs** (256x512x256 forced to 128x128 subtiles, 2
+K-tiles: 5774 cycles, no crash). This needed `build_skeleton` to strip the
+`-acc_iv` accumulation marker from the dma_wait tag index so the memory_barrier
+slot stays subtile-only and pairs with its load (see §3, `tag_slot`); before the
+strip the producer evaluated `-acc_iv` to a negative slot at the 2nd K-tile and
+TOGSim aborted with "Key does not exist in ... tag table".
+
+## 4. Components
+
+- `build_skeleton.py` + `dep_analysis.py` — in-place reduction of post-vcix to
+  "loop skeleton + togsim.* API"; `memref.dma_wait` mapped through to an explicit
+  `togsim.memory_barrier`; read/write SRAM buffer ids attached; reuses legacy
+  `TogBuilder` traversal.
+- `lower_to_emitc.py` — skeleton -> EmitC by driving the upstream conversion
+  passes plus `_retype_for_to_size_t` (clears residual index<->size_t casts).
+  `togsim_dma` carries `(tag_id, runtime tag-index, is_async, read/write_bufs)`
+  and returns void; `togsim_memory_barrier` carries `(tag_id, tag_slot,
+  write_bufs)`; `togsim_core_alloc` inserted at the work-item boundary.
+- `cycle_table.py` — `tile_id -> (cycle, overlapping)`, overlapping
+  `= max(cycle - offset[type], 0)` (legacy formula); JSON sidecar.
+- `TOGSim/src/togsim_runtime.cc` + `TOGSim/include/togsim_loader.h` — C6 runtime
+  and `run_producer` (dlopen -> togsim_kernel -> records TraceRec). dma resolves
+  `base[arg] + offset*elem_bytes` and signals its tag at data arrival; the
+  matching memory_barrier waits the `(tag_id, tag_slot)`; compute looks up the
+  cycle table; core_alloc round-robins a runtime core pool.
+- `TOGSim/src/togsim_trace_bridge.cc` — bridges the recorded TraceRec stream into
+  the existing `TileGraph`/`Instruction` form for the real Core.
+- `TOGSim/include/togsim_runtime.h` — producer ABI v11.
+
+## 5. Locked design decisions
+
+1. **Trace is a DAG, not a time order.** The consumer (existing Core) schedules
+   per-core timelines from: op kind -> hardware unit, SRAM-buffer last-writer ->
+   data dependency, same-core -> serial (reduction accumulate), SRAM slot ->
+   capacity. Emission order != execution order.
+2. **Async-DMA sync = runtime tag slot.** A `togsim.dma` carries `(tag_id,
+   tag_slot)`; the matching `togsim.memory_barrier` (lowered from the source
+   `memref.dma_wait`) waits on the same pair through the existing Core tag table
+   (`prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`). The DMA signals at
+   data arrival; the barrier becomes the loaded buffer's last-writer so consumers
+   gate on arrival. A sync DMA is blocking (no barrier). This replaced an earlier
+   `event_id` / heap event-handle design, which could not pair a DMA op with its
+   wait per loop iteration (one static op, a different tag slot each iteration).
+   No `calc_tag` content-hash, no magic values, no FIFO.
+3. **Core = runtime allocation.** `togsim_core_alloc` returns a core id (no free).
+   `num_cores` is never baked into the producer -- it is the runtime pool size.
+   A work-item's reduction stays on one core (sticky); different work-items get
+   different cores -> multi-core.
+4. **Intrinsic baked / extrinsic parametric.** vlane / tile sizes / systolic
+   define instructions (baked); num_cores only distributes (runtime).
+5. **Execution model:** P3 materializing (run producer to completion -> record ->
+   feed existing Core); P4 streaming (coroutine, alloc-blocks on resources).
+6. **Double-buffer = resource constraint.** Producer emits everything (no skew);
+   capacity is the consumer's throttle. Requires SRAM tile lifecycle
+   (alloc/free) in the trace -- the currently missing piece.
+
+## 6. Verification (reproducible)
+
+- togsim python tests pass: skeleton (contract + fixture), emitc (build + dlopen
+  run), cycle_table, runtime. TOGSim builds.
+- 256^3 GEMM: core_alloc -> dma(tag_id, tag_slot) -> memory_barrier(tag_id,
+  tag_slot) -> compute; addresses A/B/C resolved (offset 0, single tile).
+- 1024^3 GEMM: per-tile addresses correct (A[m,k]=m*1024+k -> 0,256,512;
+  B[k,n]=k*1024+n -> 0,262144,524288).
+- End-to-end through the real Core (256^3 GEMM, real gem5 table): trace 2518
+  cycles vs legacy 2698.
+- Legacy ONNX-TOG path untouched (comment-only diff), marked DEPRECATED, kept as
+  the comparison reference.
+
+## 6b. Reference timer (early sanity check; superseded by the real Core feed)
+
+`togsim::simulate(RunResult, TimingParams)` (togsim_runtime.cc) was an early
+standalone scheduler that timed the recorded TraceRec to prove the stream is
+sufficient to be timed: per core a DMA-engine timeline (DMAs serialize, overlap
+compute), a compute timeline (serial = reduction accumulate, with the `finish =
+prev.finish + cycle - overlapped` pipeline overlap of Core.cc), and data deps.
+It is NOT the production Core (no DRAM/NoC/L2 contention). It has since been
+superseded: the recorded stream is now bridged into the real Tile/TileGraph ->
+Core (see §3, and the 2518-vs-2698 result above). Retained here as context.
+
+## 7. Remaining work (priority order)
+
+1. DONE. Map TraceRec -> existing TOGSim Core Instructions (Tile/TileGraph,
+   compute_cycle+overlapping, dataflow-buffer deps + runtime-tag barriers) and
+   run through the real Core. Result: trace 2518 vs legacy 2698 on the same gem5
+   table.
+2. SRAM tile lifecycle in the trace (double-buffer throttle). togsim_dma carries
+   `tag_slot` (the lowered SRAM tag index = the slot key the existing Core's
+   Instruction.tag_idx needs); 0 for single-buffer kernels. Remaining: the
+   consumer must use it to throttle in-flight loads to the buffer depth. The
+   SRAM-buffer key is effectively (arg_id, tag_slot) since each load's DRAM
+   tensor maps to its spad.
+3. Preload concurrency cap / preload occupancy (design doc §10.5): give a preload
+   a non-zero occupancy so concurrent preloads are capped at the SA count.
+   Pre-existing in BOTH paths.
+4. (later) deeper double-buffer pipelines (more tag slots), two-function outline,
+   P4 streaming, symbolic shape, P5 op coverage (conv/SDPA/vector).
+
+## 8. Risks / open
+
+- SRAM lifecycle (double-buffer throttle) not yet implemented -- central to
+  double-buffer/capacity accuracy on multi-tile kernels.
+- LLVM 20 emitc constraints absorbed: emitc.for index bounds; old
+  subscript-returns-element model; arith.divui/remui not lowerable -> core id is
+  a runtime allocation (which became a design improvement).
+
+### Explicit dataflow-edge dependency model: implemented
+
+The dependency model is an explicit dataflow DAG, not in-order or runtime-tag
+content-hashing. `togsim_dma`/`togsim_compute` carry read_bufs/write_bufs (SRAM
+buffer ids; a virtual SA_WEIGHTS buffer folds the preload->matmul edge).
+dep_analysis + build_skeleton attach them; lower_to_emitc emits them; the runtime
+records them; the bridge builds the Instruction DAG by last-writer per buffer,
+scoped per work-item. The one runtime-paired edge is the async-DMA data wait,
+routed through an explicit `togsim.memory_barrier` keyed on `(tag_id, tag_slot)`
+(see design doc §10.7.4). The systolic-array pipeline uses the occupancy/latency
+split (§10.7), so accumulating matmuls pipeline rather than serialize.
+
+Net (256^3 GEMM, real gem5 table, real Core): trace 2518 vs legacy 2698.
+Per-output-tile dispatch for multi-core distribution is the next refinement
+(today one dispatch per work-item).
diff --git a/scripts/trace_timeline.py b/scripts/trace_timeline.py
new file mode 100644
index 00000000..5cf9608b
--- /dev/null
+++ b/scripts/trace_timeline.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""Convert a TOGSim `--log_level trace` log into a Chrome Trace Event JSON that
+opens in Perfetto (https://ui.perfetto.dev) or chrome://tracing as an interactive
+timeline (Gantt).
+
+Each instruction becomes one duration slice, grouped per core (pid). Lanes:
+  dram-rd -- loads crossing the DRAM bus (read bandwidth)
+  dram-wr -- stores crossing the DRAM bus (write bandwidth)
+  sa / sa0.. -- COMP compute_type 1 (matmul) / 2 (preload)
+  vector  -- COMP compute_type 0 (vector)
+Time unit = core cycles. Barriers (MEMORY_BAR/COMPUTE_BAR) are not drawn. A DMA bar
+runs from the op's first DRAM response (DRAM_RESP_FIRST, logged by the Core -- so it
+captures data moving even while still injecting) to its completion (load: data-ready;
+store: finished), serialized per direction so each is one visible bar (packed row =
+saturated bus). A compute slice's width is its occupancy (compute_cycle - overlapping).
+
+Usage:
+  bin/Simulator --config <yml> --trace_so <so> --cycle_table <tsv> --log_level trace \
+      2>&1 | python scripts/trace_timeline.py -o timeline.json
+  # or
+  python scripts/trace_timeline.py trace.log -o timeline.json
+Then drag timeline.json into https://ui.perfetto.dev .
+"""
+import argparse
+import json
+import re
+import sys
+
+# [cycle][Core C][TAG ][INST_ID=N] OPCODE (detail...)
+_LINE = re.compile(
+    r"\[(\d+)\]\[Core (\d+)\]\[([A-Z_]+)\s*\](?:\[INST_ID=(-?\d+)\])?\s*(\w+)?(.*)")
+
+# Only 3 lanes per core. Barriers are dropped (see _HIDE).
+_LANE = {"MOVIN": "dma", "MOVOUT": "dma"}
+_HIDE = {"MEMORY_BAR", "COMPUTE_BAR", "TILE_BEGIN", "TILE_END"}
+_CT_NAME = {0: "vector", 1: "matmul", 2: "preload"}
+
+# Perfetto/catapult reserved color names; slices are tinted by tile (= the
+# togsim_dispatch work-item / output tile) so one tile's ops share a color across
+# lanes/cores. 16 names so a core's tiles (which stride by num_cores) stay
+# distinct -- an 8-name palette collapsed to 4 colors per core under 2-core
+# even/odd assignment.
+_TILE_PALETTE = ["good", "bad", "terrible", "yellow", "olive", "rail_response",
+                 "rail_load", "rail_animation", "rail_idle", "thread_state_running",
+                 "thread_state_runnable", "thread_state_iowait",
+                 "thread_state_uninterruptible", "generic_work", "startup",
+                 "vsync_highlight_color"]
+
+
+def _tile_color(detail):
+    m = re.search(r"\btile=(\d+)", detail or "")
+    return _TILE_PALETTE[int(m.group(1)) % len(_TILE_PALETTE)] if m else None
+
+
+_DMA_SHORT = {"MOVIN": "MVIN", "MOVOUT": "MVOUT"}
+
+
+def _tile_of(detail):
+    m = re.search(r"\btile=(-?\d+)", detail or "")
+    return m.group(1) if m else "?"
+
+
+def _label(opcode, detail):
+    if opcode == "COMP":
+        m = re.search(r"compute_type=(\d+)", detail)
+        ct = int(m.group(1)) if m else -1
+        return f"T{_tile_of(detail)} {_CT_NAME.get(ct, 'comp')}"
+    # DMA: keep each load's OWN identity (addr_name) so the input/weight/K-panel
+    # loads stay distinct; tile is conveyed by color (and args), not the name.
+    m = re.search(r"addr_name=(\w+)", detail or "")
+    who = m.group(1) if m else "?"
+    return f"{who} (T{_tile_of(detail)} {_DMA_SHORT.get(opcode, opcode)})"
+
+
+def _lane(opcode, detail):
+    if opcode == "COMP":
+        m = re.search(r"compute_type=(\d+)", detail)
+        ct = int(m.group(1)) if m else -1
+        return "vector" if ct == 0 else "sa"
+    return _LANE.get(opcode, "dma")
+
+
+def parse(lines):
+    # key = (core, inst_id) -> record
+    insts = {}
+    for ln in lines:
+        m = _LINE.search(ln)
+        if not m:
+            continue
+        cyc, core, tag, iid, opcode, detail = m.groups()
+        if iid is None or opcode is None:
+            continue
+        cyc, core, iid = int(cyc), int(core), int(iid)
+        key = (core, iid)
+        r = insts.setdefault(key, {
+            "core": core, "iid": iid, "opcode": opcode, "detail": detail,
+            "issued": None, "finished": None, "resp": None, "dma_issue": None,
+            "first_resp": None})
+        if not r["opcode"] or r["opcode"] == opcode:
+            r["opcode"] = opcode
+            if detail.strip():
+                r["detail"] = detail
+        if tag == "INST_ISSUED" and r["issued"] is None:
+            r["issued"] = cyc
+        elif tag == "INST_FINISHED":
+            r["finished"] = cyc
+        elif tag == "DRAM_RESP_DONE":
+            r["resp"] = cyc
+        elif tag == "DRAM_RESP_FIRST" and r["first_resp"] is None:  # first data arrived
+            r["first_resp"] = cyc
+        elif tag == "ASYNC_DMA_ISSUE":   # all requests injected (engine done)
+            r["dma_issue"] = cyc
+    return insts
+
+
+def _occ(detail):
+    """(compute_cycle, overlapping_cycle) from a COMP detail string."""
+    cc = re.search(r"compute_cycle=(\d+)", detail)
+    ov = re.search(r"overlapping_cycle=(\d+)", detail)
+    return (int(cc.group(1)) if cc else 0, int(ov.group(1)) if ov else 0)
+
+
+def to_chrome(insts, num_sa=1):
+    """Model each hardware unit as a server and replay its ops in issue order, so
+    real idle gaps (bubbles) show and slices don't nest:
+      dma    : MOVIN/MOVOUT -- 1 DMA engine; slice = actual transfer
+               (ASYNC_DMA_ISSUE -> data-ready).
+      vector : COMP type 0  -- 1 VPU.
+      sa     : COMP type 1/2 -- each op on the SA the Core reports (`sa=` field;
+               weight-pinned), so lanes auto-split sa0..; rr fallback if absent.
+    A compute slice's width is compute_cycle - overlapping_cycle (its occupancy =
+    latency minus the tail that overlaps the next op), starting when the unit
+    actually picks it up: start = max(issue, unit_free). num_sa>1 -> lanes sa0.. ."""
+    by_core = {}
+    for r in insts.values():
+        op, detail, core = r["opcode"], r["detail"], r["core"]
+        if op in _HIDE:
+            continue
+        u = by_core.setdefault(core, {"dma": [], "vector": [], "sa": []})
+        if op == "COMP":
+            m = re.search(r"compute_type=(\d+)", detail)
+            ct = int(m.group(1)) if m else -1
+            u["vector" if ct == 0 else "sa"].append(r)
+        else:
+            u["dma"].append(r)
+
+    events, lanes, cores = [], set(), set()
+
+    def add(core, lane, ts, dur, name, r):
+        lanes.add((core, lane))
+        cores.add(core)
+        args = {"inst_id": r["iid"], "tile": _tile_of(r["detail"]),
+                "issued": r["issued"], "first_data": r["first_resp"],
+                "finished": r["finished"], "data_ready": r["resp"]}
+        am = re.search(r"addr_name=(\w+)", r["detail"] or "")
+        if am:
+            args["addr"] = am.group(1)
+        ev = {"name": name, "cat": lane, "ph": "X", "ts": ts,
+              "dur": max(dur, 1), "pid": core, "tid": lane, "args": args}
+        cname = _tile_color(r["detail"])
+        if cname:
+            ev["cname"] = cname
+        events.append(ev)
+
+    def issue_key(r):
+        return r["issued"] if r["issued"] is not None else 0
+
+    nsa = max(num_sa, 1)
+    for core, u in sorted(by_core.items()):
+        # DMA data crossing the DRAM bus, split by direction (reads and writes are
+        # asymmetric). A LOAD's data comes back on the response, so its bar runs
+        # [first DRAM response, data-ready]. A STORE's data goes out with the
+        # request (fire-and-forget; its acks arrive after it has finished), so its
+        # bar runs [issued, finished]. Serialized per direction so each op is one
+        # visible bar: a packed row = the bus is saturated, gaps = it is idle.
+        for lane, op, sk, ek in (("dram-rd", "MOVIN", "first_resp", "resp"),
+                                 ("dram-wr", "MOVOUT", "issued", "finished")):
+            free = 0
+            rows = [r for r in u["dma"] if r["opcode"] == op
+                    and r[sk] is not None and r[ek] is not None and r[ek] > r[sk]]
+            for r in sorted(rows, key=lambda r: r[ek]):
+                start = max(r[sk], free)
+                free = max(r[ek], start + 1)
+                add(core, lane, start, free - start, _label(r["opcode"], r["detail"]), r)
+        # VPU: one server; slice = occupancy (compute_cycle - overlapping_cycle).
+        free = 0
+        for r in sorted(u["vector"], key=issue_key):
+            if r["issued"] is None:
+                continue
+            cc, ov = _occ(r["detail"])
+            dur = max(cc - ov, 1)
+            start = max(r["issued"], free)
+            free = start + dur
+            add(core, "vector", start, dur, "vector", r)
+        # SA: each op runs on the systolic array the Core reports (the `sa=` field
+        # = its weight-pinned / round-robin assignment); fall back to round-robin
+        # by issue order for older logs without the field. Each SA is one server.
+        rows = sorted(u["sa"], key=issue_key)
+
+        def _sa_of(r, i):
+            m = re.search(r"\bsa=(-?\d+)", r["detail"])
+            return int(m.group(1)) if (m and int(m.group(1)) >= 0) else (i % nsa)
+
+        max_sa = max([nsa] + [_sa_of(r, i) + 1 for i, r in enumerate(rows)])
+        sa_free = [0] * max_sa
+        for i, r in enumerate(rows):
+            if r["issued"] is None:
+                continue
+            s = _sa_of(r, i)
+            cc, ov = _occ(r["detail"])
+            dur = max(cc - ov, 1)
+            start = max(r["issued"], sa_free[s])
+            sa_free[s] = start + dur
+            lane = "sa" if max_sa == 1 else f"sa{s}"
+            add(core, lane, start, dur, _label(r["opcode"], r["detail"]), r)
+
+    for c in sorted(cores):
+        events.append({"name": "process_name", "ph": "M", "pid": c, "tid": 0,
+                       "args": {"name": f"Core {c}"}})
+    order = {"dram-rd": 0, "dram-wr": 1,
+             "sa": 2, "sa0": 2, "sa1": 3, "sa2": 4, "sa3": 5, "vector": 7}
+    for c, lane in sorted(lanes, key=lambda x: (x[0], order.get(x[1], 5))):
+        events.append({"name": "thread_name", "ph": "M", "pid": c, "tid": lane,
+                       "args": {"name": lane}})
+        events.append({"name": "thread_sort_index", "ph": "M", "pid": c, "tid": lane,
+                       "args": {"sort_index": order.get(lane, 5)}})
+    return {"traceEvents": events, "displayTimeUnit": "ns"}
+
+
+def main(argv):
+    ap = argparse.ArgumentParser()
+    ap.add_argument("input", nargs="?", help="trace log file (default: stdin)")
+    ap.add_argument("-o", "--out", default="timeline.json")
+    ap.add_argument("-s", "--num-sa", type=int, default=1,
+                    help="systolic arrays per core (num_systolic_array_per_core); "
+                         ">1 splits into sa0..saN-1 lanes")
+    a = ap.parse_args(argv[1:])
+    src = open(a.input) if a.input else sys.stdin
+    insts = parse(src)
+    trace = to_chrome(insts, a.num_sa)
+    with open(a.out, "w") as fh:
+        json.dump(trace, fh)
+    n = sum(1 for e in trace["traceEvents"] if e["ph"] == "X")
+    sys.stderr.write(f"wrote {a.out}: {n} slices -> open in https://ui.perfetto.dev\n")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/tests/test_togsim_emitc.py b/tests/test_togsim_emitc.py
new file mode 100644
index 00000000..b0bd2d8e
--- /dev/null
+++ b/tests/test_togsim_emitc.py
@@ -0,0 +1,152 @@
+"""Tests for the C4 emitc lowering + compiled .so trace producer (P2).
+
+The pipeline under test (docs/design/togsim_cpp_trace.md, sec 5-7):
+
+    post-vcix .mlir --build_skeleton--> skeleton+API
+                    --lower_to_emitc--> EmitC module
+                    --mlir-translate--> C++
+                    --g++ -shared----> trace .so  (exports togsim_kernel;
+                                                    togsim_* left undefined)
+
+`test_build_trace_so` builds the .so and checks the EmitC/symbol-table shape.
+`test_trace_so_runs` additionally dlopens it against a stub runtime and confirms
+the producer executes and emits a non-empty deterministic trace.
+
+Both are skipped unless the MLIR bindings, `mlir-translate` (from
+TORCHSIM_LLVM_PATH), a host C++ compiler, AND a post-vcix `.mlir` fixture (via
+`TOGSIM_SKELETON_FIXTURE`) are available -- the same fixture used by
+test_togsim_skeleton.py.
+"""
+import importlib.util
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pytest
+
+_ROOT = pathlib.Path(__file__).resolve().parents[1]
+_CXX = os.environ.get("CXX", "g++")
+_INCLUDE = _ROOT / "TOGSim" / "include"
+
+
+def _mlir_translate():
+    return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"),
+                        "mlir-translate")
+
+
+def _tools_ready():
+    return (importlib.util.find_spec("mlir") is not None
+            and os.path.isfile(_mlir_translate())
+            and shutil.which(_CXX) is not None)
+
+
+def _fixture():
+    fix = os.environ.get("TOGSIM_SKELETON_FIXTURE")
+    if not fix or not os.path.isfile(fix):
+        pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir")
+    return fix
+
+
+_HARNESS = r'''
+#include <cstdio>
+#include <cstdint>
+#include <cstdlib>
+#include <dlfcn.h>
+#include "togsim_runtime.h"
+static int n_dma=0, n_membar=0, n_compute=0, n_core=0, bad=0;
+extern "C" {
+void togsim_dma(EmitCtx*, int32_t, int32_t, uint64_t, int32_t,
+                const int64_t*, const int64_t*, int32_t, int32_t,
+                int32_t, uint64_t, const int64_t*, int32_t,
+                const int64_t*, int32_t){ ++n_dma; }
+void togsim_compute(EmitCtx*, uint64_t, int32_t, int32_t, const int64_t*,
+                    const int64_t*, int32_t, const int64_t*, int32_t){ ++n_compute; }
+void togsim_memory_barrier(EmitCtx*, int32_t tag_id, uint64_t, const int64_t*, int32_t){
+  ++n_membar; if(tag_id<0) ++bad; }   // tag_id pairs it with its async dma
+void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, int64_t* iv, int32_t n){
+  ++n_core; fn(ctx, iv, n); }   // count a work-item + run its (outlined) body
+void togsim_compute_barrier(EmitCtx*){}
+}
+int main(int argc, char** argv){
+  void* h = dlopen(argv[1], RTLD_NOW | RTLD_GLOBAL);
+  if(!h){ printf("dlopen failed: %s\n", dlerror()); return 2; }
+  auto emit = (void(*)(EmitCtx*, int64_t*, int32_t))dlsym(h, "togsim_kernel");
+  if(!emit){ printf("dlsym failed: %s\n", dlerror()); return 3; }
+  emit(nullptr, nullptr, 0);
+  printf("TRACE core=%d dma=%d membar=%d compute=%d bad=%d\n",
+         n_core, n_dma, n_membar, n_compute, bad);
+  return 0;
+}
+'''
+
+
+@pytest.mark.skipif(not _tools_ready(),
+                    reason="need mlir bindings + mlir-translate + C++ compiler")
+def test_build_trace_so():
+    fix = _fixture()
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4
+
+    with tempfile.TemporaryDirectory() as d:
+        so = os.path.join(d, "trace.so")
+        emitc_text = c4.build_trace_so(fix, so)
+        assert os.path.isfile(so)
+
+        # EmitC form: one entry func, dma/memory_barrier/compute as call_opaque targets.
+        assert "emitc.func" in emitc_text
+        assert ("@%s" % c4.ENTRY) in emitc_text
+        assert 'emitc.call_opaque "togsim_dma"' in emitc_text
+        assert 'emitc.call_opaque "togsim_memory_barrier"' in emitc_text
+        assert 'emitc.call_opaque "togsim_compute"' in emitc_text
+
+        # Symbol table: entry exported (defined, text), runtime hooks undefined
+        # so the TOGSim loader resolves them at dlopen.
+        nm = subprocess.run(["nm", "-D", so], capture_output=True, text=True).stdout
+        syms = {parts[-1]: parts[-2] for parts in
+                (ln.split() for ln in nm.splitlines()) if len(parts) >= 2}
+        assert syms.get("togsim_kernel") == "T", nm
+        assert syms.get("togsim_dma") == "U", nm
+        assert syms.get("togsim_dispatch") == "U", nm
+        assert syms.get("togsim_memory_barrier") == "U", nm
+        # The per-work-item dispatch wrapper is emitted (outlined tile fn).
+        assert 'emitc.call_opaque "togsim_dispatch"' in emitc_text
+
+
+@pytest.mark.skipif(not _tools_ready(),
+                    reason="need mlir bindings + mlir-translate + C++ compiler")
+def test_trace_so_runs():
+    fix = _fixture()
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4
+
+    with tempfile.TemporaryDirectory() as d:
+        so = os.path.join(d, "trace.so")
+        c4.build_trace_so(fix, so)
+
+        harness_cpp = os.path.join(d, "harness.cpp")
+        harness_bin = os.path.join(d, "harness")
+        with open(harness_cpp, "w") as fh:
+            fh.write(_HARNESS)
+        # -rdynamic so the harness's togsim_* are visible to the dlopened .so.
+        build = subprocess.run(
+            [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE),
+             harness_cpp, "-o", harness_bin, "-ldl"],
+            capture_output=True, text=True)
+        assert build.returncode == 0, build.stderr
+
+        run = subprocess.run([harness_bin, so], capture_output=True, text=True)
+        assert run.returncode == 0, run.stdout + run.stderr
+        out = run.stdout.strip()
+        assert out.startswith("TRACE "), out
+        counts = dict(kv.split("=") for kv in out.split()[1:])
+        # The producer ran and emitted a real trace, with >=1 work-item (core alloc).
+        assert int(counts["core"]) >= 1
+        assert int(counts["dma"]) >= 1
+        assert int(counts["compute"]) >= 1
+        # Async loads are synced by explicit memory barriers, each carrying a
+        # valid (non-negative) tag_id that pairs it with its dma.
+        assert int(counts["membar"]) >= 1, out
+        assert int(counts["bad"]) == 0, out
diff --git a/tests/test_togsim_runtime.py b/tests/test_togsim_runtime.py
new file mode 100644
index 00000000..f17bccef
--- /dev/null
+++ b/tests/test_togsim_runtime.py
@@ -0,0 +1,181 @@
+"""P3 task 5: the TOGSim C6 runtime + loader (togsim_runtime.cc / togsim_loader.h).
+
+Builds a producer `.so` from a post-vcix fixture, links the real C6 runtime, runs
+the loader (`run_producer`) against the `.so`, and checks the recorded trace:
+DRAM addresses are resolved (base[arg_id] + offset*elem_bytes), compute cycles
+are looked up from the cycle table, and every wait gets a handle a dma minted.
+
+Skipped unless the MLIR bindings, `mlir-translate`, a C++ compiler, and a
+post-vcix `.mlir` fixture (`TOGSIM_SKELETON_FIXTURE`) are available.
+"""
+import importlib.util
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+import tempfile
+
+import pytest
+
+_ROOT = pathlib.Path(__file__).resolve().parents[1]
+_CXX = os.environ.get("CXX", "g++")
+_INCLUDE = _ROOT / "TOGSim" / "include"
+_RUNTIME = _ROOT / "TOGSim" / "src" / "togsim_runtime.cc"
+
+
+def _mlir_translate():
+    return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"),
+                        "mlir-translate")
+
+
+def _tools_ready():
+    return (importlib.util.find_spec("mlir") is not None
+            and os.path.isfile(_mlir_translate())
+            and shutil.which(_CXX) is not None
+            and _RUNTIME.is_file())
+
+
+def _fixture():
+    fix = os.environ.get("TOGSIM_SKELETON_FIXTURE")
+    if not fix or not os.path.isfile(fix):
+        pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir")
+    return fix
+
+
+# Drives the loader with known tensor bases + a synthetic cycle table, then
+# checks the recorded trace. Tailored to a single-output-tile GEMM (256^3):
+# 3 dmas A/B/C at offset 0 -> addr == base; args 0/1/2; dirs load/load/store.
+_MAIN = r'''
+#include <cstdio>
+#include <cstdint>
+#include <utility>
+#include <vector>
+#include "togsim_loader.h"
+using namespace togsim;
+int main(int argc, char** argv) {
+  uint64_t bases[3] = {0x1000, 0x2000, 0x3000};
+  int64_t  cyc[3]   = {100, 200, 300};
+  int64_t  ovl[3]   = {0, 200, 172};
+  RunResult r = run_producer(argv[1], nullptr, 0, bases, 3, cyc, ovl, 3, 1);
+  if (!r.ok) { printf("run failed\n"); return 2; }
+  int ndisp=0, nd=0, nc=0, nm=0, fail=0;
+  std::vector<uint64_t> dma_a; std::vector<int> dma_arg, dma_dir;
+  std::vector<std::pair<int,uint64_t>> async_tags;  // (tag_id, tag_slot) of async dmas
+  for (auto& t : r.trace) {
+    if (t.kind == TraceRec::TILE_BEGIN) ndisp++;   // one per work-item
+    else if (t.kind == TraceRec::DMA) {
+      nd++; dma_a.push_back(t.addr);
+      dma_arg.push_back(t.arg_id); dma_dir.push_back(t.dir);
+      if (t.is_async) async_tags.push_back({t.tag_id, t.tag_slot});
+    } else if (t.kind == TraceRec::COMPUTE) {
+      nc++;
+      int64_t want = (t.tile_id < 3) ? cyc[t.tile_id] : -1;
+      if (t.cycle != want) { printf("compute %lu cyc %ld!=%ld\n",
+          (unsigned long)t.tile_id, (long)t.cycle, (long)want); fail++; }
+    } else if (t.kind == TraceRec::MEMORY_BAR) {
+      nm++; bool ok=false;
+      for (auto& k : async_tags) if (k.first==t.tag_id && k.second==t.tag_slot) ok=true;
+      if (!ok) { printf("membar tag (%d,%lu) pairs no async dma\n",
+          t.tag_id, (unsigned long)t.tag_slot); fail++; }
+    }
+  }
+  const uint64_t exp[3] = {0x1000, 0x2000, 0x3000};
+  const int ea[3] = {0,1,2}, ed[3] = {0,0,1};
+  for (int i = 0; i < nd && i < 3; ++i)
+    if (dma_a[i]!=exp[i] || dma_arg[i]!=ea[i] || dma_dir[i]!=ed[i]) {
+      printf("dma[%d] addr=%#lx arg=%d dir=%d\n", i,
+             (unsigned long)dma_a[i], dma_arg[i], dma_dir[i]); fail++;
+    }
+  printf("dispatch=%d dma=%d compute=%d membar=%d fail=%d\n", ndisp, nd, nc, nm, fail);
+  printf(fail ? "RESULT FAIL\n" : "RESULT PASS\n");
+  return fail ? 1 : 0;
+}
+'''
+
+
+@pytest.mark.skipif(not _tools_ready(),
+                    reason="need mlir bindings + mlir-translate + C++ compiler + runtime")
+def test_runtime_loads_and_records():
+    fix = _fixture()
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4
+
+    with tempfile.TemporaryDirectory() as d:
+        so = os.path.join(d, "trace.so")
+        c4.build_trace_so(fix, so)
+
+        main_cpp = os.path.join(d, "main.cpp")
+        binp = os.path.join(d, "runtime_test")
+        with open(main_cpp, "w") as fh:
+            fh.write(_MAIN)
+        build = subprocess.run(
+            [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE),
+             main_cpp, str(_RUNTIME), "-o", binp, "-ldl"],
+            capture_output=True, text=True)
+        assert build.returncode == 0, build.stderr
+
+        run = subprocess.run([binp, so], capture_output=True, text=True)
+        out = run.stdout
+        assert "RESULT PASS" in out, out + run.stderr
+        assert run.returncode == 0, out
+        # at least the GEMM's 3 dmas were recorded with resolved addresses.
+        line = [l for l in out.splitlines() if l.startswith("dispatch=")][0]
+        counts = dict(kv.split("=") for kv in line.split())
+        assert int(counts["dma"]) >= 1
+        assert int(counts["compute"]) >= 1
+        assert int(counts["fail"]) == 0
+
+
+_SIM_MAIN = r'''
+#include <cstdio>
+#include <cstdint>
+#include "togsim_loader.h"
+using namespace togsim;
+int main(int argc, char** argv) {
+  uint64_t bases[3] = {0x1000, 0x2000, 0x3000};
+  int64_t  cyc[3]   = {100, 200, 300};
+  int64_t  ovl[3]   = {0, 200, 172};
+  RunResult r = run_producer(argv[1], nullptr, 0, bases, 3, cyc, ovl, 3, 1);
+  if (!r.ok) { printf("run failed\n"); return 2; }
+  TimingParams p; p.dma_latency = 100;
+  SimResult s = simulate(r, p);
+  // serial baseline: no overlap at all.
+  uint64_t serial = 0;
+  for (auto& t : r.trace) {
+    if (t.kind == TraceRec::DMA) serial += p.dma_latency;
+    else if (t.kind == TraceRec::COMPUTE) serial += (uint64_t)t.cycle;
+  }
+  printf("SIM total=%lu compute=%d dma=%d serial=%lu\n",
+         (unsigned long)s.total_cycle, s.n_compute, s.n_dma, (unsigned long)serial);
+  // The trace is schedulable into cycles; overlap (dma||compute, compute
+  // pipelining) makes it no worse than the fully-serial baseline.
+  bool ok = s.total_cycle > 0 && s.n_compute > 0 && s.total_cycle <= serial;
+  printf(ok ? "RESULT PASS\n" : "RESULT FAIL\n");
+  return ok ? 0 : 1;
+}
+'''
+
+
+@pytest.mark.skipif(not _tools_ready(),
+                    reason="need mlir bindings + mlir-translate + C++ compiler + runtime")
+def test_simulate_produces_cycles():
+    fix = _fixture()
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4
+
+    with tempfile.TemporaryDirectory() as d:
+        so = os.path.join(d, "trace.so")
+        c4.build_trace_so(fix, so)
+        main_cpp = os.path.join(d, "sim.cpp")
+        binp = os.path.join(d, "sim_test")
+        with open(main_cpp, "w") as fh:
+            fh.write(_SIM_MAIN)
+        build = subprocess.run(
+            [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE),
+             main_cpp, str(_RUNTIME), "-o", binp, "-ldl"],
+            capture_output=True, text=True)
+        assert build.returncode == 0, build.stderr
+        run = subprocess.run([binp, so], capture_output=True, text=True)
+        assert "RESULT PASS" in run.stdout, run.stdout + run.stderr
+        assert run.returncode == 0, run.stdout
diff --git a/tests/test_togsim_skeleton.py b/tests/test_togsim_skeleton.py
new file mode 100644
index 00000000..56601966
--- /dev/null
+++ b/tests/test_togsim_skeleton.py
@@ -0,0 +1,184 @@
+"""Tests for the C++ trace-generation front-end pieces (docs/design/togsim_cpp_trace.md).
+
+Two layers:
+
+* `test_togsim_ops_contract` runs anywhere (no MLIR bindings, no torch). It pins
+  the skeleton+API vocabulary (`togsim_ops.py`) and checks it stays in lockstep
+  with the runtime ABI header (`togsim_runtime.h`) -- the single thing most
+  likely to silently drift.
+* `test_build_skeleton_on_fixture` exercises the real `build_skeleton` pass, and
+  is skipped unless the MLIR bindings are importable AND a post-vcix `.mlir`
+  fixture is supplied via the `TOGSIM_SKELETON_FIXTURE` env var. (A valid
+  build_tog-compatible fixture is hard to hand-write reliably; point this at a
+  kernel dump from a real run.)
+"""
+import os
+import importlib.util
+import pathlib
+
+import pytest
+
+_ROOT = pathlib.Path(__file__).resolve().parents[1]
+_OPS_PY = _ROOT / "PyTorchSimFrontend" / "mlir" / "passes" / "togsim_ops.py"
+_HEADER = _ROOT / "TOGSim" / "include" / "togsim_runtime.h"
+
+
+def _load_togsim_ops():
+    spec = importlib.util.spec_from_file_location("togsim_ops", _OPS_PY)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+def test_togsim_ops_contract():
+    ts = _load_togsim_ops()
+    header = _HEADER.read_text()
+
+    # Every op maps to a callee, and every callee is the header's free function.
+    assert set(ts.EMITC_CALLEE) == set(ts.OP_NAMES)
+    for callee in ts.EMITC_CALLEE.values():
+        assert callee in header, f"{callee} missing from togsim_runtime.h"
+
+    # Entry point symbol agrees with the header.
+    assert ts.ENTRY_SYMBOL == "togsim_kernel"
+    assert ts.ENTRY_SYMBOL in header
+
+    # Runtime callee emitted directly by lower_to_emitc: the work-item dispatch
+    # wrapper. (The outlined tile fn TILE_SYMBOL is producer-generated.)
+    assert ts.DISPATCH_CALLEE in header
+
+    # Direction enum agrees with the header's togsim_dma_dir.
+    assert (ts.DIR_LOAD, ts.DIR_STORE) == (0, 1)
+    assert "TOGSIM_DMA_LOAD  = 0" in header
+    assert "TOGSIM_DMA_STORE = 1" in header
+
+
+def _mlir_available():
+    return importlib.util.find_spec("mlir") is not None
+
+
+@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed")
+def test_build_skeleton_on_fixture():
+    fixture = os.environ.get("TOGSIM_SKELETON_FIXTURE")
+    if not fixture or not os.path.isfile(fixture):
+        pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir")
+
+    import sys
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import build_skeleton
+
+    import mlir.ir as ir
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(pathlib.Path(fixture).read_text(), ctx)
+        report = build_skeleton.build_skeleton(module)
+        out = str(module)
+
+    # The data-movement ops are gone; the API ops took their place.
+    assert "memref.dma_start" not in out
+    assert "memref.dma_wait" not in out
+    assert "togsim.dma" in out
+    assert "togsim.memory_barrier" in out   # the explicit async-DMA sync (was dma_wait)
+    assert "event_id" not in out            # static pairing replaced by the runtime tag
+    # Loop skeleton is preserved.
+    assert ("affine.for" in out) or ("scf.for" in out)
+    assert module.operation.verify()
+    print(report)
+
+
+@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed")
+def test_strip_accum_terms_drops_reduction_marker():
+    """Regression: the dma_wait tag index built by lower_to_vcix carries a `-d_i`
+    term for each accumulation (reduction) loop var -- a sentinel marker, not an
+    offset. build_skeleton must drop those so a memory_barrier waits on the same
+    subtile slot the async load wrote; otherwise the producer evaluates `-acc_iv`
+    to a negative slot at reduction iteration > 0, the recorded barrier slot
+    diverges from the load slot, and TOGSim aborts with "Key does not exist in ...
+    tag table" on subtile + multi-tile-K. See docs/design/togsim_cpp_trace.md and
+    legacy TileGraphParser.cc (which skips stride -1 for the same reason)."""
+    import sys
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import build_skeleton as bs
+
+    import mlir.ir as ir
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx, ir.Location.unknown(ctx):
+        module = ir.Module.parse(
+            "func.func @k() {\n"
+            "  %r = arith.constant 1 : index\n"   # stand-in reduction iv
+            "  %a = arith.constant 0 : index\n"   # subtile dim 1
+            "  %b = arith.constant 0 : index\n"   # subtile dim 2
+            "  return\n"
+            "}", ctx)
+        block = module.body.operations[0].regions[0].blocks[0]
+        consts = [op.results[0] for op in block.operations if op.name == "arith.constant"]
+        anchor = [op for op in block.operations if op.name == "func.return"][0]
+        r, a, b = consts
+
+        def neg_dims(val):
+            amap = ir.AffineMapAttr(val.owner.attributes["map"]).value
+            return [p for p in (bs._neg_coeff_dim(s) for s in bs._flatten_add(amap.results[0]))
+                    if p is not None]
+
+        # #map8-style: -d0 (reduction) + d1 + d2 floordiv 2.
+        d0, d1, d2 = (ir.AffineDimExpr.get(i) for i in range(3))
+        expr = d0 * -1 + d1 + ir.AffineExpr.get_floor_div(d2, 2)
+        with ir.InsertionPoint(anchor):
+            apply = ir.Operation.create(
+                "affine.apply", results=[ir.IndexType.get()], operands=[r, a, b],
+                attributes={"map": ir.AffineMapAttr.get(ir.AffineMap.get(3, 0, [expr]))})
+        tag_in = apply.results[0]
+        assert neg_dims(tag_in) == [0]                       # the reduction marker is present
+
+        tag_out = bs._strip_accum_terms(ctx, tag_in, anchor)
+        assert tag_out is not tag_in                         # a new, reduced apply was emitted
+        out_map = ir.AffineMapAttr(tag_out.owner.attributes["map"]).value
+        assert out_map.n_dims == 2                           # the reduction dim was dropped
+        assert neg_dims(tag_out) == []                       # no reduction marker remains
+        assert list(tag_out.owner.operands) == [a, b]        # only the subtile operands survive
+
+        # No-op: an index with no reduction marker is returned unchanged.
+        plain = d0 + d1
+        with ir.InsertionPoint(anchor):
+            papply = ir.Operation.create(
+                "affine.apply", results=[ir.IndexType.get()], operands=[a, b],
+                attributes={"map": ir.AffineMapAttr.get(ir.AffineMap.get(2, 0, [plain]))})
+        pin = papply.results[0]
+        assert bs._strip_accum_terms(ctx, pin, anchor) is pin
+
+        assert module.operation.verify()
+
+
+@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed")
+def test_cycle_table_on_fixture():
+    fixture = os.environ.get("TOGSIM_SKELETON_FIXTURE")
+    if not fixture or not os.path.isfile(fixture):
+        pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir")
+
+    import sys
+    sys.path.insert(0, str(_ROOT))
+    from PyTorchSimFrontend.mlir.passes import build_skeleton, cycle_table
+
+    import mlir.ir as ir
+    ctx = ir.Context()
+    ctx.allow_unregistered_dialects = True
+    with ctx:
+        module = ir.Module.parse(pathlib.Path(fixture).read_text(), ctx)
+        build_skeleton.build_skeleton(module)
+        types = cycle_table._compute_types(module)
+        # synthetic per-tile cycles (gem5 sample-mode is reused at P3 task 5).
+        cyc = [10 * (i + 1) for i in range(len(types))]
+        x_off, w_off = 4, 0
+        table = cycle_table.build_cycle_table(module, cyc, x_off, w_off)
+
+    assert len(table) == len(types) >= 1
+    # cycle is carried verbatim; overlapping_cycle follows the legacy formula.
+    for (cy, ov), t, raw in zip(table, types, cyc):
+        assert cy == raw
+        if t == cycle_table.VECTOR_COMPUTE:
+            assert ov == 0
+        else:
+            off = w_off if t == cycle_table.MATMUL_PRELOAD else x_off
+            assert ov == max(raw - off, 0)