diff --git a/AsmParser/tog_generator.py b/AsmParser/tog_generator.py index a12460e3..0de76246 100644 --- a/AsmParser/tog_generator.py +++ b/AsmParser/tog_generator.py @@ -1,3 +1,9 @@ +# DEPRECATED (timing path): legacy ONNX Tile-Operation-Graph producer. Builds +# the TOG and serializes it to ONNX for the C++ TileGraphParser. Superseded by +# the C++ trace pipeline (PyTorchSimFrontend/mlir/passes/build_skeleton.py + +# lower_to_emitc.py + cycle_table.py -> a compiled trace .so). Kept live so the +# current pipeline does not break; to be retired once the trace pipeline (P3+) +# stabilizes. See docs/design/togsim_cpp_trace.md. import os import sys import importlib.util diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 492133a3..e573d1a5 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -241,8 +241,19 @@ def load(cls, source_code, # Run cyclesim cyclesim = CycleSimulator() cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), vectorlane_size, silent_mode=silent_mode) + # Snapshot for the P3-trace hook below: generate_tile_graph consumes + # cycle_list in place (cycle_list.pop(0) per tile), leaving it empty. + cycle_list_for_trace = list(cycle_list) # Create TOG + # DEPRECATED (timing path): this ONNX-TOG producer -- run_tog -> + # tog_generator.generate_tile_graph -> ONNX -> C++ TileGraphParser -- + # is being superseded by the C++ trace pipeline (build_skeleton + + # lower_to_emitc -> compiled .so, + the cycle_table sidecar). The + # per-tile cycle_list / x_offset / w_offset computed here are exactly + # what cycle_table.build_cycle_table will reuse, so both paths stay + # cycle-consistent during the transition. Kept live (pipeline must not + # break); to be retired once the trace pipeline (P3+) stabilizes. w_offset, x_offset = vectorlane_size, vectorlane_size if kwargs['loop_size'] is not None and kwargs['loop_size'][-3] < vectorlane_size: x_offset = kwargs['loop_size'][-3] @@ -258,6 +269,36 @@ def load(cls, source_code, w_offset=w_offset, # FIXME. vector_lane=vectorlane_size ) + + # Trace pipeline (DEFAULT): emit the compiled trace producer .so + the + # cycle-table TSV from the post-vcix IR and gem5 cycle_list/offsets. This + # is the default simulation path (the C++ TOG); the legacy ONNX TOG is the + # opt-in fallback via TORCHSIM_LEGACY_TOG=1, in which case the .so is unused + # so skip emitting it. Best-effort: never breaks the compile. + if os.environ.get("TORCHSIM_LEGACY_TOG") != "1": + try: + import mlir.ir as ir + from PyTorchSimFrontend.mlir.passes import ( + build_skeleton as _bs, cycle_table as _ct, lower_to_emitc as _l2e) + pv = sample_mlir_path + "_postvcix.mlir" + _ctx = ir.Context(); _ctx.allow_unregistered_dialects = True + with _ctx: + _mod = ir.Module.parse(open(pv).read(), _ctx) + _bs.build_skeleton(_mod) + _ntiles = len(_ct._compute_types(_mod)) + # align lengths: gem5 gives one numCycles per compute node; + # pad with the last value / truncate if it disagrees. + _cl = list(cycle_list_for_trace) + if _cl and len(_cl) != _ntiles: + _cl = (_cl + [_cl[-1]] * _ntiles)[:_ntiles] + logger.info(f"[P3-trace] cycle_list={cycle_list_for_trace} -> {_cl} " + f"(#tiles={_ntiles}, x_off={x_offset}, w_off={w_offset})") + _tbl = _ct.build_cycle_table(_mod, _cl, x_offset, w_offset) + _ct.dump_cycle_table_tsv(_tbl, os.path.join(write_path, "trace_cycles.tsv")) + _l2e.build_trace_so(pv, os.path.join(write_path, "trace.so")) + logger.info(f"[P3-trace] wrote trace.so + trace_cycles.tsv in {write_path}") + except Exception as e: + logger.warning(f"[P3-trace] trace .so/sidecar dump skipped: {e}") return key class CustomAsyncCompile(AsyncCompile): diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 725e0dc6..1ee62f36 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1120,9 +1120,13 @@ def codegen_nodes(self, nodes, kernel_name): src_code, meta_code = super().codegen_nodes(nodes, kernel_name) self._prepare_simulator_headers(src_code) if "autotune" in extension_config.codegen_mapping_strategy and extension_config.pytorchsim_timing_mode: - optimal_src_code, meta_code = self.autotune(nodes, kernel_name)[:2] + # Use temporaries: autotune returns [None, None, None] when it cannot + # autotune (e.g. a size-1 pointwise kernel with ranges == [1]), and + # unpacking into meta_code would clobber the valid arg_attributes that + # the fall-through below returns. + optimal_src_code, optimal_meta_code = self.autotune(nodes, kernel_name)[:2] if optimal_src_code is not None: - return optimal_src_code, meta_code + return optimal_src_code, optimal_meta_code return src_code, meta_code def _prepare_simulator_headers(self, src_code): diff --git a/PyTorchSimFrontend/mlir/passes/_mlir_util.py b/PyTorchSimFrontend/mlir/passes/_mlir_util.py new file mode 100644 index 00000000..e39f9d6f --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/_mlir_util.py @@ -0,0 +1,87 @@ +"""Small, dependency-light helpers shared across the MLIR passes. + +Every pass had its own copy of the same op-walk generator (named variously +`_iter_ops` / `_walk` / `_walk_ops`) and the same one-line attribute builders +(`_i32` / `_i64` / ...). This module is the single source for both. + +Import-safety: `walk_ops` is pure block/op attribute access and needs no MLIR +bindings, so this module does NOT import `mlir.ir` at top level -- some passes +(e.g. lower_vlane_idx, decompose_transfer) are deliberately importable without +the bindings present and only touch `mlir.ir` inside their run functions. The +attribute builders therefore import `mlir.ir` lazily; they require an active +MLIR context (the caller's `with ctx:`), exactly as the per-pass copies did. +""" + + +def walk_ops(block): + """Yield every op under `block` in program order, recursing into regions. + + Snapshots each block's operation list, so a caller may erase ops while + iterating (the strictest of the former copies; a superset of the rest).""" + for op in list(block.operations): + yield op + for region in op.operation.regions: + for b in region.blocks: + yield from walk_ops(b) + + +def _ir(): + import mlir.ir as ir + return ir + + +def i32(v): + """`i32` IntegerAttr for `v` (uses the active MLIR context).""" + ir = _ir() + return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), int(v)) + + +def i64(v): + """`i64` IntegerAttr for `v`.""" + ir = _ir() + return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), int(v)) + + +def i64_array(vals): + """ArrayAttr of `i64` IntegerAttrs for `vals`.""" + ir = _ir() + i = ir.IntegerType.get_signless(64) + return ir.ArrayAttr.get([ir.IntegerAttr.get(i, int(v)) for v in vals]) + + +def str_attr(v): + """StringAttr of `str(v)`.""" + ir = _ir() + return ir.StringAttr.get(str(v)) + + +# --------------------------------------------------------------------------- +# attribute readers -- accept an OpView or an Operation; `default` is returned +# when `key` is absent (callers that want the strict "must be present" behaviour +# simply never pass an absent key). +# --------------------------------------------------------------------------- +def _attrs(op): + return getattr(op, "operation", op).attributes + + +def attr_int(op, key, default=None): + """Integer value of `op`'s `key` attribute, or `default` if absent.""" + ir = _ir() + a = _attrs(op) + return ir.IntegerAttr(a[key]).value if key in a else default + + +def attr_bool(op, key, default=False): + """Bool value of `op`'s `key` attribute, or `default` if absent.""" + ir = _ir() + a = _attrs(op) + return bool(ir.BoolAttr(a[key]).value) if key in a else default + + +def attr_i64_array(op, key, default=None): + """`op`'s `key` ArrayAttr of integers as a Python list, or `default` if + absent (pass `default=[]` for the "missing -> empty" convention).""" + ir = _ir() + a = _attrs(op) + return ([ir.IntegerAttr(x).value for x in ir.ArrayAttr(a[key])] + if key in a else default) diff --git a/PyTorchSimFrontend/mlir/passes/build_skeleton.py b/PyTorchSimFrontend/mlir/passes/build_skeleton.py new file mode 100644 index 00000000..df4c6046 --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/build_skeleton.py @@ -0,0 +1,512 @@ +"""build_skeleton pass (C2): reduce a kernel's post-vcix MLIR to the +*skeleton + API* form, in place. + +The trace pipeline (docs/design/togsim_cpp_trace.md) compiles a kernel to a +shape-parametric C++ trace producer. The producer is just the kernel's loop +skeleton with the data computation replaced by calls to the event-based runtime +API. This pass performs that reduction at the MLIR level: + + * `memref.dma_start` -> `togsim.dma(...) {tag_id, is_async, ...}` carrying the + runtime tag index operand (`%tag[%idx]`). + * `memref.dma_wait` -> `togsim.memory_barrier(tag_idx) {tag_id, write_bufs}`, + the explicit async-DMA sync. It pairs with its dma by + the RUNTIME tag slot (tag_id + the tag index), not a + compile-time id: one static dma op runs once per loop + iteration with a different `%tag[%idx]`, so only the + runtime slot can pair iteration i's dma with its wait. + * each compute node -> a single `togsim.compute {tile_id, compute_type}` + * everything else -> removed by a use-based DCE, keeping the loops and the + index/address arithmetic the survivors depend on. + +It reuses build_tog's traversal (`TogBuilder` / `_build`): loops, DMAs and +compute blocks are already identified there, each with a back-pointer to its +MLIR op(s), so this pass only adds the *rewrite*. Keeping a single traversal +guarantees the skeleton and the legacy TOG see the same structure. + +Counterpart to `build_tog.build_tog_and_mutate`. + +The DCE is safe by construction: it never erases an op whose results still have +uses, so at worst it leaves extra ops in the dump (visible for diagnosis) rather +than producing invalid IR. + +Requires the MLIR Python bindings (importing `build_tog` pulls in `mlir.ir`). +""" + +from . import togsim_ops as ts +from ._mlir_util import walk_ops, i32, i64, i64_array, str_attr +from .build_tog import ( + ir, + TogBuilder, + _build, + _reset_ids, + _find_kernel, + _value_key, + TOGDMANode, + TOGDMAWaitNode, + _COMPUTE_TYPE_NAME, +) + +#: Marker op names for the passes/__init__ fast-path (skip parsing if absent). +MARKERS = ("memref.dma_start", "memref.dma_wait") + +#: Ops the DCE must never remove (loops, terminators, our API ops). +_KEEP = { + "affine.for", "scf.for", "scf.while", + "affine.yield", "scf.yield", "func.return", + ts.DMA, ts.COMPUTE, ts.COMPUTE_BAR, ts.MEMORY_BAR, +} + + +def _kernel_block(module): + func_op = _find_kernel(module) + if func_op is None: + return None + return func_op.regions[0].blocks[0] + + +# --------------------------------------------------------------------------- +# op construction +# --------------------------------------------------------------------------- +def _arg_id_of(base_addr): + """Tensor func-arg ordinal from a build_tog base name ("arg3" -> 3); -1 if + it is not a plain block-arg base.""" + s = str(base_addr) + return int(s[3:]) if s.startswith("arg") and s[3:].isdigit() else -1 + + +def _emit_dma(ctx, dma_node, tag_id, dram_index, tag_index, read_bufs, write_bufs): + """Insert a `togsim.dma` before the original `memref.dma_start`. + + `tag_id` is the identity of this DMA's tag memref. An async DMA pairs with + its `togsim.memory_barrier` (the original dma_wait) by the RUNTIME tag slot + -- (tag_id, tag_index) -- not a compile-time identifier: one static dma op runs + once per loop iteration, each with a different runtime `%tag[%idx]` slot, so + only a runtime key can pair iteration i's dma with iteration i's wait. + + `dram_index` is the original linear DRAM index Value (the `affine.apply` + result that indexed the tensor in the `memref.dma_start`) -- carried as an + operand so the DCE keeps the address arithmetic live and the C4 lowering can + compute the real `base_addr = base[arg_id] + index*elem` (P3, approach A). + + `tag_index` is the original SRAM tag index Value (`%tag[%idx]`), carried as a + second operand: the runtime tag slot, used both to pair with the barrier and + for the double-buffer / SRAM-capacity (WAR) model. + Operand order: [dram_index, tag_index] (each omitted if absent).""" + op = dma_node.op + attrs = { + ts.ATTR_DIR: i32(ts.DIR_STORE if dma_node.is_write else ts.DIR_LOAD), + ts.ATTR_DIMS: i64_array(dma_node.tile_size), + ts.ATTR_STRIDES: i64_array(dma_node.tile_stride), + ts.ATTR_ELEM_BITS: i32(dma_node.element_size), + ts.ATTR_IS_ASYNC: ir.BoolAttr.get(bool(dma_node.is_async)), + ts.ATTR_TAG_ID: i32(tag_id), + ts.ATTR_ARG_ID: i32(_arg_id_of(dma_node.base_addr)), + "base": str_attr(dma_node.base_addr), + # SRAM spad this DMA touches (load writes it, store reads it) -- sec 10. + ts.ATTR_READ_BUFS: i64_array(read_bufs), + ts.ATTR_WRITE_BUFS: i64_array(write_bufs), + } + operands = [v for v in (dram_index, tag_index) if v is not None] + ir.Operation.create( + ts.DMA, + results=[], + operands=operands, + attributes=attrs, + loc=ir.Location.unknown(ctx), + ip=ir.InsertionPoint(op), + ) + + +def _emit_compute_bar(ctx, anchor_op): + """Insert a `togsim.compute_barrier` before `anchor_op` -- the fence that + drains in-flight async compute (the systolic-array matmuls) before a store + consumes their result (sec 10.7). + + FIXME: this is the one barrier still synthesized here rather than read from + the IR. Like the async-load memory barrier (now mapped 1:1 from the explicit + dma_wait), the compute fence should eventually appear explicitly in the input + MLIR and be mapped through, not auto-inserted -- no surprising insertion.""" + ir.Operation.create( + ts.COMPUTE_BAR, results=[], operands=[], attributes={}, + loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(anchor_op)) + + +def _emit_memory_bar(ctx, anchor_op, tag_id, tag_index, write_bufs): + """Insert a `togsim.memory_barrier` before `anchor_op` -- the explicit + async-DMA sync that was the original `memref.dma_wait`. It pairs with its + async `togsim.dma` by the RUNTIME tag slot (tag_id + tag_index), and carries + the SRAM buffer that dma loaded so consumers gate on data-arrival, not on the + async dma's issue-complete.""" + attrs = { + ts.ATTR_TAG_ID: i32(tag_id), + ts.ATTR_WRITE_BUFS: i64_array(write_bufs), + } + operands = [tag_index] if tag_index is not None else [] + ir.Operation.create( + ts.MEMORY_BAR, results=[], operands=operands, attributes=attrs, + loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(anchor_op)) + + +def _flatten_add(expr): + """Top-level additive summands of an AffineExpr (`.lhs`/`.rhs` come back typed + as the base AffineExpr, so use the `isinstance`/cast pattern, not Python + isinstance).""" + if ir.AffineAddExpr.isinstance(expr): + a = ir.AffineAddExpr(expr) + return _flatten_add(a.lhs) + _flatten_add(a.rhs) + return [expr] + + +def _neg_coeff_dim(summand): + """If `summand` is `dim * c` with a negative constant `c`, return that dim's + position; else None. lower_to_vcix tags each accumulation (reduction) loop var + with coefficient -1 in the dma_wait tag index -- a SENTINEL marking the + reduction axis, not an arithmetic offset (legacy TileGraphParser skips stride + -1 for the same reason).""" + if not ir.AffineMulExpr.isinstance(summand): + return None + mul = ir.AffineMulExpr(summand) + l, r = mul.lhs, mul.rhs + dim = l if ir.AffineDimExpr.isinstance(l) else (r if ir.AffineDimExpr.isinstance(r) else None) + con = l if ir.AffineConstantExpr.isinstance(l) else (r if ir.AffineConstantExpr.isinstance(r) else None) + if dim is None or con is None or ir.AffineConstantExpr(con).value >= 0: + return None + return ir.AffineDimExpr(dim).position + + +def _strip_accum_terms(ctx, tag_index, anchor_op): + """Return a tag-index Value with the accumulation-marked (-1 coefficient) terms + dropped, so a memory_barrier waits on the SAME subtile slot its async load + wrote. + + The wait tag index built by lower_to_vcix carries `-acc_iv` for each reduction + loop var; the matching load index (dma_fine_grained) is subtile-only. Without + this, at reduction iteration > 0 the producer EVALUATES `-acc_iv` to a negative + slot, so the recorded barrier slot diverges from the load slot and the runtime + tag pairing fails (TOGSim aborts with "Key does not exist in ... tag table"). + Dropping the -1 terms mirrors legacy TileGraphParser.cc, which skips stride -1 + and routes the reduction axis to a separate accum tag component; here the + per-iteration tag alloc (dma_fine_grained) already separates the reductions, so + the barrier only needs the subtile slot. + + Falls through (returns `tag_index` unchanged) for anything that is not an + affine.apply whose single result carries such a term -- e.g. the single-tile + case, whose index has no reduction term.""" + if tag_index is None: + return None + try: + apply_op = tag_index.owner + if apply_op.name != "affine.apply": + return tag_index + amap = ir.AffineMapAttr(apply_op.attributes["map"]).value + except Exception: + return tag_index + if amap.n_dims == 0 or amap.n_symbols != 0 or len(amap.results) != 1: + return tag_index + expr = amap.results[0] + dropped = sorted({p for p in (_neg_coeff_dim(s) for s in _flatten_add(expr)) + if p is not None}) + if not dropped: + return tag_index + n = amap.n_dims + kept = [i for i in range(n) if i not in dropped] + new_pos = {old: i for i, old in enumerate(kept)} + # compose the original expr with a selector that sends each dropped dim to 0 + # and renumbers the kept dims 0..k-1. + sel = [ir.AffineConstantExpr.get(0) if i in dropped + else ir.AffineDimExpr.get(new_pos[i]) for i in range(n)] + new_expr = expr.compose(ir.AffineMap.get(len(kept), 0, sel)) + new_map = ir.AffineMap.get(len(kept), 0, [new_expr]) + operands = list(apply_op.operands) + new_operands = [operands[i] for i in kept] + new_apply = ir.Operation.create( + "affine.apply", + results=[ir.IndexType.get(ctx)], + operands=new_operands, + attributes={"map": ir.AffineMapAttr.get(new_map)}, + loc=ir.Location.unknown(ctx), + ip=ir.InsertionPoint(anchor_op), + ) + return new_apply.results[0] + + +def _emit_compute(ctx, compute_node, tile_id, read_bufs, write_bufs): + front = compute_node.operations[0] + attrs = { + ts.ATTR_TILE_ID: i64(tile_id), + # int code (0 vector / 1 matmul / 2 preload) consumed by the C4 lowering; + # maps directly to the Core compute-unit enum. Keep the readable name too. + ts.ATTR_COMPUTE_TYPE: i32(int(compute_node.compute_type)), + "compute_type_name": str_attr(_COMPUTE_TYPE_NAME[compute_node.compute_type]), + # SRAM buffer ids read/written (sec 10 dataflow); the bridge builds the + # dependency DAG by last-writer per buffer. + ts.ATTR_READ_BUFS: i64_array(read_bufs), + ts.ATTR_WRITE_BUFS: i64_array(write_bufs), + } + ir.Operation.create( + ts.COMPUTE, + results=[], + operands=[], + attributes=attrs, + loc=ir.Location.unknown(ctx), + ip=ir.InsertionPoint(front), + ) + + +# --------------------------------------------------------------------------- +# DCE +# --------------------------------------------------------------------------- +def _has_nonempty_region(op): + for region in op.operation.regions: + for b in region.blocks: + if len(list(b.operations)) > 0: + return True + return False + + +def _results_unused(op): + for r in op.operation.results: + if len(list(r.uses)) > 0: + return False + return True + + +def _dce(block): + """Erase non-kept ops with no used results, to a fixed point. Safe: an op + with live SSA uses is never touched.""" + changed = True + while changed: + changed = False + victims = [] + for op in walk_ops(block): + name = op.operation.name + if name in _KEEP: + continue + if _has_nonempty_region(op): + continue + if _results_unused(op): + victims.append(op) + for op in victims: + try: + op.operation.erase() + changed = True + except Exception: + # Still referenced via something we will erase next round; retry. + pass + + +# --------------------------------------------------------------------------- +# driver +# --------------------------------------------------------------------------- +def _collect_dma_nodes(builder): + """Map op-identity -> DMA/DMAWait node, by walking the built tree.""" + by_op = {} + seen = set() + + def visit(n): + if id(n) in seen: + return + seen.add(id(n)) + if isinstance(n, (TOGDMANode, TOGDMAWaitNode)) and n.op is not None: + by_op[id(n.op.operation)] = n + for c in n.children: + visit(c) + + for ln in builder.loop_nodes: + visit(ln) + return by_op + + +class _BufferIds: + """Assigns each SRAM buffer name a stable small int id, shared by DMA and + compute so the bridge can match a reader to its buffer's writer (sec 10). + The virtual SA_WEIGHTS buffer (preload -> matmul) is numbered here too, on + first sight. `None` (a non-buffer base) is -1.""" + + def __init__(self): + self._ids = {} + + def of(self, name): + if name is None: + return -1 + return self._ids.setdefault(name, len(self._ids)) + + +class _TagIds: + """Identity of a DMA's tag memref -> stable small int, plus the SRAM buffer + that tag's async DMA loads. An async dma and its memory_barrier (the original + dma_wait) share a tag memref; this assigns it a tag_id (so the runtime can + pair them by the runtime tag slot) and remembers the loaded buffer so the + barrier can release it to consumers. Pairing is by tag, never a static id.""" + + def __init__(self): + self._ids = {} # tag value-key -> tag_id + self._buf = {} # tag value-key -> SRAM buffer id the dma loads + + def bind(self, key, buf): + tag_id = self._ids.setdefault(key, len(self._ids)) + self._buf[key] = buf + return tag_id + + def lookup(self, key): + """(tag_id, buffer) for a tag memref, or None if no dma used it.""" + if key not in self._ids: + return None + return self._ids[key], self._buf[key] + + +def _emit_computes(ctx, builder, bufs): + """Step 1: each compute node -> one togsim.compute carrying its tile_id and + the ids of the SRAM buffers it reads/writes. Returns the count.""" + from . import dep_analysis as dep # lazy: dep_analysis imports build_skeleton + n = 0 + for tile_id, cn in enumerate(builder.compute_nodes): + if not cn.operations: + continue + reads, writes = dep.compute_buffers(cn) + _emit_compute(ctx, cn, tile_id, + sorted(bufs.of(b) for b in reads), + sorted(bufs.of(b) for b in writes)) + n += 1 + return n + + +def _emit_one_dma(ctx, op, node, builder, bufs, tags): + """Rewrite one memref.dma_start as togsim.dma. A load reads DRAM and writes + its SRAM spad; a store reads the spad and writes DRAM -- which sets the + read/write buffer that drives the dependency edge (sec 10). The tag memref is + bound to a tag_id (with its loaded buffer) so the paired memory_barrier finds + it by the runtime tag slot.""" + from . import dep_analysis as dep # lazy: dep_analysis imports build_skeleton + f = builder._dma_start_fields(op) + dram_indices = f["dst_indices"] if node.is_write else f["src_indices"] + dram_index = dram_indices[0] if dram_indices else None + tag_indices = f["tag_indices"] + tag_index = tag_indices[0] if tag_indices else None + # the spad is the SRAM side of the copy: dst for a load, src for a store. + spad_id = bufs.of(dep._global_of(f["src"] if node.is_write else f["dst"])) + read_bufs = [spad_id] if node.is_write else [] + write_bufs = [] if node.is_write else [spad_id] + tag_id = tags.bind(_value_key(f["tag"]), spad_id) + if node.is_write: + _emit_compute_bar(ctx, op) # FIXME(sec10.7): auto-inserted; should be explicit in the IR. + _emit_dma(ctx, node, tag_id, dram_index, tag_index, read_bufs, write_bufs) + + +def _emit_one_wait(ctx, op, tags): + """Rewrite one memref.dma_wait as togsim.memory_barrier -- the explicit + async-DMA sync already in the IR. Paired with its dma by the tag memref + (tag_id) and the runtime tag index; carries the buffer the dma loaded. + Returns True iff emitted (a wait whose tag no dma used is dropped).""" + operands = list(op.operation.operands) + tag = operands[0] + tag_index = operands[1] if len(operands) >= 2 else None + binding = tags.lookup(_value_key(tag)) + if binding is None: + return False + tag_id, buf = binding + # honor lower_to_vcix's -1 accumulation marker: strip the reduction terms so + # the barrier slot equals the subtile slot the paired async load wrote. + tag_index = _strip_accum_terms(ctx, tag_index, op) + _emit_memory_bar(ctx, op, tag_id, tag_index, [buf]) + return True + + +def _emit_dmas_and_waits(ctx, block, builder, dma_by_op, bufs): + """Step 2: rewrite memref.dma_start -> togsim.dma and memref.dma_wait -> + togsim.memory_barrier in program order. An async dma and its barrier are + paired by the RUNTIME tag slot (tag_id + tag index), not a compile-time id: + one static dma op runs per loop iteration with a different `%tag[%idx]`, so + only the runtime slot can pair iteration i's dma with iteration i's wait. + Returns the original ops to erase and the (dma, wait) counts.""" + tags = _TagIds() + originals = [] + n_dma = n_wait = 0 + for op in list(walk_ops(block)): + name = op.operation.name + if name == "memref.dma_start": + node = dma_by_op.get(id(op.operation)) + if node is None: + continue + _emit_one_dma(ctx, op, node, builder, bufs, tags) + originals.append(op) + n_dma += 1 + elif name == "memref.dma_wait": + if _emit_one_wait(ctx, op, tags): + n_wait += 1 + originals.append(op) + return originals, n_dma, n_wait + + +def build_skeleton(module): + """Reduce `func.func @kernel` in `module` to the skeleton+API form, in place. + + Four steps: analyze the kernel into loop/compute/DMA nodes, emit a + togsim.compute per compute node, rewrite the DMAs/waits to togsim.dma/wait, + then DCE the leftover data computation. Returns a short text report (counts). + """ + _reset_ids() + builder = TogBuilder() + _build(module, builder) # populates loop/compute nodes + op back-pointers + + block = _kernel_block(module) + if block is None: + return "no @kernel found" + ctx = module.context + dma_by_op = _collect_dma_nodes(builder) + bufs = _BufferIds() + + n_compute = _emit_computes(ctx, builder, bufs) + originals, n_dma, n_wait = _emit_dmas_and_waits(ctx, block, builder, dma_by_op, bufs) + + # erase the now-replaced originals (result-less -> safe), then strip the + # leftover data computation. + for op in originals: + try: + op.operation.erase() + except Exception: + pass + _dce(block) + + return ("skeleton: compute=%d dma=%d wait=%d (unpaired waits dropped)" + % (n_compute, n_dma, n_wait)) + + +def run(module, vectorlane=128): + """passes/__init__ pass protocol entry (vectorlane unused; kept for parity).""" + build_skeleton(module) + + +def run_skeleton(in_path, out_path=None): + """Read post-vcix MLIR at `in_path`, reduce to skeleton+API, write it out. + + Requires the MLIR bindings. + """ + if out_path is None: + out_path = in_path + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(open(in_path).read(), ctx) + report = build_skeleton(module) + with open(out_path, "w") as fh: + fh.write(str(module)) + return report + + +def main(argv): + import argparse + + parser = argparse.ArgumentParser(prog="build_skeleton.py") + parser.add_argument("input") + parser.add_argument("--out", default=None) + args = parser.parse_args(argv[1:]) + report = run_skeleton(args.input, args.out) + import sys + sys.stderr.write(report + "\n") + return 0 + + +if __name__ == "__main__": + import sys + sys.exit(main(sys.argv)) diff --git a/PyTorchSimFrontend/mlir/passes/cycle_table.py b/PyTorchSimFrontend/mlir/passes/cycle_table.py new file mode 100644 index 00000000..40dd3459 --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/cycle_table.py @@ -0,0 +1,103 @@ +"""cycle_table (C3): the precomputed tile_id -> (cycle, overlapping_cycle) table +the C++ trace pipeline looks up at runtime (docs/design/togsim_cpp_trace.md sec +6, sec 9.8 task 4). + +A `togsim.compute(tile_id=...)` in the trace says *which* tile to compute, not +how long it takes. Because tiles are fixed size, each tile's cost is invariant +(only the trip count varies with shape), so it is sampled once and stored here, +keyed by `tile_id`. Two numbers per tile, mirroring the legacy TOG: + + * `cycle` -- full compute latency, sampled by gem5 sample-mode + (the existing measurement: `_rewrite_loop_steps` + + `_insert_compute_markers` in build_tog, run through + CycleSimulator -> the per-tile `cycle_list`). + * `overlapping_cycle` -- the portion that overlaps the previous instruction in + the systolic pipeline; the timing core uses it as + `finish = prev.finish + cycle - overlapped` (Core.cc). + Derived exactly as the legacy path does + (tog_generator.generate_tile_graph): + type 0 (VectorCompute) -> 0 + type 1 (MatmulCompute) -> max(cycle - x_offset, 0) + type 2 (MatmulPreload) -> max(cycle - w_offset, 0) + +This module only *builds/serializes* the table from a cycle_list; obtaining the +cycle_list reuses the existing sample-mode + gem5 path (wired in P3 task 5). The +`tile_id` order matches build_skeleton's `compute_nodes` order, which matches the +legacy TOG, so the same sampling keys both paths. + +Requires the MLIR Python bindings (to read the skeleton's togsim.compute ops). +""" + +import json + +from . import togsim_ops as ts +from ._mlir_util import walk_ops +from .build_tog import ( + ir, + VECTOR_COMPUTE, + MATMUL_COMPUTE, # noqa: F401 (documents the type enum used by the formula) + MATMUL_PRELOAD, +) + + +def overlapping_cycle(cycle, compute_type, x_offset, w_offset): + """Hideable (pipeline-overlapped) portion of `cycle`. Mirrors + tog_generator.generate_tile_graph.""" + if compute_type <= VECTOR_COMPUTE: # VectorCompute: no systolic overlap + return 0 + offset = w_offset if compute_type == MATMUL_PRELOAD else x_offset + return max(int(cycle) - int(offset), 0) + + +def _compute_types(skeleton_module): + """tile_id-ordered list of compute_type ints, from the skeleton's + togsim.compute ops.""" + items = [] + for op in walk_ops(skeleton_module.body): + if op.operation.name != ts.COMPUTE: + continue + tid = ir.IntegerAttr(op.operation.attributes[ts.ATTR_TILE_ID]).value + ct = ir.IntegerAttr(op.operation.attributes[ts.ATTR_COMPUTE_TYPE]).value + items.append((tid, ct)) + items.sort() + return [t for _, t in items] + + +def build_cycle_table(skeleton_module, cycle_list, x_offset, w_offset): + """Return `[(cycle, overlapping_cycle), ...]` indexed by tile_id. + + `cycle_list` is the per-tile gem5 measurement (compute_nodes order == + tile_id order). `x_offset`/`w_offset` are the systolic-fill offsets the + legacy path computes from the vector-lane size / loop size.""" + types = _compute_types(skeleton_module) + if len(cycle_list) != len(types): + raise ValueError( + "cycle_list (%d) does not match #compute tiles (%d)" + % (len(cycle_list), len(types))) + return [(int(c), overlapping_cycle(c, t, x_offset, w_offset)) + for c, t in zip(cycle_list, types)] + + +def dump_cycle_table(table, path, x_offset=None, w_offset=None): + """Serialize the table as a sidecar JSON next to the trace `.so`. The P3 C6 + loader reads it and sets compute_cycle + overlapping_cycle on each emitted + Instruction.""" + with open(path, "w") as fh: + json.dump({"x_offset": x_offset, "w_offset": w_offset, + "table": [list(e) for e in table]}, fh) + return path + + +def load_cycle_table(path): + with open(path) as fh: + return json.load(fh) + + +def dump_cycle_table_tsv(table, path): + """Plain `cycleoverlapping` per line, in tile_id order -- the trivial + format the C++ `--cycle_table` loader (main.cc, P3 trace pipeline) reads with + ifstream (no JSON dependency in TOGSim).""" + with open(path, "w") as fh: + for cycle, overlapping in table: + fh.write("%d\t%d\n" % (int(cycle), int(overlapping))) + return path diff --git a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py index c0e82b66..10b2edfb 100644 --- a/PyTorchSimFrontend/mlir/passes/decompose_transfer.py +++ b/PyTorchSimFrontend/mlir/passes/decompose_transfer.py @@ -32,13 +32,7 @@ OP_NAME = "togsim.transfer" MARKERS = (OP_NAME,) - -def _iter_ops(block): - for op in list(block.operations): - yield op - for region in op.operation.regions: - for b in region.blocks: - yield from _iter_ops(b) +from ._mlir_util import walk_ops def _int_array(attr): @@ -92,7 +86,7 @@ def run(module, vectorlane=128, **_): targets = [] for region in module.operation.regions: for b in region.blocks: - for op in _iter_ops(b): + for op in walk_ops(b): if op.operation.name == OP_NAME: targets.append(op.operation) diff --git a/PyTorchSimFrontend/mlir/passes/dep_analysis.py b/PyTorchSimFrontend/mlir/passes/dep_analysis.py new file mode 100644 index 00000000..bc53bfc9 --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/dep_analysis.py @@ -0,0 +1,194 @@ +"""dep_analysis.py -- dependency-edge analysis for the C++ trace pipeline (P3, sec 10). + +The current TOG pass does NO dependency analysis (it emits a lexical loop tree + +runtime tags). This module derives the producer->consumer edges that the explicit +dataflow trace needs, from two sources available on the post-vcix IR (before +build_skeleton collapses the compute regions): + + 1. SRAM access: each DMA/compute's read/write SRAM buffer(s), recovered by + following SSA (a vcix.iv's input vector -> its vector.transfer_read -> the + memref -> @global), and the DMA's spad operand. Edge: a reader depends on + the last node that wrote the same buffer. + 2. vcix preload/matmul pairing: a matmul (vcix opcode 0) consumes the weights a + preceding preload (opcode 1) loaded into the systolic array -- an SA-internal + dependency NOT visible as a memref access, so it comes from the opcode order. + +This is a node-level analysis (one node per build_tog compute/DMA node); the loops +replay the nodes, so loop-carried edges (the Y_spad accumulator) are materialized +per iteration downstream. First cut: buffer granularity (slot-level value matching +is a later refinement). Output is an edge list for validation / to drive emit. +""" +import sys +import os + +from .build_tog import TogBuilder, ir, _reset_ids +from . import build_skeleton as _bs + + +def _global_of(memref_val): + """memref SSA value -> @global symbol name (e.g. 'X_spad'), or None.""" + owner = memref_val.owner + op = owner if isinstance(owner, ir.Operation) else getattr(owner, "operation", None) + if op is None: + return None + if op.name == "memref.get_global": + return str(op.attributes["name"]).strip('@" ') + # walk through view-like ops (subview/cast) to their source + if op.operands: + try: + return _global_of(op.operands[0]) + except Exception: + return None + return None + + +def _read_buffers_of_compute(cn): + """SRAM buffers a compute node reads: (a) each vcix.iv input traced to its + vector.transfer_read source (activations/weights streamed into the SA), and + (b) any direct vector.transfer_read in the node (the epilogue's accumulator + read-modify-write of Y_spad).""" + bufs = set() + for op in cn.operations: + if op.name == "vector.transfer_read" and list(op.operands): + b = _global_of(op.operands[0]) + if b: + bufs.add(b) + elif op.name == "vcix.iv" and list(op.operands): + v = op.operands[0] + defop = v.owner if isinstance(v.owner, ir.Operation) else getattr(v.owner, "operation", None) + if defop is not None and defop.name == "vector.transfer_read" and list(defop.operands): + b = _global_of(defop.operands[0]) + if b: + bufs.add(b) + return bufs + + +def _write_buffers_of_compute(cn): + """SRAM buffers a compute node writes: vector.transfer_write / vector_store target.""" + bufs = set() + for op in cn.operations: + if op.name in ("vector.transfer_write", "affine.vector_store", "vector.store"): + # target memref is the last memref operand + for v in op.operands: + try: + if ir.MemRefType.isinstance(v.type): + b = _global_of(v) + if b: + bufs.add(b) + except Exception: + pass + return bufs + + +def _dma_buffer(builder, dma_node): + """The SRAM spad buffer a DMA touches (dst for load, src for store).""" + try: + f = builder._dma_start_fields(dma_node.op) + except Exception: + return None + val = f["dst"] if not dma_node.is_write else f["src"] + return _global_of(val) + + +# Virtual buffer for the systolic-array weight registers: a preload writes it, +# the following matmul reads it. This folds the SA-internal preload->matmul +# dependency (not a memref access) into the uniform "last-writer per buffer" rule. +SA_WEIGHTS = "__SA_WEIGHTS__" + + +def compute_buffers(cn): + """(read_buffers, write_buffers) for one compute node, including the virtual + SA_WEIGHTS edge (preload writes it, matmul reads it).""" + reads = set(_read_buffers_of_compute(cn)) + writes = set(_write_buffers_of_compute(cn)) + if cn.compute_type == 1: # MATMUL consumes the preloaded weights + reads.add(SA_WEIGHTS) + elif cn.compute_type == 2: # PRELOAD loads them + writes.add(SA_WEIGHTS) + return reads, writes + + +def analyze(module): + """Return (nodes, edges). nodes: list of dicts; edges: list of (consumer_idx, + producer_idx, reason).""" + _reset_ids() + builder = TogBuilder() + _bs._build(module, builder) + + nodes = [] + # DMA nodes only (the map also contains TOGDMAWaitNode; keep real DMAs). + dma_nodes = [dn for dn in dict.fromkeys(_bs._collect_dma_nodes(builder).values()) + if hasattr(dn, "is_write")] + for dn in dma_nodes: + buf = _dma_buffer(builder, dn) + nodes.append({ + "kind": "STORE" if dn.is_write else "LOAD", + "buf": buf, "arg": str(dn.base_addr), + "reads": {buf} if dn.is_write else set(), + "writes": {buf} if not dn.is_write else set(), + "node": dn, + }) + for cn in builder.compute_nodes: + if not cn.operations: + continue + ct = {0: "VECTOR", 1: "MATMUL", 2: "PRELOAD"}.get(cn.compute_type, f"c{cn.compute_type}") + nodes.append({ + "kind": ct, + "reads": _read_buffers_of_compute(cn), + "writes": _write_buffers_of_compute(cn), + "node": cn, + "compute_type": cn.compute_type, + }) + + # Order nodes by program position (last-writer needs program order: e.g. the + # store reads Y_spad written by the matmul, which lexically precedes it). + pos = {} + idx = [0] + def _index(op): + pos[op] = idx[0]; idx[0] += 1 + for r in op.regions: + for b in r.blocks: + for o in b.operations: + _index(o) + _index(module.operation) + def _key(n): + node = n["node"] + op = getattr(node, "op", None) or (node.operations[0] if getattr(node, "operations", None) else None) + return pos.get(op, 1 << 30) + nodes.sort(key=_key) + + # Edges: (1) buffer last-writer, (2) preload->matmul. + edges = [] + last_writer = {} # buffer -> node idx + prev_preload = None + for i, n in enumerate(nodes): + for b in sorted(n["reads"]): + if b in last_writer: + edges.append((i, last_writer[b], f"reads {b}")) + if n["kind"] == "MATMUL" and prev_preload is not None: + edges.append((i, prev_preload, "uses preloaded weights (vcix op1->op0)")) + for b in n["writes"]: + last_writer[b] = i + if n["kind"] == "PRELOAD": + prev_preload = i + return nodes, edges + + +def _main(): + path = sys.argv[1] + ctx = ir.Context(); ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(open(path).read(), ctx) + nodes, edges = analyze(module) + print("=== nodes ===") + for i, n in enumerate(nodes): + r = ",".join(sorted(n["reads"])) or "-" + w = ",".join(sorted(n["writes"])) or "-" + print(f" #{i:<2} {n['kind']:<8} reads[{r}] writes[{w}]") + print("=== edges (consumer -> producer) ===") + for c, p, why in edges: + print(f" #{c} ({nodes[c]['kind']}) -> #{p} ({nodes[p]['kind']}) [{why}]") + + +if __name__ == "__main__": + _main() diff --git a/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py b/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py index 3f583ef2..f1872dca 100644 --- a/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py +++ b/PyTorchSimFrontend/mlir/passes/dma_fine_grained.py @@ -30,6 +30,8 @@ import mlir.ir as ir # noqa: E402 +from ._mlir_util import walk_ops, attr_i64_array + MARKERS = ("subtile_size",) # only subtile DMAs are split MVIN, MVIN2, MVIN3, MVOUT = 2, 1, 14, 3 @@ -54,12 +56,6 @@ def _const_int(value, default=-1): return default -def _int_array_attr(op, key): - if key not in op.attributes: - return [] - return [ir.IntegerAttr(a).value for a in ir.ArrayAttr(op.attributes[key])] - - def _is_block_arg(v): return isinstance(v, ir.BlockArgument) @@ -106,13 +102,13 @@ def tile_shape(self): return list(mt.shape) def subtile_size(self): - return _int_array_attr(self.op, "subtile_size") + return attr_i64_array(self.op, "subtile_size", default=[]) def sram_stride(self): - return _int_array_attr(self.op, "sram_stride") + return attr_i64_array(self.op, "sram_stride", default=[]) def dram_stride(self): - return _int_array_attr(self.op, "dram_stride") + return attr_i64_array(self.op, "dram_stride", default=[]) def is_async(self): a = self.op.attributes @@ -244,6 +240,27 @@ def _const_index(v, ip): ir.IntegerAttr.get(ir.IndexType.get(), v), ip=ip).result +def _fresh_tag(dma): + """Give this DMA a fresh tag memref.alloc right BEFORE the (pre-split) coarse + dma_start, and rewire every use of the old tag -- the dma_start re-emitted + below AND its dma_wait -- to it. The coarse dma sits at the reduction-loop body + level (it has not been wrapped in a subtile load nest yet), so the alloc there + dominates both the load nest fine-grained is about to build and the sibling + wait nest. Each reduction iteration thus allocates its own tag -> successive + iterations are distinct (multi-tile-K / conv) and the per-iteration tag + semantics is in the IR, not reconstructed downstream. Old alloc becomes dead.""" + old = dma.tag + new_tag = ir.Operation.create("memref.alloc", results=[old.type], + operands=[], ip=ir.InsertionPoint(dma.op)).results[0] + old.replace_all_uses_with(new_tag) + dma.tag = new_tag + # the old (func-entry, per-tensor unique) alloc is now dead -- erase it. + try: + old.owner.erase() + except Exception: + pass + + # --------------------------------------------------------------------------- # Loop-nest construction # --------------------------------------------------------------------------- @@ -293,20 +310,12 @@ def _reaches(value, target): # --------------------------------------------------------------------------- # Pass driver # --------------------------------------------------------------------------- -def _iter_ops(block): - for op in list(block.operations): - yield op - for region in op.operation.regions: - for b in region.blocks: - yield from _iter_ops(b) - - def _run_func(func, vectorlane): from mlir.dialects import linalg # First matmul only. matmul = None dmas = [] - for op in _iter_ops(func.regions[0].blocks[0]): + for op in walk_ops(func.regions[0].blocks[0]): name = op.operation.name if name == "linalg.matmul" and matmul is None: matmul = op @@ -363,6 +372,12 @@ def _run_func(func, vectorlane): for d, f in enumerate(fuse["w_to_fused"]): bounds[f] = w_counts[d] + # Give each load a fresh per-iteration tag alloc just before its coarse dma + # (rewiring its dma_wait via the old tag's uses), so the tag is distinct per + # reduction iteration -- positioned to match the per-iteration tag semantics. + _fresh_tag(mvin_input) + _fresh_tag(mvin_weight) + # Insert the fused nest at the weight DMA (the later of the two): both DMAs' # original DRAM base indices (src_idx[0], computed in the enclosing loops) must # dominate the nest. Codegen emits input before weight, matching the C++ pass diff --git a/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py b/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py index f5b841bb..998a6db5 100644 --- a/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py +++ b/PyTorchSimFrontend/mlir/passes/lower_dma_to_gemmini.py @@ -22,6 +22,8 @@ WAIT_NAME = "memref.dma_wait" MARKERS = (OP_NAME, WAIT_NAME) +from ._mlir_util import attr_i64_array + # func7 instruction codes (CustomDMAAttribute.h) CONFIG, CONFIG2, CONFIG3, CONFIG4 = 0, 4, 5, 6 MVIN, MVIN2, MVIN3, MVOUT = 2, 1, 14, 3 @@ -124,8 +126,8 @@ def elem_addr_i64(memref_val, indices, mtype, elem_bytes): tile_shape = _subtile(op) if tile_shape is None: tile_shape = list(dst_ty.shape) if is_mvin else list(src_ty.shape) - dram_strides = _int_array(op, "dram_stride") - spad_strides = _int_array(op, "sram_stride") + dram_strides = attr_i64_array(op, "dram_stride") + spad_strides = attr_i64_array(op, "sram_stride") assert len(tile_shape) == len(dram_strides) == len(spad_strides), \ f"shape/stride rank mismatch: {tile_shape} {dram_strides} {spad_strides}" @@ -180,11 +182,6 @@ def _subtile(op): return [IntegerAttr(a).value for a in ArrayAttr(op.attributes["subtile_size"])] -def _int_array(op, name): - from mlir.ir import ArrayAttr, IntegerAttr - return [IntegerAttr(a).value for a in ArrayAttr(op.attributes[name])] - - def _elem_bytes(elem_type): from mlir.ir import IntegerType, FloatType bits = (IntegerType(elem_type).width if IntegerType.isinstance(elem_type) diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py b/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py new file mode 100644 index 00000000..6ade7442 --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/lower_to_emitc.py @@ -0,0 +1,613 @@ +"""lower_to_emitc pass (C4): skeleton+API MLIR -> EmitC -> C++ -> trace `.so`. + +Second stage of the C++ trace pipeline (docs/design/togsim_cpp_trace.md, sec +5-7). Takes the skeleton+API module from `build_skeleton` (loop nest + +`togsim.*` ops) and produces an EmitC module whose single entry function + + extern "C" void togsim_kernel(EmitCtx* ctx, int64_t* shape_args, int32_t n) + +mirrors the loop skeleton, with every `togsim.*` op as an `emitc.call_opaque` +to the matching `togsim_runtime.h` free function (`togsim_ops.EMITC_CALLEE`). +`mlir-translate --mlir-to-cpp` renders it to C++, compiled to a `.so` that +exports `togsim_kernel` and leaves `togsim_dma/wait/compute/signal` undefined for +the TOGSim loader to resolve at `dlopen`. + +How the lowering is done -- it drives the *upstream* EmitC conversion passes and +adds only the glue they cannot do: + + 1. (python) Rewrite the unregistered `togsim.*` ops to `emitc.call_opaque`. + Unregistered ops have no registered conversion patterns, so this must be a + custom rewrite (design sec 8). Also rewrite the kernel's signature to the + ABI form (drop the memref tensor args -- the trace producer never touches + tensor data; base addresses are deferred to P3) and drop the aux + globals / wrapper func. + 2. (upstream passes, in-process PassManager) + func.func(lower-affine) -> convert-scf-to-emitc + -> convert-arith-to-emitc -> convert-func-to-emitc + This is the EmitC infrastructure: it lowers the affine/scf loop nest to + `emitc.for`, the index/arith (loop bounds, and in P3 the address + arithmetic) to EmitC, and the func to `emitc.func`. + 3. (python) Two small fixups the passes leave behind in this LLVM 20 build: + * `convert-scf-to-emitc` emits `emitc.for` with `index`-typed bounds, so + `convert-arith-to-emitc` (which makes constants `!emitc.size_t`) leaves + `builtin.unrealized_conversion_cast` on the bounds that nothing folds + and `mlir-to-cpp` cannot print (design sec 8 "EmitC coverage" risk). + `_fold_for_bound_casts` rewrites those casts away. + * add the `extern "C"` specifier so `dlsym` finds the entry unmangled. + +Requires the MLIR Python bindings (incl. `mlir.passmanager`); the .cpp/.so +steps additionally require `mlir-translate` (TORCHSIM_LLVM_PATH) and a host C++ +compiler. +""" + +import os +import subprocess + +from mlir.passmanager import PassManager + +from . import togsim_ops as ts +from ._mlir_util import walk_ops, i32, i64, attr_int, attr_i64_array +from .build_tog import ir, _find_kernel + +#: emitted entry symbol (== ts.ENTRY_SYMBOL == "togsim_kernel"). +ENTRY = ts.ENTRY_SYMBOL + +#: EmitC type of the opaque EmitCtx* threaded through every call. +CTX_TYPE = '!emitc.ptr>' + +#: upstream EmitC conversion pipeline (the infrastructure this pass drives). +_PIPELINE = ("builtin.module(" + "func.func(lower-affine)," + "convert-scf-to-emitc," + "convert-arith-to-emitc," + "convert-func-to-emitc)") + +#: prepended to the mlir-to-cpp output; pulls in size_t/intN_t and the ABI. +_PRELUDE = ( + "#include \n" + "#include \n" + "using std::size_t;\n" + '#include "togsim_runtime.h"\n' +) + + +# --------------------------------------------------------------------------- +# attribute builders / readers +# --------------------------------------------------------------------------- +def _idx(v): + return ir.IntegerAttr.get(ir.IndexType.get(), int(v)) + + +def _opaque(ctx, text): + return ir.Attribute.parse('#emitc.opaque<"%s">' % text, ctx) + + +def _arr(ctx, vals): + """A C compound-literal `(const int64_t[]){...}` arg, or `nullptr` if empty + (the call site decays it to a `const int64_t*`).""" + vals = list(vals) + if not vals: + return _opaque(ctx, "nullptr") + return _opaque(ctx, "(const int64_t[]){%s}" % ", ".join(str(int(v)) for v in vals)) + + +def _attr_bool(op, key): + return 1 if ir.BoolAttr(op.operation.attributes[key]).value else 0 + + +# --------------------------------------------------------------------------- +# step 1: rewrite signature + togsim.* ops (the unregistered-op glue) +# --------------------------------------------------------------------------- +def _strip_aux(module): + """Erase memref.global decls and every func except @kernel (the wrapper).""" + victims = [] + for op in module.body.operations: + name = op.operation.name + if name == "memref.global": + victims.append(op) + elif name == "func.func": + if ir.StringAttr(op.operation.attributes["sym_name"]).value != "kernel": + victims.append(op) + for op in victims: + op.operation.erase() + + +def _rewrite_signature(kernel, ctx): + """Replace @kernel's memref tensor args with the ABI args + (EmitCtx*, int64_t* shape_args, int32_t n) and rename it to togsim_kernel. + Returns the ctx Value.""" + block = kernel.regions[0].blocks[0] + for arg in block.arguments: + if len(list(arg.uses)) > 0: + raise ValueError( + "kernel arg still used after build_skeleton; cannot drop it " + "(expected the DCE to have removed all tensor-data ops)") + # erase existing (memref) args high-to-low, then append the ABI args. + for i in reversed(range(len(block.arguments))): + block.erase_argument(i) + ptr = ir.Type.parse(CTX_TYPE, ctx) + i64ptr = ir.Type.parse("!emitc.ptr", ctx) + i32 = ir.IntegerType.get_signless(32) + loc = ir.Location.unknown(ctx) + block.add_argument(ptr, loc) + block.add_argument(i64ptr, loc) + block.add_argument(i32, loc) + kernel.operation.attributes["function_type"] = ir.TypeAttr.get( + ir.FunctionType.get([ptr, i64ptr, i32], [])) + kernel.operation.attributes["sym_name"] = ir.StringAttr.get(ENTRY) + return block.arguments[0] + + +def _call(ctx, ctx_val, op, callee, arg_attrs): + """Insert emitc.call_opaque (ctx) {args=[0:index, ...]} before `op`. + The leading `0 : index` references operand 0 (ctx); other entries are + literal C args (integer attr -> literal, #emitc.opaque -> verbatim).""" + ir.Operation.create( + "emitc.call_opaque", results=[], operands=[ctx_val], + attributes={"callee": ir.StringAttr.get(callee), + "args": ir.ArrayAttr.get([_idx(0)] + arg_attrs)}, + loc=ir.Location.unknown(ctx), ip=ir.InsertionPoint(op)) + + +def _innermost_outer_loop(block): + """Deepest `affine.for {outer_loop=true}` (the PARALLEL/ACCUMULATION + boundary). Returns the op or None if the kernel has no parallel loop.""" + found = [None] + + def is_outer(op): + a = op.operation.attributes + return "outer_loop" in a and ir.BoolAttr(a["outer_loop"]).value + + def walk(b): + for op in b.operations: + if op.operation.name == "affine.for" and is_outer(op): + found[0] = op # nested outer loops overwrite -> deepest wins + for r in op.operation.regions: + for bb in r.blocks: + walk(bb) + + walk(block) + return found[0] + + +def _is_outer(forop): + a = forop.operation.attributes + return "outer_loop" in a and ir.BoolAttr(a["outer_loop"]).value + + +def _parallel_loop_chain(block): + """The nested chain of `affine.for {outer_loop}` from `block` inward (one + work-item's parallel indices). Empty if the kernel has no parallel loop.""" + chain = [] + cur = block + while True: + nxt = None + for op in cur.operations: + if op.operation.name == "affine.for" and _is_outer(op): + nxt = op + break + if nxt is None: + break + chain.append(nxt) + cur = nxt.operation.regions[0].blocks[0] + return chain + + +def _const_op(value): + """The defining arith/emitc constant Operation if `value` is a constant + result, else None (block args / other ops).""" + owner = value.owner + if isinstance(owner, ir.Block): + return None + return owner if owner.name in ("arith.constant", "emitc.constant") else None + + +def _outline_work_item(ctx, kernel, ctx_val): + """Outline the innermost parallel work-item body into a uniform + `togsim_kernel_tile(ctx, iv, n)` func, replacing it with a + `togsim_dispatch(ctx, togsim_kernel_tile, iv, n)` call (sec 9.3). The + work-item SCOPE becomes the function body; the runtime wrapper owns the + core-alloc + the TILE_BEGIN/TILE_END boundary (a decorator). One uniform tile + signature -> a single general dispatcher serves every kernel. + + Runs after `_rewrite_togsim_ops`, so the moved body holds emitc.call_opaque + (not togsim.* ops). The only values captured from outside the body are ctx, + the enclosing parallel induction vars, and constants -- threaded via the iv + array (parallel IVs) / cloned (constants); anything else is unsupported + (dynamic shape -> P4).""" + kblk = kernel.regions[0].blocks[0] + chain = _parallel_loop_chain(kblk) + if chain: + L = chain[-1] + Lbody = L.operation.regions[0].blocks[0] + ivs = [c.operation.regions[0].blocks[0].arguments[0] for c in chain] + else: # no parallel loop -> the whole kernel body is one work-item + L = None + Lbody = kblk + ivs = [] + + i64 = ir.IntegerType.get_signless(64) + i32 = ir.IntegerType.get_signless(32) + idxty = ir.IndexType.get() + ctxty = ir.Type.parse(CTX_TYPE, ctx) + i64ptr = ir.Type.parse("!emitc.ptr", ctx) + loc = ir.Location.unknown(ctx) + + # --- the outlined tile function (before the kernel so C defines it first) --- + tile = ir.Operation.create( + "func.func", results=[], regions=1, + attributes={ + "function_type": ir.TypeAttr.get(ir.FunctionType.get([ctxty, i64ptr, i32], [])), + "sym_name": ir.StringAttr.get(ts.TILE_SYMBOL), + "sym_visibility": ir.StringAttr.get("private")}, + loc=loc, ip=ir.InsertionPoint(kernel)) + with loc: + tblk = tile.regions[0].blocks.append(ctxty, i64ptr, i32) + ctx2, iv2, _n2 = tblk.arguments + with ir.InsertionPoint(tblk): + tret = ir.Operation.create("func.return", results=[], operands=[], loc=loc) + + # in the tile fn: recover each parallel index = index_cast(iv[k]). + idx_vals = [] + with ir.InsertionPoint(tret): + for k in range(len(ivs)): + kc = ir.Operation.create("emitc.constant", results=[i64], + attributes={"value": ir.IntegerAttr.get(i64, k)}, loc=loc).results[0] + elem = ir.Operation.create("emitc.subscript", results=[i64], + operands=[iv2, kc], loc=loc).results[0] + idx_vals.append(ir.Operation.create("arith.index_cast", results=[idxty], + operands=[elem], loc=loc).results[0]) + + # move the work-item body into the tile fn (terminators stay behind). + for op in [o for o in Lbody.operations + if o.operation.name not in ("affine.yield", "func.return")]: + op.operation.move_before(tret) + + # remap captures (Value `==` is identity): ctx -> ctx2, each parallel IV -> + # its index_cast, each external constant -> a clone inside the tile fn. A + # constant defined inside the tile fn (moved/read) is internal -> left alone. + caps = [(ctx_val, ctx2)] + list(zip(ivs, idx_vals)) + internal_consts = [] + def _collect_internal(block): + for op in block.operations: + c = _const_op(op.operation.results[0]) if len(op.operation.results) == 1 else None + if c is not None: + internal_consts.append(op.operation.results[0]) + for rg in op.operation.regions: + for b in rg.blocks: + _collect_internal(b) + _collect_internal(tblk) + const_clones = [] + ext_consts = [] + def _find_ext_consts(block): + for op in block.operations: + for opnd in op.operation.operands: + if _const_op(opnd) is None: + continue + if any(opnd == ic for ic in internal_consts): + continue + if any(opnd == e for e in ext_consts): + continue + ext_consts.append(opnd) + for rg in op.operation.regions: + for b in rg.blocks: + _find_ext_consts(b) + _find_ext_consts(tblk) + top = ir.InsertionPoint(tblk.operations[0]) + for e in ext_consts: + c = _const_op(e) + clone = ir.Operation.create(c.name, results=[e.type], + attributes={"value": c.attributes["value"]}, loc=loc, ip=top).results[0] + const_clones.append((e, clone)) + + allcaps = caps + const_clones + def _remap(block): + for op in block.operations: + for i in range(len(op.operation.operands)): + cur = op.operation.operands[i] + for orig, new in allcaps: + if cur == orig: + op.operation.operands[i] = new + break + for rg in op.operation.regions: + for b in rg.blocks: + _remap(b) + _remap(tblk) + + # --- the dispatcher: marshal the IVs and hand the tile fn to togsim_dispatch --- + term = [o for o in Lbody.operations + if o.operation.name in ("affine.yield", "func.return")][0] + fn_ref = _opaque(ctx, ts.TILE_SYMBOL) # function name -> verbatim pointer in C + with ir.InsertionPoint(term): + if ivs: + arrty = ir.Type.parse("!emitc.array<%dxi64>" % len(ivs), ctx) + arr = ir.Operation.create("emitc.variable", results=[arrty], + attributes={"value": _opaque(ctx, "")}, loc=loc).results[0] + for k, iv in enumerate(ivs): + kc = ir.Operation.create("emitc.constant", results=[i64], + attributes={"value": ir.IntegerAttr.get(i64, k)}, loc=loc).results[0] + v64 = ir.Operation.create("arith.index_cast", results=[i64], + operands=[iv], loc=loc).results[0] + sub = ir.Operation.create("emitc.subscript", results=[i64], + operands=[arr, kc], loc=loc).results[0] + # emitc.assign operands are (lvalue dest, value). + ir.Operation.create("emitc.assign", results=[], operands=[sub, v64], loc=loc) + ir.Operation.create( + "emitc.call_opaque", results=[], operands=[ctx_val, arr], + attributes={"callee": ir.StringAttr.get(ts.DISPATCH_CALLEE), + "args": ir.ArrayAttr.get( + [_idx(0), fn_ref, _idx(1), ir.IntegerAttr.get(i32, len(ivs))])}, + loc=loc) + else: + ir.Operation.create( + "emitc.call_opaque", results=[], operands=[ctx_val], + attributes={"callee": ir.StringAttr.get(ts.DISPATCH_CALLEE), + "args": ir.ArrayAttr.get( + [_idx(0), fn_ref, _opaque(ctx, "nullptr"), ir.IntegerAttr.get(i32, 0)])}, + loc=loc) + + +def _rewrite_togsim_ops(ctx, kernel, ctx_val): + block = kernel.regions[0].blocks[0] + victims = [] + for op in walk_ops(block): + name = op.operation.name + ipo = ir.InsertionPoint(op) + if name == ts.DMA: + dims = attr_i64_array(op, ts.ATTR_DIMS) + # The DRAM element offset is the togsim.dma operand (the original + # affine index, kept live by build_skeleton); pass it as a call + # operand so convert-arith-to-emitc lowers the address arithmetic + # into the producer (P3 approach A). The runtime adds the tensor base. + # Operands carried by build_skeleton: [dram_index, tag_index] (each + # optional). Pass each as a call operand so convert-arith-to-emitc + # lowers it; reference it from `args` by its operand position. offset + # -> DRAM byte address (runtime adds the tensor base); tag_slot -> the + # SRAM tile slot (runtime uses it for double-buffer/SRAM-capacity). + ins = list(op.operation.operands) + dram_operand = ins[0] if len(ins) >= 1 else None + tag_operand = ins[1] if len(ins) >= 2 else None + operands = [ctx_val] + offset_arg = i64(0) + tag_arg = i64(0) + if dram_operand is not None: + operands.append(dram_operand) + offset_arg = _idx(len(operands) - 1) + if tag_operand is not None: + operands.append(tag_operand) + tag_arg = _idx(len(operands) - 1) + args = [_idx(0), + i32(attr_int(op, ts.ATTR_DIR)), + i32(attr_int(op, ts.ATTR_ARG_ID)), + offset_arg, + i32(len(dims)), + _arr(ctx, dims), + _arr(ctx, attr_i64_array(op, ts.ATTR_STRIDES)), + i32(attr_int(op, ts.ATTR_ELEM_BITS)), + i32(_attr_bool(op, ts.ATTR_IS_ASYNC)), + i32(attr_int(op, ts.ATTR_TAG_ID)), + tag_arg] + _rb = attr_i64_array(op, ts.ATTR_READ_BUFS) + _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS) + args += [_arr(ctx, _rb), i32(len(_rb)), _arr(ctx, _wb), i32(len(_wb))] + # togsim_dma is void: the dma is paired with its barrier by the runtime + # (tag_id, tag_slot), not a returned handle. + ir.Operation.create( + "emitc.call_opaque", results=[], operands=operands, + attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.DMA]), + "args": ir.ArrayAttr.get(args)}, + loc=ir.Location.unknown(ctx), ip=ipo) + victims.append(op) + elif name == ts.MEMORY_BAR: + # explicit async-DMA sync (the original dma_wait) -> + # togsim_memory_barrier(ctx, tag_id, tag_slot, write_bufs). The tag + # index operand (if any) is the runtime tag slot. + ins = list(op.operation.operands) + operands = [ctx_val] + tag_arg = i64(0) + if ins: + operands.append(ins[0]) + tag_arg = _idx(len(operands) - 1) + _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS) + ir.Operation.create( + "emitc.call_opaque", results=[], operands=operands, + attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.MEMORY_BAR]), + "args": ir.ArrayAttr.get( + [_idx(0), i32(attr_int(op, ts.ATTR_TAG_ID)), tag_arg, + _arr(ctx, _wb), i32(len(_wb))])}, + loc=ir.Location.unknown(ctx), ip=ipo) + victims.append(op) + elif name == ts.COMPUTE: + # skeleton compute carries no dims (cost is keyed by tile_id) -> 0/null. + _rb = attr_i64_array(op, ts.ATTR_READ_BUFS) + _wb = attr_i64_array(op, ts.ATTR_WRITE_BUFS) + _call(ctx, ctx_val, op, ts.EMITC_CALLEE[ts.COMPUTE], + [i64(attr_int(op, ts.ATTR_TILE_ID)), + i32(attr_int(op, ts.ATTR_COMPUTE_TYPE)), + i32(0), _opaque(ctx, "nullptr"), + _arr(ctx, _rb), i32(len(_rb)), _arr(ctx, _wb), i32(len(_wb))]) + victims.append(op) + elif name == ts.COMPUTE_BAR: + # explicit compute fence -> togsim_compute_barrier(ctx) (sec 10.7). + ir.Operation.create( + "emitc.call_opaque", results=[], operands=[ctx_val], + attributes={"callee": ir.StringAttr.get(ts.EMITC_CALLEE[ts.COMPUTE_BAR]), + "args": ir.ArrayAttr.get([_idx(0)])}, + loc=ir.Location.unknown(ctx), ip=ipo) + victims.append(op) + for op in victims: + op.operation.erase() + + +# --------------------------------------------------------------------------- +# step 3: post-conversion fixups +# --------------------------------------------------------------------------- +def _retype_for_to_size_t(module): + """Make every `emitc.for` use `!emitc.size_t` bounds + induction variable, + then drop the `index`<->`!emitc.size_t` `unrealized_conversion_cast` ops that + `convert-scf-to-emitc` / `convert-arith-to-emitc` leave behind (mlir-to-cpp + cannot print them; --reconcile cannot fold them). + + `emitc.for` accepts `size_t` bounds with the explicit type, and a `size_t` IV + makes the lowered address arithmetic (`convert-arith-to-emitc`, which works + in `size_t`) cast-free. So: set each IV to size_t, then for every + index<->size_t cast replace its result with its source (every consumer here + -- `emitc.for` bounds, `emitc.call_opaque` operands, `emitc` arith -- accepts + either, and after the IV retype each such cast bridges equal types).""" + idx = ir.IndexType.get() + st = ir.Type.parse("!emitc.size_t", module.context) + + for op in list(walk_ops(module.body)): + if op.operation.name == "emitc.for": + op.operation.regions[0].blocks[0].arguments[0].set_type(st) + + dead = [] + for op in list(walk_ops(module.body)): + if op.operation.name != "builtin.unrealized_conversion_cast": + continue + res = op.results[0] + src = list(op.operation.operands)[0] + # idx<->size_t bridges (incl. the size_t->size_t identities left after + # the IV retype): every consumer here accepts either, so fold to source. + if src.type in (idx, st) and res.type in (idx, st): + res.replace_all_uses_with(src) + dead.append(op) + for d in dead: + try: + d.operation.erase() + except Exception: + pass + + +def _add_extern_c(module, ctx): + for op in module.body.operations: + if (op.operation.name == "emitc.func" + and ir.StringAttr(op.operation.attributes["sym_name"]).value == ENTRY): + op.operation.attributes["specifiers"] = ir.ArrayAttr.get( + [ir.StringAttr.get('extern "C"')]) + return + raise ValueError("emitc.func @%s not found after conversion" % ENTRY) + + +# --------------------------------------------------------------------------- +# driver +# --------------------------------------------------------------------------- +def lower_to_emitc(skeleton_module): + """Lower a skeleton+API module (in place) to an EmitC module with the + `togsim_kernel` entry function. Returns the same module.""" + ctx = skeleton_module.context + kernel = _find_kernel(skeleton_module) + if kernel is None: + raise ValueError("no @kernel found in skeleton module") + + _strip_aux(skeleton_module) + ctx_val = _rewrite_signature(kernel, ctx) + _rewrite_togsim_ops(ctx, kernel, ctx_val) # togsim.* -> emitc.call_opaque + _outline_work_item(ctx, kernel, ctx_val) # work-item body -> togsim_kernel_tile + dispatch + + PassManager.parse(_PIPELINE, ctx).run(skeleton_module.operation) + + _retype_for_to_size_t(skeleton_module) + _add_extern_c(skeleton_module, ctx) + return skeleton_module + + +# --------------------------------------------------------------------------- +# C++ / .so backend +# --------------------------------------------------------------------------- +def _mlir_translate_bin(): + return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"), + "mlir-translate") + + +def emitc_to_cpp(emitc_module, mlir_translate=None): + """Render `emitc_module` to C++ source (prelude + mlir-to-cpp body).""" + mlir_translate = mlir_translate or _mlir_translate_bin() + proc = subprocess.run( + [mlir_translate, "--mlir-to-cpp"], + input=str(emitc_module), capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError("mlir-translate --mlir-to-cpp failed:\n" + proc.stderr) + return _PRELUDE + proc.stdout + + +def compile_so(cpp_text, so_path, include_dir, cxx=None): + """Compile producer C++ to `so_path`. `include_dir` must hold + togsim_runtime.h. togsim_* symbols are left undefined (resolved at dlopen).""" + cxx = cxx or os.environ.get("CXX", "g++") + cpp_path = os.path.splitext(so_path)[0] + ".cpp" + with open(cpp_path, "w") as fh: + fh.write(cpp_text) + proc = subprocess.run( + [cxx, "-shared", "-fPIC", "-std=gnu++17", "-O2", + "-I", include_dir, cpp_path, "-o", so_path], + capture_output=True, text=True) + if proc.returncode != 0: + raise RuntimeError("%s failed:\n%s" % (cxx, proc.stderr)) + return so_path + + +def _default_include_dir(): + root = os.environ.get("TORCHSIM_DIR") + if not root: + root = os.path.dirname(os.path.dirname(os.path.dirname( + os.path.dirname(os.path.abspath(__file__))))) + return os.path.join(root, "TOGSim", "include") + + +def skeleton_to_so(skeleton_module, so_path, include_dir=None): + """skeleton module -> EmitC -> C++ -> compiled trace `.so`. Returns the + EmitC module text (for inspection / caching).""" + emitc = lower_to_emitc(skeleton_module) + cpp = emitc_to_cpp(emitc) + compile_so(cpp, so_path, include_dir or _default_include_dir()) + return str(emitc) + + +def build_trace_so(postvcix_path, so_path, include_dir=None): + """Full P2 path from a post-vcix kernel .mlir to a trace `.so`.""" + from . import build_skeleton as bs + + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(open(postvcix_path).read(), ctx) + bs.build_skeleton(module) + return skeleton_to_so(module, so_path, include_dir) + + +def main(argv): + import argparse + + parser = argparse.ArgumentParser(prog="lower_to_emitc.py") + parser.add_argument("input", help="post-vcix kernel .mlir") + parser.add_argument("--so", required=True, help="output .so path") + parser.add_argument("--include-dir", default=None, + help="dir holding togsim_runtime.h (default: TOGSim/include)") + parser.add_argument("--emit-cpp", default=None, + help="also write the generated C++ here") + parser.add_argument("--emit-mlir", default=None, + help="also write the EmitC MLIR here") + args = parser.parse_args(argv[1:]) + + from . import build_skeleton as bs + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(open(args.input).read(), ctx) + bs.build_skeleton(module) + emitc = lower_to_emitc(module) + if args.emit_mlir: + open(args.emit_mlir, "w").write(str(emitc)) + cpp = emitc_to_cpp(emitc) + if args.emit_cpp: + open(args.emit_cpp, "w").write(cpp) + compile_so(cpp, args.so, args.include_dir or _default_include_dir()) + import sys + sys.stderr.write("wrote %s\n" % args.so) + return 0 + + +if __name__ == "__main__": + import sys + sys.exit(main(sys.argv)) diff --git a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py index ac93ebc8..df124d00 100644 --- a/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py +++ b/PyTorchSimFrontend/mlir/passes/lower_to_vcix.py @@ -29,6 +29,8 @@ import mlir.ir as ir # noqa: E402 +from ._mlir_util import walk_ops, i32, i64, attr_bool + MARKERS = ("linalg.matmul", "math.exp", "math.erf", "math.tanh", "math.sin", "math.cos") # math op name -> (opcode, imm) for the vcix.v.iv lowering (mirror Math*ToVCIX). @@ -80,20 +82,12 @@ def _legalize_vector_type(vt, vlen): return n, ir.VectorType.get([elt_count >> (n - 1)], elt_ty, scalable=[True]) -def _i64(v): - return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), v) - - -def _i32(v): - return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), v) - - def _viv(operand, result_ty, opcode, imm, rvl=None): """Create an unregistered vcix.v.iv (vcix::BinaryImmOp) op at the current IP.""" operands = [operand] if rvl is None else [operand, rvl] return ir.Operation.create( "vcix.v.iv", results=[result_ty], operands=operands, - attributes={"opcode": _i64(opcode), "imm": _i32(imm)}).results[0] + attributes={"opcode": i64(opcode), "imm": i32(imm)}).results[0] def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm): @@ -104,7 +98,7 @@ def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm): scalable = legal_ty.scalable rvl = None if scalable: - rvl = arith.ConstantOp(ir.IntegerType.get_signless(64), _i64(9)).result + rvl = arith.ConstantOp(ir.IntegerType.get_signless(64), i64(9)).result if n == 1: return _viv(vec, legal_ty, opcode, imm, rvl) elt_ty = legal_ty.element_type @@ -119,24 +113,16 @@ def _make_sf_vc_v_iv(vec, op_vt, n, legal_ty, opcode, imm): for i in range(total // elt_count): ext = vector.ExtractStridedSliceOp( legal_ty, vec, - ir.ArrayAttr.get([_i64(i * elt_count)]), - ir.ArrayAttr.get([_i64(elt_count)]), - ir.ArrayAttr.get([_i64(1)])).result + ir.ArrayAttr.get([i64(i * elt_count)]), + ir.ArrayAttr.get([i64(elt_count)]), + ir.ArrayAttr.get([i64(1)])).result v = _viv(ext, legal_ty, opcode, imm, rvl) res = vector.InsertStridedSliceOp( - v, res, ir.ArrayAttr.get([_i64(i * elt_count)]), - ir.ArrayAttr.get([_i64(1)])).result + v, res, ir.ArrayAttr.get([i64(i * elt_count)]), + ir.ArrayAttr.get([i64(1)])).result return res -def _iter_ops(block): - for op in list(block.operations): - yield op - for region in op.operation.regions: - for b in region.blocks: - yield from _iter_ops(b) - - # --------------------------------------------------------------------------- # matmul lowering helpers (mirror MatmulOpLowering) # --------------------------------------------------------------------------- @@ -146,11 +132,6 @@ def _elt_bits(elt_ty): return ir.FloatType(elt_ty).width -def _bool_attr_true(op, key): - a = op.attributes - return key in a and ir.BoolAttr(a[key]).value - - def _enclosing_loops(op): """Walk ancestor ops; return (accumulation, outer, inner) affine.for lists, outermost-first (mirror the C++ insert-at-begin).""" @@ -158,11 +139,11 @@ def _enclosing_loops(op): parent = op.operation.parent while parent is not None: if parent.name == "affine.for": - if _bool_attr_true(parent, "accumulation_loop"): + if attr_bool(parent, "accumulation_loop"): acc.insert(0, parent) - if _bool_attr_true(parent, "outer_loop"): + if attr_bool(parent, "outer_loop"): outer.insert(0, parent) - if _bool_attr_true(parent, "inner_loop"): + if attr_bool(parent, "inner_loop"): inner.insert(0, parent) parent = parent.parent return acc, outer, inner @@ -200,7 +181,7 @@ def _scan_conv_offsets(ow_loop, o_h, k_h, o_w, k_w): """Mirror the heuristic offset scan: find affine.apply(o_h,k_h)/(o_w,k_w) in the o_w loop and read the constant in its map (default 1).""" offset_h = offset_w = 1 - for o in _iter_ops(ow_loop.regions[0].blocks[0]): + for o in walk_ops(ow_loop.regions[0].blocks[0]): if o.operation.name != "affine.apply": continue ops = list(o.operation.operands) @@ -391,7 +372,7 @@ def _root(v): return owner.operands[0] return v rootA, rootB = _root(A), _root(B) - for o in _iter_ops(outer[-1].regions[0].blocks[0]): + for o in walk_ops(outer[-1].regions[0].blocks[0]): if o.operation.name == "affine.vector_store": dest = _root(o.operation.operands[1]) if dest == rootA: @@ -488,6 +469,14 @@ def _root(v): # --- B dma_wait --- nacc = len(acc) acc_ivs = [_loop_iv(l) for l in acc] + # LEGACY behavior: coefficient -1 on each accumulation (reduction) loop var + # is a SENTINEL marking "this tag dim is the reduction axis", not an + # arithmetic offset. The legacy TOG path (TileGraphParser.cc) honors it by + # routing those vars to a separate accum tag component and skipping stride + # -1. The C++ trace path does NOT honor it: build_skeleton._strip_accum_terms + # drops these -1 terms so the memory_barrier slot stays subtile-only and + # pairs with its async load. Kept here for byte-identity with the C++ + # -test-pytorchsim-to-vcix pass; remove (do not flag) once legacy retires. bexpr = ir.AffineDimExpr.get(0) * -1 for i in range(1, nacc): bexpr = bexpr + ir.AffineDimExpr.get(i) * -1 @@ -544,6 +533,10 @@ def _root(v): with body_ip: # --- A dma_wait --- + # LEGACY behavior (see the B dma_wait above): the -1 coefficients mark the + # reduction axis for the legacy TOG path; the trace path strips them in + # build_skeleton._strip_accum_terms. Kept for byte-identity with the C++ + # -test-pytorchsim-to-vcix pass; remove once legacy retires. aexpr = ir.AffineDimExpr.get(0) * -1 for i in range(1, nacc): aexpr = aexpr + ir.AffineDimExpr.get(i) * -1 @@ -617,7 +610,7 @@ def run(module, vectorlane=128, vlen=128, **_): mms = [] for region in module.operation.regions: for b in region.blocks: - for o in _iter_ops(b): + for o in walk_ops(b): if o.operation.name == "linalg.matmul": mms.append(o.operation) for o in mms: @@ -625,7 +618,7 @@ def run(module, vectorlane=128, vlen=128, **_): targets = [] for region in module.operation.regions: for b in region.blocks: - for op in _iter_ops(b): + for op in walk_ops(b): if op.operation.name in _MATH_VIV: targets.append(op.operation) for op in targets: diff --git a/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py b/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py index 76e30cb3..3ed0a394 100644 --- a/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py +++ b/PyTorchSimFrontend/mlir/passes/lower_vlane_idx.py @@ -24,13 +24,7 @@ OP_NAME = "torchsim.vlane_idx" MARKERS = (OP_NAME,) - -def _iter_ops(block): - for op in list(block.operations): - yield op - for region in op.operation.regions: - for b in region.blocks: - yield from _iter_ops(b) +from ._mlir_util import walk_ops def run(module, **_): @@ -46,7 +40,7 @@ def run(module, **_): targets = [] for region in module.operation.regions: for b in region.blocks: - for op in _iter_ops(b): + for op in walk_ops(b): if op.operation.name == OP_NAME: targets.append(op.operation) diff --git a/PyTorchSimFrontend/mlir/passes/togsim_ops.py b/PyTorchSimFrontend/mlir/passes/togsim_ops.py new file mode 100644 index 00000000..740a8f2f --- /dev/null +++ b/PyTorchSimFrontend/mlir/passes/togsim_ops.py @@ -0,0 +1,106 @@ +"""Shared vocabulary for the skeleton+API MLIR form (C1). + +The trace pipeline (docs/design/togsim_cpp_trace.md) reduces a kernel's MLIR to +a *loop skeleton + API calls*: native `affine.for`/`scf.for` loops (bounds kept +as-is, symbolic preserved) plus a handful of `togsim.*` ops that stand for the +runtime API. This module is the single source of truth for those op names and +attribute keys, shared by: + + * build_skeleton (C2) -- produces the skeleton+API MLIR, and + * togsim->emitc lowering (C4) -- rewrites each op to an `emitc.call_opaque`. + +The ops are kept *unregistered* (like the existing `togsim.transfer`), so there +is no C++ dialect to register; C4 is a custom rewrite, not a registered +ConversionPass. + +Grammar (each op lowers 1:1 to a `togsim_runtime.h` free function): + + "togsim.dma"(%dram_idx, %tag_idx) { -> togsim_dma(ctx, dir, arg_id, + dir = 0 | 1, # LOAD|STORE offset, ndim, dims, strides, + dims = [..], strides = [..], elem_bits, is_async, + elem_bits = i32, is_async = bool, tag_id, tag_slot, + tag_id = i32, arg_id = i32, read_bufs, write_bufs) + read_bufs = [..], write_bufs = [..] + } : (index, index) -> () + + "togsim.compute"() { -> togsim_compute(ctx, tile_id, + tile_id = i64, compute_type = i32, compute_type, ndim, dims, + read_bufs = [..], write_bufs = [..] read_bufs, write_bufs) + } : () -> () + + "togsim.memory_barrier"(%tag_idx) { -> togsim_memory_barrier(ctx, + tag_id = i32, write_bufs = [..] tag_id, tag_slot, write_bufs) + } : (index) -> () + + "togsim.compute_barrier"() : () -> () -> togsim_compute_barrier(ctx) + +How an async dma pairs with its sync point: NOT by a compile-time id. One static +`togsim.dma` op runs once per loop iteration, each with a different RUNTIME tag +slot `%tag[%idx]`, so the pairing must be a runtime key. `togsim.dma` carries a +`tag_id` (its tag memref identity) and the runtime `%tag[%idx]` operand; the +original `memref.dma_wait` becomes an explicit `togsim.memory_barrier` carrying +the same `tag_id` + tag index. They pair at runtime by `(tag_id, tag_slot)` via +the Core's tag table (the dma signals the tag at data-arrival; the barrier waits +it). `tag_id` (which tag memref) is distinct from `tag_slot` (the SRAM tile slot, +used for the double-buffer / capacity model). A sync (non-async) dma is blocking, +so it needs no barrier. (Supersedes the earlier static `event_id` + `togsim.wait` +design, which could not express per-iteration pairing.) + +Keep this in lockstep with TOGSim/include/togsim_runtime.h (TOGSIM_ABI_VERSION). +""" + +# ---- op names ------------------------------------------------------------- +DMA = "togsim.dma" +COMPUTE = "togsim.compute" +COMPUTE_BAR = "togsim.compute_barrier" # fence: drain async compute before a consumer (sec 10.7) +MEMORY_BAR = "togsim.memory_barrier" # explicit async-DMA sync (the original dma_wait); tag-keyed + +#: every op this module owns (for matchers / DCE roots in C2). +OP_NAMES = (DMA, COMPUTE, COMPUTE_BAR, MEMORY_BAR) + +#: op name -> the togsim_runtime.h symbol C4 lowers it to. +EMITC_CALLEE = { + DMA: "togsim_dma", + COMPUTE: "togsim_compute", + COMPUTE_BAR: "togsim_compute_barrier", + MEMORY_BAR: "togsim_memory_barrier", +} + +#: producer entry-point symbol the TOGSim loader resolves (see togsim_runtime.h). +ENTRY_SYMBOL = "togsim_kernel" + +#: outlined per-work-item function the dispatcher hands to togsim_dispatch +#: (uniform signature (ctx, int64* iv, i32 n); see togsim_cpp_trace.md sec 9.3). +TILE_SYMBOL = "togsim_kernel_tile" + +#: runtime callees emitted directly by lower_to_emitc (not skeleton ops), kept in +#: lockstep with togsim_runtime.h. DISPATCH_CALLEE is the higher-order wrapper the +#: dispatcher loop calls per work-item (round-robins a core + TILE_BEGIN/END); +#: TILE_SYMBOL is passed to it as the function pointer. +DISPATCH_CALLEE = "togsim_dispatch" + +# ---- attribute keys ------------------------------------------------------- +ATTR_DIR = "dir" # i32: DIR_LOAD | DIR_STORE +ATTR_DIMS = "dims" # i64 array: tile extents +ATTR_STRIDES = "strides" # i64 array: tile strides +ATTR_ELEM_BITS = "elem_bits" # i32 +ATTR_IS_ASYNC = "is_async" # bool +ATTR_TILE_ID = "tile_id" # i64: key into the precomputed tile_id->cycle table +ATTR_COMPUTE_TYPE = "compute_type" # i32: 0 vector / 1 matmul / 2 preload (Core enum) +ATTR_READ_BUFS = "read_bufs" # i64 array: SRAM buffer ids this op reads (sec 10 dataflow) +ATTR_WRITE_BUFS = "write_bufs" # i64 array: SRAM buffer ids this op writes (sec 10 dataflow) +ATTR_TAG_ID = "tag_id" # i32: identity of the DMA's tag memref; pairs an async dma with + # its memory_barrier by the RUNTIME tag slot (tag_id + tag index) +ATTR_ARG_ID = "arg_id" # i32: which tensor (func arg) this DMA's base is + +# Must match togsim_dma_dir in togsim_runtime.h. +DIR_LOAD = 0 +DIR_STORE = 1 + + +def is_togsim_op(op): + """True if `op` (an Operation or a wrapping view) is one of ours.""" + name = getattr(op, "name", None) + if name is None: + name = getattr(getattr(op, "operation", None), "name", None) + return name in OP_NAMES diff --git a/Simulator/simulator.py b/Simulator/simulator.py index 2b9f05be..5f6ed08e 100644 --- a/Simulator/simulator.py +++ b/Simulator/simulator.py @@ -560,7 +560,20 @@ def run_standalone( os.fsync(trace_file.fileno()) try: - cmd = f"{TOGSimulator.get_togsim_command(config_path, togsim_path)} --models_list {trace_file_path}" + # The C++ TOG (trace) path is the DEFAULT: drive the simulation from the + # emitted trace.so. The legacy ONNX TOG is the opt-in fallback via + # TORCHSIM_LEGACY_TOG=1. Autotune candidates each retile while the .so is + # one tiling, so they always run legacy; the trace path drives the final + # (chosen-tiling) run. Fall back to legacy if the .so was not emitted. + trace_so = os.path.join(os.path.dirname(str(model_path)), "trace.so") + cycle_tsv = os.path.join(os.path.dirname(str(model_path)), "trace_cycles.tsv") + base_cmd = TOGSimulator.get_togsim_command(config_path, togsim_path) + use_trace = (os.environ.get("TORCHSIM_LEGACY_TOG") != "1" + and not autotune_mode and os.path.exists(trace_so)) + if use_trace: + cmd = f"{base_cmd} --trace_so {trace_so} --cycle_table {cycle_tsv}" + else: + cmd = f"{base_cmd} --models_list {trace_file_path}" if extension_config.CONFIG_TOGSIM_DEBUG_LEVEL: cmd += f" --log_level {extension_config.CONFIG_TOGSIM_DEBUG_LEVEL}" diff --git a/TOGSim/include/Core.h b/TOGSim/include/Core.h index 286feb5f..0b6f8595 100644 --- a/TOGSim/include/Core.h +++ b/TOGSim/include/Core.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include #include @@ -24,6 +25,10 @@ class Core { Core(uint32_t id, SimulationConfig config); ~Core()=default; virtual bool running(); + // True if this core has work actively in flight (DMA / compute pipeline / queues) + // that will produce a future finish event -- i.e. running() minus "tiles waiting". + // Used by the frozen-state (spad-too-small) guard. + bool has_inflight(); virtual bool can_issue(const std::shared_ptr& op); virtual void issue(std::shared_ptr tile); virtual std::shared_ptr pop_finished_tile(); @@ -55,6 +60,16 @@ class Core { void sa_cycle(); bool can_issue_compute(std::shared_ptr& inst); void update_stats(); + // SRAM-capacity throttle (sec 10.x): a consumer frees the buffer-versions it + // read (refcount -> 0 releases the spad bytes). Called when COMP/MOVOUT issue. + void release_sram(const std::shared_ptr& inst); + // SA weight-buffer throttle (sec 10.x): pick a systolic array that has a free + // weight slot (round-robin among free); -1 if all full -> the preload stalls. + int pick_free_weight_sa(); + // Free weight slots due this cycle: a matmul releases its slot at its + // streaming-end (finish - overlapping, when it stops reading the weight), + // scheduled at issue in _weight_release_q. Last consumer frees it. + void process_weight_releases(); /* Core id & config file */ const uint32_t _id; @@ -103,4 +118,20 @@ class Core { std::queue _request_queue; std::queue _response_queue; uint32_t _waiting_write_reqs; + + // SRAM-capacity throttle (sec 10.x). _sram_used = current per-core spad bytes; + // _sram_capacity = limit (0 = disabled); _sram_allocs maps a buffer-version id + // to its accumulated footprint bytes (freed when its last reader issues). + size_t _sram_used = 0; + size_t _sram_capacity = 0; + std::unordered_map _sram_allocs; + + // SA weight-buffer throttle (sec 10.x). _weight_slots_used[s] = weights resident + // on SA s (loaded by a preload, not yet freed by their last matmul); + // _weight_slot_depth = per-SA capacity (0 = disabled -> plain round-robin). + std::vector _weight_slots_used; + uint32_t _weight_slot_depth = 0; + // Pending weight-slot releases keyed by cycle (each matmul's streaming-end); + // process_weight_releases() drains those due and decrements the token. + std::multimap> _weight_release_q; }; \ No newline at end of file diff --git a/TOGSim/include/Instruction.h b/TOGSim/include/Instruction.h index bb62a440..fa5d4ca1 100644 --- a/TOGSim/include/Instruction.h +++ b/TOGSim/include/Instruction.h @@ -12,7 +12,16 @@ #include #include -enum class Opcode { MOVIN, MOVOUT, COMP, BAR, COUNT}; +// MEMORY_BAR: the DMA/memory barrier (waits a DMA tag in the tag table). +// COMPUTE_BAR: the compute barrier -- waits the systolic-array compute pipeline(s) +// to drain (all SAs empty), then finishes. Used as the explicit +// fence before a store consumes async matmul results (sec 10.7). +enum class Opcode { MOVIN, MOVOUT, COMP, MEMORY_BAR, COMPUTE_BAR, COUNT}; + +// One weight slot on systolic array `sa` (sec 10.x). A preload sets refcount = +// the matmuls reusing the weight; each frees it at its streaming-end, the last +// one releases the slot. Shared (shared_ptr) by the preload's matmul consumers. +struct WeightToken { int sa; int refcount; }; typedef uint64_t addr_type; typedef uint64_t cycle_type; @@ -29,6 +38,26 @@ class Instruction : public std::enable_shared_from_this { Instruction(Opcode opcode); void finish_instruction(); void add_child(std::shared_ptr child); + // Occupancy (SA-pipeline) dependency: the child is released when THIS op is + // ISSUED (enters the pipeline), not when it finishes -- so a preload/matmul + // successor overlaps it instead of waiting its full latency (sec 10.7). + void add_pipeline_child(std::shared_ptr child); + void release_pipeline_children(); + // SA weight-buffer model: the SA this op is pinned to (a preload picks it, its + // matmul consumers inherit it) and the shared weight slot the matmuls release. + const std::set>& get_pipeline_children() { return _pipeline_children; } + void set_assigned_sa(int s) { _assigned_sa = s; } + int get_assigned_sa() const { return _assigned_sa; } + void set_weight_token(const std::shared_ptr& t) { _weight_token = t; } + const std::shared_ptr& get_weight_token() const { return _weight_token; } + // Trace-only: which work-item (togsim_dispatch tile) this op belongs to, for + // grouping/coloring in the timeline. Set by the bridge per TILE_BEGIN. + void set_tile_group(int g) { _tile_group = g; } + int get_tile_group() const { return _tile_group; } + // COMPUTE_BAR fence: the max finish_cycle of the async computes it gates (its + // own dispatch only), so it drains those instead of every SA pipeline. + void update_fence_finish(cycle_type c) { if (c > _fence_finish) _fence_finish = c; } + cycle_type get_fence_finish() const { return _fence_finish; } bool check_ready() { return ready_counter == 0; } const Opcode get_opcode() { return opcode; } bool is_dma_read() { return opcode == Opcode::MOVIN; } @@ -51,6 +80,9 @@ class Instruction : public std::enable_shared_from_this { void inc_waiting_request(); void dec_waiting_request(); size_t get_waiting_request() { return _nr_waiting_request; } + // trace: log only the FIRST DRAM response of a load (when data starts arriving). + bool got_first_response() const { return _got_first_response; } + void mark_first_response() { _got_first_response = true; } std::vector& get_tile_size() { return tile_size; } std::vector& get_tile_stride() { return tile_stride; } void set_overlapping_cycle(cycle_type cycle) { overlapping_cycle = cycle; } @@ -86,12 +118,26 @@ class Instruction : public std::enable_shared_from_this { std::set>& get_child_inst() { return child_inst; } uint64_t get_global_inst_id() const { return _global_inst_id; } - cycle_type start_cycle; - cycle_type finish_cycle; + // SRAM-capacity model (sec 10.x). A load contributes its footprint to a + // buffer-version allocation; the version is freed when its LAST consumer (the + // program-order-last reader, tagged by the bridge) issues. The bridge fills + // these; Core enforces them. + // _sram_alloc_id : which buffer-version this load fills (-1 = untracked) + // _sram_release_allocs: versions this consumer frees on issue (tagged only on + // each version's last reader) + void set_sram_alloc(int64_t id) { _sram_alloc_id = id; } + int64_t get_sram_alloc() const { return _sram_alloc_id; } + void add_sram_release(int64_t id) { _sram_release_allocs.push_back(id); } + const std::vector& get_sram_release() const { return _sram_release_allocs; } + // bytes this load occupies in the spad (from the tile it moves in). + size_t sram_footprint() const { return _tile_numel * (_elem_bits / 8); } + + cycle_type start_cycle = 0; + cycle_type finish_cycle = 0; cycle_type bubble_cycle=0; bool finished=false; - int subgraph_id; + int subgraph_id = 0; private: uint64_t _global_inst_id = 0; static uint64_t _next_global_inst_id; @@ -99,16 +145,19 @@ class Instruction : public std::enable_shared_from_this { void *_owner = nullptr; std::list>* _owner_ready_queue_ref = nullptr; Opcode opcode; - cycle_type compute_cycle; - cycle_type overlapping_cycle; - size_t ready_counter; + cycle_type compute_cycle = 0; + cycle_type overlapping_cycle = 0; + size_t ready_counter = 0; // parents not yet finished; the minimal Instruction(Opcode) + // ctor (barriers) relies on this default + inc_ready_counter std::set> child_inst; + std::set> _pipeline_children; // released at issue (sec 10.7) std::vector tile_size; std::vector tile_stride; - size_t _tile_numel; + size_t _tile_numel = 0; size_t _nr_waiting_request=0; + bool _got_first_response=false; size_t _elem_bits = 0; - addr_type dram_addr; + addr_type dram_addr = 0; uint32_t _numa_id = 0; // For DMA instruction int _compute_type = 0; std::vector _tag_idx_list; @@ -123,4 +172,12 @@ class Instruction : public std::enable_shared_from_this { bool _is_indirect_mode=false; bool _is_sparse_inst=false; std::string _indirect_index_path=""; + // SRAM-capacity model (see the setters above). + int64_t _sram_alloc_id = -1; + std::vector _sram_release_allocs; + // SA weight-buffer model (see the setters above). + int _assigned_sa = -1; + std::shared_ptr _weight_token; + int _tile_group = -1; // trace-only work-item id (see set_tile_group) + cycle_type _fence_finish = 0; // COMPUTE_BAR: drain target (see update_fence_finish) }; \ No newline at end of file diff --git a/TOGSim/include/SimulationConfig.h b/TOGSim/include/SimulationConfig.h index 2ef08618..7785ff7a 100644 --- a/TOGSim/include/SimulationConfig.h +++ b/TOGSim/include/SimulationConfig.h @@ -27,6 +27,16 @@ struct SimulationConfig { uint32_t num_systolic_array_per_core = 1; uint32_t num_stonne_per_core = 1; uint32_t num_stonne_port = 1; + // Per-core VMEM/spad capacity (KB) for the trace-path DMA throttle (sec 10.x): + // a load that would overflow the spad does not issue until a consumer frees a + // tile. Provided by the config (the TPU configs set 16384 = 16 MB VMEM). 0 = + // unset -> gate disabled (unlimited). Only affects trace-path instructions + // (legacy TileGraphParser insts have alloc id -1 -> never gated). + uint32_t core_spad_size_kb = 0; + // SA weight-buffer depth (sec 10.x): weight tiles a systolic array holds; a + // preload stalls until a slot frees (its matmuls finished). 2 = weight + // double-buffer (convention default, tunable). 0 = disabled. + uint32_t sa_weight_buffer_depth = 2; /* DRAM config */ DramType dram_type; diff --git a/TOGSim/include/Simulator.h b/TOGSim/include/Simulator.h index e3542d51..91baf5b5 100644 --- a/TOGSim/include/Simulator.h +++ b/TOGSim/include/Simulator.h @@ -48,6 +48,9 @@ class Simulator { void dram_cycle(); void icnt_cycle(); bool running(); + // Spad-too-small guard: if the sim stays frozen (running() but nothing in + // flight) past kWedgeThreshold cycles, error out and exit. Called each cycle. + void check_frozen(); void set_cycle_mask(); uint32_t get_dest_node(mem_fetch *access); SimulationConfig _config; diff --git a/TOGSim/include/TraceLogTags.h b/TOGSim/include/TraceLogTags.h index 6c158099..759a4fdb 100644 --- a/TOGSim/include/TraceLogTags.h +++ b/TOGSim/include/TraceLogTags.h @@ -24,6 +24,7 @@ inline constexpr const char* kInstructionFinished = "INST_FINISHED"; inline constexpr const char* kInstructionSkipped = "INST_SKIP"; inline constexpr const char* kAsyncDmaAllRequestsIssued = "ASYNC_DMA_ISSUE"; +inline constexpr const char* kFirstDramResponse = "DRAM_RESP_FIRST"; inline constexpr const char* kAllDramResponsesReceived = "DRAM_RESP_DONE"; inline constexpr const char* kL2CacheableStatusForAddress = "L2CACHE_STAT"; diff --git a/TOGSim/include/togsim_loader.h b/TOGSim/include/togsim_loader.h new file mode 100644 index 00000000..6c1273ee --- /dev/null +++ b/TOGSim/include/togsim_loader.h @@ -0,0 +1,76 @@ +#pragma once +// togsim_loader.h +// ----------------------------------------------------------------------------- +// TOGSim-side loader for the compiled trace producer (C6, P3 task 5). NOT part +// of the producer ABI (togsim_runtime.h) -- this is the TOGSim half that +// `dlopen`s a producer `.so`, runs its `togsim_kernel`, and records the emitted +// instruction stream. See docs/design/togsim_cpp_trace.md sec 5.3 / 9.7. +// +// This first cut is the "materializing sink": the callbacks resolve each tile's +// DRAM address (base[arg_id] + offset*elem_bytes) and per-tile compute cost +// (the cycle table), mint event handles, and append a TraceRec per modeled +// instruction. Feeding the recorded stream into the existing timing core +// (Core/Simulator) for cycle-equivalence vs the build_tog path is the remaining +// task-5 step. +// ----------------------------------------------------------------------------- + +#include +#include + +#include "togsim_runtime.h" + +namespace togsim { + +// One modeled instruction recorded by the runtime callbacks. +struct TraceRec { + enum Kind { TILE_BEGIN, TILE_END, DMA, COMPUTE, MEMORY_BAR, COMPUTE_BAR } kind; + int32_t core; // work-item -> core binding (set by togsim_dispatch) + // DMA / MEMORY_BAR + int32_t dir; // togsim_dma_dir + int32_t arg_id; // tensor + int32_t elem_bits; + int32_t is_async; + uint64_t addr; // resolved DRAM byte address = base[arg_id] + off*bytes + int32_t tag_id; // DMA/MEMORY_BAR: tag memref identity; with tag_slot the + // runtime pairing key (an async dma <-> its memory_barrier) + uint64_t tag_slot; // SRAM tile slot (double-buffer / capacity model) + std::vector dims; // tile extents (DMA) + std::vector strides; // tile strides (DMA) + std::vector read_bufs; // SRAM buffer ids read (sec 10 dataflow DAG) + std::vector write_bufs; // SRAM buffer ids written (MEMORY_BAR: released bufs) + // COMPUTE + uint64_t tile_id; + int32_t compute_type; // 0 vector / 1 matmul / 2 preload (Core unit enum) + int64_t cycle; // looked up from the cycle table + int64_t overlapping; // looked up from the cycle table +}; + +struct RunResult { + bool ok = false; + std::vector trace; +}; + +// Load `so_path`, run its `togsim_kernel(shape_args, n_shape)` against a freshly +// built EmitCtx, and return the recorded trace. +// tensor_base[arg_id] : DRAM base address of each kernel tensor argument +// cyc[tile_id] / ovl[tile_id] : the cycle table (cycle, overlapping_cycle) +// num_cores : dispatch round-robins work-items across this many cores +RunResult run_producer(const char* so_path, + const int64_t* shape_args, int32_t n_shape, + const uint64_t* tensor_base, int32_t n_tensors, + const int64_t* cyc, const int64_t* ovl, int32_t n_tiles, + int32_t num_cores); + +// First-order reference timing over a recorded trace, to validate that the +// stream carries enough to be scheduled (it is NOT the production Core -- no +// DRAM/NoC/L2 contention; the real cycle-equivalence path feeds Tile/TileGraph +// into Core). Models, per core: a DMA-engine timeline (DMAs serialize, overlap +// compute), a compute timeline (serial = reduction accumulate, with the +// finish = prev.finish + cycle - overlapped pipeline overlap of Core.cc), and +// data dependencies (a compute waits the dmas whose handles its preceding +// togsim_wait()s named). +struct TimingParams { uint64_t dma_latency = 100; }; +struct SimResult { uint64_t total_cycle = 0; int n_compute = 0, n_dma = 0; }; +SimResult simulate(const RunResult& run, const TimingParams& params); + +} // namespace togsim diff --git a/TOGSim/include/togsim_runtime.h b/TOGSim/include/togsim_runtime.h new file mode 100644 index 00000000..e8fd6b84 --- /dev/null +++ b/TOGSim/include/togsim_runtime.h @@ -0,0 +1,177 @@ +#pragma once +// togsim_runtime.h +// ----------------------------------------------------------------------------- +// Shared C ABI between a compiled, shape-parametric trace producer (`.so`, +// generated MLIR -> EmitC -> C++) and TOGSim. See docs/design/togsim_cpp_trace.md. +// +// The producer keeps loops as native loops (symbolic bounds become function +// parameters) and calls the functions below; each call emits one trace record = +// one modeled instruction. TOGSim `dlopen`s the producer, constructs an +// `EmitCtx`, calls the entry point, records the emitted stream, and feeds it to +// the existing timing core. The producer carries NO timing model and NO +// functional compute -- it is a deterministic trace generator only. +// +// ABI shape rationale: `mlir-translate --mlir-to-cpp` lowers our `togsim.*` ops +// (via `emitc.call_opaque`) to *free function* calls, so the contract is a set +// of `extern "C"` free functions taking an opaque `EmitCtx*` as the first +// argument. Implementations live in TOGSim and may dispatch internally; the +// `EmitCtx` is opaque to the producer. `togsim_abi_version()` guards against a +// producer `.so` built against a stale header. +// +// STATUS: firmed up in P2. The signatures below match what the C4 +// togsim->emitc lowering (PyTorchSimFrontend/mlir/passes/lower_to_emitc.py) +// emits as `emitc.call_opaque` targets and what `mlir-translate --mlir-to-cpp` +// renders. Synchronization is event-id based: each async op is registered +// under an integer `event_id` and the matching wait passes the same id (the +// "event-id table replaces the memory-keyed tag_table" decision). Tile DRAM +// base addresses are still passed as a stub (0) until P3 wires real addresses. +// ----------------------------------------------------------------------------- + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Bump whenever the signatures below change incompatibly. TOGSim refuses to load +// a producer whose embedded version (a `togsim_producer_abi_version` symbol, or +// a value passed at the entry point) does not match. +// v1 -> v2 (P2): dma takes an event_id and returns void (was: returns a +// handle); togsim_kernel shape_args is non-const to match the +// emitc/mlir-to-cpp output. +// v2 -> v3 (P3): add togsim_dispatch (work-item boundary + core binding) and +// togsim_wait_all (join / barrier). +// v3 -> v4 (P3): togsim_dma takes (arg_id, element offset) instead of a +// precomputed base_addr; the producer lowers the address +// arithmetic and the runtime adds the tensor base. +// v4 -> v5 (P3): event handles. togsim_dma RETURNS a fresh handle (drops the +// event_id arg); the producer parks it in a heap event buffer +// (togsim_event_alloc/free) and togsim_wait takes the handle. +// v5 -> v6 (P3): replace togsim_dispatch with togsim_core_alloc (returns a +// core id; no free) -- the runtime owns the core pool, num_cores +// is never baked into the producer. +// v6 -> v7 (P3): togsim_dma takes a tag_slot (SRAM tile slot) for the runtime's +// double-buffer / SRAM-capacity model. +// v7 -> v8 (P3): togsim_compute takes a compute_type (vector/matmul/preload) so +// the Core routes it to the right compute unit. +// v8 -> v9 (P3 sec10): togsim_dma/compute take read_bufs/write_bufs (SRAM buffer +// ids); the loader builds an explicit dependency DAG by +// last-writer per buffer (replaces in-order/tag dependencies). +// v9 -> v10 (P3 sec10.7): add togsim_compute_barrier (the explicit compute fence +// before a store; loader -> COMPUTE_BAR instruction). +// v10 -> v11 (P3 sec10): replace the static event-id pairing with the RUNTIME +// tag slot. togsim_dma takes a tag_id (its tag memref identity) +// and returns void; the original dma_wait becomes an explicit +// togsim_memory_barrier(tag_id, tag_slot, write_bufs) that pairs +// with its async dma by the runtime (tag_id, tag_slot) -- one +// static dma op runs once per loop iteration with a different +// %tag[%idx], so only a runtime key can pair them. Drops +// togsim_wait/signal/wait_all/event_alloc/event_free + the +// togsim_event handle (no compile-time pairing token). +// v11 -> v12 (P3 sec9.3): replace the bare togsim_core_alloc marker with a +// higher-order togsim_dispatch(ctx, tile_fn, iv, n_iv) wrapper. +// The producer outlines each parallel work-item into a uniform +// togsim_kernel_tile(ctx, iv, n) and the dispatcher loop hands it +// to togsim_dispatch, which round-robins a core and brackets the +// call with TILE_BEGIN/TILE_END. The work-item scope is now the +// function call itself (no implicit "until the next core_alloc" +// range); one general dispatcher serves every kernel (uniform +// iv-array ABI). Core alloc + the begin/end boundary are +// runtime-owned. +#define TOGSIM_ABI_VERSION 12 +int32_t togsim_abi_version(void); + +// Opaque per-invocation context owned by TOGSim. Holds the record sink and the +// tile_id->cycle lookup. Never dereferenced by the producer. +typedef struct EmitCtx EmitCtx; + +// Direction for togsim_dma. +typedef enum { + TOGSIM_DMA_LOAD = 0, // DRAM -> SRAM (MOVIN) + TOGSIM_DMA_STORE = 1, // SRAM -> DRAM (MOVOUT) +} togsim_dma_dir; + +// Emit a DMA. +// dir : load/store +// arg_id : which tensor (kernel func arg) this tile lives in +// offset : ELEMENT offset of this tile within that tensor, computed by the +// producer from the loop indices (the affine address arithmetic is +// lowered into the producer -- P3 approach A). The runtime forms +// the DRAM address as base[arg_id] + offset*elem_bytes (only the +// runtime knows the tensors' allocation base addresses). +// ndim : rank of the tile +// dims : ndim tile extents +// strides : ndim tile strides (may be null => contiguous) +// elem_bits : element width in bits +// is_async : non-zero => issue-complete is the finish; the consumer must be +// gated by an explicit togsim_memory_barrier (data arrives later). +// Zero => blocking: the dma finishes at data-arrival. +// tag_id : identity of this dma's tag memref. With tag_slot it forms the +// RUNTIME pairing key (tag_id, tag_slot) the matching +// togsim_memory_barrier waits on -- not a compile-time id, since +// one static dma op runs once per loop iteration. +// tag_slot : the SRAM tile slot this tile occupies (the producer's lowered +// tag index, evaluated at runtime). Also the double-buffer / +// SRAM-capacity slot. Single-buffer kernels pass 0. +// read_bufs/n_read, write_bufs/n_write : SRAM buffer ids this op reads/writes +// (sec 10 dataflow). The loader builds the dependency DAG by last-writer per +// buffer. +void togsim_dma(EmitCtx* ctx, int32_t dir, int32_t arg_id, + uint64_t offset, int32_t ndim, const int64_t* dims, + const int64_t* strides, int32_t elem_bits, + int32_t is_async, int32_t tag_id, uint64_t tag_slot, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write); + +// Emit a fixed-size tile compute. Cost is looked up from the precomputed +// tile_id->cycle table (annotation pass / sample-mode); `dims` are passed for +// logging and future remainder-tile handling, not to compute cost here. +// compute_type : 0 vector / 1 matmul / 2 preload (maps to the Core unit enum; +// routes the op to the VPU vs the systolic array). +void togsim_compute(EmitCtx* ctx, uint64_t tile_id, int32_t compute_type, + int32_t ndim, const int64_t* dims, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write); + +// Explicit async-DMA sync -- the original memref.dma_wait. Pairs with its async +// togsim_dma by the RUNTIME tag slot (tag_id, tag_slot) and gates consumers on +// data-arrival (resp-complete), since an async dma's own finish is only +// issue-complete. `write_bufs` is the SRAM buffer(s) that dma loaded; the loader +// makes the barrier the last writer of them so consumers depend on it. Sync DMAs +// need no barrier (they block to data-arrival themselves). +void togsim_memory_barrier(EmitCtx* ctx, int32_t tag_id, uint64_t tag_slot, + const int64_t* write_bufs, int32_t n_write); + +// A parallel work-item body, outlined by the producer (sec 9.3). Uniform across +// kernels: it takes the EmitCtx, the packed parallel loop indices `iv` (iv[0.. +// n_iv) -- e.g. the (m,n) output-tile indices) and their count. The body emits +// the work-item's ops (init / reduction / store). One signature => one general +// dispatcher serves every kernel. +// (iv is non-const to match the `int64_t*` the EmitC producer emits; the runtime +// only reads it.) +typedef void (*togsim_tile_fn)(EmitCtx* ctx, int64_t* iv, int32_t n_iv); + +// Dispatch one work-item (sec 9.3). The runtime round-robins a core from the +// pool, brackets the call with TILE_BEGIN/TILE_END (the work-item boundary), and +// invokes `fn(ctx, iv, n_iv)` -- so the work-item SCOPE is exactly the function +// call, not an implicit "ops until the next alloc" range. Core alloc + boundary +// are runtime-owned; the producer is core-count transparent (never names +// num_cores or a physical core). Independent work-items land on different cores +// -> multi-core. A general (kernel-independent) wrapper: it only forwards the +// opaque iv array to fn. +void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, + int64_t* iv, int32_t n_iv); + +// Compute fence: drain in-flight async compute (the systolic-array matmuls) +// before the following op (a store) consumes their result. Explicit barrier in +// the trace; the loader turns it into a COMPUTE_BAR instruction (sec 10.7). +void togsim_compute_barrier(EmitCtx* ctx); + +// Entry point the loader resolves in the producer `.so`. `shape_args` carries +// the runtime values for the kernel's symbolic dimensions (in a kernel-specific +// order recorded alongside the cached `.so`); `n_shape_args` is their count. +void togsim_kernel(EmitCtx* ctx, int64_t* shape_args, int32_t n_shape_args); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/TOGSim/include/togsim_trace_bridge.h b/TOGSim/include/togsim_trace_bridge.h new file mode 100644 index 00000000..f0213ef5 --- /dev/null +++ b/TOGSim/include/togsim_trace_bridge.h @@ -0,0 +1,18 @@ +#pragma once +// togsim_trace_bridge.h +// ----------------------------------------------------------------------------- +// Bridge from the recorded trace (togsim_loader.h RunResult) to a TileGraph the +// existing Simulator/Core can run, for production cycle-equivalence (P3 task 5; +// see togsim_cpp_trace.md sec 9.9). First cut: one Tile per work-item (the span +// between two togsim_core_alloc markers), bound to that work-item's core; the +// DMA/compute records become MOVIN/MOVOUT/COMP Instructions with the RAW +// dependency edges (a compute waits the dmas its preceding waits named). +// ----------------------------------------------------------------------------- +#include + +#include "TileGraph.h" +#include "togsim_loader.h" + +// Build a TileGraph from a recorded trace. `path`/`name` label the graph. +std::unique_ptr trace_to_tilegraph(const togsim::RunResult& run, + const std::string& name); diff --git a/TOGSim/src/CMakeLists.txt b/TOGSim/src/CMakeLists.txt index 65cd4dd4..d782d4d1 100644 --- a/TOGSim/src/CMakeLists.txt +++ b/TOGSim/src/CMakeLists.txt @@ -12,3 +12,8 @@ file(GLOB_RECURSE SRC_FILES # build add_executable(${LIB_NAME} ${SRC_FILES}) + +# Export the executable's dynamic symbols (-rdynamic) so a dlopen'd trace +# producer .so resolves the togsim_* runtime callbacks back into this binary +# (P3 trace pipeline). +set_target_properties(${LIB_NAME} PROPERTIES ENABLE_EXPORTS ON) diff --git a/TOGSim/src/Common.cc b/TOGSim/src/Common.cc index 3f84d885..6f9a74d7 100644 --- a/TOGSim/src/Common.cc +++ b/TOGSim/src/Common.cc @@ -64,6 +64,10 @@ SimulationConfig initialize_config(const YAML::Node& config, parsed_config.core_freq_mhz = get_config_value(config, "core_freq_mhz"); if (config["num_systolic_array_per_core"]) parsed_config.num_systolic_array_per_core = config["num_systolic_array_per_core"].as(); + if (config["core_spad_size_kb"]) + parsed_config.core_spad_size_kb = config["core_spad_size_kb"].as(); + if (config["sa_weight_buffer_depth"]) + parsed_config.sa_weight_buffer_depth = config["sa_weight_buffer_depth"].as(); if (config["num_stonne_per_core"]) parsed_config.num_stonne_per_core = config["num_stonne_per_core"].as(); if (config["num_stonne_port"]) diff --git a/TOGSim/src/Core.cc b/TOGSim/src/Core.cc index 9dad8597..25335c9c 100644 --- a/TOGSim/src/Core.cc +++ b/TOGSim/src/Core.cc @@ -17,6 +17,42 @@ Core::Core(uint32_t id, SimulationConfig config) _stat_sa_compute_idle_cycle.resize(_num_systolic_array_per_core); _stat_inst_count.resize(static_cast(Opcode::COUNT), 0); _stat_tot_skipped_inst.resize(static_cast(Opcode::COUNT), 0); + _sram_capacity = (size_t)config.core_spad_size_kb * 1024; // 0 = throttle disabled + _weight_slot_depth = config.sa_weight_buffer_depth; // 0 = disabled (plain rr) + _weight_slots_used.resize(_num_systolic_array_per_core, 0); +} + +// Round-robin a systolic array that still has a free weight slot; -1 if all full +// (the preload must stall). Advances _systolic_array_rr past the chosen SA. +int Core::pick_free_weight_sa() { + for (uint32_t i = 0; i < _num_systolic_array_per_core; i++) { + uint32_t s = (_systolic_array_rr + i) % _num_systolic_array_per_core; + if (_weight_slots_used[s] < (int)_weight_slot_depth) { + _systolic_array_rr = (s + 1) % _num_systolic_array_per_core; + return (int)s; + } + } + return -1; +} + +void Core::process_weight_releases() { + while (!_weight_release_q.empty() && _weight_release_q.begin()->first <= _core_cycle) { + auto tok = _weight_release_q.begin()->second; + _weight_release_q.erase(_weight_release_q.begin()); + if (--tok->refcount <= 0) _weight_slots_used[tok->sa]--; // last reader frees the slot + } +} + +// The LAST reader of a buffer-version issued (bridge tags only that consumer): +// free the version's bytes back to the per-core spad. +void Core::release_sram(const std::shared_ptr& inst) { + if (!_sram_capacity) return; + for (int64_t id : inst->get_sram_release()) { + auto it = _sram_allocs.find(id); + if (it == _sram_allocs.end()) continue; + _sram_used -= it->second; + _sram_allocs.erase(it); + } } bool Core::can_issue(const std::shared_ptr& op) { @@ -154,7 +190,7 @@ void Core::dma_cycle() { } else if(!finished_inst->is_dma_read()) { core_trace_log::log_error_dma_instruction_invalid(_core_cycle, _id); exit(EXIT_FAILURE); - } else if (finished_inst->get_opcode() == Opcode::BAR) { + } else if (finished_inst->get_opcode() == Opcode::MEMORY_BAR) { core_trace_log::trace_instruction_line(_core_cycle, _id, TraceLogTag::pad15(TraceLogTag::kInstructionFinished), @@ -200,6 +236,8 @@ void Core::cycle() { /* Increase core cycle counter */ _core_cycle++; + process_weight_releases(); // free weight slots due this cycle before dispatch + /* Iterate tile while an instruction is issued */ bool issued = false; @@ -240,6 +278,22 @@ void Core::cycle() { _stat_tot_skipped_inst.at(static_cast(inst->get_opcode()))++; break; } else { + // SRAM-capacity gate (sec 10.x): a load that would overflow the + // per-core spad does not issue this cycle -- leave it in the ready + // queue (it++ retries next cycle) until a consumer frees a tile. On + // issue, occupy its bytes under its buffer-version allocation. + if (_sram_capacity && inst->get_sram_alloc() >= 0) { + size_t F = inst->sram_footprint(); + // Stall if the tile does not fit in the free spad right now. If + // it can never fit (the kernel's working set exceeds the whole + // spad), the sim wedges -- Simulator::cycle() detects that frozen + // state and exits with a "spad too small" error rather than + // looping forever. + if (_sram_used + F > _sram_capacity) + break; // not issued -> retry next cycle + _sram_used += F; + _sram_allocs[inst->get_sram_alloc()] += F; // accumulate version footprint + } core_trace_log::trace_instruction_line(_core_cycle, _id, TraceLogTag::pad15( @@ -254,6 +308,7 @@ void Core::cycle() { } } case Opcode::MOVOUT: + release_sram(inst); // store issued -> free the tiles it drained core_trace_log::trace_instruction_line(_core_cycle, _id, TraceLogTag::pad15(TraceLogTag::kInstructionIssued), @@ -265,7 +320,44 @@ void Core::cycle() { break; case Opcode::COMP: { - auto& target_pipeline = get_compute_pipeline(inst->get_compute_type()); + const int ct = inst->get_compute_type(); + // --- SA selection + weight-buffer gate (sec 10.x) --- + // A preload picks a systolic array with a free weight slot and pins + // its matmul consumers to that SA (they free the slot on finish). A + // matmul runs on the SA its weight was preloaded into. This both + // bounds preload run-ahead and keeps matmuls on their weight's SA. + int sa_idx = -1; + if (ct == MATMUL || ct == PRELOAD) { + if (ct == PRELOAD) { + int n_consumers = 0; // matmuls reusing this weight + for (auto& c : inst->get_pipeline_children()) + if (c->get_compute_type() == MATMUL) n_consumers++; + if (_weight_slot_depth > 0 && n_consumers > 0) { + sa_idx = pick_free_weight_sa(); + if (sa_idx < 0) break; // all weight slots full -> stall (retry) + _weight_slots_used[sa_idx]++; + auto tok = std::make_shared(WeightToken{sa_idx, n_consumers}); + for (auto& c : inst->get_pipeline_children()) + if (c->get_compute_type() == MATMUL) { + c->set_assigned_sa(sa_idx); + c->set_weight_token(tok); + } + } else { // disabled / no consumers -> plain rr + sa_idx = _systolic_array_rr; + _systolic_array_rr = (_systolic_array_rr + 1) % _num_systolic_array_per_core; + } + } else { // MATMUL + sa_idx = inst->get_assigned_sa(); + if (sa_idx < 0) { // no preload pinned it -> rr fallback + sa_idx = _systolic_array_rr; + _systolic_array_rr = (_systolic_array_rr + 1) % _num_systolic_array_per_core; + } + } + inst->set_assigned_sa(sa_idx); // record the SA actually used (for the trace) + } + release_sram(inst); // consumer issued -> free the tiles it read + auto& target_pipeline = (ct == VECTOR_UNIT) ? _vu_compute_pipeline + : _sa_compute_pipeline.at(sa_idx); if (target_pipeline.empty()) { inst->finish_cycle = _core_cycle + inst->get_compute_cycle(); inst->bubble_cycle = inst->get_overlapping_cycle(); @@ -275,6 +367,18 @@ void Core::cycle() { inst->finish_cycle = target_pipeline.back()->finish_cycle + inst->get_compute_cycle() - overlapped_cycle; inst->bubble_cycle = bubble_cycle; } + // sec 10.7: release the occupancy (pipeline) dependents so a successor + // overlaps this op. finish_cycle is set first so release can feed it to + // a COMPUTE_BAR child's per-dispatch fence (see release_pipeline_children). + inst->release_pipeline_children(); + + // Release this matmul's weight slot at its streaming-end (finish - + // overlapping), not at full finish (the drain tail does not read it). + if (ct == MATMUL && inst->get_weight_token()) { + cycle_type rel = inst->finish_cycle > inst->get_overlapping_cycle() + ? inst->finish_cycle - inst->get_overlapping_cycle() : _core_cycle; + _weight_release_q.emplace(rel, inst->get_weight_token()); + } if (inst->get_compute_cycle() == 0) { inst->finish_instruction(); @@ -297,7 +401,7 @@ void Core::cycle() { } } break; - case Opcode::BAR: + case Opcode::MEMORY_BAR: { auto& key = inst->get_tag_id(); uint32_t finished = _dma.get_tag_finish(inst->subgraph_id, key); @@ -324,6 +428,24 @@ void Core::cycle() { issued = true; } break; + case Opcode::COMPUTE_BAR: + { + // Compute fence (sec 10.7): finish once THIS dispatch's async computes + // have drained -- i.e. the current cycle has reached the max finish of + // the computes it gates (fed in via update_fence_finish when each + // issued). Scoped to its own dispatch, so an unrelated tile's matmuls + // sharing the SA pipelines do not delay it (no cross-dispatch + // serialization). Not yet drained -> stays in the ready queue. + if (_core_cycle >= inst->get_fence_finish()) { + core_trace_log::trace_instruction_line(_core_cycle, _id, + TraceLogTag::pad15(TraceLogTag::kInstructionFinished), + inst->get_global_inst_id(), + core_trace_log::format_instruction_detail_line(*inst)); + finish_instruction(inst); + issued = true; + } + } + break; default: core_trace_log::log_error_undefined_opcode(); exit(EXIT_FAILURE); @@ -387,6 +509,19 @@ void Core::finish_instruction(std::shared_ptr& inst, InstFinishTrac core_trace_log::format_instruction_detail_line(*inst)); } +bool Core::has_inflight() { + // running() without the "_tiles.size() > 0" term: work that will produce a + // finish event on its own (so the sim is NOT frozen). If this is false but + // tiles remain, only stalled ready instructions are left. + if (!_vu_compute_pipeline.empty()) return true; + for (int i = 0; i < _num_systolic_array_per_core; i++) + if (!_sa_compute_pipeline.at(i).empty()) return true; + if (!_dma_waiting_queue.empty() || !_dma_finished_queue.empty()) return true; + if (!_dma.empty()) return true; + if (!_ld_inst_queue.empty() || !_st_inst_queue.empty()) return true; + return false; +} + bool Core::running() { bool running = false; running = running || _tiles.size() > 0; @@ -412,6 +547,13 @@ void Core::push_memory_response(mem_fetch* response) { Instruction* owner_inst = static_cast(response->get_custom_data()); assert(owner_inst->get_waiting_request()); + if (!owner_inst->got_first_response()) { // first data of this load arrived + owner_inst->mark_first_response(); + core_trace_log::trace_instruction_line(_core_cycle, _id, + TraceLogTag::pad15(TraceLogTag::kFirstDramResponse), + owner_inst->get_global_inst_id(), + core_trace_log::format_instruction_detail_line(*owner_inst)); + } owner_inst->dec_waiting_request(); if (!owner_inst->get_waiting_request()) { auto it = _dma_waiting_queue.find(owner_inst); diff --git a/TOGSim/src/CoreTraceLog.cc b/TOGSim/src/CoreTraceLog.cc index ebc31de0..7086893e 100644 --- a/TOGSim/src/CoreTraceLog.cc +++ b/TOGSim/src/CoreTraceLog.cc @@ -31,7 +31,7 @@ std::string format_dma_inst_issued_detail(Instruction& inst) { } return fmt::format( "addr_name={} dram=0x{:016x} rank={} elem_bits={} async={} indirect={} tag=0x{:016x} stride=[{}] size=[{}] " - "tag_idx=[{}]", + "tag_idx=[{}] tile={}", inst.get_addr_name(), static_cast(inst.get_base_dram_address()), rank, @@ -41,7 +41,8 @@ std::string format_dma_inst_issued_detail(Instruction& inst) { tag_hex, fmt::join(inst.get_tile_stride(), ","), fmt::join(ts, ","), - fmt::join(tidx, ",")); + fmt::join(tidx, ","), + inst.get_tile_group()); } std::string format_dma_inst_issued_trace_line(Instruction& inst) { @@ -52,31 +53,35 @@ std::string format_instruction_detail_line(Instruction& inst) { const Opcode op = inst.get_opcode(); const std::string opname = opcode_to_string(op); if (op == Opcode::COMP) { - return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={})", + return fmt::format("{} (compute_type={} compute_cycle={} overlapping_cycle={} sa={} tile={})", opname, inst.get_compute_type(), inst.get_compute_cycle(), - inst.get_overlapping_cycle()); + inst.get_overlapping_cycle(), + inst.get_assigned_sa(), + inst.get_tile_group()); } if ((op == Opcode::MOVIN || op == Opcode::MOVOUT) && inst.is_async_dma()) { - return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])", + return fmt::format("{} (ASYNC subgraph_id={} addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}] tile={})", opname, inst.subgraph_id, inst.get_addr_name(), format_tag_key_list_hex(inst.get_tag_id()), fmt::join(inst.get_tag_idx_list(), ","), - fmt::join(inst.get_tag_stride_list(), ",")); + fmt::join(inst.get_tag_stride_list(), ","), + inst.get_tile_group()); } if (op == Opcode::MOVIN || op == Opcode::MOVOUT) { - return fmt::format("{} (addr_name={})", opname, inst.get_addr_name()); + return fmt::format("{} (addr_name={} tile={})", opname, inst.get_addr_name(), inst.get_tile_group()); } - if (op == Opcode::BAR) { - return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}])", + if (op == Opcode::MEMORY_BAR) { + return fmt::format("{} (addr_name={} tag_id=[{}] tag_idx=[{}] tag_stride=[{}] tile={})", opname, inst.get_addr_name(), format_tag_key_list_hex(inst.get_tag_id()), fmt::join(inst.get_tag_idx_list(), ","), - fmt::join(inst.get_tag_stride_list(), ",")); + fmt::join(inst.get_tag_stride_list(), ","), + inst.get_tile_group()); } return opname; } diff --git a/TOGSim/src/Instruction.cc b/TOGSim/src/Instruction.cc index f236d160..d0471226 100644 --- a/TOGSim/src/Instruction.cc +++ b/TOGSim/src/Instruction.cc @@ -23,7 +23,8 @@ std::string opcode_to_string(Opcode opcode) { case Opcode::MOVIN: return "MOVIN"; case Opcode::MOVOUT: return "MOVOUT"; case Opcode::COMP: return "COMP"; - case Opcode::BAR: return "BAR"; + case Opcode::MEMORY_BAR: return "MEMORY_BAR"; + case Opcode::COMPUTE_BAR: return "COMPUTE_BAR"; default: return "Unknown"; } } @@ -60,6 +61,21 @@ void Instruction::add_child(std::shared_ptr child) { child_inst.insert(child); } +void Instruction::add_pipeline_child(std::shared_ptr child) { + child->inc_ready_counter(); + _pipeline_children.insert(child); +} + +void Instruction::release_pipeline_children() { + for (auto& c : _pipeline_children) { + // a COMPUTE_BAR child fences only its own dispatch -> it drains the max + // finish of the computes it gates, fed here as each one issues. + if (c->get_opcode() == Opcode::COMPUTE_BAR) c->update_fence_finish(finish_cycle); + c->dec_ready_counter(); + } + _pipeline_children.clear(); +} + void Instruction::inc_waiting_request() { _nr_waiting_request++; } diff --git a/TOGSim/src/Simulator.cc b/TOGSim/src/Simulator.cc index d987d787..03dd7bf9 100644 --- a/TOGSim/src/Simulator.cc +++ b/TOGSim/src/Simulator.cc @@ -184,6 +184,38 @@ void Simulator::icnt_cycle() { _icnt->cycle(); } +// Consecutive frozen cycles tolerated before declaring the sim wedged (spad too +// small). Generous so transient idle never false-fires; a true freeze is constant. +static constexpr uint64_t kWedgeThreshold = 5000; + +// Frozen-state guard: work remains (running()) but nothing is in flight to +// advance it -- the SRAM throttle can never satisfy a load because the kernel's +// working set exceeds the whole per-core spad (core_spad_size_kb too small). The +// state repeats every cycle, so after a margin error out instead of looping +// forever. `stuck` is function-local-static (one running sim at a time; it resets +// on any progress). +void Simulator::check_frozen() { + static uint64_t stuck = 0; + // In flight = anything that will produce a future state change: icnt/dram busy, + // a core with DMA/compute pending, or a tile still schedulable. + bool inflight = _icnt->running() || _dram->running(); + for (int id = 0; id < _n_cores && !inflight; id++) { + if (_cores[id]->has_inflight()) inflight = true; + else if (!get_partition_scheduler(id)->empty(id)) inflight = true; + } + if (running() && !inflight) { + if (++stuck > kWedgeThreshold) { + spdlog::error("[Simulator] simulation wedged at cycle {}: work remains but " + "nothing is in flight -- the per-core spad (core_spad_size_kb) " + "is too small to hold a kernel's working set. Increase it.", + _core_cycles); + exit(EXIT_FAILURE); + } + } else { + stuck = 0; + } +} + void Simulator::cycle() { while (running() || _core_cycles < 1) { set_cycle_mask(); @@ -198,6 +230,8 @@ void Simulator::cycle() { // Interconnect cycle if (IS_ICNT_CYCLE(_cycle_mask)) icnt_cycle(); + + check_frozen(); // spad-too-small guard (errors out if wedged) } for (auto &core: _cores) { core->check_tag(); diff --git a/TOGSim/src/TileGraphParser.cc b/TOGSim/src/TileGraphParser.cc index 5060d336..572062e0 100644 --- a/TOGSim/src/TileGraphParser.cc +++ b/TOGSim/src/TileGraphParser.cc @@ -543,7 +543,7 @@ std::vector> TileLoopNode::get_tiles_from_iter(TileGraphPa fmt::join(new_tag_stride_list, ", ")); std::shared_ptr inst = std::make_shared( - Opcode::BAR, 0, + Opcode::MEMORY_BAR, 0, 0, base_addr, std::vector(), std::vector(), 0, tag_list, new_tag_stride_list, accum_tag_list diff --git a/TOGSim/src/main.cc b/TOGSim/src/main.cc index 010826ef..a763cdd0 100644 --- a/TOGSim/src/main.cc +++ b/TOGSim/src/main.cc @@ -8,6 +8,8 @@ #include "Simulator.h" #include "TileGraphParser.h" #include "helper/CommandLineParser.h" +#include "togsim_loader.h" // P3 trace pipeline: run a compiled producer .so +#include "togsim_trace_bridge.h" // ... and bridge its trace to a TileGraph namespace fs = std::filesystem; namespace po = boost::program_options; @@ -104,6 +106,11 @@ int main(int argc, char** argv) { "models_list", "Path for the trace file (.trace)"); cmd_parser.add_command_line_option( "log_level", "Set for log level [trace, debug, info], default = info"); + cmd_parser.add_command_line_option( + "trace_so", "Path to a compiled trace producer .so (P3 trace pipeline)"); + cmd_parser.add_command_line_option( + "cycle_table", "Path to a 'cycleoverlapping' per-tile_id sidecar (TSV) " + "for --trace_so; falls back to a flat stub if omitted"); try { cmd_parser.parse(argc, argv); } catch (const CommandLineParser::ParsingError& e) { @@ -147,6 +154,47 @@ int main(int argc, char** argv) { exit(1); } + // P3 trace pipeline: if a compiled producer .so is given, run it, bridge the + // recorded trace to a TileGraph, and run the existing Simulator on it. + std::string trace_so_path; + cmd_parser.set_if_defined("trace_so", &trace_so_path); + if (!trace_so_path.empty()) { + const auto& cfg = simulator->get_hardware_config_yaml(); + int num_cores = cfg["num_cores"] ? cfg["num_cores"].as() : 1; + // First cut: stub tensor bases (real per-tensor addresses come later). + std::vector bases(16); + for (size_t i = 0; i < bases.size(); ++i) bases[i] = 0x100000ull * (i + 1); + // Cycle table: load the per-tile_id TSV sidecar if given, else a flat stub. + std::vector cyc, ovl; + std::string cycle_table_path; + cmd_parser.set_if_defined("cycle_table", &cycle_table_path); + if (!cycle_table_path.empty()) { + std::ifstream ct(cycle_table_path); + if (!ct.is_open()) { spdlog::error("[TOGSim] cannot open cycle_table {}", cycle_table_path); exit(1); } + int64_t c, o; + while (ct >> c >> o) { cyc.push_back(c); ovl.push_back(o); } + spdlog::info("[TOGSim-trace] loaded cycle table: {} tiles from {}", cyc.size(), cycle_table_path); + } else { + cyc.assign(256, 128); + ovl.assign(256, 0); + } + auto run = togsim::run_producer(trace_so_path.c_str(), nullptr, 0, + bases.data(), (int)bases.size(), + cyc.data(), ovl.data(), (int)cyc.size(), + num_cores); + if (!run.ok) { spdlog::error("[TOGSim] trace producer run failed"); exit(1); } + spdlog::info("[TOGSim-trace] recorded {} instructions", run.trace.size()); + auto tg = trace_to_tilegraph(run, "trace_kernel"); + tg->set_arrival_time(simulator->get_core_cycle()); + tg->set_kernel_id(0); + simulator->enqueue_graph(0, std::move(tg)); + simulator->run_simulator(); + spdlog::info("[TOGSim-trace] Total cycles: {}", simulator->get_core_cycle()); + spdlog::info("Simulation finished"); + simulator->print_core_stat(); + return 0; + } + // Get trace file path cmd_parser.set_if_defined("models_list", &trace_file_path); diff --git a/TOGSim/src/togsim_runtime.cc b/TOGSim/src/togsim_runtime.cc new file mode 100644 index 00000000..86de081e --- /dev/null +++ b/TOGSim/src/togsim_runtime.cc @@ -0,0 +1,199 @@ +// togsim_runtime.cc +// ----------------------------------------------------------------------------- +// C6 runtime + loader for the compiled trace producer (P3 task 5). Implements +// the producer ABI (togsim_runtime.h) and the TOGSim-side loader +// (togsim_loader.h). See docs/design/togsim_cpp_trace.md sec 5.3 / 9.6.1 / 9.7. +// +// The producer `.so` calls the extern "C" togsim_* functions below; each one +// records a TraceRec on the EmitCtx. EmitCtx is the opaque type the producer +// only ever passes back to us. This is the "materializing sink": it resolves +// addresses and per-tile cycles into a recorded instruction stream. Wiring the +// stream into the existing timing core (Core/Simulator) is the remaining step. +// ----------------------------------------------------------------------------- + +#include "togsim_loader.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// Full definition of the opaque handle from togsim_runtime.h. The producer holds +// only EmitCtx* and never dereferences it. +struct EmitCtx { + // inputs supplied by the loader + const uint64_t* tensor_base = nullptr; + int32_t n_tensors = 0; + const int64_t* cyc = nullptr; // tile_id -> cycle + const int64_t* ovl = nullptr; // tile_id -> overlapping_cycle + int32_t n_tiles = 0; + int32_t num_cores = 1; + // mutable run state + int32_t rr = 0; // round-robin core cursor + int32_t cur_core = -1; // current work-item's core + std::vector trace; +}; + +namespace { +inline togsim::TraceRec blank(togsim::TraceRec::Kind k, int32_t core) { + togsim::TraceRec r{}; + r.kind = k; + r.core = core; + return r; +} +} // namespace + +extern "C" { + +int32_t togsim_abi_version(void) { return TOGSIM_ABI_VERSION; } + +void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, int64_t* iv, int32_t n_iv) { + // Higher-order work-item wrapper (sec 9.3): round-robin a core (the producer + // never sees num_cores), bracket the work-item with TILE_BEGIN/TILE_END, and + // run its body. The work-item SCOPE is exactly this fn call -- the begin/end + // are runtime-owned, so the producer never relies on an implicit "ops until + // the next alloc" boundary. The ops fn emits record under ctx->cur_core. + ctx->cur_core = ctx->num_cores > 0 ? (ctx->rr++ % ctx->num_cores) : 0; + ctx->trace.push_back(blank(togsim::TraceRec::TILE_BEGIN, ctx->cur_core)); + fn(ctx, iv, n_iv); + ctx->trace.push_back(blank(togsim::TraceRec::TILE_END, ctx->cur_core)); +} + +void togsim_dma(EmitCtx* ctx, int32_t dir, int32_t arg_id, + uint64_t offset, int32_t ndim, const int64_t* dims, + const int64_t* strides, int32_t elem_bits, + int32_t is_async, int32_t tag_id, uint64_t tag_slot, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write) { + uint64_t base = (arg_id >= 0 && arg_id < ctx->n_tensors) + ? ctx->tensor_base[arg_id] : 0; + uint64_t addr = base + offset * (uint64_t)(elem_bits / 8); + togsim::TraceRec r = blank(togsim::TraceRec::DMA, ctx->cur_core); + r.dir = dir; r.arg_id = arg_id; r.elem_bits = elem_bits; + r.is_async = is_async; r.addr = addr; r.tag_id = tag_id; r.tag_slot = tag_slot; + for (int32_t i = 0; i < ndim; ++i) { + if (dims) r.dims.push_back(dims[i]); + if (strides) r.strides.push_back(strides[i]); + } + for (int32_t i = 0; i < n_read; ++i) r.read_bufs.push_back(read_bufs[i]); + for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]); + ctx->trace.push_back(r); +} + +void togsim_compute(EmitCtx* ctx, uint64_t tile_id, int32_t compute_type, + int32_t ndim, const int64_t* dims, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write) { + (void)ndim; (void)dims; + togsim::TraceRec r = blank(togsim::TraceRec::COMPUTE, ctx->cur_core); + r.tile_id = tile_id; + r.compute_type = compute_type; + for (int32_t i = 0; i < n_read; ++i) r.read_bufs.push_back(read_bufs[i]); + for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]); + if (ctx->cyc && (int32_t)tile_id < ctx->n_tiles) r.cycle = ctx->cyc[tile_id]; + if (ctx->ovl && (int32_t)tile_id < ctx->n_tiles) r.overlapping = ctx->ovl[tile_id]; + ctx->trace.push_back(r); +} + +void togsim_memory_barrier(EmitCtx* ctx, int32_t tag_id, uint64_t tag_slot, + const int64_t* write_bufs, int32_t n_write) { + togsim::TraceRec r = blank(togsim::TraceRec::MEMORY_BAR, ctx->cur_core); + r.tag_id = tag_id; r.tag_slot = tag_slot; + for (int32_t i = 0; i < n_write; ++i) r.write_bufs.push_back(write_bufs[i]); + ctx->trace.push_back(r); +} + +void togsim_compute_barrier(EmitCtx* ctx) { + ctx->trace.push_back(blank(togsim::TraceRec::COMPUTE_BAR, ctx->cur_core)); +} + +} // extern "C" + +namespace togsim { + +RunResult run_producer(const char* so_path, + const int64_t* shape_args, int32_t n_shape, + const uint64_t* tensor_base, int32_t n_tensors, + const int64_t* cyc, const int64_t* ovl, int32_t n_tiles, + int32_t num_cores) { + RunResult res; + void* lib = dlopen(so_path, RTLD_NOW | RTLD_GLOBAL); + if (!lib) { fprintf(stderr, "togsim: dlopen failed: %s\n", dlerror()); return res; } + auto emit = (void (*)(EmitCtx*, int64_t*, int32_t))dlsym(lib, "togsim_kernel"); + if (!emit) { fprintf(stderr, "togsim: dlsym togsim_kernel failed: %s\n", dlerror()); return res; } + + EmitCtx ctx; + ctx.tensor_base = tensor_base; ctx.n_tensors = n_tensors; + ctx.cyc = cyc; ctx.ovl = ovl; ctx.n_tiles = n_tiles; + ctx.num_cores = num_cores > 0 ? num_cores : 1; + emit(&ctx, (int64_t*)shape_args, n_shape); + + res.ok = true; + res.trace = std::move(ctx.trace); + return res; +} + +SimResult simulate(const RunResult& run, const TimingParams& params) { + SimResult out; + std::unordered_map dma_free; // DMA-engine free time, per core + std::unordered_map comp_free; // compute free time, per core + std::unordered_map prev_comp; // prev compute finish (overlap), per core + std::map, uint64_t> tag_finish; // (tag_id,tag_slot) -> finish + std::vector pending; // barrier-resolved deps since last compute + + for (const auto& t : run.trace) { + const int c = t.core; + switch (t.kind) { + case TraceRec::DMA: { + // DMAs serialize on the core's DMA engine (overlap compute -> separate + // timeline). finish = issue + latency, recorded under the runtime tag. + uint64_t start = dma_free[c]; + uint64_t fin = start + params.dma_latency; + dma_free[c] = fin; + tag_finish[{t.tag_id, t.tag_slot}] = fin; + out.n_dma++; + break; + } + case TraceRec::MEMORY_BAR: { + // the explicit async-DMA sync: gate the next compute on the paired dma's + // data-arrival, found by the runtime tag (tag_id, tag_slot). + auto it = tag_finish.find({t.tag_id, t.tag_slot}); + if (it != tag_finish.end()) pending.push_back(it->second); + break; + } + case TraceRec::COMPUTE: { + uint64_t deps = 0; + for (uint64_t f : pending) deps = std::max(deps, f); + pending.clear(); + uint64_t start = std::max(comp_free[c], deps); + uint64_t fin; + auto pit = prev_comp.find(c); + if (pit != prev_comp.end()) { + uint64_t prev = pit->second; + uint64_t tail = prev > start ? prev - start : 0; // prev still running + uint64_t overlapped = std::min(tail, (uint64_t)t.overlapping); + fin = std::max(start, prev) + (uint64_t)t.cycle - overlapped; + } else { + fin = start + (uint64_t)t.cycle; + } + comp_free[c] = fin; + prev_comp[c] = fin; + out.n_compute++; + break; + } + case TraceRec::TILE_BEGIN: + case TraceRec::TILE_END: + case TraceRec::COMPUTE_BAR: + break; // work-item boundary / compute fence: no cost in this reference timer + } + } + for (auto& kv : dma_free) out.total_cycle = std::max(out.total_cycle, kv.second); + for (auto& kv : comp_free) out.total_cycle = std::max(out.total_cycle, kv.second); + return out; +} + +} // namespace togsim diff --git a/TOGSim/src/togsim_trace_bridge.cc b/TOGSim/src/togsim_trace_bridge.cc new file mode 100644 index 00000000..e13af2d7 --- /dev/null +++ b/TOGSim/src/togsim_trace_bridge.cc @@ -0,0 +1,278 @@ +// togsim_trace_bridge.cc -- see togsim_trace_bridge.h +#include "togsim_trace_bridge.h" + +#include +#include +#include + +#include "Tile.h" +#include "Instruction.h" + +namespace { + +// `uniq` is a per-DMA-record unique tag-key id minted by the caller. The Core +// tag table keys completion on [addr_id, ..., sum(tag_idx*stride)]; using `uniq` +// as addr_id makes every reduction iteration of one static dma get a DISTINCT +// key -- so multi-tile-K (and conv, whose reduction is the kh*kw*C nest) do not +// collide, with no coordinate enumeration. The matching memory_barrier reuses +// the same `uniq` (current-load map per (tag_id, tag_slot), see +// trace_to_tilegraph), so the table still pairs them. This works because the +// recorded stream is already per-iteration (the producer ran the loops) -- +// unlike a compile-time event_id. `tag_idx` (the subtile slot) is retained for +// the SRAM double-buffer model. +// +// FIXME(semantics): the per-iteration tag is still reconstructed HERE from the +// record order. The producer IR now DOES carry a per-iteration tag -- dma_fine_- +// grained emits a fresh tag memref.alloc just before each coarse load (rewiring +// its dma_wait), so successive reduction iterations allocate distinct tags -- but +// build_skeleton collapses that to one static tag_id (it DCEs the alloc and keys +// togsim.dma by the alloc's static identity), so this bridge still needs `uniq` +// to tell iterations apart at runtime. The faithful finish is to thread the +// per-iteration alloc identity through build_skeleton as an SSA tag handle on the +// togsim.dma / togsim.memory_barrier (then `uniq` here is unnecessary). +std::shared_ptr make_dma(const togsim::TraceRec& t, int64_t uniq) { + Opcode op = (t.dir == 1) ? Opcode::MOVOUT : Opcode::MOVIN; + std::vector tile_size(t.dims.begin(), t.dims.end()); + std::vector tile_stride(t.strides.begin(), t.strides.end()); + std::vector tag_idx{(int64_t)t.tag_slot}; + std::vector tag_stride{1}; + auto inst = std::make_shared( + op, /*compute_cycle=*/0, /*num_parents=*/0, /*dram_addr=*/t.addr, + tile_size, tile_stride, (size_t)t.elem_bits, tag_idx, tag_stride, + /*accum_tag_idx_list=*/std::vector{}); + inst->set_is_async(t.is_async != 0); + inst->set_addr_name("tag" + std::to_string(uniq), uniq); + inst->prepare_tag_key(); + return inst; +} + +// A MEMORY_BAR carrying the SAME `uniq` tag key as the async dma it gates -- the +// Core's tag table signals it at the dma's DATA-ready (resp-complete), unlike a +// raw add_child which the async dma releases at issue-complete. +std::shared_ptr make_mem_bar(const togsim::TraceRec& t, int64_t uniq) { + auto bar = std::make_shared( + Opcode::MEMORY_BAR, 0, 0, 0, + std::vector{}, std::vector{}, 0, + std::vector{(int64_t)t.tag_slot}, std::vector{1}, + std::vector{}); + bar->set_addr_name("tag" + std::to_string(uniq), uniq); + bar->prepare_tag_key(); + return bar; +} + +std::shared_ptr make_compute(const togsim::TraceRec& t) { + auto inst = std::make_shared( + Opcode::COMP, /*compute_cycle=*/(cycle_type)t.cycle, /*num_parents=*/0, + /*dram_addr=*/0, std::vector{}, std::vector{}, /*elem_bits=*/0, + std::vector{}, std::vector{}, std::vector{}); + inst->set_overlapping_cycle((cycle_type)t.overlapping); + inst->set_compute_type(t.compute_type); // route to VPU vs systolic array + return inst; +} + +} // namespace + +std::unique_ptr trace_to_tilegraph(const togsim::RunResult& run, + const std::string& name) { + using togsim::TraceRec; + auto tg = std::make_unique(name, name); + // Empty cache plan (no L2/CMEM persistence) -- append_subgraph propagates it + // to each subgraph, and DMA::is_cacheable dereferences it, so it must be a + // valid (if empty) IntervalTree rather than null. + tg->init_cache_plan({}); + + std::shared_ptr sg; + std::shared_ptr tile; + // Explicit dependency DAG (sec 10): a reader depends on the last writer of each + // SRAM buffer it reads. Scoped per work-item (reset at each dispatch) -- buffers + // are work-item-local, so distinct work-items are independent (-> parallel). + std::map> last_writer; // buffer id -> producer + // An async dma is paired with its explicit memory_barrier(s) by the runtime tag + // (tag_id, tag_slot). It is 1 load : N barriers (the load happens once per + // reduction iteration; each consumer in that iteration is preceded by a wait on + // the same tag), so we track the CURRENT (most recent) load per (tag_id, + // tag_slot) -- like last_writer for a buffer -- not a FIFO. Each load gets a + // fresh `uniq` Core key, so successive reduction iterations (multi-tile-K, conv) + // never collide in the tag table; the iteration's barriers reuse that load's + // uniq. Correct because the load nest and its consumer nest run in order within + // the reduction body (no cross-iteration prefetch). Scoped per work-item. + std::map, + std::pair>> current_dma; + int64_t next_tag = 0; // mints a unique Core tag key per dma record + int cur_tile_group = -1; // work-item index, bumped per TILE_BEGIN (trace grouping) + // Async compute (matmul/preload): issued and pipelined on the systolic array; + // they do not block each other. A store then needs the drained result, so it + // FLUSHes -- waits all outstanding async compute before running (like a fence + // after async ops). No per-op completion events; one barrier before the store. + std::vector> outstanding_async; + std::shared_ptr pending_bar; // last COMPUTE_BAR fence, awaited by the next store + auto is_async_compute = [](int ct) { return ct == 1 || ct == 2; }; // matmul / preload + + auto flush = [&]() { + if (sg && tile) { + sg->add_tile(tile); + tile->set_owner(sg); + tg->append_subgraph(sg); + } + sg.reset(); + tile.reset(); + last_writer.clear(); + current_dma.clear(); + next_tag = 0; + outstanding_async.clear(); + pending_bar.reset(); + }; + + // Build edges from the recorded read/write buffer sets: reader <- last writer of + // each buffer it reads (the virtual SA_WEIGHTS buffer carries preload->matmul; + // the Y_spad accumulator carries the reduction chain; the spads carry load-> + // compute). No in-order chain, no tag matching, no op heuristics. + // sec 10.7 occupancy/latency split. An edge from a systolic-array producer + // (preload=2 or matmul=1) to a matmul (1) is an OCCUPANCY dependency: the + // successor overlaps the producer on the SA pipeline, so use add_pipeline_child + // (released when the producer ISSUES). Every other edge is a LATENCY + // dependency (the consumer needs the producer's result): load->compute, + // init->matmul, matmul->store -> add_child (released at the producer's finish). + const int MATMUL_CT = 1, PRELOAD_CT = 2; + auto link = [&](std::shared_ptr inst, + const std::vector& reads, + const std::vector& writes) { + for (int64_t b : reads) { + auto it = last_writer.find(b); + if (it == last_writer.end()) continue; + int pct = it->second->get_compute_type(); + if (inst->get_compute_type() == MATMUL_CT && (pct == MATMUL_CT || pct == PRELOAD_CT)) + it->second->add_pipeline_child(inst); // SA pipeline -> occupancy (overlap) + else + it->second->add_child(inst); // data/result -> latency (full wait) + } + for (int64_t b : writes) last_writer[b] = inst; + tile->append_instuction(inst); + }; + + // --- SRAM-capacity tracking (buffer-version allocations, sec 10.x) --- + // A coarse tile = one version of its buffer; the fine DMAs that fill it share + // one allocation, freed once all the version's consumers have issued (refcount + // -> 0). NOT reset in flush(): the spad is one physical per-core resource, so a + // buffer reused by the next reduction iter / work-item is a NEW version that + // must wait for the old one to free (WAR / double-buffer). Tracked buffers are + // the DMA-loaded ones; the accumulator / virtual SA-weights are never written + // by a load, so cur_alloc has no entry and they are skipped. (v1: single-core; + // multi-core would key cur_alloc/vers by (core, buf).) + int64_t next_alloc = 0; + std::map cur_alloc; // buf -> current version id + std::map open_ver; // buf -> version still accepting loads + struct Ver { std::vector> loads, readers; }; + std::map vers; + auto sram_on_load = [&](int64_t b, const std::shared_ptr& ld) { + if (!cur_alloc.count(b) || !open_ver[b]) { // a read closed it -> new version + cur_alloc[b] = next_alloc++; + open_ver[b] = true; + vers[cur_alloc[b]] = {}; + } + ld->set_sram_alloc(cur_alloc[b]); + vers[cur_alloc[b]].loads.push_back(ld); + }; + auto sram_on_read = [&](int64_t b, const std::shared_ptr& rd) { + auto it = cur_alloc.find(b); + if (it == cur_alloc.end()) return; // not a load buffer -> untracked + vers[it->second].readers.push_back(rd); + open_ver[b] = false; // next write starts a new version + }; + auto sram_finalize = [&]() { // tag only each version's LAST reader + for (auto& kv : vers) { + auto& v = kv.second; + if (v.readers.empty()) { // no consumer -> never freed: untrack + for (auto& ld : v.loads) ld->set_sram_alloc(-1); + continue; + } + v.readers.back()->add_sram_release(kv.first); // it frees the whole version on issue + } + }; + + for (const auto& t : run.trace) { + if (t.kind == TraceRec::TILE_BEGIN) { + // togsim_dispatch opened a work-item -> new subgraph (bound to its core) + + // tile. The scope runs until the matching TILE_END (the dispatch wrapper + // brackets the tile fn call), not until the next begin. + flush(); + sg = std::make_shared(); + sg->set_core_id(t.core); + tile = std::make_shared(Tile::Status::INITIALIZED); + cur_tile_group++; + continue; + } + if (t.kind == TraceRec::TILE_END) { + flush(); // close the work-item explicitly (scope = the tile fn call) + continue; + } + if (!tile) continue; // defensive: ops before the first TILE_BEGIN + + if (t.kind == TraceRec::DMA) { + int64_t uniq = next_tag++; // fresh Core tag key per dma record + auto inst = make_dma(t, uniq); + inst->set_tile_group(cur_tile_group); + size_t numel = 1; // SRAM footprint (ready-tile ordering) + for (auto d : t.dims) numel *= (size_t)d; + tile->inc_required_sram_size(numel * (t.elem_bits / 8)); + if (t.dir == 1) { // STORE + if (pending_bar) { + // after a compute fence: wait it (drains the async matmuls) -- covers + // the accumulator read, so no per-buffer read edge. + pending_bar->add_child(inst); + pending_bar.reset(); + for (int64_t b : t.write_bufs) last_writer[b] = inst; + tile->append_instuction(inst); + } else { + link(inst, t.read_bufs, t.write_bufs); + } + for (int64_t b : t.read_bufs) sram_on_read(b, inst); // store frees what it drains + } else { // LOAD + tile->append_instuction(inst); + // async load: record it as the CURRENT load for this (tag_id, tag_slot) + // with its fresh uniq; the barriers in this reduction iteration reuse that + // uniq (1 load : N barriers). A new iteration's load overwrites it with a + // new uniq -> distinct tag key, no collision. last_writer = the dma for now; + // the barrier overwrites it so consumers gate on data arrival. A sync load + // has no barrier and blocks to arrival itself. + if (t.is_async) current_dma[{t.tag_id, t.tag_slot}] = {uniq, inst}; + for (int64_t b : t.write_bufs) last_writer[b] = inst; + for (int64_t b : t.write_bufs) sram_on_load(b, inst); // occupy spad + } + } else if (t.kind == TraceRec::MEMORY_BAR) { + // the explicit async-DMA sync (the original dma_wait). Pair with the CURRENT + // load for this (tag_id, tag_slot), reusing its uniq Core key so the dma and + // bar pair in the tag table; the dma releases the bar at issue-complete + // (add_child), then the bar parks on the tag until data-ready (resp-complete, + // set_tag_finish). Consumers of the loaded buffer then gate on the bar. + auto it = current_dma.find({t.tag_id, t.tag_slot}); + int64_t uniq = next_tag++; // fallback if unpaired + std::shared_ptr dma_inst; + if (it != current_dma.end()) { uniq = it->second.first; dma_inst = it->second.second; } + auto bar = make_mem_bar(t, uniq); + bar->set_tile_group(cur_tile_group); + if (dma_inst) dma_inst->add_child(bar); + tile->append_instuction(bar); + for (int64_t b : t.write_bufs) last_writer[b] = bar; + } else if (t.kind == TraceRec::COMPUTE) { + auto inst = make_compute(t); + inst->set_tile_group(cur_tile_group); + link(inst, t.read_bufs, t.write_bufs); + for (int64_t b : t.read_bufs) sram_on_read(b, inst); // frees the tiles it consumes + if (is_async_compute(t.compute_type)) outstanding_async.push_back(inst); + } else if (t.kind == TraceRec::COMPUTE_BAR) { + // explicit compute fence: ready once all outstanding async compute have + // ISSUED (pipeline-child release); the Core then waits the SA pipelines to + // drain before it finishes (-> the store it gates). + auto bar = std::make_shared(Opcode::COMPUTE_BAR); + bar->set_tile_group(cur_tile_group); + for (auto& a : outstanding_async) a->add_pipeline_child(bar); + outstanding_async.clear(); + tile->append_instuction(bar); + pending_bar = bar; + } + } + flush(); + sram_finalize(); // readers per version are now final -> set each version's refcount + return tg; +} diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml index 6d2537d9..7fea374b 100644 --- a/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml +++ b/configs/systolic_ws_128x128_c1_booksim_tpuv2.yml @@ -22,3 +22,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml index f830419b..3a96b588 100644 --- a/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml +++ b/configs/systolic_ws_128x128_c1_booksim_tpuv3.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml index 1a8c60f6..41e267b6 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv2.yml @@ -25,3 +25,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml index ff976784..397f0fb7 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml index 2ed1bb12..f080fc69 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_half.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml index 1bcc9bb3..f89661b8 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv3_timing_only.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 8 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml index 39d195b0..ca69d930 100644 --- a/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml +++ b/configs/systolic_ws_128x128_c1_simple_noc_tpuv4.yml @@ -28,3 +28,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml index bf01913b..b7b03e7a 100644 --- a/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml +++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml index 8c71c528..903ffcbc 100644 --- a/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml +++ b/configs/systolic_ws_128x128_c2_booksim_tpuv3_bw_quarter.yml @@ -34,3 +34,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml index d058f188..6a234017 100644 --- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml +++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3.yml @@ -28,3 +28,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml index 019a0f0f..f0546e56 100644 --- a/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml +++ b/configs/systolic_ws_128x128_c2_chiplet_tpuv3_xnuma.yml @@ -27,3 +27,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml index 348babae..08ec26ac 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv2.yml @@ -25,3 +25,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml index a0985aec..a6e073e9 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3.yml @@ -26,3 +26,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml index 166e2e25..5436b3e8 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_ils.yml @@ -29,3 +29,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml index 6119e83d..d928f9d3 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv3_partition.yml @@ -30,3 +30,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml index 9100c22a..dd9dfac7 100644 --- a/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml +++ b/configs/systolic_ws_128x128_c2_simple_noc_tpuv4.yml @@ -28,3 +28,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core VMEM (vector/scratchpad) size: TPUv2/v3/v4 = 16 MB. +core_spad_size_kb: 16384 diff --git a/configs/systolic_ws_8x8_c1_booksim.yml b/configs/systolic_ws_8x8_c1_booksim.yml index f46d380e..1593e148 100644 --- a/configs/systolic_ws_8x8_c1_booksim.yml +++ b/configs/systolic_ws_8x8_c1_booksim.yml @@ -23,3 +23,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core spad: 8x8 array, 128 KB x 8 = 1 MB. +core_spad_size_kb: 1024 diff --git a/configs/systolic_ws_8x8_c1_simple_noc.yml b/configs/systolic_ws_8x8_c1_simple_noc.yml index 1be24b85..b2d16c6a 100644 --- a/configs/systolic_ws_8x8_c1_simple_noc.yml +++ b/configs/systolic_ws_8x8_c1_simple_noc.yml @@ -24,3 +24,6 @@ codegen_external_mapping_file: '' codegen_autotune_max_retry: 10 codegen_autotune_template_topk: 4 codegen_compiler_optimization: all + +# Per-core spad: 8x8 array, 128 KB x 8 = 1 MB. +core_spad_size_kb: 1024 diff --git a/docs/design/togsim_cpp_trace.md b/docs/design/togsim_cpp_trace.md new file mode 100644 index 00000000..9565bdfb --- /dev/null +++ b/docs/design/togsim_cpp_trace.md @@ -0,0 +1,1006 @@ +# TOGSim C++ Trace Generation — Design Proposal + +**Status:** Implemented end-to-end through the real timing Core (256^3 GEMM); see +§11 for remaining work. +**Branch:** `feature/togsim-cpp-trace` +**Scope:** Replace the timing-path TOG producer (MLIR → Python-dict → ONNX → C++ +parser) with a compiled, shape-parametric trace producer (MLIR → C++ → `.so`). +TOGSim's timing core is preserved. + +**Note on the sync mechanism (read before §3, §5, §9).** An earlier version of +this design synchronized an asynchronous DMA with the consumer that waits on its +data using a compile-time integer `event_id` — one id per static `togsim.dma`/ +`togsim.wait` op, paired through a heap "event buffer" of opaque handles. That +mechanism was *removed*: a single static `togsim.dma` op executes once per loop +iteration, each iteration writing a different runtime tag slot, so one +compile-time id per op cannot represent the per-iteration pairing. The current +design (ABI v11) pairs an async DMA with its sync point by the **runtime tag +slot** instead. Sections below have been rewritten to the runtime-tag model; +where a section still mentions `event_id` / event handles / `togsim_wait` / +`togsim_signal`, it is flagged as the superseded design, not current behavior. + +--- + +## 1. Motivation + +The current Tile-Operation Graph (TOG) pipeline has accumulated structural debt +that blocks where we want to go (notably dynamic shape for LLM decode / MoE): + +1. **"ONNX in name only."** TOG is serialized as ONNX, but every op is a custom + `torchsim_*` attribute. We pay ONNX's costs (rigid schema, protobuf, + stringly-typed attribute encoding) and use none of its interop value + (onnxruntime, standard ops, netron). The schema lives in three places — + Python dict (`extension_op.py`), ONNX (`AsmParser/onnx_utility.py`), C++ + (`TOGSim/.../TileGraphParser`) — and drifts. + +2. **Synchronization is ad-hoc and DMA-specific.** Completion tracking is a + counting-semaphore in disguise, but unnamed and tangled: + - `DMA.h`: `tag_table[subgraph][tag_key] -> uint32` with overloaded magic + values (`0` pending, `1` signaled, `>1` consumed-count, `-1` sparse) plus a + parallel `waiters` wait-queue. The `tag_key` is a hand-rolled + content-addressed vector computed from loop indices/strides (`calc_tag`), + with implicit fallbacks (push `0` when an index is missing, dedup by + silently `continue`-ing). + - A *second*, separate dependency mechanism — `Instruction::ready_counter` + + `child_inst` graph edges — handles structural ordering. + - Net: one concept ("an async op completed; a consumer may proceed") is + expressed two different ways, and the event-like one only works for DMA. + +3. **Static shape is baked in.** `build_tog._affine_for_bounds` resolves loop + bounds to constants (`_const_index_value`). The graph is fully materialized + per static shape, so dynamic shape forces recompile-per-shape — pathological + for decode (a new `seq_len` every step) and MoE (variable expert load). + +4. **Loop-flattening hackery.** Much of the roughness (`loop_end` tricks, + `calc_tag`, dedup-by-skip, magic offsets) exists only to flatten loop nests + into a static graph. + +See [Appendix A](#appendix-a-current-state-references) for file:line references. + +## 2. Key idea: trace-driven → execution-driven + +Instead of materializing a flattened graph, **TOG becomes a stream emitted by +*running* a shape-parametric producer.** The producer is C++ compiled from the +kernel's MLIR; it keeps loops as loops (with symbolic bounds) and calls a small +**event-based API**. Each API call emits one trace record = one modeled +instruction. TOGSim `dlopen`s the producer `.so`, injects a callback context +that records and times the stream. + +This directly resolves the four problems: + +| Problem | Resolution | +|---|---| +| ONNX-in-name-only / 3-place schema | The API signature is the single contract. No ONNX. | +| DMA-only, ad-hoc sync | An async DMA and the consumer that waits on its data are paired at runtime by the tile's tag slot, through the existing Core tag table (`prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`). The DMA signals the tag when its data arrives; an explicit `togsim.memory_barrier` waits on it and becomes the last-writer of the loaded buffer, so consumers gate on data arrival. No content-hashed `calc_tag`, no magic values. | +| Static shape | Loop bounds flow from MLIR as-is; symbolic bounds become native loop bounds in C++, so trip count is dynamic. | +| Loop-flatten hacks | Loops stay loops; the trace is generated by executing them. `calc_tag`/dedup disappear. | + +It is *not* a dynamic hardware scheduler: control flow is still statically +emitted by the compiler. The `.so` is a deterministic **trace generator**, not a +timing model — it keeps the trace-as-data boundary, so TOGSim's timing core is +untouched. + +## 3. Core algebra + +Small, orthogonal primitives. Everything else is composition (Layer-1 helpers +like `double_buffered_loop`, not IR primitives). + +- `dma(dir, arg_id, offset, shape, is_async, tag_id, tag_slot, …)` — + `dir ∈ {LOAD, STORE}`. Returns void. A **synchronous** (non-async) DMA is + blocking: it finishes when its data arrives, and consumers depend on it + directly. An **async** DMA returns control immediately and signals its tag at + data arrival (DMA response-complete); a later `memory_barrier` is the explicit + point that waits on it. +- `compute(tile_id, dims…)` — references a fixed-size tile kernel; cost is looked + up (§6), not computed here. +- `memory_barrier(tag_id, tag_slot, write_bufs)` — the explicit async-DMA sync. + It waits until the async DMA carrying the same `(tag_id, tag_slot)` has + delivered its data, then becomes the last-writer of the loaded buffer so + consumers gate on data arrival. It is the original `memref.dma_wait` mapped + through from the source IR, not a synthesized barrier. +- `compute_barrier()` — a compute fence inserted before a store, so the store + sees the drained accumulator. This is the **one** remaining auto-inserted + barrier; it is marked FIXME in the code as something that should also become + explicit in the source IR later (§10.7.3). +- **Control flow lives in the producer** — ordinary `for`/`if`/`while` with + runtime bounds. Loop types (normal/parallel/accumulation/inner) and dynamic + shape are just producer loops; the emitted trace is already specialized. + +Two distinct things share the word "tag", and the design uses **both together** +as the dma↔barrier pairing key: + +- **`tag_id`** — the identity of a DMA's *tag memref*. It plus the runtime + `tag_slot` index identifies which async DMA a `memory_barrier` is waiting on. +- **`tag_slot`** — the SRAM tile slot the loaded tile occupies (the + double-buffer / SRAM-capacity index). It is *also* part of the pairing key + because each load's tile maps to its own slot. The slot is **subtile-only**: + `lower_to_vcix` writes the dma_wait tag index with a `-acc_iv` term for each + accumulation (reduction) loop var — a sentinel marking the reduction axis, not + an arithmetic offset — and `build_skeleton` strips those terms so a + `memory_barrier` waits on the same slot its async load wrote. (Mirrors legacy + `TileGraphParser`, which skips stride -1; reduction iterations are told apart + by the per-iteration tag alloc + a fresh per-record Core key in the bridge, not + by the slot.) Without the strip, the producer evaluates `-acc_iv` to a negative + slot at reduction iteration > 0 and the pairing fails on subtile + multi-tile-K. + +Pairing is done at runtime by the existing TOGSim Core tag table: the async DMA +calls `prepare_tag_key` and `set_tag_finish` (signal at data arrival), the +`memory_barrier` calls `register_tag_waiter` (wait on `(tag_id, tag_slot)`). +A synchronous DMA needs no barrier — it blocks until data arrival itself. + +> **Superseded.** An earlier version used a neutral `event` completion token +> (freely allocated, not tied to memory) with `signal`/`wait`/`wait_all` +> primitives. That has been removed in favor of the runtime-tag mechanism above. + +## 4. Decisions (locked) + +| Axis | Decision | +|---|---| +| Input MLIR | Use the **given MLIR as-is**. Do not touch inductor / MLIR templates / shape plumbing. Whatever bounds the MLIR carries (const or symbolic) pass through verbatim. | +| MLIR → C++ | **EmitC dialect + `mlir-translate --mlir-to-cpp`** (upstream). | +| `.so` ↔ TOGSim | **`dlopen` + `EmitCtx` callback** (execution-driven). The ABI boundary is the main design surface. | +| `.so` role | **Timing trace only.** Functional correctness stays on the existing Spike/LLVM path. Strip every op without a timing dependency; keep loop skeleton + API ops + ops feeding bounds/addresses. | +| Compute cycle | A **separate annotation pass** reuses the existing **sample-mode** to produce a **precomputed `tile_id → cycle` table**, looked up at runtime. | +| Dynamic shape | Falls out of symbolic loop bounds in the MLIR. Per-tile cost is static (tiles are fixed-size); only trip count is dynamic. | + +## 5. Architecture + +### 5.1 Artifacts (per kernel) + +- **Trace `.so`** — compiled from the skeleton+API MLIR. Shape-parametric: + symbolic bounds become C++ function parameters. Calls the runtime API + (`togsim_dma`, `togsim_compute`, `togsim_memory_barrier`, …). +- **Cycle table** — `tile_id → cycle`, produced by the annotation pass. + +### 5.2 Pipeline (input = given MLIR) + +``` +given MLIR (affine/scf.for + memref.dma_start/dma_wait + vcix/vector compute) +│ +├── Branch A (trace): +│ C2 build_skeleton pass (reuse build_tog traversal) +│ • affine/scf.for kept, bounds as-is (symbolic preserved) +│ • dma_start → togsim.dma(... tag_id, %tag[%idx], is_async) +│ • dma_wait → togsim.memory_barrier(tag_id, %tag[%idx], write_bufs) +│ • compute block → togsim.compute(tile_id, dims) +│ • DCE: drop ops with no dependency to loop/address/API operands +│ → C4 togsim→emitc lowering (togsim.* → emitc.call_opaque; +│ convert-scf/arith-to-emitc; func args incl. symbolic shapes) +│ → mlir-translate --mlir-to-cpp +│ → C5 compile → trace .so (cached by kernel key) +│ +└── Branch B (cost): + C3 annotation pass over the same MLIR + • extract per-tile compute bodies, assign tile_id + • run through existing sample-mode → tile_id → cycle table + +TOGSim (C6): + dlopen(trace.so) → resolve togsim_kernel + inject EmitCtx { tag table; record sink; cost = cycle_table[tile_id] } + togsim_kernel(ctx, runtime_shape_args...) // producer runs, emits stream + → existing timing core consumes the recorded Instruction stream +``` + +### 5.3 Components + +- **C1 — `togsim` API op vocabulary.** `togsim.dma(...)` (void result, carrying + `tag_id`, the runtime tag-index operand, `is_async`), + `togsim.memory_barrier(tag_id, tag_slot, write_bufs)`, + `togsim.compute(tile_id, dims)`, `togsim.compute_barrier`. Kept *unregistered* + (like the existing `togsim.transfer`), so no C++ dialect registration; the + togsim→emitc step is a custom Python rewrite, not a registered ConversionPass. +- **C2 — `build_skeleton` pass.** Sibling to `build_tog.py`, reusing its + traversal (matmul FSM, `_dma_start_fields`, loop typing). Emits the + skeleton+API MLIR instead of TOG nodes; preserves `is_async`. The original + `memref.dma_wait` is mapped through to an explicit `togsim.memory_barrier` + carrying the DMA's `tag_id` and the runtime tag-index operand. +- **C3 — annotation pass + cycle table.** Reuses sample-mode to sample the + deterministic per-tile cycle; emits the `tile_id → cycle` table artifact. +- **C4 — togsim→emitc lowering.** Maps each `togsim.*` op to an + `emitc.call_opaque "togsim_*"`; lowers control flow via `convert-scf-to-emitc` + / `convert-arith-to-emitc`; func arguments (including symbolic shapes) become + C++ parameters. Then `mlir-translate --mlir-to-cpp`. +- **C5 — `.so` build.** Compile emitted `.cpp` + `togsim_runtime.h` to `.so` + via the existing toolchain; cache by kernel key. +- **C6 — TOGSim runtime + loader.** `togsim_runtime.h/.cc`: `EmitCtx` and the + `togsim_dma/compute/memory_barrier/compute_barrier/core_alloc` + implementations (compute looks up the cycle table). Loader `dlopen`s the + `.so`, calls `togsim_kernel` with runtime shape args, records the stream, feeds + the existing timing core. An async DMA and its `memory_barrier` are paired at + runtime by `(tag_id, tag_slot)` through the existing Core tag table. + +### 5.4 ABI sketch (current: v11) + +```c +// togsim_runtime.h — shared contract between emitted .cpp and TOGSim +typedef struct EmitCtx EmitCtx; + +void togsim_dma(EmitCtx*, int32_t dir, int32_t arg_id, uint64_t offset, + int32_t ndim, const int64_t* dims, const int64_t* strides, + int32_t elem_bits, int32_t is_async, + int32_t tag_id, uint64_t tag_slot, + const int64_t* read_bufs, int32_t n_read, + const int64_t* write_bufs, int32_t n_write); + +void togsim_memory_barrier(EmitCtx*, int32_t tag_id, uint64_t tag_slot, + const int64_t* write_bufs, int32_t n_write); + +void togsim_compute(EmitCtx*, uint64_t tile_id, int32_t compute_type, /* dims */ ...); +void togsim_compute_barrier(EmitCtx*); +int32_t togsim_core_alloc(EmitCtx*); + +// entry point the loader resolves: +void togsim_kernel(EmitCtx*, int64_t* shape_args, int32_t n_shape_args); +``` + +`togsim_dma` returns void (no handle). An async DMA carries `(tag_id, tag_slot)`; +the matching `togsim_memory_barrier` waits on the same pair through the Core tag +table. The symbols are resolved as free `extern "C"` functions: the loaded `.so` +links back into the Simulator binary (built with `ENABLE_EXPORTS`). + +> **Superseded.** v2–v10 evolved through a `togsim_event` handle type with +> `togsim_dma` returning a handle and `togsim_wait`/`togsim_signal`/ +> `togsim_wait_all` plus `togsim_event_alloc`/`togsim_event_free`. v11 removed +> all of those; see the note at the top of this doc and §9.6.1. + +## 6. Compute cost model + +The annotation pass (C3) reuses **sample-mode** to measure each tile's +deterministic cycle once and stores a **precomputed `tile_id → cycle` table**. +`togsim_compute` looks it up at runtime. + +This is consistent with dynamic shape because **tiles are fixed-size** +(`TILE_M/N/K`): the per-tile cycle is invariant; only the *number* of tiles +(loop trip count) varies, and that is handled by the symbolic loop in the `.so`. + +**Open edge case — remainder tiles.** When a dimension is not divisible by the +tile size, edge tiles are partial and have a different cycle than the table +entry. Options: pad to full-tile cost (simple, small error) vs. sample a +separate `tile_id` for the remainder. Decided at P4. + +## 7. Milestones + +- **P0** — DONE. New branch; runtime API header (C6 surface) + `togsim` op + vocabulary (C1). +- **P1** — DONE. `build_skeleton` pass (C2) on a matmul kernel; verified against + the legacy `build_tog` TOG. The async DMA's `memref.dma_wait` is mapped through + to an explicit `togsim.memory_barrier` carrying the DMA's `tag_id` and the + runtime tag-index operand; the IR verifies across sibling prefetch/compute loop + nests because the pairing is by runtime tag slot, not a cross-region SSA edge. +- **P2** — DONE. togsim→emitc (C4) + `mlir-translate` + compile (C5) → `.so` for + that kernel (static shape). C4 rewrites the unregistered `togsim.*`/signature + then drives the upstream `lower-affine`/`convert-*-to-emitc` passes, with a + small fold for residual `emitc.for` bound casts (see §8). Base addresses + stubbed to 0 (wired in P3). +- **P3** — DONE. TOGSim loader + runtime (C6) + cycle table (C3); runs end-to-end + through the real Simulator/Core (256^3 GEMM via `--trace_so`). Parallelism / + reduction / core dispatch design is locked in **§9** (core-transparent work + function + `togsim_core_alloc` hook). Async DMA↔consumer sync is the runtime + tag-slot mechanism (`togsim.memory_barrier`), not an event-id. +- **P4** — Symbolic bounds end-to-end on a decode-style kernel; verify trace + length scales with runtime shape; decide remainder-tile handling. +- **P5** — Migrate remaining op families (conv, SDPA, vector). + +## 8. Risks / open questions + +- **Remainder tiles vs. precomputed table** (§6) — P4. +- **ABI versioning** — RESOLVED. Free `extern "C"` symbols (the `.so` links back + into the Simulator binary via `ENABLE_EXPORTS`); `TOGSIM_ABI_VERSION` is v11. +- **togsim→emitc for unregistered ops** — must be a custom rewrite to + `emitc.call_opaque`, since unregistered ops have no registered conversion + patterns. +- **EmitC coverage** — RESOLVED (P2). C4 uses the upstream conversion passes + (`lower-affine`, `convert-scf-to-emitc`, `convert-arith-to-emitc`, + `convert-func-to-emitc`). One gap in this LLVM 20 build: + `convert-scf-to-emitc` emits `emitc.for` with `index` bounds, so + `convert-arith-to-emitc` leaves `builtin.unrealized_conversion_cast` on the + bounds (`emitc.size_t`↔`index`) that `--reconcile-unrealized-casts` cannot + fold and `mlir-to-cpp` cannot print. C4 adds a small post-pass + (`_retype_for_to_size_t`) that retypes each `emitc.for` to `!emitc.size_t` + bounds + IV (`emitc.for` accepts size_t with the explicit type) and folds the + residual index<->size_t casts. A size_t IV also makes the lowered *address* + arithmetic cast-free, which is what lets P3 wire real addresses (approach A): + `togsim_dma` passes `(arg_id, element offset)` where the offset is computed + from the loop IVs and lowered by `convert-arith-to-emitc`. +- **async/fire-and-forget** — `is_async` preserved on `togsim.dma`. An async DMA + signals its tag at data arrival; a sync DMA is blocking. A DMA with no matching + `memory_barrier` is fire-and-forget (nothing waits its tag). + +## 9. P3 design: parallelism, reduction, and core dispatch (locked) + +How the trace producer expresses *which core runs what*, *what is parallel*, and +*what is a reduction* (cross-iteration dependency). This is the design for P3. + +### 9.1 Where the semantics come from + +Nothing new has to be inferred — the post-vcix `affine.for` already carries the +mapping decision the frontend made, and `build_skeleton` preserves it: + +| attribute | meaning | role | +|---|---|---| +| `outer_loop` | PARALLEL axis (e.g. GEMM m, n) | independent output tiles -> distributable across cores | +| `accumulation_loop` | REDUCTION axis (e.g. GEMM k) | partial sums into one output tile -> ordered dependency | +| `inner_loop` | tile micro-loop | within one tile | + +This matches what legacy TOGSim already does with `torchsim_loop_type` +(`TileGraphParser`: PARALLEL -> `outer_loop_idx` selects a core; ACCUMULATION -> +`accum_tag` groups dependent partials). The current gap is only that +`lower_to_emitc` (P2) *drops* these attributes when it lowers `affine.for` to +`emitc.for`, producing a flat single-stream producer. + +### 9.2 Principle: bake intrinsic, parameterize extrinsic + +Two different kinds of hardware dependence must be treated differently: + +- **Intrinsic** (vlane / vector width, `TILE_M/N/K`, systolic size) — defines the + *content and cost of each instruction*. Already baked into the IR; correct. +- **Extrinsic** (`num_cores`) — defines only the *distribution* of an otherwise + fixed set of work-items. The tile set, the per-tile cost table + (`tile_id -> cycle`), and the DMA tile shapes are all `num_cores`-invariant. + +Therefore `num_cores` is **not** baked into the producer. The producer is +**core-count transparent**: it knows nothing about how many cores exist. + +### 9.3 Model: core-transparent work function + dispatch hook + +The producer is two functions, split at the PARALLEL/ACCUMULATION boundary: + +```c +// WORK: trace for ONE independent output tile. Core-transparent: takes the +// PARALLEL indices directly, names no core. Reduction (k) is program order -> +// the dependency is implicit (the accumulator is core-local). An async load is +// synced to its consumer by an explicit memory_barrier on the same tag slot. +void togsim_kernel_tile(EmitCtx* ctx, int64_t mi, int64_t ni, int64_t* shape) { + togsim_core_alloc(ctx); // first line: new work-item + pick core + togsim_compute(ctx, /*tile_id=*/0, ...); // acc init + for (size_t ki = 0; ki < KT; ++ki) { // REDUCTION = program order + togsim_dma(ctx, LOAD, A, offA(mi,ki), ..., /*is_async=*/1, /*tag_id=*/0, ki%D, ...); + togsim_dma(ctx, LOAD, B, offB(ki,ni), ..., /*is_async=*/1, /*tag_id=*/1, ki%D, ...); + togsim_memory_barrier(ctx, /*tag_id=*/1, ki%D, ...); togsim_compute(ctx, 1, ...); + togsim_memory_barrier(ctx, /*tag_id=*/0, ki%D, ...); togsim_compute(ctx, 2, ...); + } + togsim_dma(ctx, STORE, C, offC(mi,ni), ...); +} + +// DISPATCH: enumerate the PARALLEL domain, one call per work-item. +extern "C" void togsim_kernel(EmitCtx* ctx, int64_t* shape, int32_t n) { + size_t MT = shape[0]/256, NT = shape[1]/256; + for (size_t mi = 0; mi < MT; ++mi) + for (size_t ni = 0; ni < NT; ++ni) + togsim_kernel_tile(ctx, mi, ni, shape); +} +``` + +Reduced to two orthogonal concepts: + +- **Parallel** = each `togsim_kernel_tile` call is an independent work-item (no + tags shared across calls). TOGSim is free to place it on any core. +- **Reduction** = ordering *inside* one work-item: program order on its core + (no explicit barrier). The `memory_barrier`/tag-slot mechanism is only the + async-DMA → consumer data sync. +- **Core assignment** = `togsim_core_alloc(ctx)` (a runtime callback, body in + TOGSim) marks the work-item boundary and binds the following ops to a chosen + core. The producer never sees `core_id`/`num_cores`; those live only in + TOGSim's dispatch policy (round-robin / blocked / cost-aware via the cycle + table). + +The boundary callback lives at the start of each work-item; it cannot be folded +away because TOGSim cannot intercept the producer-internal work-function call -- +only `togsim_*` callbacks are visible across the `dlopen` boundary. + +> FINAL API (supersedes the `togsim_dispatch` naming used below): the boundary + +> core binding is **`int32_t togsim_core_alloc(EmitCtx*)`** (header v6). The +> producer calls it at each work-item start; the **runtime owns the core pool** +> and round-robins -- `num_cores` is NEVER baked into the producer (it is purely +> a runtime quantity). There is **no free**: a core is an assignment, not a held +> resource; the next `togsim_core_alloc` starts the next work-item. The returned +> id is discarded by the producer. This keeps the producer core-count transparent +> while making the core mapping an explicit runtime allocation. Wherever the text +> below says `togsim_dispatch`, read `togsim_core_alloc`. + +### 9.4 Codegen (lower_to_emitc) and ABI deltas + +- `lower_to_emitc` splits the loop nest at the PARALLEL/ACCUMULATION boundary + into two `emitc.func`: the PARALLEL loops become `togsim_kernel` (dispatcher, + passing the loop indices as args); the ACCUMULATION+INNER body becomes + `togsim_kernel_tile`, with `togsim_core_alloc(ctx)` inserted at its entry. +- ABI additions in `togsim_runtime.h`: `int32_t togsim_core_alloc(EmitCtx*)` + (runtime owns the core pool; no `num_cores` in the producer; no free). + `togsim_kernel_tile` may stay internal (`static`) for now; export it only if a + future loader wants to own the parallel enumeration (which would also need a + `num_tiles`-style count — not required now). +- `tile_id -> cycle` table unchanged (num_cores-invariant). + +> Implementation status (P3, ABI v12): `lower_to_emitc` OUTLINES the innermost +> PARALLEL-loop body into a uniform `togsim_kernel_tile(ctx, iv, n)` func and the +> dispatcher loop hands it to `togsim_dispatch(ctx, fn, iv, n)` -- a higher-order +> runtime wrapper that round-robins a core and brackets the call with +> TILE_BEGIN/TILE_END. The work-item SCOPE is now the function call itself (not an +> implicit "ops until the next core_alloc" range), and one general dispatcher +> serves every kernel (uniform iv-array ABI). Earlier this was a single +> `togsim_kernel` with a bare `togsim_core_alloc` marker; the emitted *trace* is +> identical (one work-item bracket, then the work ops), so cycles are unchanged -- +> the outline was done to make the boundary explicit, not for timing. Address +> arithmetic is wired (approach A): each `togsim_dma` passes `(arg_id, element +> offset)` with the offset computed from the loop IVs (lowered by +> `convert-arith-to-emitc`, cast-free thanks to the size_t IV retype); the runtime +> adds the tensor base. The parallel IVs reach the tile fn through the iv array. + +### 9.5 Stance and the split-K exception + +This refines the design's "not a dynamic scheduler / static control flow": +**per-work-item trace is static and deterministic; only the work-item -> core +binding is dynamic** (decided by `togsim_core_alloc`). That is independent-task +distribution, not data-dependent control flow, and it matches a real tile +scheduler more closely. + +The transparent model holds while work-items are independent (data-parallel over +output tiles). **Split-K** (a reduction split *across* cores) breaks +independence: the producer must emit `c` partials + a combine, so the +instruction stream then depends on `num_cores`, and the cross-core dependency +must be a real dataflow edge (not program order). Treat split-K as a deliberate, +scoped exception — start P3 with data-parallel only. + +### 9.6 Work-items form a DAG (barriers, cross-parallel reduction) + +Work-items are not always a flat independent set. When there is a computation +*between* parallel loops (e.g. an op at the m-level after the inner n parallel +loop), it can only run once the inner parallel region completes — a join / +barrier: + +``` +parallel for m: + parallel for n: A(m,n) # leaf work-items + B(m) # join: needs all n of this m +``` + +This needs **no new primitive**: it is the same dataflow-edge mechanism the trace +already uses (§10), just at work-item granularity. The join op declares the +leaves' output buffers as its inputs, so the bridge makes it depend on every leaf +through the last-writer-per-buffer analysis: + +``` +parallel for m: + parallel for n: A(m,n) // each writes a tile of m's intermediate buffer + B(m) // reads that buffer -> depends on all n of this m +``` + +So the general picture: **work-items form a DAG; edges are buffer producer → +consumer dependencies.** The independent data-parallel case is the degenerate +edge-less DAG; barriers, reduction-across-a-parallel-axis, and split-K are the +same DAG with real dataflow edges. (Async-DMA data arrival is the one edge that +needs an explicit `memory_barrier` on the tag slot, because the buffer write +completes only at DMA response-complete, later than the producing op's +issue — see §10.7.4.) + +> **Superseded.** An earlier version expressed these joins with a per-leaf +> completion `event` plus `togsim_wait_all`. Those primitives were removed; joins +> are now ordinary buffer dependencies in the dataflow DAG (§10). + +### 9.6.1 How a barrier finds its DMA: runtime tag-slot pairing (locked) + +How the explicit `togsim.memory_barrier` (lowered from `memref.dma_wait`) finds +*which* `togsim.dma` instance's data it must wait for. The hard case is a +reduction loop: one static `togsim.dma` op executes once per iteration, each +iteration loading a different tile into a different runtime tag slot. The pairing +must therefore key on a *runtime* value, not a compile-time one. + +The locked model: pair by the **runtime tag slot**, using the existing TOGSim +Core tag table. + +- **A DMA carries `(tag_id, tag_slot)`.** `tag_id` is the compile-time identity + of the DMA's tag memref (which logical channel — e.g. A-load vs B-load). + `tag_slot` is the *runtime* tag index `%tag[%idx]`, i.e. the SRAM tile slot + the loaded tile occupies this iteration. Together they uniquely name this + iteration's load. +- **An async DMA signals; the barrier waits.** At DMA response-complete (the + moment data has actually arrived in SRAM), the runtime calls + `set_tag_finish(tag_id, tag_slot)`. The matching `togsim.memory_barrier` + carries the same `(tag_id, tag_slot)`; it calls `register_tag_waiter` and is + woken at that signal. The barrier then becomes the **last-writer** of the + loaded SRAM buffer (`write_bufs`), so every consumer that reads the buffer + gates on data arrival through the ordinary dataflow-edge analysis (§10). +- **A synchronous DMA needs no barrier.** It is blocking — it finishes at data + arrival itself, and consumers depend on it directly. +- **Reduction iterations do not collide.** Because `tag_slot` is the runtime + index, iteration `i`'s DMA and iteration `i`'s barrier share a slot that is + distinct from (or correctly reused after) other iterations — exactly the + per-iteration pairing a compile-time id could not express. The + double-buffer/pipeline depth is the slot's lifetime, owned by the Core's tag + table. + +**What this drops vs legacy `tag_table`:** no `calc_tag` content-hash, no magic +values (`0`/`1`/`-1`/`>1`), no FIFO, no in-order assumption. The pairing key is +`(tag_id, tag_slot)`, both carried explicitly on the trace ops. + +> Status: IMPLEMENTED (ABI v11). `build_skeleton` maps `memref.dma_wait` to +> `togsim.memory_barrier` and tags `togsim.dma` with `tag_id` + the runtime +> tag-index operand; `lower_to_emitc` lowers both; the runtime pairs them via +> `prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`. Verified bad=0 on the +> 256^3 GEMM. (All current fixtures have tag memref size 1, i.e. single-buffer; +> deeper double-buffer pipelines exercise more slots but use the same key.) +> +> **Superseded.** ABI v5–v10 used a dynamically minted `togsim_event` handle +> parked in a heap "event buffer" (`togsim_event_alloc`/`togsim_event_free`), +> with `togsim_dma` returning the handle and `togsim_wait(handle)` consuming it. +> That mechanism — and the earlier static `event_id` it replaced — could not +> represent per-iteration reduction pairing and was removed in v11 in favor of +> the runtime tag slot above. + +### 9.7 Execution / simulation model: trace generation (not co-execution) + +The producer is a **pure trace (DAG) generator**: running its loops *emits* the +ordered op stream + dependency edges. It never computes cycles, models hardware, +or schedules. Two consequences pin the model: + +- **What is an edge vs. what blocks.** Data dependencies (buffer producer → + consumer edges, plus the async-DMA `memory_barrier` on its tag slot) are + recorded *edges* — the producer does not block on them. The only thing that + ever blocks the producer is *resource backpressure* (finite cores, + double-buffer / SRAM slots, DMA-queue depth), and that is pure flow control, + not timing semantics. +- **Cores, double-buffering, DRAM/NoC are the timing core's job — reused, not + reimplemented.** TOGSim's timing core already models all of this when it + consumes the legacy TOG (Appendix A: `tag_table` double-buffer sync, + `num_cores`). The producer stays oblivious; depths/counts are consumer-side + config. + +Consumption is staged via a swappable **sink** behind the callbacks, so the +choice does not touch the producer or the ABI: + +| | sink | threads | when | +|---|---|---|---| +| **P3** | *materializing* — callbacks append to the timing core's input; reuse its existing scheduler/timing | none | static shape; like-for-like cycle-equivalence vs `build_tog` | +| **P4+** | *streaming* — callbacks push to a bounded queue; the producer runs as a fiber/coroutine and blocks on backpressure; the DES loop advances time, frees resources, resumes it | producer fiber | only when dynamic-shape trace size makes full materialization impractical | + +This is **not** timing co-execution: even the streaming sink only blocks the +producer on resource flow-control, never on timing-resolved data events. It is +the lazy/streamed realization of the same trace model. Decision: **do P3 with +the materializing sink (no threads); defer streaming to P4 as a sink swap.** The +single forward-compat requirement is that the callback sink is an interface. + +### 9.8 P3 task list + +1. DONE. `togsim_runtime.h` + `togsim_runtime.cc`/`togsim_loader.h`: C6 runtime + (`EmitCtx`) + `dlopen` loader (`run_producer`), materializing sink. Callees: + `togsim_core_alloc` (runtime core pool), `togsim_dma` (records a tile load/ + store, signals its tag at data arrival), `togsim_compute` (cycle-table lookup), + `togsim_memory_barrier` (waits the matching `(tag_id, tag_slot)`), + `togsim_compute_barrier`. +2. DONE (single-buffer). `lower_to_emitc`: OUTLINES the work-item body into + `togsim_kernel_tile(ctx, iv, n)` + a `togsim_dispatch` call at the work-item + boundary (ABI v12; was a bare `togsim_core_alloc` marker), lowers + `togsim.memory_barrier`, and reads `loop_type`. (Two-function outline DONE; + trace identical.) +3. DONE. Real tile addresses wired (approach A): build_skeleton keeps the DRAM + index operand on `togsim.dma`; lower_to_emitc passes `(arg_id, offset)` and + `convert-arith-to-emitc` lowers the offset (size_t IV retype makes it + cast-free). Verified on 1024^3 GEMM: per-tile offsets are correct + (A[m,k]=m*1024+k, B[k,n]=k*1024+n). +4. PARTIAL. C3 cycle table: `cycle_table.py` builds `tile_id -> (cycle, + overlapping_cycle)` from a per-tile `cycle_list`, with `overlapping_cycle = + max(cycle - offset[type], 0)` (the legacy formula) and a JSON sidecar dump. + Remaining (folds into task 5): feed it the gem5 sample-mode `cycle_list` + already computed in `extension_codecache` (reused -> both paths stay + cycle-consistent), and have `togsim_compute` set BOTH cycle and + overlapping_cycle on the Instruction. +5. PARTIAL. C6 runtime + loader: `TOGSim/src/togsim_runtime.cc` + + `togsim_loader.h` implement the producer ABI and `run_producer` -- dlopen the + `.so`, run `togsim_kernel` against an `EmitCtx`, and record a `TraceRec` stream + (the materializing sink): each dma resolves `base[arg_id] + offset*elem_bytes` + and signals its tag at data arrival, each compute looks up the cycle table, + core_alloc round-robins the core. Verified standalone on the 256^3 GEMM: + addresses/cycles resolved correctly. DONE (sec 10, 10.7.4): the recorded + stream is fed into the existing timing core (Core/Simulator) -- TraceRec maps + to `Instruction` (compute_cycle + overlapping_cycle, dataflow-buffer deps + + runtime-tag barriers). + +Legacy path: the ONNX-TOG producer (`run_tog` -> `tog_generator` -> ONNX -> +C++ `TileGraphParser`) is marked DEPRECATED in place (comments in +`extension_codecache.py` and `tog_generator.py`) but kept live -- it must not +break during the transition. It is retired only once this trace pipeline is +stable. The cycle measurement (`cycle_list`, `x_offset`/`w_offset`) is shared, +so the two paths stay cycle-consistent meanwhile. + +### 9.9 Task-5 completion roadmap: TraceRec -> Core (DONE; see §10) + +> **Status: implemented.** This roadmap is retained for context. The dependency +> model it sketches (a per-`togsim_wait`-handle RAW edge) was *superseded* during +> implementation by the explicit dataflow-DAG model in §10: edges come from SRAM +> last-writer-per-buffer plus the vcix preload/matmul FSM, and async-DMA data +> arrival is gated by an explicit `togsim.memory_barrier` paired on the runtime +> `(tag_id, tag_slot)` (§10.7.4) — not by a returned event handle. Read the +> bullets below as the original target shape, with that one substitution. + +Grounded by reading `Instruction.h`, `Core.cc`, `TileGraphParser.h/.cc`, +`Simulator.cc`. + +**Target architecture (legacy, reused):** `ONNX -> TileGraphParser -> TileGraph +(TileLoopNode / TileMemoryNode / TileMemoryWaitNode / TileComputeNode) -> +Simulator distributes Tiles to Cores -> Core runs Instructions`. We replace only +the front: build the same `TileGraph` / `Instruction`s from the recorded +`TraceRec` stream, then hand it to the existing `Simulator`. + +**Mapping (TraceRec -> Instruction):** `Instruction(opcode, compute_cycle, +num_parents, dram_addr, tile_size, tile_stride, elem_bits, tag_idx_list, +tag_stride_list, accum_tag_idx_list)`; `ready_counter = num_parents`. +- DMA load/store -> `MOVIN`/`MOVOUT`: `dram_addr = TraceRec.addr`, `tile_size`/ + `tile_stride`/`elem_bits` from the dma, `tag_idx_list = {tag_slot}` (the + SRAM-slot key), `is_async` set. compute_cycle 0. +- COMPUTE -> `COMP`: `compute_cycle = TraceRec.cycle`, + `set_overlapping_cycle(TraceRec.overlapping)`, `set_compute_type(...)`. +- Dependency (RAW): a compute depends on its loads through the SRAM + last-writer-per-buffer analysis (§10); for an async load the last-writer is the + `togsim.memory_barrier` paired on the load's runtime `(tag_id, tag_slot)`, so + the compute's `ready_counter` only clears once the data has arrived (§10.7.4). +- SRAM double-buffer / capacity (WAR): the existing Core enforces it through the + tag mechanism (`register_tag`/`set_tag_finish`/`mark_tag_used`, DMA.h) keyed by + `tag_idx_list`; our `(arg_id, tag_slot)` is that key. Reduction grouping -> + `accum_tag_idx_list` (the accumulation-loop index). + +**Build/wiring:** compile the bridge into TOGSim (it needs the conan deps; +include flags are in `TOGSim/build/compile_commands.json`, notably +`-D_GLIBCXX_USE_CXX11_ABI=0` and the `/root/.conan/data/{robin-hood,spdlog,fmt, +yaml-cpp,boost}` include dirs). Add `togsim_runtime.cc` + the bridge to +`TOGSim/CMakeLists.txt`. Either (a) build `TileGraph`/`Tile` nodes from TraceRec +(maximal reuse of `Simulator`'s tile distribution + Core), or (b) build the +`Instruction` DAG directly and drive a single Core. (a) is closer to legacy and +gives multi-core for free. + +**Cycle-table feed:** reuse the gem5 `cycle_list` already computed in +`extension_codecache` (so both paths stay cycle-consistent); pass it + +`x_offset`/`w_offset` to `cycle_table.build_cycle_table`, dump the sidecar, and +have the loader populate `EmitCtx.cyc/ovl`. + +**Validation:** same post-vcix fixture through both paths; compare the +`Simulator`'s total cycles / DRAM traffic. Start with the 256^3 GEMM (static +shape, single-buffer), then multi-tile / double-buffer kernels. + +This is a focused C++ integration (TOGSim build + TileGraph construction), not a +small increment -- best executed as its own push; all the producer-side inputs +(addresses, cycles, handles, core, tag_slot) are already in the trace. + +## Appendix A: current-state references + +- `TOGSim/include/DMA.h:27-115` — `tag_table` (overloaded `0/1/-1/>1`) + + `waiters`; `register_tag` / `set_tag_finish` / `register_tag_waiter` / + `mark_tag_used` (= init / signal / wait / consume). +- `TOGSim/src/Core.cc:118-140, 214-324` — async-DMA signal path and the `BAR` + wait/consume path over the tag table. +- `TOGSim/include/Instruction.h:40-48, 104-117` — `ready_counter` / `child_inst` + (the second, separate dependency mechanism) and the tag fields. +- `PyTorchSimFrontend/mlir/passes/build_tog.py` — `TogBuilder.print_operation` + dispatch (`affine.for` / `memref.dma_start` / `memref.dma_wait` / `vcix.*`); + `_affine_for_bounds` (constant-bound resolution → static shape). +- `PyTorchSimFrontend/mlir/passes/__init__.py`, + `PyTorchSimFrontend/mlir/passes/lower_to_llvm.py` — in-process Python MLIR pass + orchestration via the bindings; the functional Spike/LLVM path (unchanged). +- `PyTorchSimFrontend/mlir/mlir_gemm_template.py` — kernel template emitting the + `affine.for` nest + `linalg.matmul` + `togsim.transfer` DMA ops. + +## 10. Explicit dependency-edge trace (revised dependency model) + +Supersedes the in-order / runtime-tag approach for expressing dependencies. The +trace is an explicit dataflow DAG: every op declares the producers of the data it +consumes; the consumer (Core) does all resource scheduling. Reached after finding +that (a) flat in-order over-serializes parallel tiles, (b) the current TOG pass +does NO dependency analysis (it emits a lexical loop tree + tags resolved at +runtime by the C++ tag_table), and (c) compute I/O is collapsed away by +build_skeleton, so dependencies must be recovered before the collapse. + +### 10.1 Representation + +The dependency edge is "consumer reads the buffer that producer wrote". As +landed (ABI v9 onward; see STATUS "sec 10 explicit-edge bridge"), each op +declares the **SRAM buffer ids** it reads and writes (`read_bufs` / `write_bufs`); +the bridge builds the Instruction DAG by **last-writer per buffer**, scoped per +work-item. There is no SSA event token threaded by the producer and no event +handle returned by an op. + +- The edge source is data, not order: an op that reads buffer `b` gets an edge + from whatever op most recently wrote `b`. +- No in-order chain, no runtime tag content-hash, no op-pattern heuristics. +- Resource scheduling -- SA round-robin, double-buffer (<=N in flight), SRAM -- + stays entirely in the Core. The trace never reasons about SRAM occupancy or + timing; it only states producer->consumer order. +- One exception: an **async** DMA's write completes only at data arrival (DMA + response-complete), later than its issue, so its last-writer edge is routed + through an explicit `togsim.memory_barrier` that waits the load's runtime + `(tag_id, tag_slot)` (§10.7.4). A synchronous DMA is blocking and needs no + barrier. + +> The sketch below uses an `out_ev = op(ctx, in_events[])` SSA notation to +> *illustrate* the edges; it predates the landed `read_bufs`/`write_bufs` form +> and is no longer the literal ABI. Read `in={…}` as "reads these buffers". + +Producer C++ form (events threaded like SSA; loop-carried = a reassigned var): + + for mi, ni: // PARALLEL: independent tiles + ev acc = compute(ctx, INIT, in={}); + for ki: // REDUCTION: loop-carried acc + ev a = dma_load(ctx, A[mi,ki], in={}); + ev b = dma_load(ctx, B[ki,ni], in={}); + ev w = compute(ctx, PRELOAD, in={b}); + acc = compute(ctx, MATMUL, in={a,w,acc}); // new acc event each iter + dma_store(ctx, C[mi,ni], in={acc}); + +The INIT dependency reaches every accumulate transitively through the acc chain +(INIT -> mm_k0 -> mm_k1 -> store); each node only needs edges to its immediate +producers. Different (mi,ni) -> separate acc chains -> independent -> parallel. + +### 10.2 Two dependency sources (both available pre-collapse in the TOG pass) + +A single "SRAM access" analysis is necessary but NOT sufficient -- verified on the +GEMM post-vcix: + +| dependency | source | visible in SRAM? | +|---|---|---| +| load -> compute (DMA writes X_spad/W_spad, preload/matmul read) | SRAM last-writer per (buffer, slot) | yes | +| accumulator chain (INIT writes Y_spad; the drain/epilogue read-modify-writes Y_spad; store reads it) | SRAM last-writer on Y_spad | yes | +| **preload -> matmul** (preload loads weights into the systolic-array registers; matmul consumes them) | **vcix opcode FSM** (op1=preload pairs with the following op0=matmul; build_tog already tracks this via `current_preload_node`) | **no -- SA-internal, not a memref access** | + +So the analysis derives edges from (1) SRAM (buffer, slot) last-writer for loads +and the accumulator, and (2) the vcix preload/matmul pairing for the SA-weight +dependency. The slot is a concrete value at run time (the producer runs the +loops), so matching is by value -- no static affine-overlap math. + +Key facts (256^3 GEMM, post-vcix): SRAM buffers are %0=X_spad(A), %1=W_spad(B), +%2=Y_spad(acc/out). matmul (vcix op0) reads %0 only; preload (vcix op1) reads %1; +the matmul does NOT read %1 (weights come from the SA), which is exactly why a +memref-only analysis lets it run before the weight load -- the preload->matmul +edge must come from the FSM. The accumulation is the epilogue's `transfer_read +%2 + addf + transfer_write %2`, which IS SRAM-visible. + +### 10.3 Components changed (as landed) + +- TOG pass (`build_skeleton` + `dep_analysis`, on post-vcix before collapse): per + op, the read/write SRAM buffer ids + the preload->matmul pairing (folded as a + virtual `SA_WEIGHTS` buffer) -> the read/write buffer sets. +- ABI (`togsim_runtime.h`): `togsim_dma`/`togsim_compute` carry + `read_bufs`/`write_bufs`; an async DMA also carries `(tag_id, tag_slot)` for the + `togsim.memory_barrier` pairing. No `in_events[]`, no returned event, no + `event_id`/handle-buffer mechanism. +- `lower_to_emitc`: emits the buffer-id arrays on each op (and lowers + `togsim.memory_barrier`). +- bridge: builds the Instruction DAG by last-writer per buffer (`add_child`); + no in-order chain, no runtime tag content-hash. +- Core: unchanged (ready_counter DAG + SA pipeline + double-buffer already exist). + +### 10.4 Open decisions + +- Reduction timing: model the acc chain as completion-serial (conservative, + simple) first; SA-pipelined (matches legacy's overlap) — RESOLVED via the + occupancy/latency split (§10.7). +- Buffer-id lifetime: the last-writer map is scoped per work-item (reset at each + `togsim_core_alloc`). + +### 10.5 Known issue: preload concurrency not bounded by #systolic-arrays + +Observed in the --trace_so run (256^3 GEMM): 4 PRELOADs execute concurrently +(issue ~1028, finish ~1119-1122), but with num_systolic_array_per_core = 2 at +most 2 should overlap, and two preloads on the same SA should serialize (one +weight register file per array). Cause: a preload's overlapping_cycle equals its +compute_cycle (91 == 91), so its occupancy (compute - overlapping) is ~0 and the +Core's SA compute pipeline accepts unbounded back-to-back preloads. + +This is a PRE-EXISTING Core SA-model property, NOT introduced by the trace +pipeline: the legacy build_tog path shows the same -- its 4 preloads issue at +1215-1218 and finish 1306-1309 (4 concurrent). So it is not a trace-vs-legacy +regression, but it is a real hardware-fidelity gap: the model should cap +concurrent preloads at the systolic-array count and serialize same-SA preloads on +the single weight buffer. Track separately from the trace work (affects both +paths equally). + +### 10.6 Known issue: accumulator dependency over-serializes the reduction + +Observed in the --trace_so run: consecutive matmuls run 396 cycles apart (fully +serial: issue 1120, 1516, 1912, ...), but physically matmuls that accumulate into +the same output should PIPELINE on the systolic array (the partial sums stream +through; consecutive matmuls overlap by overlapping_cycle, ~128 effective). They +should NOT wait the previous matmul to complete. + +Cause: the explicit-edge bridge builds a hard completion edge (add_child) for the +Y_spad accumulator read-modify-write, so matmul_k1 waits matmul_k0's +finish_instruction -> when it issues, k0 is already done -> the overlapping_cycle +window is empty -> no pipeline. This is the mechanism behind the 4888 vs legacy +2095 gap (legacy has NO inter-matmul edges, so its matmuls pipeline on 2 SAs: +finishes 1704,1707 | 1832,1835 = +128 within an SA, +3 across SAs). + +So the accumulator (Y_spad) dependency is a PIPELINED/ordering dependency, not a +completion barrier. add_child cannot express that. Fix direction: do not create a +matmul->matmul completion edge through the accumulator -- the accumulation order +is preserved implicitly by same-SA issue order + the SA pipeline (overlapping_ +cycle), exactly as legacy does. Keep the real barriers: load->compute, and +store->last-matmul (the store needs the final accumulator). The asymmetry (a +matmul consuming Y pipelines; the store consuming Y waits) is the crux to model -- +likely "do not barrier when the consumer is a same-unit pipelined compute". + +Related to the same root as 10.5 (the SA/compute-pipeline occupancy model): both +are about modeling the systolic array's streaming/pipelined execution rather than +treating each compute as an atomic completion. + +### 10.7 Occupancy/latency split for pipelined computes (design + prototype) + +Idea (keeps add_child uniform): give each compute two completion points instead of +one. A systolic-array op occupies its unit for occupancy = compute_cycle - +overlapping_cycle (the initiation interval, ~128 for the matmul) and its result is +ready at latency = compute_cycle (~395). Then add_child releases: + - a same-unit pipelined successor (next matmul, accumulator RMW) at OCCUPANCY + -> it starts ~128 later -> pipeline; + - a result consumer (the store reads the drained accumulator) at LATENCY + -> it waits the full drain (tail). +So a single add_child mechanism stays, but the release point depends on whether +the edge is an occupancy-dependency (same-unit pipeline) or a latency-dependency +(reads the result). This also fixes 10.5: a preload then occupies its SA for its +occupancy, so concurrent preloads are naturally capped at the SA count. + +Prototype (bridge stopgap, committed): skip the matmul->matmul accumulator edge +(treat it as pipelined, not a barrier); keep every other edge. Result on 256^3 +GEMM: matmuls now issue back-to-back (1120-1127) and finish pipelined on 2 SAs +(1515,1516 | 1643,1644 | 1771,1772 | 1899,1900 = +128 within an SA, +1 across), +exactly like legacy. Total 4888 -> 2501 (vs legacy 2095 / 2608-incl-store; our +matmuls finish at 1900 vs legacy 2091 -- our load chain is shorter). This +confirms the accumulator dependency is pipelined. The clean replacement is the +occupancy/latency split above in the Core so add_child stays uniform and the +bridge needs no matmul-specific skip. + +#### 10.7.1 preload->matmul is also an occupancy dependency (preload fully overlaps) + +The preload->matmul edge is the SAME kind as matmul->matmul: a same-SA pipeline +(occupancy) dependency, not a latency barrier. A preload's overlapping_cycle +equals its compute_cycle (91 == 91), so its occupancy = compute - overlapping = 0 +-- it fully overlaps. With the occupancy/latency split, the matmul (successor) +released at the preload's OCCUPANCY (= preload issue + 0) starts immediately, so +the preload's 91-cycle latency is entirely hidden under the matmul. + +In the current prototype the preload->matmul edge is still an add_child barrier +(only matmul->matmul was skipped), so the matmul issues at 1120 -- right after the +preload finishes at ~1119 -- paying the full 91. The bridge cannot cleanly skip +preload->matmul (skipping it outright loses the ordering: the matmul could be +ready before the preload and reach the SA without weights). So preload-overlap is +another reason the proper fix is the Core occupancy/latency split (10.7), which +releases the matmul at the preload's occupancy (0) while keeping the issue order. + +Net: the Core occupancy/latency split resolves three notes at once -- 10.5 +(concurrent preloads capped at SA count via preload occupancy), 10.6 (matmuls +pipeline), 10.7.1 (preload fully overlaps) -- all instances of "model the SA as a +pipeline (occupancy + latency) instead of atomic completion". + +#### 10.7.2 Occupancy/latency split: implemented + POC result + +Implemented uniformly: Instruction gains add_pipeline_child / release_pipeline_ +children; the Core releases an op's pipeline children when it ISSUES (enters the +SA pipeline), and its normal children at finish. The bridge classifies edges: a +preload/matmul -> matmul edge is occupancy (add_pipeline_child), everything else +is latency (add_child). No matmul-specific skip heuristic. + +256^3 GEMM result: preloads issue 1028-1031, matmuls issue 1032-1039 (right after +the preloads ISSUE, not after they finish at ~1119 -> preload fully overlaps), and +matmuls finish pipelined on 2 SAs (1427,1428 | 1555,1556 | 1683,1684 | 1811,1812 += +128 within an SA, +1 across). Total 4888 -> 2501 (matmul-skip) -> 2413 +(occupancy/latency). Legacy is 2095 (matmul completion; our matmuls finish at 1812 +vs legacy 2091 -- shorter load chain -- and our 2413 includes the store). + +Note on 10.5 (preload concurrency): NOT fixed by this alone. A preload's +overlapping_cycle == compute_cycle, so its occupancy is 0 -> it does not hold the +SA -> 4 preloads still issue concurrently (1028-1031). Capping concurrent preloads +at the SA count needs the preload to have a non-zero occupancy reflecting the +weight-load time (a cycle-model input), separate from this edge-release change. + +#### 10.7.3 Explicit compute fence: implemented (COMPUTE_BAR), BAR -> MEMORY_BAR + +The compute fence is now a first-class trace entity, not a bridge-internal edge: + - togsim_ops: `togsim.compute_barrier`; ABI v10 adds `togsim_compute_barrier(ctx)`. + - build_skeleton emits a `togsim.compute_barrier` before each store DMA; lower_to_emitc + lowers it; the runtime records a COMPUTE_BAR TraceRec. + - The two barrier kinds are now named distinctly: Opcode::BAR -> Opcode::MEMORY_BAR + (the DMA/tag memory barrier, unchanged) and a new Opcode::COMPUTE_BAR. + - Core: COMPUTE_BAR finishes only once ALL compute pipelines drain (every systolic + array + the VPU empty); until then it stays in the ready queue (re-checked each + cycle). Its ready_counter is gated (pipeline-child of the outstanding async + computes) so it is only evaluated after they have ISSUED into the pipeline. + - bridge: a COMPUTE_BAR record -> a COMPUTE_BAR Instruction (pipeline-child of the + outstanding async matmuls); the following store add_child's the fence. + +256^3 GEMM: trace shows `... matmul x N -> COMPUTE_BAR -> STORE`; the COMPUTE_BAR +instruction finishes at 1813 (after the SAs drain, last matmul ~1812), the store +issues at 1814. Total 2414 (matches the implicit-flush 2413 + the 1-cycle fence). +Multiple SAs handled (drains all _sa_compute_pipeline[*]). 7 python tests pass. + +#### 10.7.4 load->compute uses MEMORY_BAR (async DMA data wait); fixes a real bug + +Bug found: a consumer reading an async-loaded buffer ran BEFORE the data arrived +(preload issued @1028 but its weight load W finished @1131). Cause: a raw +add_child on an async DMA fires at the load's ISSUE-complete (program flow), not +its DATA-ready (resp-complete) -- the async DMA signals data only via the tag +table (set_tag_finish at resp-complete). So the buffer-edge model alone cannot +gate compute on async-loaded data. + +Fix (symmetric with COMPUTE_BAR): route async load -> compute through a MEMORY_BAR +that carries the load's tag. The load registers the tag at issue; the MEMORY_BAR +(made ready after the load issues, via add_child) parks on the tag and is woken at +resp-complete; consumers depend on the MEMORY_BAR (last_writer[buf] = bar). So the +memory-arrival notification (set_tag_finish) connects to compute via the existing +tag mechanism -- now explicit in the trace as a MEMORY_BAR instruction. + +256^3 GEMM: preload now issues @1132 (after W resp-done @1131), correct. Total +2414 (buggy/optimistic) -> 2518 (correct: compute waits the slow weight load). +Both barriers are explicit and symmetric: MEMORY_BAR (DMA tag, resp-complete) for +load->compute, COMPUTE_BAR (SA pipeline drain) for compute->store. + +## 11. Remaining work + next-session handoff + +### 11.1 Status + +PR #267 (feature/togsim-cpp-trace -> develop). The trace pipeline runs end-to-end +through the REAL Simulator/Core on a 256^3 GEMM via `--trace_so`, with an explicit +dataflow dependency model (SRAM last-writer + vcix FSM) and two explicit barriers: +MEMORY_BAR for async load->compute data (paired to its DMA by the runtime +`(tag_id, tag_slot)` tag slot) and COMPUTE_BAR for the SA drain before a store. +The async-DMA sync is the runtime tag slot, NOT a compile-time event-id (ABI +bumped to v11; the event-id / event-handle / wait/signal design was removed). +Legacy ONNX-TOG path kept + DEPRECATED. All togsim python tests pass; TOGSim +builds. + +**Validation (256^3 GEMM, real gem5 cycle table):** through the real Core the +trace path totals **2518 cycles** vs the legacy path's **2698** on the same +table. The earlier 10.x notes (with a stub table) report different absolute +numbers; 2518-vs-2698 is the current real-table figure. + +### 11.2 Remaining work (priority order) + +1. **Cycle-equivalence closure.** Characterize/close the trace-vs-legacy gap on the + 256^3 GEMM with the SAME gem5 cycle_list. Sub-items 2-3 are the main drivers. +2. **Preload concurrency cap (sec 10.5).** 4 preloads run concurrently though there + are 2 SAs, because a preload's occupancy is 0 (overlapping_cycle == compute). + Give the preload a non-zero occupancy (the weight-load time) so concurrent + preloads are capped at the SA count. Pre-existing in BOTH paths. +3. **Robust gem5 cycle_list wiring.** The extension_codecache `TORCHSIM_DUMP_TRACE_SO=1` + hook dumps trace.so + trace_cycles.tsv from the real cycle_list, but is flaky + under concurrent compiles (saw cycle_list==[] once). Make it robust (or force a + single-thread compile), so `--trace_so --cycle_table` uses real per-tile cycles. +4. **Parallel output tiles / multi-core.** One dispatch per work-item today; for + distributing independent output tiles across cores, emit a dispatch per parallel + (m_sub, n_sub) tile. The inner sub-tile loops are currently unlabeled (only the + macro loops carry subtile/accumulation), so the axis role must be recovered. +5. **Cleanup.** The obsolete WAIT/SIGNAL trace records and the event-handle + buffer are dropped (v11). COMPUTE_BAR logs finish twice (cosmetic). The + preload node mis-attributes an X_spad read (build_tog `_steal_leading_transfer_read`) + -> a harmless extra edge. +6. **P5 op coverage.** Only GEMM is exercised. Extend to conv / SDPA / vector / pool. +7. **P4.** Symbolic/dynamic shape; streaming sink (coroutine, alloc-blocks). +8. **Two-function outline** (togsim_kernel_tile) -- DONE (ABI v12). The work-item + body is outlined into a uniform `togsim_kernel_tile(ctx, iv, n)` and run via the + higher-order `togsim_dispatch` wrapper (round-robin core + TILE_BEGIN/TILE_END); + the work-item scope is now the function call. Trace/cycles identical to the old + single-function `togsim_core_alloc` form. One general dispatcher serves every + kernel. +9. **Retire the legacy ONNX-TOG path** once the trace path is stable. + +### 11.3 Next-session context + +- Worktree `/workspace/PyTorchSim-cpptrace`, branch `feature/togsim-cpp-trace`, + PR #267 -> develop. The branch is rebased ONTO develop (the retire-floormod base + was dropped -- develop already has it). `source .envrc` in the worktree. +- Build TOGSim: submodules are init'd; `cd TOGSim/build && cmake .. -DCMAKE_BUILD_TYPE=Release && make -j$(nproc)`. + The Simulator target has ENABLE_EXPORTS (so a dlopen'd .so resolves the togsim_* + callbacks); togsim_runtime.cc + togsim_trace_bridge.cc are picked up by the src glob. +- Run the trace path: + `python -m PyTorchSimFrontend.mlir.passes.lower_to_emitc --so trace.so [--emit-cpp x.cpp]` + then `bin/Simulator --config --trace_so trace.so [--cycle_table cyc.tsv] [--log_level trace]`. +- Get a post-vcix fixture: a real torch.compile GEMM with `TORCHSIM_DUMP_MLIR_IR=1 + pytorchsim_functional_mode=False` writes `outputs//..._sample_postvcix.mlir`. + Real cycle data + legacy reference: add `TORCHSIM_DUMP_TRACE_SO=1` to also dump + trace.so + trace_cycles.tsv in `outputs//` (see 11.2 #3). (Prior /tmp + fixtures are ephemeral -- regenerate.) +- Env (.envrc): gem5 `/gem5/release/gem5.opt`, spike `/release/bin/spike`, + LLVM `/riscv-llvm/bin`. +- Tests: `TOGSIM_SKELETON_FIXTURE= pytest tests/test_togsim_{skeleton,emitc,runtime}.py`. + These are NOT in the CI allowlist (`.github/workflows/pytorchsim_test.yml`) -- register them to gate CI. +- Key files: passes `build_skeleton.py`, `lower_to_emitc.py`, `dep_analysis.py`, + `cycle_table.py`, `togsim_ops.py`; `TOGSim/include/{togsim_runtime.h, togsim_loader.h, togsim_trace_bridge.h}`, + `TOGSim/src/{togsim_runtime.cc, togsim_trace_bridge.cc}`; `Core.cc`/`Instruction.{h,cc}` + (COMPUTE_BAR + MEMORY_BAR rename); `main.cc` (--trace_so); `extension_codecache.py` + (TORCHSIM_DUMP_TRACE_SO hook). +- Local-only backups of the pre-squash/pre-rebase 28-commit history: tag + `pr-backup-ccfea43e`, branch `backup-presquash-3cfd4a3f` (NOT pushed). diff --git a/docs/design/togsim_cpp_trace_HANDOFF.md b/docs/design/togsim_cpp_trace_HANDOFF.md new file mode 100644 index 00000000..23f642bb --- /dev/null +++ b/docs/design/togsim_cpp_trace_HANDOFF.md @@ -0,0 +1,191 @@ +# Handoff — TOGSim C++ Trace Generation + +Continuation notes for picking this work up in a fresh session. Read alongside +the full design: [`togsim_cpp_trace.md`](./togsim_cpp_trace.md) and the snapshot +[`togsim_cpp_trace_STATUS.md`](./togsim_cpp_trace_STATUS.md). + +## Goal (one line) + +Replace the timing-path TOG producer (MLIR -> Python-dict -> ONNX -> C++ parser) +with a compiled, shape-parametric trace producer (MLIR -> EmitC -> C++ -> `.so`); +TOGSim's timing core is preserved. + +## Current state (one paragraph) + +The trace pipeline is implemented end-to-end and runs through the REAL +Simulator/Core on a 256^3 GEMM (`--trace_so`). Dependencies are an explicit +dataflow DAG (SRAM last-writer per buffer + the vcix preload/matmul FSM). An +asynchronous DMA is synced to the consumer of its data by the **runtime tag +slot** `(tag_id, tag_slot)` through an explicit `togsim.memory_barrier` (lowered +from the source `memref.dma_wait`); a sync DMA is blocking. ABI is **v11**. An +earlier design used a compile-time `event_id` / heap event handle with +`wait`/`signal`; it was removed because one static DMA op runs once per loop +iteration into a different tag slot, which a compile-time id cannot pair per +iteration. **Validation:** on the 256^3 GEMM with the real gem5 cycle table, the +trace path totals **2518 cycles** vs the legacy path's **2698** through the real +Core; all togsim python tests pass; TOGSim builds. + +## Branch + +- Work branch: `feature/togsim-cpp-trace` (PR #267 -> develop) + +## Status + +| Milestone | State | +|---|---| +| P0 — ABI header + op vocabulary | DONE (ABI evolved to v11) | +| P1 — `build_skeleton` pass | DONE, verified — runs on a real GEMM fixture, module verifies, compute grouping + dma/barrier counts match the legacy `build_tog` TOG. | +| P2 — togsim -> emitc -> cpp -> .so | DONE — `lower_to_emitc.py` builds EmitC, `mlir-translate` -> C++, `g++ -shared` -> `.so`; validated by build/symbol checks and a dlopen run harness. | +| P3 — TOGSim loader + runtime + cycle table; real-Core run | DONE — runs end-to-end through the real Simulator/Core (256^3 GEMM, `--trace_so`). Runtime tag-slot pairing (ABI v11, `togsim.memory_barrier`), explicit dataflow DAG (read/write_bufs last-writer + vcix FSM), real tile addresses, cycle_table. `togsim_runtime.cc`/`togsim_loader.h`/`togsim_trace_bridge.cc` feed TraceRec into the real Core. Cycle comparison vs legacy on the real gem5 table: trace 2518 vs legacy 2698. Legacy ONNX-TOG path DEPRECATED in place, kept live. | +| P4 — symbolic-bound dynamic shape, streaming sink | not started | +| P5 — op-family migration (conv/SDPA/vector) | not started | + +### Async-DMA sync: runtime tag slot (current), event-id (removed) + +The original P1 threaded the dma->wait dependency as an SSA `!togsim.event` +value, which fails `module.verify()` on a software-pipelined kernel (the +`togsim.dma` sits in the prefetch loop nest, its consumer in a sibling compute +nest, so the value does not dominate its use). An intermediate fix used a +compile-time `event_id` attribute (later a heap-allocated event handle). Both +were **removed**: one static `togsim.dma` op executes once per loop iteration +into a *different* runtime tag slot, so a compile-time id (one per static op) +cannot pair iteration i's DMA with iteration i's wait. + +Current mechanism (ABI v11): `togsim.dma` carries `tag_id` (its tag-memref +identity) plus the runtime tag-index operand `%tag[%idx]` and returns void. The +source `memref.dma_wait` is mapped through to an explicit +`togsim.memory_barrier {tag_id, write_bufs}` carrying the runtime tag index. At +runtime an async DMA and its barrier are paired by `(tag_id, tag_slot)` through +the existing Core tag table (`prepare_tag_key`/`set_tag_finish`/ +`register_tag_waiter`): the DMA signals at data arrival, the barrier waits, and +the barrier becomes the loaded buffer's last-writer so consumers gate on +arrival. (The one remaining auto-inserted barrier is `togsim.compute_barrier`, +the compute fence before a store — marked FIXME to become explicit later.) + +### P2 decisions + +* **ABI v11 (runtime tag slot).** `togsim_dma` returns void and carries + `(is_async, tag_id, tag_slot, read_bufs, write_bufs)`. The + `togsim_memory_barrier(tag_id, tag_slot, write_bufs)` is the explicit + async-DMA sync. No `event_id`, no event handle, no `wait`/`signal`. +* **C4 drives the upstream EmitC conversion passes** (it does not hand-build + EmitC). It only does the parts upstream cannot: rewrite the *unregistered* + `togsim.*` ops to `emitc.call_opaque` and rewrite the kernel signature to the + ABI form. Then it runs, in-process (`mlir.passmanager`), + `func.func(lower-affine), convert-scf-to-emitc, convert-arith-to-emitc, + convert-func-to-emitc`. One local fixup: in this LLVM 20 build + `convert-scf-to-emitc` emits `emitc.for` with `index` bounds, so + `convert-arith-to-emitc` (constants -> `!emitc.size_t`) leaves + `unrealized_conversion_cast` on the bounds that nothing folds and + `mlir-to-cpp` can't print (design sec 8 risk). `_fold_for_bound_casts` + rewrites those bound constants to `index`-typed `emitc.constant`, clearing + the casts. (`emitc.for` *does* accept `size_t` bounds with an explicit + `: !emitc.size_t`, but keeping the bounds `index` avoids retyping the IV.) +* **Addresses (wired in P3, approach A):** `togsim_dma` passes `(arg_id, element + offset)` with the offset computed from the loop IVs; the runtime adds the + tensor base. `togsim.compute` is keyed by `tile_id` for cost. + +## Files (key) + +- `TOGSim/include/togsim_runtime.h` — extern "C" ABI v11 (`togsim_dma`, + `togsim_memory_barrier`, `togsim_compute`, `togsim_compute_barrier`, + `togsim_core_alloc`, `togsim_kernel` entry, `TOGSIM_ABI_VERSION`, opaque + `EmitCtx`). +- `PyTorchSimFrontend/mlir/passes/togsim_ops.py` — single source of truth for the + skeleton+API MLIR vocabulary (op names, attr keys, op->callee map). +- `PyTorchSimFrontend/mlir/passes/build_skeleton.py` + `dep_analysis.py` — the P1 + pass + dependency analysis (reuse build_tog's `TogBuilder`/`_build`; map + dma_start->togsim.dma, dma_wait->togsim.memory_barrier, attach read/write_bufs; + use-based DCE). +- `TOGSim/src/togsim_runtime.cc`, `TOGSim/include/togsim_loader.h`, + `TOGSim/src/togsim_trace_bridge.cc` — C6 runtime, dlopen loader, and the bridge + that feeds the recorded TraceRec stream into the real Core. +- `tests/test_togsim_skeleton.py` — `test_togsim_ops_contract` (runs anywhere) + + `test_build_skeleton_on_fixture` (gated on bindings + a fixture). +- `PyTorchSimFrontend/mlir/passes/lower_to_emitc.py` — the P2/C4 pass: skeleton + module -> EmitC `togsim_kernel` -> C++ (`mlir-translate`) -> `.so` (`g++`). + Entry points: `lower_to_emitc(module)`, `build_trace_so(postvcix_path, so)`, + and a `__main__` CLI (`--so`, `--emit-cpp`, `--include-dir`). +- `tests/test_togsim_emitc.py` — `test_build_trace_so` (EmitC + symbol checks) + + `test_trace_so_runs` (dlopen the `.so` against a stub runtime, run it). Gated + on bindings + `mlir-translate` + a C++ compiler + the fixture. + +## Reproduce P1 + P2 (one GEMM kernel) + +```bash +# 1. post-vcix fixture: compile a GEMM (needs the built PyTorchSimDevice .so). +export pytorchsim_functional_mode=False +python tests/ops/gemm/test_matmul.py +FIX=$(find "${TORCHSIM_DUMP_PATH:-.}" -name '*_postvcix.mlir' | head -1) +# build_skeleton/lower_to_emitc only need the .mlir + bindings, not torch, so a +# fixture compiled in any worktree is fine. + +# 2. P1: skeleton+API MLIR. +python -m PyTorchSimFrontend.mlir.passes.build_skeleton "$FIX" --out /tmp/skel.mlir +# stderr: "skeleton: compute=.. dma=.. memory_barrier=.." + +# 3. P2: skeleton -> EmitC -> C++ -> .so (reads skel from $FIX via build_skeleton). +python -m PyTorchSimFrontend.mlir.passes.lower_to_emitc "$FIX" \ + --so /tmp/trace.so --emit-cpp /tmp/trace.cpp +nm -D /tmp/trace.so | grep togsim # togsim_kernel = T; togsim_dma/memory_barrier/compute = U + +# 4. tests +TOGSIM_SKELETON_FIXTURE="$FIX" python -m pytest \ + tests/test_togsim_skeleton.py tests/test_togsim_emitc.py -q +``` + +Note: `mlir-opt`/`mlir-translate` live in `$TORCHSIM_LLVM_PATH` but are not on +`$PATH`; `lower_to_emitc` resolves `mlir-translate` from `TORCHSIM_LLVM_PATH`. + +## Next steps (P3 is done; remaining work) + +The producer is wired into TOGSim and runs through the real Core (trace 2518 vs +legacy 2698 on the 256^3 GEMM). The parallelism / reduction / core-dispatch +design is in `togsim_cpp_trace.md` §9. Summary: the producer is core-transparent +(knows nothing about `num_cores`); it enumerates parallel output-tile work-items +and calls `togsim_core_alloc` at each work-item boundary. Parallel = independent +work-items; reduction = program order inside one work-item; core binding = the +`togsim_core_alloc` runtime callback (policy lives in TOGSim). Async-DMA data +sync = the runtime `(tag_id, tag_slot)` via `togsim.memory_barrier`. `num_cores` +is extrinsic so it is never baked; vlane/tile sizes are intrinsic and stay baked. +Split-K is a deferred exception. + +Remaining (priority order; full list in STATUS §7 and design §11.2): + +- **SRAM tile lifecycle (double-buffer throttle).** `togsim.dma` carries + `tag_slot` (the SRAM slot key); the consumer must use it to throttle in-flight + loads to the buffer depth on multi-tile / double-buffered kernels. +- **Preload concurrency cap (design §10.5).** Give a preload a non-zero occupancy + (its weight-load time) so concurrent preloads are capped at the SA count. + Pre-existing in BOTH paths. +- **Per-output-tile dispatch / multi-core.** One `togsim_core_alloc` per + work-item today; distribute independent output tiles across cores. +- **Robust gem5 cycle_list wiring.** The extension_codecache + `TORCHSIM_DUMP_TRACE_SO=1` hook is flaky under concurrent compiles. +- **P5 op coverage** (conv/SDPA/vector) and **P4** (symbolic shape, streaming + sink), then **retire the legacy ONNX-TOG path**. + +Full design: `togsim_cpp_trace.md` §5-11. + +## Environment requirements (for the new session) + +- MLIR Python bindings importable (`import mlir.ir`). They ship with the LLVM + build at `${TORCHSIM_LLVM_PATH%/bin}/python_packages/mlir_core`; the CI docker + image `ghcr.io/psal-postech/torchsim-ci` has them. `passes/__init__` also + derives the path from `TORCHSIM_LLVM_PATH`. +- `pytest` to run the test files directly (`pip install pytest` if absent). +- `mlir-translate` (in `$TORCHSIM_LLVM_PATH`) and a host C++ compiler (`g++`/ + `$CXX`) for the P2 `.so` path. +- TOGSim build (for `--trace_so`): `cd TOGSim/build && cmake .. + -DCMAKE_BUILD_TYPE=Release && make -j$(nproc)`. The Simulator target has + ENABLE_EXPORTS so a dlopen'd `.so` resolves the `togsim_*` callbacks. +- When iterating on passes, clear the codegen caches (`$TORCHSIM_DUMP_PATH`, + default `outputs/`) between runs — see CLAUDE.md "Codegen changes are sticky". + +## Verification that already passes anywhere (sanity) + +```bash +python -m py_compile PyTorchSimFrontend/mlir/passes/build_skeleton.py \ + PyTorchSimFrontend/mlir/passes/togsim_ops.py tests/test_togsim_skeleton.py +# contract test (no bindings needed): see test_togsim_ops_contract +``` diff --git a/docs/design/togsim_cpp_trace_STATUS.md b/docs/design/togsim_cpp_trace_STATUS.md new file mode 100644 index 00000000..ebf05701 --- /dev/null +++ b/docs/design/togsim_cpp_trace_STATUS.md @@ -0,0 +1,226 @@ +# TOGSim C++ Trace Generation — Status Report + +Branch: `feature/togsim-cpp-trace`. Design of record: `togsim_cpp_trace.md` (esp. +§9); continuation notes: `togsim_cpp_trace_HANDOFF.md`. This file is a snapshot of +progress. + +## 1. Goal + +Replace the timing-path TOG producer (`MLIR -> Python dict -> ONNX -> C++ +TileGraphParser`) with a compiled, shape-parametric trace producer +(`MLIR -> skeleton -> EmitC -> C++ -> .so`). TOGSim's timing core is preserved; +only the producer of its input changes. The key idea: do not flatten the TOG; +instead **run** a compiled C++ producer that emits the trace as a stream of API +calls. + +Each API call emits one trace record = one modeled instruction, fed to the +existing timing Core. Dependencies are an explicit dataflow DAG (SRAM +last-writer per buffer + the vcix preload/matmul FSM). An asynchronous DMA is +synced to the consumer of its data by the **runtime tag slot** `(tag_id, +tag_slot)` through an explicit `togsim.memory_barrier` (ABI v11). An earlier +design used a compile-time `event_id` / event handle with `wait`/`signal`; that +was removed because one static DMA op runs once per loop iteration into a +different tag slot, which a single compile-time id cannot pair per iteration. + +## 2. Pipeline + +``` +post-vcix .mlir (torch.compile output) + | build_skeleton.py + dep_analysis.py (P1) keep loops; + | memref.dma_start -> togsim.dma(tag_id, %tag[%idx], is_async, read/write_bufs); + | memref.dma_wait -> togsim.memory_barrier(tag_id, tag_slot, write_bufs); + | compute block -> togsim.compute; DCE the rest + v +skeleton+API MLIR + | lower_to_emitc.py (P2/C4) togsim.* -> emitc.call_opaque; ABI signature; drive upstream + | lower-affine/convert-*-to-emitc; _retype_for_to_size_t fixups + v +EmitC --mlir-translate--> C++ --g++ -shared--> trace.so + | TOGSim loader (C6): dlopen + EmitCtx callbacks + v + TraceRec stream (materializing sink) + | togsim_trace_bridge.cc -> existing Core timing + v + cycles / DRAM traffic (real Core) +``` + +Side artifact: cycle table `tile_id -> (cycle, overlapping_cycle)` (cycle_table.py). + +## 3. Milestones + +| | State | +|---|---| +| P0 ABI header + togsim vocabulary | DONE (ABI evolved to v11) | +| P1 build_skeleton | DONE, verified (compute/dma/barrier match legacy TOG) | +| P2 lower_to_emitc -> .so | DONE (real GEMM .so built and run) | +| P3 loader/runtime + cycle table + real-Core run | DONE (runs end-to-end through the real Simulator/Core; below) | +| P4 symbolic/dynamic shape, streaming sink | TODO | +| P5 op-family migration (conv/SDPA/vector) | TODO | + +P3 detail: + +| | State | +|---|---| +| ABI (core_alloc, runtime tag pairing, dma address) | DONE (v11) | +| work-item boundary (togsim_core_alloc) | DONE | +| real tile DRAM addresses (approach A) | DONE, verified on 1024^3 | +| cycle_table builder (cycle + overlapping) | DONE | +| async DMA <-> consumer sync (runtime tag slot, memory_barrier) | DONE | +| explicit dataflow DAG (read/write_bufs last-writer) | DONE | +| C6 runtime + dlopen loader (materializing) | DONE | +| TraceRec -> existing Core timing feed | DONE (runs end-to-end through real Core) | +| cycle comparison vs build_tog (real gem5 table) | DONE: trace 2518 vs legacy 2698 | +| SRAM tile lifecycle / preload-occupancy refinements | partial (see §7) | + +### TraceRec -> Core: now running end-to-end + +`TOGSim/src/togsim_trace_bridge.cc` (`trace_to_tilegraph`) + a `--trace_so` mode +in `main.cc` feed the recorded trace into the REAL Simulator/Core. The producer +`.so` is `dlopen`'d (the Simulator is built with ENABLE_EXPORTS so the `.so` +resolves the `togsim_*` callbacks back into the binary), its trace recorded, then +bridged to a `TileGraph`: one `TileSubGraph` per work-item (core_alloc marker) +bound to its core, one `Tile` of MOVIN/MOVOUT/COMP/MEMORY_BAR/COMPUTE_BAR +`Instruction`s. Dependency edges are built by **last-writer per SRAM buffer** +(`read_bufs`/`write_bufs`); an async load's last-writer is the MEMORY_BAR paired +to it by the runtime `(tag_id, tag_slot)` (so a consumer waits actual data +arrival), and a COMPUTE_BAR drains the systolic-array pipeline before a store. +Build it (`cd TOGSim/build && cmake .. && make`) and run: +`bin/Simulator --config --trace_so gemm_trace.so`. + +### Cycle comparison vs legacy build_tog (256^3 GEMM, real gem5 table) + +Ran the same kernel through the legacy path (torch.compile -> gem5 -> build_tog +-> Simulator) and the trace path (the same post-vcix IR -> trace .so + the SAME +gem5 cycle_list -> --trace_so), both through the REAL Core. extension_codecache +has an opt-in TORCHSIM_DUMP_TRACE_SO=1 hook that dumps trace.so + trace_cycles.tsv +from the same cycle_list/offsets (best-effort, never breaks the legacy path); +compute-unit routing uses compute_type and the tag key uses a per-tensor addr_id +(set_addr_name(arg_id)+prepare_tag_key) so A and B don't collide on tag_slot 0. + +**Result: the trace path totals 2518 cycles vs the legacy path's 2698 on the +same gem5 cycle table.** All togsim python tests pass; TOGSim builds. Compute +work and DRAM traffic match; the remaining difference is scheduling (the +explicit dataflow DAG plus the occupancy/latency SA-pipeline model overlap +differently than legacy's per-iteration BARs). + +**Subtile + multi-tile-K now runs** (256x512x256 forced to 128x128 subtiles, 2 +K-tiles: 5774 cycles, no crash). This needed `build_skeleton` to strip the +`-acc_iv` accumulation marker from the dma_wait tag index so the memory_barrier +slot stays subtile-only and pairs with its load (see §3, `tag_slot`); before the +strip the producer evaluated `-acc_iv` to a negative slot at the 2nd K-tile and +TOGSim aborted with "Key does not exist in ... tag table". + +## 4. Components + +- `build_skeleton.py` + `dep_analysis.py` — in-place reduction of post-vcix to + "loop skeleton + togsim.* API"; `memref.dma_wait` mapped through to an explicit + `togsim.memory_barrier`; read/write SRAM buffer ids attached; reuses legacy + `TogBuilder` traversal. +- `lower_to_emitc.py` — skeleton -> EmitC by driving the upstream conversion + passes plus `_retype_for_to_size_t` (clears residual index<->size_t casts). + `togsim_dma` carries `(tag_id, runtime tag-index, is_async, read/write_bufs)` + and returns void; `togsim_memory_barrier` carries `(tag_id, tag_slot, + write_bufs)`; `togsim_core_alloc` inserted at the work-item boundary. +- `cycle_table.py` — `tile_id -> (cycle, overlapping)`, overlapping + `= max(cycle - offset[type], 0)` (legacy formula); JSON sidecar. +- `TOGSim/src/togsim_runtime.cc` + `TOGSim/include/togsim_loader.h` — C6 runtime + and `run_producer` (dlopen -> togsim_kernel -> records TraceRec). dma resolves + `base[arg] + offset*elem_bytes` and signals its tag at data arrival; the + matching memory_barrier waits the `(tag_id, tag_slot)`; compute looks up the + cycle table; core_alloc round-robins a runtime core pool. +- `TOGSim/src/togsim_trace_bridge.cc` — bridges the recorded TraceRec stream into + the existing `TileGraph`/`Instruction` form for the real Core. +- `TOGSim/include/togsim_runtime.h` — producer ABI v11. + +## 5. Locked design decisions + +1. **Trace is a DAG, not a time order.** The consumer (existing Core) schedules + per-core timelines from: op kind -> hardware unit, SRAM-buffer last-writer -> + data dependency, same-core -> serial (reduction accumulate), SRAM slot -> + capacity. Emission order != execution order. +2. **Async-DMA sync = runtime tag slot.** A `togsim.dma` carries `(tag_id, + tag_slot)`; the matching `togsim.memory_barrier` (lowered from the source + `memref.dma_wait`) waits on the same pair through the existing Core tag table + (`prepare_tag_key`/`set_tag_finish`/`register_tag_waiter`). The DMA signals at + data arrival; the barrier becomes the loaded buffer's last-writer so consumers + gate on arrival. A sync DMA is blocking (no barrier). This replaced an earlier + `event_id` / heap event-handle design, which could not pair a DMA op with its + wait per loop iteration (one static op, a different tag slot each iteration). + No `calc_tag` content-hash, no magic values, no FIFO. +3. **Core = runtime allocation.** `togsim_core_alloc` returns a core id (no free). + `num_cores` is never baked into the producer -- it is the runtime pool size. + A work-item's reduction stays on one core (sticky); different work-items get + different cores -> multi-core. +4. **Intrinsic baked / extrinsic parametric.** vlane / tile sizes / systolic + define instructions (baked); num_cores only distributes (runtime). +5. **Execution model:** P3 materializing (run producer to completion -> record -> + feed existing Core); P4 streaming (coroutine, alloc-blocks on resources). +6. **Double-buffer = resource constraint.** Producer emits everything (no skew); + capacity is the consumer's throttle. Requires SRAM tile lifecycle + (alloc/free) in the trace -- the currently missing piece. + +## 6. Verification (reproducible) + +- togsim python tests pass: skeleton (contract + fixture), emitc (build + dlopen + run), cycle_table, runtime. TOGSim builds. +- 256^3 GEMM: core_alloc -> dma(tag_id, tag_slot) -> memory_barrier(tag_id, + tag_slot) -> compute; addresses A/B/C resolved (offset 0, single tile). +- 1024^3 GEMM: per-tile addresses correct (A[m,k]=m*1024+k -> 0,256,512; + B[k,n]=k*1024+n -> 0,262144,524288). +- End-to-end through the real Core (256^3 GEMM, real gem5 table): trace 2518 + cycles vs legacy 2698. +- Legacy ONNX-TOG path untouched (comment-only diff), marked DEPRECATED, kept as + the comparison reference. + +## 6b. Reference timer (early sanity check; superseded by the real Core feed) + +`togsim::simulate(RunResult, TimingParams)` (togsim_runtime.cc) was an early +standalone scheduler that timed the recorded TraceRec to prove the stream is +sufficient to be timed: per core a DMA-engine timeline (DMAs serialize, overlap +compute), a compute timeline (serial = reduction accumulate, with the `finish = +prev.finish + cycle - overlapped` pipeline overlap of Core.cc), and data deps. +It is NOT the production Core (no DRAM/NoC/L2 contention). It has since been +superseded: the recorded stream is now bridged into the real Tile/TileGraph -> +Core (see §3, and the 2518-vs-2698 result above). Retained here as context. + +## 7. Remaining work (priority order) + +1. DONE. Map TraceRec -> existing TOGSim Core Instructions (Tile/TileGraph, + compute_cycle+overlapping, dataflow-buffer deps + runtime-tag barriers) and + run through the real Core. Result: trace 2518 vs legacy 2698 on the same gem5 + table. +2. SRAM tile lifecycle in the trace (double-buffer throttle). togsim_dma carries + `tag_slot` (the lowered SRAM tag index = the slot key the existing Core's + Instruction.tag_idx needs); 0 for single-buffer kernels. Remaining: the + consumer must use it to throttle in-flight loads to the buffer depth. The + SRAM-buffer key is effectively (arg_id, tag_slot) since each load's DRAM + tensor maps to its spad. +3. Preload concurrency cap / preload occupancy (design doc §10.5): give a preload + a non-zero occupancy so concurrent preloads are capped at the SA count. + Pre-existing in BOTH paths. +4. (later) deeper double-buffer pipelines (more tag slots), two-function outline, + P4 streaming, symbolic shape, P5 op coverage (conv/SDPA/vector). + +## 8. Risks / open + +- SRAM lifecycle (double-buffer throttle) not yet implemented -- central to + double-buffer/capacity accuracy on multi-tile kernels. +- LLVM 20 emitc constraints absorbed: emitc.for index bounds; old + subscript-returns-element model; arith.divui/remui not lowerable -> core id is + a runtime allocation (which became a design improvement). + +### Explicit dataflow-edge dependency model: implemented + +The dependency model is an explicit dataflow DAG, not in-order or runtime-tag +content-hashing. `togsim_dma`/`togsim_compute` carry read_bufs/write_bufs (SRAM +buffer ids; a virtual SA_WEIGHTS buffer folds the preload->matmul edge). +dep_analysis + build_skeleton attach them; lower_to_emitc emits them; the runtime +records them; the bridge builds the Instruction DAG by last-writer per buffer, +scoped per work-item. The one runtime-paired edge is the async-DMA data wait, +routed through an explicit `togsim.memory_barrier` keyed on `(tag_id, tag_slot)` +(see design doc §10.7.4). The systolic-array pipeline uses the occupancy/latency +split (§10.7), so accumulating matmuls pipeline rather than serialize. + +Net (256^3 GEMM, real gem5 table, real Core): trace 2518 vs legacy 2698. +Per-output-tile dispatch for multi-core distribution is the next refinement +(today one dispatch per work-item). diff --git a/scripts/trace_timeline.py b/scripts/trace_timeline.py new file mode 100644 index 00000000..5cf9608b --- /dev/null +++ b/scripts/trace_timeline.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +"""Convert a TOGSim `--log_level trace` log into a Chrome Trace Event JSON that +opens in Perfetto (https://ui.perfetto.dev) or chrome://tracing as an interactive +timeline (Gantt). + +Each instruction becomes one duration slice, grouped per core (pid). Lanes: + dram-rd -- loads crossing the DRAM bus (read bandwidth) + dram-wr -- stores crossing the DRAM bus (write bandwidth) + sa / sa0.. -- COMP compute_type 1 (matmul) / 2 (preload) + vector -- COMP compute_type 0 (vector) +Time unit = core cycles. Barriers (MEMORY_BAR/COMPUTE_BAR) are not drawn. A DMA bar +runs from the op's first DRAM response (DRAM_RESP_FIRST, logged by the Core -- so it +captures data moving even while still injecting) to its completion (load: data-ready; +store: finished), serialized per direction so each is one visible bar (packed row = +saturated bus). A compute slice's width is its occupancy (compute_cycle - overlapping). + +Usage: + bin/Simulator --config --trace_so --cycle_table --log_level trace \ + 2>&1 | python scripts/trace_timeline.py -o timeline.json + # or + python scripts/trace_timeline.py trace.log -o timeline.json +Then drag timeline.json into https://ui.perfetto.dev . +""" +import argparse +import json +import re +import sys + +# [cycle][Core C][TAG ][INST_ID=N] OPCODE (detail...) +_LINE = re.compile( + r"\[(\d+)\]\[Core (\d+)\]\[([A-Z_]+)\s*\](?:\[INST_ID=(-?\d+)\])?\s*(\w+)?(.*)") + +# Only 3 lanes per core. Barriers are dropped (see _HIDE). +_LANE = {"MOVIN": "dma", "MOVOUT": "dma"} +_HIDE = {"MEMORY_BAR", "COMPUTE_BAR", "TILE_BEGIN", "TILE_END"} +_CT_NAME = {0: "vector", 1: "matmul", 2: "preload"} + +# Perfetto/catapult reserved color names; slices are tinted by tile (= the +# togsim_dispatch work-item / output tile) so one tile's ops share a color across +# lanes/cores. 16 names so a core's tiles (which stride by num_cores) stay +# distinct -- an 8-name palette collapsed to 4 colors per core under 2-core +# even/odd assignment. +_TILE_PALETTE = ["good", "bad", "terrible", "yellow", "olive", "rail_response", + "rail_load", "rail_animation", "rail_idle", "thread_state_running", + "thread_state_runnable", "thread_state_iowait", + "thread_state_uninterruptible", "generic_work", "startup", + "vsync_highlight_color"] + + +def _tile_color(detail): + m = re.search(r"\btile=(\d+)", detail or "") + return _TILE_PALETTE[int(m.group(1)) % len(_TILE_PALETTE)] if m else None + + +_DMA_SHORT = {"MOVIN": "MVIN", "MOVOUT": "MVOUT"} + + +def _tile_of(detail): + m = re.search(r"\btile=(-?\d+)", detail or "") + return m.group(1) if m else "?" + + +def _label(opcode, detail): + if opcode == "COMP": + m = re.search(r"compute_type=(\d+)", detail) + ct = int(m.group(1)) if m else -1 + return f"T{_tile_of(detail)} {_CT_NAME.get(ct, 'comp')}" + # DMA: keep each load's OWN identity (addr_name) so the input/weight/K-panel + # loads stay distinct; tile is conveyed by color (and args), not the name. + m = re.search(r"addr_name=(\w+)", detail or "") + who = m.group(1) if m else "?" + return f"{who} (T{_tile_of(detail)} {_DMA_SHORT.get(opcode, opcode)})" + + +def _lane(opcode, detail): + if opcode == "COMP": + m = re.search(r"compute_type=(\d+)", detail) + ct = int(m.group(1)) if m else -1 + return "vector" if ct == 0 else "sa" + return _LANE.get(opcode, "dma") + + +def parse(lines): + # key = (core, inst_id) -> record + insts = {} + for ln in lines: + m = _LINE.search(ln) + if not m: + continue + cyc, core, tag, iid, opcode, detail = m.groups() + if iid is None or opcode is None: + continue + cyc, core, iid = int(cyc), int(core), int(iid) + key = (core, iid) + r = insts.setdefault(key, { + "core": core, "iid": iid, "opcode": opcode, "detail": detail, + "issued": None, "finished": None, "resp": None, "dma_issue": None, + "first_resp": None}) + if not r["opcode"] or r["opcode"] == opcode: + r["opcode"] = opcode + if detail.strip(): + r["detail"] = detail + if tag == "INST_ISSUED" and r["issued"] is None: + r["issued"] = cyc + elif tag == "INST_FINISHED": + r["finished"] = cyc + elif tag == "DRAM_RESP_DONE": + r["resp"] = cyc + elif tag == "DRAM_RESP_FIRST" and r["first_resp"] is None: # first data arrived + r["first_resp"] = cyc + elif tag == "ASYNC_DMA_ISSUE": # all requests injected (engine done) + r["dma_issue"] = cyc + return insts + + +def _occ(detail): + """(compute_cycle, overlapping_cycle) from a COMP detail string.""" + cc = re.search(r"compute_cycle=(\d+)", detail) + ov = re.search(r"overlapping_cycle=(\d+)", detail) + return (int(cc.group(1)) if cc else 0, int(ov.group(1)) if ov else 0) + + +def to_chrome(insts, num_sa=1): + """Model each hardware unit as a server and replay its ops in issue order, so + real idle gaps (bubbles) show and slices don't nest: + dma : MOVIN/MOVOUT -- 1 DMA engine; slice = actual transfer + (ASYNC_DMA_ISSUE -> data-ready). + vector : COMP type 0 -- 1 VPU. + sa : COMP type 1/2 -- each op on the SA the Core reports (`sa=` field; + weight-pinned), so lanes auto-split sa0..; rr fallback if absent. + A compute slice's width is compute_cycle - overlapping_cycle (its occupancy = + latency minus the tail that overlaps the next op), starting when the unit + actually picks it up: start = max(issue, unit_free). num_sa>1 -> lanes sa0.. .""" + by_core = {} + for r in insts.values(): + op, detail, core = r["opcode"], r["detail"], r["core"] + if op in _HIDE: + continue + u = by_core.setdefault(core, {"dma": [], "vector": [], "sa": []}) + if op == "COMP": + m = re.search(r"compute_type=(\d+)", detail) + ct = int(m.group(1)) if m else -1 + u["vector" if ct == 0 else "sa"].append(r) + else: + u["dma"].append(r) + + events, lanes, cores = [], set(), set() + + def add(core, lane, ts, dur, name, r): + lanes.add((core, lane)) + cores.add(core) + args = {"inst_id": r["iid"], "tile": _tile_of(r["detail"]), + "issued": r["issued"], "first_data": r["first_resp"], + "finished": r["finished"], "data_ready": r["resp"]} + am = re.search(r"addr_name=(\w+)", r["detail"] or "") + if am: + args["addr"] = am.group(1) + ev = {"name": name, "cat": lane, "ph": "X", "ts": ts, + "dur": max(dur, 1), "pid": core, "tid": lane, "args": args} + cname = _tile_color(r["detail"]) + if cname: + ev["cname"] = cname + events.append(ev) + + def issue_key(r): + return r["issued"] if r["issued"] is not None else 0 + + nsa = max(num_sa, 1) + for core, u in sorted(by_core.items()): + # DMA data crossing the DRAM bus, split by direction (reads and writes are + # asymmetric). A LOAD's data comes back on the response, so its bar runs + # [first DRAM response, data-ready]. A STORE's data goes out with the + # request (fire-and-forget; its acks arrive after it has finished), so its + # bar runs [issued, finished]. Serialized per direction so each op is one + # visible bar: a packed row = the bus is saturated, gaps = it is idle. + for lane, op, sk, ek in (("dram-rd", "MOVIN", "first_resp", "resp"), + ("dram-wr", "MOVOUT", "issued", "finished")): + free = 0 + rows = [r for r in u["dma"] if r["opcode"] == op + and r[sk] is not None and r[ek] is not None and r[ek] > r[sk]] + for r in sorted(rows, key=lambda r: r[ek]): + start = max(r[sk], free) + free = max(r[ek], start + 1) + add(core, lane, start, free - start, _label(r["opcode"], r["detail"]), r) + # VPU: one server; slice = occupancy (compute_cycle - overlapping_cycle). + free = 0 + for r in sorted(u["vector"], key=issue_key): + if r["issued"] is None: + continue + cc, ov = _occ(r["detail"]) + dur = max(cc - ov, 1) + start = max(r["issued"], free) + free = start + dur + add(core, "vector", start, dur, "vector", r) + # SA: each op runs on the systolic array the Core reports (the `sa=` field + # = its weight-pinned / round-robin assignment); fall back to round-robin + # by issue order for older logs without the field. Each SA is one server. + rows = sorted(u["sa"], key=issue_key) + + def _sa_of(r, i): + m = re.search(r"\bsa=(-?\d+)", r["detail"]) + return int(m.group(1)) if (m and int(m.group(1)) >= 0) else (i % nsa) + + max_sa = max([nsa] + [_sa_of(r, i) + 1 for i, r in enumerate(rows)]) + sa_free = [0] * max_sa + for i, r in enumerate(rows): + if r["issued"] is None: + continue + s = _sa_of(r, i) + cc, ov = _occ(r["detail"]) + dur = max(cc - ov, 1) + start = max(r["issued"], sa_free[s]) + sa_free[s] = start + dur + lane = "sa" if max_sa == 1 else f"sa{s}" + add(core, lane, start, dur, _label(r["opcode"], r["detail"]), r) + + for c in sorted(cores): + events.append({"name": "process_name", "ph": "M", "pid": c, "tid": 0, + "args": {"name": f"Core {c}"}}) + order = {"dram-rd": 0, "dram-wr": 1, + "sa": 2, "sa0": 2, "sa1": 3, "sa2": 4, "sa3": 5, "vector": 7} + for c, lane in sorted(lanes, key=lambda x: (x[0], order.get(x[1], 5))): + events.append({"name": "thread_name", "ph": "M", "pid": c, "tid": lane, + "args": {"name": lane}}) + events.append({"name": "thread_sort_index", "ph": "M", "pid": c, "tid": lane, + "args": {"sort_index": order.get(lane, 5)}}) + return {"traceEvents": events, "displayTimeUnit": "ns"} + + +def main(argv): + ap = argparse.ArgumentParser() + ap.add_argument("input", nargs="?", help="trace log file (default: stdin)") + ap.add_argument("-o", "--out", default="timeline.json") + ap.add_argument("-s", "--num-sa", type=int, default=1, + help="systolic arrays per core (num_systolic_array_per_core); " + ">1 splits into sa0..saN-1 lanes") + a = ap.parse_args(argv[1:]) + src = open(a.input) if a.input else sys.stdin + insts = parse(src) + trace = to_chrome(insts, a.num_sa) + with open(a.out, "w") as fh: + json.dump(trace, fh) + n = sum(1 for e in trace["traceEvents"] if e["ph"] == "X") + sys.stderr.write(f"wrote {a.out}: {n} slices -> open in https://ui.perfetto.dev\n") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/tests/test_togsim_emitc.py b/tests/test_togsim_emitc.py new file mode 100644 index 00000000..b0bd2d8e --- /dev/null +++ b/tests/test_togsim_emitc.py @@ -0,0 +1,152 @@ +"""Tests for the C4 emitc lowering + compiled .so trace producer (P2). + +The pipeline under test (docs/design/togsim_cpp_trace.md, sec 5-7): + + post-vcix .mlir --build_skeleton--> skeleton+API + --lower_to_emitc--> EmitC module + --mlir-translate--> C++ + --g++ -shared----> trace .so (exports togsim_kernel; + togsim_* left undefined) + +`test_build_trace_so` builds the .so and checks the EmitC/symbol-table shape. +`test_trace_so_runs` additionally dlopens it against a stub runtime and confirms +the producer executes and emits a non-empty deterministic trace. + +Both are skipped unless the MLIR bindings, `mlir-translate` (from +TORCHSIM_LLVM_PATH), a host C++ compiler, AND a post-vcix `.mlir` fixture (via +`TOGSIM_SKELETON_FIXTURE`) are available -- the same fixture used by +test_togsim_skeleton.py. +""" +import importlib.util +import os +import pathlib +import shutil +import subprocess +import sys +import tempfile + +import pytest + +_ROOT = pathlib.Path(__file__).resolve().parents[1] +_CXX = os.environ.get("CXX", "g++") +_INCLUDE = _ROOT / "TOGSim" / "include" + + +def _mlir_translate(): + return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"), + "mlir-translate") + + +def _tools_ready(): + return (importlib.util.find_spec("mlir") is not None + and os.path.isfile(_mlir_translate()) + and shutil.which(_CXX) is not None) + + +def _fixture(): + fix = os.environ.get("TOGSIM_SKELETON_FIXTURE") + if not fix or not os.path.isfile(fix): + pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir") + return fix + + +_HARNESS = r''' +#include +#include +#include +#include +#include "togsim_runtime.h" +static int n_dma=0, n_membar=0, n_compute=0, n_core=0, bad=0; +extern "C" { +void togsim_dma(EmitCtx*, int32_t, int32_t, uint64_t, int32_t, + const int64_t*, const int64_t*, int32_t, int32_t, + int32_t, uint64_t, const int64_t*, int32_t, + const int64_t*, int32_t){ ++n_dma; } +void togsim_compute(EmitCtx*, uint64_t, int32_t, int32_t, const int64_t*, + const int64_t*, int32_t, const int64_t*, int32_t){ ++n_compute; } +void togsim_memory_barrier(EmitCtx*, int32_t tag_id, uint64_t, const int64_t*, int32_t){ + ++n_membar; if(tag_id<0) ++bad; } // tag_id pairs it with its async dma +void togsim_dispatch(EmitCtx* ctx, togsim_tile_fn fn, int64_t* iv, int32_t n){ + ++n_core; fn(ctx, iv, n); } // count a work-item + run its (outlined) body +void togsim_compute_barrier(EmitCtx*){} +} +int main(int argc, char** argv){ + void* h = dlopen(argv[1], RTLD_NOW | RTLD_GLOBAL); + if(!h){ printf("dlopen failed: %s\n", dlerror()); return 2; } + auto emit = (void(*)(EmitCtx*, int64_t*, int32_t))dlsym(h, "togsim_kernel"); + if(!emit){ printf("dlsym failed: %s\n", dlerror()); return 3; } + emit(nullptr, nullptr, 0); + printf("TRACE core=%d dma=%d membar=%d compute=%d bad=%d\n", + n_core, n_dma, n_membar, n_compute, bad); + return 0; +} +''' + + +@pytest.mark.skipif(not _tools_ready(), + reason="need mlir bindings + mlir-translate + C++ compiler") +def test_build_trace_so(): + fix = _fixture() + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4 + + with tempfile.TemporaryDirectory() as d: + so = os.path.join(d, "trace.so") + emitc_text = c4.build_trace_so(fix, so) + assert os.path.isfile(so) + + # EmitC form: one entry func, dma/memory_barrier/compute as call_opaque targets. + assert "emitc.func" in emitc_text + assert ("@%s" % c4.ENTRY) in emitc_text + assert 'emitc.call_opaque "togsim_dma"' in emitc_text + assert 'emitc.call_opaque "togsim_memory_barrier"' in emitc_text + assert 'emitc.call_opaque "togsim_compute"' in emitc_text + + # Symbol table: entry exported (defined, text), runtime hooks undefined + # so the TOGSim loader resolves them at dlopen. + nm = subprocess.run(["nm", "-D", so], capture_output=True, text=True).stdout + syms = {parts[-1]: parts[-2] for parts in + (ln.split() for ln in nm.splitlines()) if len(parts) >= 2} + assert syms.get("togsim_kernel") == "T", nm + assert syms.get("togsim_dma") == "U", nm + assert syms.get("togsim_dispatch") == "U", nm + assert syms.get("togsim_memory_barrier") == "U", nm + # The per-work-item dispatch wrapper is emitted (outlined tile fn). + assert 'emitc.call_opaque "togsim_dispatch"' in emitc_text + + +@pytest.mark.skipif(not _tools_ready(), + reason="need mlir bindings + mlir-translate + C++ compiler") +def test_trace_so_runs(): + fix = _fixture() + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4 + + with tempfile.TemporaryDirectory() as d: + so = os.path.join(d, "trace.so") + c4.build_trace_so(fix, so) + + harness_cpp = os.path.join(d, "harness.cpp") + harness_bin = os.path.join(d, "harness") + with open(harness_cpp, "w") as fh: + fh.write(_HARNESS) + # -rdynamic so the harness's togsim_* are visible to the dlopened .so. + build = subprocess.run( + [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE), + harness_cpp, "-o", harness_bin, "-ldl"], + capture_output=True, text=True) + assert build.returncode == 0, build.stderr + + run = subprocess.run([harness_bin, so], capture_output=True, text=True) + assert run.returncode == 0, run.stdout + run.stderr + out = run.stdout.strip() + assert out.startswith("TRACE "), out + counts = dict(kv.split("=") for kv in out.split()[1:]) + # The producer ran and emitted a real trace, with >=1 work-item (core alloc). + assert int(counts["core"]) >= 1 + assert int(counts["dma"]) >= 1 + assert int(counts["compute"]) >= 1 + # Async loads are synced by explicit memory barriers, each carrying a + # valid (non-negative) tag_id that pairs it with its dma. + assert int(counts["membar"]) >= 1, out + assert int(counts["bad"]) == 0, out diff --git a/tests/test_togsim_runtime.py b/tests/test_togsim_runtime.py new file mode 100644 index 00000000..f17bccef --- /dev/null +++ b/tests/test_togsim_runtime.py @@ -0,0 +1,181 @@ +"""P3 task 5: the TOGSim C6 runtime + loader (togsim_runtime.cc / togsim_loader.h). + +Builds a producer `.so` from a post-vcix fixture, links the real C6 runtime, runs +the loader (`run_producer`) against the `.so`, and checks the recorded trace: +DRAM addresses are resolved (base[arg_id] + offset*elem_bytes), compute cycles +are looked up from the cycle table, and every wait gets a handle a dma minted. + +Skipped unless the MLIR bindings, `mlir-translate`, a C++ compiler, and a +post-vcix `.mlir` fixture (`TOGSIM_SKELETON_FIXTURE`) are available. +""" +import importlib.util +import os +import pathlib +import shutil +import subprocess +import sys +import tempfile + +import pytest + +_ROOT = pathlib.Path(__file__).resolve().parents[1] +_CXX = os.environ.get("CXX", "g++") +_INCLUDE = _ROOT / "TOGSim" / "include" +_RUNTIME = _ROOT / "TOGSim" / "src" / "togsim_runtime.cc" + + +def _mlir_translate(): + return os.path.join(os.environ.get("TORCHSIM_LLVM_PATH", "/usr/bin"), + "mlir-translate") + + +def _tools_ready(): + return (importlib.util.find_spec("mlir") is not None + and os.path.isfile(_mlir_translate()) + and shutil.which(_CXX) is not None + and _RUNTIME.is_file()) + + +def _fixture(): + fix = os.environ.get("TOGSIM_SKELETON_FIXTURE") + if not fix or not os.path.isfile(fix): + pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir") + return fix + + +# Drives the loader with known tensor bases + a synthetic cycle table, then +# checks the recorded trace. Tailored to a single-output-tile GEMM (256^3): +# 3 dmas A/B/C at offset 0 -> addr == base; args 0/1/2; dirs load/load/store. +_MAIN = r''' +#include +#include +#include +#include +#include "togsim_loader.h" +using namespace togsim; +int main(int argc, char** argv) { + uint64_t bases[3] = {0x1000, 0x2000, 0x3000}; + int64_t cyc[3] = {100, 200, 300}; + int64_t ovl[3] = {0, 200, 172}; + RunResult r = run_producer(argv[1], nullptr, 0, bases, 3, cyc, ovl, 3, 1); + if (!r.ok) { printf("run failed\n"); return 2; } + int ndisp=0, nd=0, nc=0, nm=0, fail=0; + std::vector dma_a; std::vector dma_arg, dma_dir; + std::vector> async_tags; // (tag_id, tag_slot) of async dmas + for (auto& t : r.trace) { + if (t.kind == TraceRec::TILE_BEGIN) ndisp++; // one per work-item + else if (t.kind == TraceRec::DMA) { + nd++; dma_a.push_back(t.addr); + dma_arg.push_back(t.arg_id); dma_dir.push_back(t.dir); + if (t.is_async) async_tags.push_back({t.tag_id, t.tag_slot}); + } else if (t.kind == TraceRec::COMPUTE) { + nc++; + int64_t want = (t.tile_id < 3) ? cyc[t.tile_id] : -1; + if (t.cycle != want) { printf("compute %lu cyc %ld!=%ld\n", + (unsigned long)t.tile_id, (long)t.cycle, (long)want); fail++; } + } else if (t.kind == TraceRec::MEMORY_BAR) { + nm++; bool ok=false; + for (auto& k : async_tags) if (k.first==t.tag_id && k.second==t.tag_slot) ok=true; + if (!ok) { printf("membar tag (%d,%lu) pairs no async dma\n", + t.tag_id, (unsigned long)t.tag_slot); fail++; } + } + } + const uint64_t exp[3] = {0x1000, 0x2000, 0x3000}; + const int ea[3] = {0,1,2}, ed[3] = {0,0,1}; + for (int i = 0; i < nd && i < 3; ++i) + if (dma_a[i]!=exp[i] || dma_arg[i]!=ea[i] || dma_dir[i]!=ed[i]) { + printf("dma[%d] addr=%#lx arg=%d dir=%d\n", i, + (unsigned long)dma_a[i], dma_arg[i], dma_dir[i]); fail++; + } + printf("dispatch=%d dma=%d compute=%d membar=%d fail=%d\n", ndisp, nd, nc, nm, fail); + printf(fail ? "RESULT FAIL\n" : "RESULT PASS\n"); + return fail ? 1 : 0; +} +''' + + +@pytest.mark.skipif(not _tools_ready(), + reason="need mlir bindings + mlir-translate + C++ compiler + runtime") +def test_runtime_loads_and_records(): + fix = _fixture() + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4 + + with tempfile.TemporaryDirectory() as d: + so = os.path.join(d, "trace.so") + c4.build_trace_so(fix, so) + + main_cpp = os.path.join(d, "main.cpp") + binp = os.path.join(d, "runtime_test") + with open(main_cpp, "w") as fh: + fh.write(_MAIN) + build = subprocess.run( + [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE), + main_cpp, str(_RUNTIME), "-o", binp, "-ldl"], + capture_output=True, text=True) + assert build.returncode == 0, build.stderr + + run = subprocess.run([binp, so], capture_output=True, text=True) + out = run.stdout + assert "RESULT PASS" in out, out + run.stderr + assert run.returncode == 0, out + # at least the GEMM's 3 dmas were recorded with resolved addresses. + line = [l for l in out.splitlines() if l.startswith("dispatch=")][0] + counts = dict(kv.split("=") for kv in line.split()) + assert int(counts["dma"]) >= 1 + assert int(counts["compute"]) >= 1 + assert int(counts["fail"]) == 0 + + +_SIM_MAIN = r''' +#include +#include +#include "togsim_loader.h" +using namespace togsim; +int main(int argc, char** argv) { + uint64_t bases[3] = {0x1000, 0x2000, 0x3000}; + int64_t cyc[3] = {100, 200, 300}; + int64_t ovl[3] = {0, 200, 172}; + RunResult r = run_producer(argv[1], nullptr, 0, bases, 3, cyc, ovl, 3, 1); + if (!r.ok) { printf("run failed\n"); return 2; } + TimingParams p; p.dma_latency = 100; + SimResult s = simulate(r, p); + // serial baseline: no overlap at all. + uint64_t serial = 0; + for (auto& t : r.trace) { + if (t.kind == TraceRec::DMA) serial += p.dma_latency; + else if (t.kind == TraceRec::COMPUTE) serial += (uint64_t)t.cycle; + } + printf("SIM total=%lu compute=%d dma=%d serial=%lu\n", + (unsigned long)s.total_cycle, s.n_compute, s.n_dma, (unsigned long)serial); + // The trace is schedulable into cycles; overlap (dma||compute, compute + // pipelining) makes it no worse than the fully-serial baseline. + bool ok = s.total_cycle > 0 && s.n_compute > 0 && s.total_cycle <= serial; + printf(ok ? "RESULT PASS\n" : "RESULT FAIL\n"); + return ok ? 0 : 1; +} +''' + + +@pytest.mark.skipif(not _tools_ready(), + reason="need mlir bindings + mlir-translate + C++ compiler + runtime") +def test_simulate_produces_cycles(): + fix = _fixture() + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import lower_to_emitc as c4 + + with tempfile.TemporaryDirectory() as d: + so = os.path.join(d, "trace.so") + c4.build_trace_so(fix, so) + main_cpp = os.path.join(d, "sim.cpp") + binp = os.path.join(d, "sim_test") + with open(main_cpp, "w") as fh: + fh.write(_SIM_MAIN) + build = subprocess.run( + [_CXX, "-std=gnu++17", "-O2", "-rdynamic", "-I", str(_INCLUDE), + main_cpp, str(_RUNTIME), "-o", binp, "-ldl"], + capture_output=True, text=True) + assert build.returncode == 0, build.stderr + run = subprocess.run([binp, so], capture_output=True, text=True) + assert "RESULT PASS" in run.stdout, run.stdout + run.stderr + assert run.returncode == 0, run.stdout diff --git a/tests/test_togsim_skeleton.py b/tests/test_togsim_skeleton.py new file mode 100644 index 00000000..56601966 --- /dev/null +++ b/tests/test_togsim_skeleton.py @@ -0,0 +1,184 @@ +"""Tests for the C++ trace-generation front-end pieces (docs/design/togsim_cpp_trace.md). + +Two layers: + +* `test_togsim_ops_contract` runs anywhere (no MLIR bindings, no torch). It pins + the skeleton+API vocabulary (`togsim_ops.py`) and checks it stays in lockstep + with the runtime ABI header (`togsim_runtime.h`) -- the single thing most + likely to silently drift. +* `test_build_skeleton_on_fixture` exercises the real `build_skeleton` pass, and + is skipped unless the MLIR bindings are importable AND a post-vcix `.mlir` + fixture is supplied via the `TOGSIM_SKELETON_FIXTURE` env var. (A valid + build_tog-compatible fixture is hard to hand-write reliably; point this at a + kernel dump from a real run.) +""" +import os +import importlib.util +import pathlib + +import pytest + +_ROOT = pathlib.Path(__file__).resolve().parents[1] +_OPS_PY = _ROOT / "PyTorchSimFrontend" / "mlir" / "passes" / "togsim_ops.py" +_HEADER = _ROOT / "TOGSim" / "include" / "togsim_runtime.h" + + +def _load_togsim_ops(): + spec = importlib.util.spec_from_file_location("togsim_ops", _OPS_PY) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def test_togsim_ops_contract(): + ts = _load_togsim_ops() + header = _HEADER.read_text() + + # Every op maps to a callee, and every callee is the header's free function. + assert set(ts.EMITC_CALLEE) == set(ts.OP_NAMES) + for callee in ts.EMITC_CALLEE.values(): + assert callee in header, f"{callee} missing from togsim_runtime.h" + + # Entry point symbol agrees with the header. + assert ts.ENTRY_SYMBOL == "togsim_kernel" + assert ts.ENTRY_SYMBOL in header + + # Runtime callee emitted directly by lower_to_emitc: the work-item dispatch + # wrapper. (The outlined tile fn TILE_SYMBOL is producer-generated.) + assert ts.DISPATCH_CALLEE in header + + # Direction enum agrees with the header's togsim_dma_dir. + assert (ts.DIR_LOAD, ts.DIR_STORE) == (0, 1) + assert "TOGSIM_DMA_LOAD = 0" in header + assert "TOGSIM_DMA_STORE = 1" in header + + +def _mlir_available(): + return importlib.util.find_spec("mlir") is not None + + +@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed") +def test_build_skeleton_on_fixture(): + fixture = os.environ.get("TOGSIM_SKELETON_FIXTURE") + if not fixture or not os.path.isfile(fixture): + pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir") + + import sys + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import build_skeleton + + import mlir.ir as ir + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(pathlib.Path(fixture).read_text(), ctx) + report = build_skeleton.build_skeleton(module) + out = str(module) + + # The data-movement ops are gone; the API ops took their place. + assert "memref.dma_start" not in out + assert "memref.dma_wait" not in out + assert "togsim.dma" in out + assert "togsim.memory_barrier" in out # the explicit async-DMA sync (was dma_wait) + assert "event_id" not in out # static pairing replaced by the runtime tag + # Loop skeleton is preserved. + assert ("affine.for" in out) or ("scf.for" in out) + assert module.operation.verify() + print(report) + + +@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed") +def test_strip_accum_terms_drops_reduction_marker(): + """Regression: the dma_wait tag index built by lower_to_vcix carries a `-d_i` + term for each accumulation (reduction) loop var -- a sentinel marker, not an + offset. build_skeleton must drop those so a memory_barrier waits on the same + subtile slot the async load wrote; otherwise the producer evaluates `-acc_iv` + to a negative slot at reduction iteration > 0, the recorded barrier slot + diverges from the load slot, and TOGSim aborts with "Key does not exist in ... + tag table" on subtile + multi-tile-K. See docs/design/togsim_cpp_trace.md and + legacy TileGraphParser.cc (which skips stride -1 for the same reason).""" + import sys + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import build_skeleton as bs + + import mlir.ir as ir + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx, ir.Location.unknown(ctx): + module = ir.Module.parse( + "func.func @k() {\n" + " %r = arith.constant 1 : index\n" # stand-in reduction iv + " %a = arith.constant 0 : index\n" # subtile dim 1 + " %b = arith.constant 0 : index\n" # subtile dim 2 + " return\n" + "}", ctx) + block = module.body.operations[0].regions[0].blocks[0] + consts = [op.results[0] for op in block.operations if op.name == "arith.constant"] + anchor = [op for op in block.operations if op.name == "func.return"][0] + r, a, b = consts + + def neg_dims(val): + amap = ir.AffineMapAttr(val.owner.attributes["map"]).value + return [p for p in (bs._neg_coeff_dim(s) for s in bs._flatten_add(amap.results[0])) + if p is not None] + + # #map8-style: -d0 (reduction) + d1 + d2 floordiv 2. + d0, d1, d2 = (ir.AffineDimExpr.get(i) for i in range(3)) + expr = d0 * -1 + d1 + ir.AffineExpr.get_floor_div(d2, 2) + with ir.InsertionPoint(anchor): + apply = ir.Operation.create( + "affine.apply", results=[ir.IndexType.get()], operands=[r, a, b], + attributes={"map": ir.AffineMapAttr.get(ir.AffineMap.get(3, 0, [expr]))}) + tag_in = apply.results[0] + assert neg_dims(tag_in) == [0] # the reduction marker is present + + tag_out = bs._strip_accum_terms(ctx, tag_in, anchor) + assert tag_out is not tag_in # a new, reduced apply was emitted + out_map = ir.AffineMapAttr(tag_out.owner.attributes["map"]).value + assert out_map.n_dims == 2 # the reduction dim was dropped + assert neg_dims(tag_out) == [] # no reduction marker remains + assert list(tag_out.owner.operands) == [a, b] # only the subtile operands survive + + # No-op: an index with no reduction marker is returned unchanged. + plain = d0 + d1 + with ir.InsertionPoint(anchor): + papply = ir.Operation.create( + "affine.apply", results=[ir.IndexType.get()], operands=[a, b], + attributes={"map": ir.AffineMapAttr.get(ir.AffineMap.get(2, 0, [plain]))}) + pin = papply.results[0] + assert bs._strip_accum_terms(ctx, pin, anchor) is pin + + assert module.operation.verify() + + +@pytest.mark.skipif(not _mlir_available(), reason="MLIR Python bindings not installed") +def test_cycle_table_on_fixture(): + fixture = os.environ.get("TOGSIM_SKELETON_FIXTURE") + if not fixture or not os.path.isfile(fixture): + pytest.skip("set TOGSIM_SKELETON_FIXTURE to a post-vcix kernel .mlir") + + import sys + sys.path.insert(0, str(_ROOT)) + from PyTorchSimFrontend.mlir.passes import build_skeleton, cycle_table + + import mlir.ir as ir + ctx = ir.Context() + ctx.allow_unregistered_dialects = True + with ctx: + module = ir.Module.parse(pathlib.Path(fixture).read_text(), ctx) + build_skeleton.build_skeleton(module) + types = cycle_table._compute_types(module) + # synthetic per-tile cycles (gem5 sample-mode is reused at P3 task 5). + cyc = [10 * (i + 1) for i in range(len(types))] + x_off, w_off = 4, 0 + table = cycle_table.build_cycle_table(module, cyc, x_off, w_off) + + assert len(table) == len(types) >= 1 + # cycle is carried verbatim; overlapping_cycle follows the legacy formula. + for (cy, ov), t, raw in zip(table, types, cyc): + assert cy == raw + if t == cycle_table.VECTOR_COMPUTE: + assert ov == 0 + else: + off = w_off if t == cycle_table.MATMUL_PRELOAD else x_off + assert ov == max(raw - off, 0)