diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4f6336fc..9b7139a3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,8 +5,53 @@ on: branches: [mob] pull_request: branches: [mob] + workflow_dispatch: + +env: + # Stable synthetic key both metrics jobs record under (metrics.db keys runs + # by commit_sha+host) -- decoupled from the Pi's actual hostname so the + # cloud-recorded codesize/compile-time rows and the Pi-recorded perf rows + # land on the SAME run instead of two separate per-host rows. + METRICS_HOST: armv8m-metrics jobs: + # Builds the cross compiler once and shares it (via artifact) with + # build-and-measure and rp2350-perf below, so metrics never repeats this + # build. build-and-test does NOT consume this artifact: `make test` has + # `cross` as a prerequisite that reaches through object files and + # checksum/fp-libs/PCH stamp files (Makefile:206-234), not just the final + # binary, so dropping in a pre-built armv8m-tcc wouldn't save it a + # recompile -- make would just rebuild the missing intermediates anyway. + build: + runs-on: ubuntu-latest + permissions: + packages: read + container: + image: ghcr.io/matgla/tinycc-armv8m:latest + options: --user root + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Build cross compiler + run: ./configure --enable-cross --enable-O2 --debug && make cross -j$(nproc) + + - name: Upload tcc build + uses: actions/upload-artifact@v4 + with: + name: tcc-cross-build-${{ github.sha }} + path: | + armv8m-tcc + armv8m-libtcc1.a + retention-days: 1 + build-and-test: runs-on: ubuntu-latest permissions: @@ -25,11 +70,135 @@ jobs: submodules: recursive - name: Configure - run: ./configure --enable-cross --enable-O2 + # --debug enables CONFIG_TCC_DEBUG so the compiler supports -dump-ir; + # without it the frontend types/ tests skip (they need IR dumps). + run: ./configure --enable-cross --enable-O2 --debug - name: Build and test shell: bash + env: + # Write a JUnit report from every pytest run (the final ir_tests run + # overwrites it last) so the failure collector knows which tests failed. + PYTEST_ADDOPTS: "--junitxml=/tmp/ci-junit.xml" run: | virtualenv .venv source .venv/bin/activate - make test -j$(nproc) + # `shell: bash` runs with -eo pipefail, so a failing make still fails + # the step even though its output is teed to a log we upload on failure. + make test -j$(nproc) 2>&1 | tee /tmp/make-test.log + + - name: Collect failure artifacts + if: failure() + shell: bash + env: + MAKE_TEST_LOG: /tmp/make-test.log + PYTEST_JUNIT_XML: /tmp/ci-junit.xml + run: | + source .venv/bin/activate 2>/dev/null || true + bash scripts/collect_ci_failure_artifacts.sh "$PWD/ci-failure-artifacts" + + - name: Upload failure artifacts + if: failure() + uses: actions/upload-artifact@v4 + with: + name: make-test-failure-artifacts + path: ci-failure-artifacts.tar.gz + retention-days: 14 + if-no-files-found: warn + + # Code size + compile time: no RP2350 board needed, so this reuses the + # `build` job's artifact on a regular (fast) GitHub-hosted runner instead + # of rebuilding (as it used to) or running on the shared Pi. + build-and-measure: + needs: build + runs-on: ubuntu-latest + permissions: + packages: read + container: + image: ghcr.io/matgla/tinycc-armv8m:latest + options: --user root + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Mark workspace as a safe git directory + # actions/checkout runs on the runner host and configures safe.directory + # there, but `run:` steps in a container job execute as a different + # user/HOME inside the container -- that config never reaches it, so any + # git command run from a `run:` step (e.g. metrics/record.py) fails with + # "detected dubious ownership" on the bind-mounted repo. + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: Download tcc build + uses: actions/download-artifact@v4 + with: + name: tcc-cross-build-${{ github.sha }} + + - name: Make tcc executable + run: chmod +x armv8m-tcc + + - name: Record codesize + compile time + run: | + python3 metrics/record.py --db /tmp/metrics-scratch.db --rev HEAD \ + --no-correctness --jobs "$(nproc)" --host "$METRICS_HOST" \ + --trigger "${{ github.event_name }}" + + - name: Upload metrics scratch db + uses: actions/upload-artifact@v4 + with: + name: metrics-scratch-${{ github.sha }} + path: /tmp/metrics-scratch.db + retention-days: 1 + + # RP2350 hardware perf: the only part of this workflow that actually needs + # the board, so it's the only part still pinned to the self-hosted Pi. + rp2350-perf: + needs: build-and-measure + runs-on: [self-hosted, rpi5, pimoroni_pico_plus2] + timeout-minutes: 90 + concurrency: + group: metrics-rpi5 + cancel-in-progress: false + env: + METRICS_DB: /var/lib/tcc-metrics/metrics.db + PERF_HOST: 127.0.0.1 + PERF_IDENTITY: /home/runner/.ssh/id_rp + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Download tcc build + uses: actions/download-artifact@v4 + with: + name: tcc-cross-build-${{ github.sha }} + + - name: Download metrics scratch db + uses: actions/download-artifact@v4 + with: + name: metrics-scratch-${{ github.sha }} + path: /tmp/downloaded-metrics + + - name: Make tcc executable + run: chmod +x armv8m-tcc + + - name: Record perf (import codesize/compile time from the cloud build) + run: | + python3 metrics/record.py --db "$METRICS_DB" --rev HEAD --no-correctness \ + --import-codesize-from /tmp/downloaded-metrics/metrics-scratch.db \ + --jobs "$(nproc)" --host "$METRICS_HOST" --trigger "${{ github.event_name }}" \ + --perf-host "$PERF_HOST" --perf-identity "$PERF_IDENTITY" + + - name: Gate + if: ${{ vars.METRICS_GATE_ENABLED == 'true' }} + run: python3 metrics/gate.py --db "$METRICS_DB" --rev HEAD --host "$METRICS_HOST" --strict diff --git a/.gitignore b/.gitignore index 6f465adc..23207a48 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,14 @@ tests/hello tests/tests2/fred.txt libtcc.dylib build/ +build_backend/ +build_libtcc_api/ +build_tccgen/ +build_tccopt/ +build_tccelf/ +build_tccpp/ +build_tcctools/ +build_tccyaff/ rootfs/ __pycache__/ tests/ir_tests/qemu/mps2-an505/newlib_build/ @@ -97,8 +105,20 @@ tests/ir_tests/dump_ir.txt tests/ir_tests/dump.txt tests/ir_tests/dump_fine.txt tests/ir_tests/dump_ir_fine.txt +tests/ir_tests/build/ +armv8m-tcc.debug .aider* .claude .cache scripts/.disasm_cache.json scripts/.disasm_cache.pending.json + +# Python test artifacts +__pycache__/ +.pytest_cache/ +*.pyc +tests/fuzz/results/* +tests/fuzz/fuzz_triage_repros/ +/tests/fuzz/.sweep_cache/ +tests/unit/arm/armv8m/build* + diff --git a/.vscode/settings.json b/.vscode/settings.json index 9e26dfee..b8b58871 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1 +1,3 @@ -{} \ No newline at end of file +{ + "cmake.sourceDirectory": "/home/mateusz/repos/tinycc/tests/benchmarks/libs/pico-sdk" +} \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..62c8a3b4 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,51 @@ +# AGENTS.md + +Guidance for autonomous coding agents working in this repository (TinyCC fork +targeting ARMv8-M). Read this first, then `CLAUDE.md` for the full project +overview, build commands, and architecture. + +## Build & test (always current) + +```bash +make cross -j$(nproc) # build armv8m-tcc (rebuild after EVERY edit) +make test -j16 # IR test suite (primary gate) +python3 scripts/diff_olevels.py --seeds 0-5000 --require-qemu # fuzz self-consistency +``` + +Style: `-std=c11 -Wunused-function -Werror` (treat warnings as build failures). +Function-body brace on its own line; see `.clang-format` and `CLAUDE.md`. + +## Debugging an optimizer miscompilation + +When a fuzz seed diverges between O-levels (tcc -O0 correct, -O1/-O2 wrong), +follow **`docs/debugging_fuzz_divergences.md`** end-to-end: + +1. `scripts/bisect_opt.py --seed N --high=-O1` — QEMU-confirms the culprit + knob(s) and flags the exact IR line where a memory read is misfolded to a + constant, naming the pass group and the gated pass functions. +2. Write a **regression test first** (`tests/ir_tests/NN_fuzz_.c` + + `.expect`, registered in `tests/ir_tests/test_qemu.py`); confirm it fails + before the fix and passes after. +3. Fix, rebuild, re-run the IR suite + a fuzz sweep; confirm zero *new* + divergences. + +Ground truth oracle is `gcc -m32 -funsigned-char` (ARM ABI: unsigned char, +32-bit long). Sweep/triage infrastructure is documented in +`docs/fuzz_triage_guide.md`. + +## Conventions for changes + +- **Never commit without a regression test** for a bug fix — verbatim or reduced + repro under `tests/ir_tests/`, expected output in a `.expect` file. +- New IR opcode → lowering in `arm-thumb-gen.c` + test. New asm instruction → + builder in `arm-thumb-opcodes.c` + token + parser + test. +- IR internals live in `ir/` (included via `ir/ir.h`); the public IR interface + is `tccir.h`. Internal IR functions are `ir__()`. +- Don't commit the temporary `TCC_SKIP_SSA*` env-var bisection gates (see the + triage guide); they are investigation-only scaffolding. + +## Don't + +- Don't disable ASan/leak checks to "fix" a failure; investigate the root cause. + (ASan is ON by default; `./configure --disable-asan` for fast builds only.) +- Don't commit secrets, force-push, or create empty commits. diff --git a/CLAUDE.md b/CLAUDE.md index d6e0f105..4fec65ca 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,7 +10,7 @@ This is a specialized fork of **TinyCC (Tiny C Compiler)** targeting **ARMv8-M** ```bash # One-time setup -./configure +./configure # AddressSanitizer is ON by default; ./configure --disable-asan for fast/production builds make download-gcc-tests # optional: sparse-fetch GCC torture tests (~16 MB, not the full gcc repo) # Build ARMv8-M cross compiler @@ -140,6 +140,13 @@ Build uses `-std=c11 -Wunused-function -Werror`. ## Debug Logging +For debugging **optimizer miscompilations** found by the differential fuzzer +(tcc -O0 correct, -O1/-O2 wrong), see +[`docs/debugging_fuzz_divergences.md`](docs/debugging_fuzz_divergences.md) — the +end-to-end workflow built around `scripts/bisect_opt.py` (QEMU-confirmed culprit +knob + the exact IR line where a memory read is misfolded to a constant). +`docs/fuzz_triage_guide.md` covers the sweep/triage infrastructure. + Unified logging system defined in `log.h`. Each scope is a compile-time switch: ```bash diff --git a/Makefile b/Makefile index 937648f4..5ba3a1fd 100644 --- a/Makefile +++ b/Makefile @@ -46,9 +46,15 @@ CFLAGS += $(CPPFLAGS) -std=c11 -Wunused-function -Wno-declaration-after-statemen VPATH = $(TOPSRC) $(TOPSRC)/arch -LTCC = $(TOP)/$(LIBTCC) -# Enable extra runtime-debug features (not for release builds). -# This is intentionally controlled by configure's --debug (CONFIG_debug=yes). -ifeq ($(CONFIG_debug),yes) +# Dump-IR support: the -dump-ir / -dump-ir-passes options and the per-pass IR +# dumps they drive (all guarded by CONFIG_TCC_DEBUG, which in this fork gates +# nothing but the IR-dump feature). Enabled by default so IR tooling and the +# frontend golden-IR tests work with a plain `make cross`. The dump calls are +# no-ops unless -dump-ir is passed, so this has no effect on generated code. +# +# For a smaller "minimal" release binary without the dump-IR machinery, build +# with CONFIG_minimal=yes (e.g. `make cross CONFIG_minimal=yes`). +ifneq ($(CONFIG_minimal),yes) CFLAGS += -DCONFIG_TCC_DEBUG endif @@ -164,6 +170,15 @@ CHECKSUM_CMD = $(shell command -v sha256sum 2>/dev/null || command -v md5sum 2>/ # proceed while still keeping ASan instrumentation. ifeq ($(CONFIG_asan),yes) SAN_ENV = LSAN_OPTIONS=detect_leaks=0 ASAN_OPTIONS=detect_leaks=0 +# Leak detection (LSan) is enabled by default for `make test`: every compiler +# invocation runs the at-exit leak check, so any leak in tcc surfaces as a +# non-zero exit. Note tcc (like most compilers) intentionally does not free +# everything on exit, so known pre-existing leaks will fail here too; override +# by exporting your own [AL]SAN_OPTIONS (e.g. detect_leaks=0) to opt out. +# The nested fp-libs build (SAN_ENV above) keeps leak detection off so the +# build can still complete. +export LSAN_OPTIONS ?= detect_leaks=1 +export ASAN_OPTIONS ?= detect_leaks=1 endif @@ -338,18 +353,26 @@ endif gcc -DC2STR $(filter %.c,$^) -o c2str.exe && ./c2str.exe $< $@ # target specific object rules -$(X)%.o : %.c $(LIBTCC_INC) +# (depend on config.mak so toggling build flags — e.g. ASan via +# ./configure [--disable-asan] — forces a recompile instead of silently +# relinking stale, differently-instrumented objects) +$(X)%.o : %.c $(LIBTCC_INC) config.mak $S$(CC) -o $@ -c $< $(addsuffix ,$(DEFINES) $(CFLAGS)) # Architecture library — built by nested Makefile TARGET_ARCH_NAME = $($T_ARCH) $(ARCH_LIB): FORCE @mkdir -p $(dir $(ARCH_LIB)) + @# Build flags changed (e.g. ASan toggled via configure)? Drop stale objects + @# since the nested arch Makefile only tracks source timestamps, not flags. + @if [ -f "$(ARCH_LIB)" ] && [ config.mak -nt "$(ARCH_LIB)" ]; then \ + rm -f $(dir $(ARCH_LIB))*.o "$(ARCH_LIB)"; \ + fi $S$(MAKE) --no-print-directory -C arch ARCH=$(TARGET_ARCH_NAME) \ TOP=$(CURDIR) BUILD_DIR=$(CURDIR)/$(dir $(ARCH_LIB)) \ CC="$(CC)" AR="$(AR)" CFLAGS="$(CFLAGS)" DEFINES="$(DEFINES)" -$(X)ir/%.o : ir/%.c $(LIBTCC_INC) +$(X)ir/%.o : ir/%.c $(LIBTCC_INC) config.mak @mkdir -p $(dir $@) $S$(CC) -o $@ -c $< $(addsuffix ,$(DEFINES) $(CFLAGS)) @@ -486,8 +509,16 @@ config.mak: PYTHON ?= python3 PYTEST ?= pytest -# Pytest parallel workers: make test J=16 → pytest -n 16 (default: auto) +# Pytest parallel workers: make test J=16 → pytest -n 16 (default: auto). +# J=1 disables xdist entirely so logs are sequential. J ?= auto +PYTEST_XDIST ?= -n $(J) +ifeq ($(J),1) +PYTEST_XDIST = +endif + +# Cross compiler used by pytest test suites. +CROSS_COMPILER = $(CURDIR)/armv8m-tcc # If set to 1, wrap compiler invocations with valgrind to detect memory errors. # Usage: make test VALGRIND=1 @@ -509,6 +540,7 @@ IRTESTS_REQUIREMENTS := $(IRTESTS_DIR)/requirements.txt IRTESTS_VENV_STAMP := $(VENV_DIR)/.irtests-requirements.stamp PCH_BENCHMARK_SCRIPT := $(IRTESTS_DIR)/benchmark_pch.py PCH_PREPARE_SCRIPT := $(IRTESTS_DIR)/prepare_pch.py +GOLDEN_IR_COMPILER ?= $(TOP)/armv8m-tcc.debug NEWLIB_DIR := $(IRTESTS_DIR)/qemu/mps2-an505/newlib_build/arm-none-eabi/newlib NEWLIB_LIBC_A := $(NEWLIB_DIR)/libc.a @@ -606,9 +638,9 @@ test-asm: cross test-venv TEST_OBJCOPY="arm-none-eabi-objcopy"; \ export TEST_CC TEST_COMPARE_CC TEST_OBJDUMP TEST_OBJCOPY; \ if [ "$(USE_VENV)" = "1" ]; then \ - "$(VENV_PY)" -m pytest --tb=short -q -n $(J) .; \ + "$(VENV_PY)" -m pytest --tb=short -q $(PYTEST_XDIST) .; \ else \ - $(PYTEST) --tb=short -q -n $(J) .; \ + $(PYTEST) --tb=short -q $(PYTEST_XDIST) .; \ fi # Check that cross-compilation produces no unexpected warnings or errors. @@ -648,13 +680,90 @@ warn-check: armv8m-tcc$(EXESUF) patch-newlib if [ "$$fail" -ne 0 ]; then exit 1; fi @echo "------------ warn-check: passed ------------" +# run frontend coverage tests +# Fast, QEMU-free preprocessor / type-system / diagnostic golden tests. +test-frontend: cross + @echo "------------ frontend tests ------------" + @if [ "$(USE_VENV)" = "1" ]; then \ + cd $(TOP)/tests/frontend && "$(VENV_PY)" -m pytest -q --compiler=$(CROSS_COMPILER); \ + else \ + cd $(TOP)/tests/frontend && $(PYTEST) -q --compiler=$(CROSS_COMPILER); \ + fi + +# run linker/object coverage tests +# Fast, QEMU-free readelf/objdump golden tests. +test-linker: cross + @echo "------------ linker tests ------------" + @if [ "$(USE_VENV)" = "1" ]; then \ + cd $(TOP)/tests/linker && "$(VENV_PY)" -m pytest -q; \ + else \ + cd $(TOP)/tests/linker && $(PYTEST) -q; \ + fi + +# run debug-info coverage tests +# Fast, QEMU-free DWARF/STAB readelf tests. +test-debug: cross + @echo "------------ debug-info tests ------------" + @if [ "$(USE_VENV)" = "1" ]; then \ + cd $(TOP)/tests/debug && "$(VENV_PY)" -m pytest -q; \ + else \ + cd $(TOP)/tests/debug && $(PYTEST) -q; \ + fi + +# run runtime-library coverage tests +# Host-native soft-FP tests plus cross-compiled runtime-helper reference tests. +test-runtime: cross + @echo "------------ runtime-library tests ------------" + @if [ "$(USE_VENV)" = "1" ]; then \ + cd $(TOP)/tests/runtime && "$(VENV_PY)" -m pytest -q --compiler=$(CROSS_COMPILER); \ + else \ + cd $(TOP)/tests/runtime && $(PYTEST) -q --compiler=$(CROSS_COMPILER); \ + fi + +# run self-host bootstrap gate +# Compile-only smoke test always runs; FAT-drive round-trip skips if YasOS env is missing. +test-selfhost: cross + @echo "------------ self-host bootstrap gate ------------" + @if [ "$(USE_VENV)" = "1" ]; then \ + cd $(TOP)/tests/selfhost && "$(VENV_PY)" -m pytest -q --compiler=$(CROSS_COMPILER); \ + else \ + cd $(TOP)/tests/selfhost && $(PYTEST) -q --compiler=$(CROSS_COMPILER); \ + fi + # run IR tests via pytest (preferred) -test: cross test-aeabi-host test-asm warn-check test-venv test-prepare download-gcc-tests ut +.PHONY: test-ir +test-ir: cross test-venv test-prepare download-gcc-tests @echo "------------ ir_tests (pytest) ------------" @if [ "$(USE_VENV)" = "1" ]; then \ - cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -s -n $(J) --durations=10; \ + cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -s -v $(PYTEST_XDIST) -m "not golden_ir" --durations=10; \ + else \ + cd $(IRTESTS_DIR) && $(PYTEST) -s -v $(PYTEST_XDIST) -m "not golden_ir" --durations=10; \ + fi + +# container target: runs the full test suite (all test-* targets below) +.NOTPARALLEL: test test-full test-all +test: cross test-aeabi-host test-asm warn-check test-venv test-prepare download-gcc-tests ut test-frontend test-linker test-debug test-runtime test-selfhost test-ir + @echo "------------ test suite complete ------------" + +# Fully sequential test run: disables pytest-xdist too, for the cleanest logs. +.PHONY: test-sequential +test-sequential: + @+$(MAKE) --no-print-directory test J=1 + +# run golden IR snapshot tests explicitly. +# These require a compiler built with CONFIG_TCC_DEBUG because -dump-ir-passes +# is intentionally a debug/diagnostic interface. Set GOLDEN_IR_COMPILER to a +# specific debug binary, or leave it unset to use the runner's fallback search. +test-golden-ir: test-venv + @echo "------------ golden IR snapshot tests ------------" + @compiler_arg=""; \ + if [ -x "$(GOLDEN_IR_COMPILER)" ]; then \ + compiler_arg="--compiler $(GOLDEN_IR_COMPILER)"; \ + fi; \ + if [ "$(USE_VENV)" = "1" ]; then \ + cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -s $(PYTEST_XDIST) -m "golden_ir" --require-dump-ir $$compiler_arg test_golden_ir.py; \ else \ - cd $(IRTESTS_DIR) && $(PYTEST) -s -n $(J) --durations=10; \ + cd $(IRTESTS_DIR) && $(PYTEST) -s $(PYTEST_XDIST) -m "golden_ir" --require-dump-ir $$compiler_arg test_golden_ir.py; \ fi # legacy tests (kept for reference) @@ -692,9 +801,9 @@ distclean: clean test-tests2: cross test-venv @echo "------------ tests2 test suite ------------" @if [ "$(USE_VENV)" = "1" ]; then \ - cd $(TOP)/tests && "$(VENV_PY)" run_tests.py --tests2 -v -n $(J); \ + cd $(TOP)/tests && "$(VENV_PY)" run_tests.py --tests2 -v $(PYTEST_XDIST); \ else \ - cd $(TOP)/tests && $(PYTEST) -v -m tests2 --tb=short -n $(J) tests/tests2/; \ + cd $(TOP)/tests && $(PYTEST) -v -m tests2 --tb=short $(PYTEST_XDIST) tests/tests2/; \ fi # download GCC torture tests @@ -711,9 +820,9 @@ test-gcc-torture-compile: cross test-venv test-prepare download-gcc-tests PYTEST_TIMEOUT=""; \ fi; \ if [ "$(USE_VENV)" = "1" ]; then \ - cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -m "gcc_compile" --tb=short -n $(J) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ + cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -m "gcc_compile" --tb=short $(PYTEST_XDIST) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ else \ - cd $(IRTESTS_DIR) && $(PYTEST) -m "gcc_compile" --tb=short -n $(J) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ + cd $(IRTESTS_DIR) && $(PYTEST) -m "gcc_compile" --tb=short $(PYTEST_XDIST) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ fi # run GCC torture execute tests only (via ir_tests framework) @@ -725,9 +834,9 @@ test-gcc-torture-execute: cross test-venv test-prepare download-gcc-tests PYTEST_TIMEOUT=""; \ fi; \ if [ "$(USE_VENV)" = "1" ]; then \ - cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -m "gcc_execute" --tb=short -n $(J) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ + cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -m "gcc_execute" --tb=short $(PYTEST_XDIST) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ else \ - cd $(IRTESTS_DIR) && $(PYTEST) -m "gcc_execute" --tb=short -n $(J) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ + cd $(IRTESTS_DIR) && $(PYTEST) -m "gcc_execute" --tb=short $(PYTEST_XDIST) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ fi # run full GCC torture tests (compile + execute via ir_tests framework) @@ -739,9 +848,9 @@ test-gcc-torture: cross test-venv test-prepare download-gcc-tests PYTEST_TIMEOUT=""; \ fi; \ if [ "$(USE_VENV)" = "1" ]; then \ - cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -m "gcc_torture" --tb=short -n $(J) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ + cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -m "gcc_torture" --tb=short $(PYTEST_XDIST) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ else \ - cd $(IRTESTS_DIR) && $(PYTEST) -m "gcc_torture" --tb=short -n $(J) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ + cd $(IRTESTS_DIR) && $(PYTEST) -m "gcc_torture" --tb=short $(PYTEST_XDIST) $$PYTEST_TIMEOUT test_gcc_torture_ir.py; \ fi # run full test suite (IR + GCC torture compile-only) @@ -761,10 +870,22 @@ test-valgrind: ut: $(MAKE) -C tests/unit run +# pipeline pass coverage ledger: compares PASS/PASS_GATED names in +# ir/opt_pipeline.c + SSA_RUN names against UT_COVERS markers and golden-IR +# directories. 89/89 (100%) reached 2026-07-01 (see docs/plan_ut_next_steps.md); +# --strict now hard-fails on any regression. +check-pass-coverage: + @python3 tests/unit/check_pass_coverage.py --strict + +# gcov line/branch coverage report for the unit tests (requires gcovr). +# Renders HTML + text under tests/unit//build/coverage/. +ut-coverage: + $(MAKE) -C tests/unit coverage + ut-clean: $(MAKE) -C tests/unit clean -.PHONY: all cross fp-libs clean test test-valgrind test-aeabi-host test-legacy test-tests2 test-gcc-torture test-gcc-torture-compile test-gcc-torture-execute test-full test-all rebuild-newlib download-gcc-tests tar tags ETAGS doc distclean install uninstall ut ut-clean FORCE +.PHONY: all cross fp-libs clean test test-ir test-sequential test-valgrind test-aeabi-host test-legacy test-tests2 test-gcc-torture test-gcc-torture-compile test-gcc-torture-execute test-full test-all test-frontend test-linker test-debug test-runtime test-selfhost test-golden-ir rebuild-newlib download-gcc-tests tar tags ETAGS doc distclean install uninstall ut ut-coverage ut-clean check-pass-coverage FORCE # Container image settings (auto-detect docker or podman) DOCKER_REGISTRY ?= ghcr.io @@ -825,7 +946,11 @@ help: @echo " $(wordlist 1,8,$(TCC_X))" @echo " $(wordlist 9,99,$(TCC_X))" @echo "make test" + @echo " run the full test suite (test-ir + test-asm + warn-check + ut + ...)" + @echo "make test-ir" @echo " rebuild + initialize GCC testsuite + run pytest in tests/ir_tests" + @echo "make test-sequential" + @echo " same as make test, but runs pytest sequentially for clean logs" @echo "make rebuild-newlib" @echo " wipe and rebuild newlib used by ir_tests/qemu (mps2-an505)" @echo "make test-legacy" diff --git a/NEXT_SESSION_PROMPT.md b/NEXT_SESSION_PROMPT.md new file mode 100644 index 00000000..f042a210 --- /dev/null +++ b/NEXT_SESSION_PROMPT.md @@ -0,0 +1,58 @@ +# Next-session prompt — tinycc O1/O2 fuzz miscompile hunt + +Continue the tinycc O1/O2 differential-fuzz miscompile hunt in +`/home/matgla/repos/yasos.zig/libs/tinycc` (branch `heapOverflowBug`). START by reading the +memory file `yasos-tinycc-fuzz-divergence-playbook` (the per-seed investigate→fix→regression-test +loop); it and the per-seed memories auto-load via MEMORY.md. + +## Golden rules +- tcc -O0 is the trusted oracle; ground truth = `gcc -m32 -funsigned-char` (unsigned char, 32-bit long). +- After EVERY compiler edit: `make cross -j$(nproc)`. +- Single repro: copy `seedN.c` into `tests/ir_tests/`, then `python run.py -c seedN.c --cflags="-O1"` + (grep `checksum=`). Reproduce/confirm with `python3 scripts/diff_olevels.py --seed N --require-qemu`. + +## Workflow per seed +1. Reproduce; note failing level + correct O0 checksum. +2. Bisect the culprit pass with `TCC_DISABLE_PASS=` (works for opt-pipeline AND `ssa:` + passes now). Names: `grep PASS_GATED ir/opt_pipeline.c` and the RUN_SSA list in + `ir/regalloc.c` / `ir/opt/ssa_opt.c`. +3. **TRIGGER ≠ ROOT**: the `-fno-*` knob (esp. `-fno-const-prop`) is usually a TRIGGER — a *sound* + pass (const-prop of a genuine constant, a sound DSE) reshapes the IR and exposes a downstream + bug. Several passes "fixing" it when disabled = enablers; keep bisecting to the pass that + CREATES the wrong value. +4. Pinpoint within a pass via a STABLE skip-by-vreg/offset knob (vregs persist across pass + iterations; instruction indices do NOT) + a debug log, then bisect. Proven knobs: + REDKILL_KEEPVAR (redundant_var_assign), CPA_SKIP_DEST (cprop), SR_KEEPOFF (store_redundant), + SLF_SKIP_DEST (sl_forward). SSA phi nodes are NOT shown by `-dump-ir`; dump + `ctx->ssa->block_phis` manually if a phi is involved. +5. **Do NOT printf to isolate the divergent variable** — it perturbs opt and misattributes. Map a + VAR to a source var by its init constant in the earliest IR dump + (`run.py --dump-ir-passes=all --cc-output`). +6. Fix conservatively. Add `tests/ir_tests/NN_fuzz_.c` + `.expect` (the gcc value), register + in `TEST_FILES` in `tests/ir_tests/test_qemu.py`. PROVE fail-unfixed (toggle the fix to `if(0)`), + pass fixed at O0/O1/O2/Os. +7. Validate: `python -m pytest test_qemu.py -n 16 -q`; diff_olevels sweep over triage seeds (no NEW + divergences); then `make test -j16` MUST be green (unit tests + self-host gate + IR pytest). + Commit (end message with the Co-Authored-By line). + +## Done this session (committed, all make-test-green) +- `d528cd9d`: 2137 + 8425 (ARM `fuse_store_src_through_add_imm` load hoist across store); + 2657 (`load_cse` runtime stack-indexed store invalidation). Tests 210, 211. +- `020964a3`: 2698 + 5689 + 8300 + 8606 (`cprop_assign` lost-copy into loop back-edge phi). Test 212. +- `ec9128db`: 2874 (`store_redundant` constant-index LOAD_INDEXED read eviction). Test 213. + +## Next target — seed 3210 +`-O1`, O0=`a720d0d4` vs O1/O2=`2c0f55a4`. Read memory `yasos-tinycc-seed3210-slforward-open`. +Localized to `sl_forward` (NOT store_redundant), load T194 / dest 536871106 (sl_forward-time i=198): +skipping its forward fixes it, but the stack-local forward-match never fires there → subtler +mechanism. Re-add SLF_SKIP_DEST, dump IR with the skip (minimal-correct) vs buggy and diff T194's +region; or log every forward-commit site in `tcc_ir_opt_sl_forward` (ir/opt_memory.c) with the dest. + +## Other open seeds (each likely a separate root) +4482, 5656, 6214, 6447, 9403 (`-fno-const-prop` O1); 4193, 4594, 7918 (no knob); +6951 (`-fno-jump-threading`); 8985 (`-fno-loop-unroll`); +8078 (COMPILE_CRASH: `STORE operand produced MACH_OP_NONE`). Triage table: `fuzz_triage_2000_10000.md`. + +## Gotcha +`tests/benchmarks/libs/pico-sdk` is an external checkout — never `git add` it. Commit with +`git add -A -- ':!tests/benchmarks/libs/pico-sdk'`. diff --git a/arch/arm/ssa_opt_arm.c b/arch/arm/ssa_opt_arm.c index 53249077..1902a256 100644 --- a/arch/arm/ssa_opt_arm.c +++ b/arch/arm/ssa_opt_arm.c @@ -10,6 +10,7 @@ #define USING_GLOBALS #include "ir.h" +#include "opt_xform.h" #include "ssa_opt.h" #include "ssa_opt_arm.h" @@ -89,13 +90,21 @@ int ssa_gen_arm_fuse_mul_add_to_mla(IRSSAOptCtx *ctx, int instr_idx) return 0; /* Place the MLA at the ADD's position. By SSA dominance, MUL's inputs and - * the accumulator are all defined before the ADD, so this is always valid. - * Placing the MLA at the MUL's position would require the accumulator to - * dominate the MUL — that's the rarer case. */ + * the accumulator are all defined before the ADD, so this is always valid + * for register operands. Placing the MLA at the MUL's position would + * require the accumulator to dominate the MUL — that's the rarer case. */ IROperand add_dest = tcc_ir_op_get_dest(ir, add_q); IROperand mul_src1 = tcc_ir_op_get_src1(ir, mul_q); IROperand mul_src2 = tcc_ir_op_get_src2(ir, mul_q); + /* A MUL source that reads memory would be re-read at the ADD's site; + * any store to that location in between changes the loaded value + * (volatile fuzz seed 5053: `vv11 = st.f0 * u5` before a loop that + * updates st.f0, product consumed after the loop). */ + if ((ir_xform_operand_reads_memory(mul_src1) || ir_xform_operand_reads_memory(mul_src2)) && + (add_q->is_jump_target || !ir_xform_range_preserves_memory(ir, instr_idx, add_idx))) + return 0; + /* Allocate fresh pool space for the MLA's 4 operands (dest, src1, src2, * accum). Reusing the ADD's operand_base would clobber the next * instruction's operands at base+2 and base+3. */ @@ -857,6 +866,46 @@ int ssa_gen_arm_fuse_store_src_through_add_imm(IRSSAOptCtx *ctx, int instr_idx) if (abs_imm > 4095) return 0; + /* Unlike the LOAD variant (which rewrites the load op in place), this fuses + * the deref *source* of a STORE by turning the address-computing ADD into the + * LOAD_INDEXED — i.e. the load is RELOCATED upward from this STORE to the + * ADD's definition site. That hoist is only sound when nothing between the + * two positions can write the loaded memory or divert control flow. GVN can + * CSE the address so the defining ADD sits before a later store to the same + * slot (fuzz seed 2137: `arr[i]` read, `arr[i]=v`, then an unrolled re-read of + * arr[i] whose address was CSE'd back to the first read's LEA) — the hoisted + * load would then read the pre-store value. Bail on any intervening memory + * clobber or control-flow op (the latter also restricts the hoist to a single + * straight-line basic block). */ + { + int didx = vi->def_instr; + if (didx >= instr_idx) + return 0; + for (int j = didx + 1; j < instr_idx; j++) { + switch (ir->compact_instructions[j].op) { + case TCCIR_OP_STORE: + case TCCIR_OP_STORE_INDEXED: + case TCCIR_OP_STORE_POSTINC: + case TCCIR_OP_FUNCCALLVAL: + case TCCIR_OP_FUNCCALLVOID: + case TCCIR_OP_BLOCK_COPY: + case TCCIR_OP_INLINE_ASM: + case TCCIR_OP_VLA_ALLOC: + case TCCIR_OP_SETJMP: + case TCCIR_OP_LONGJMP: + case TCCIR_OP_NL_SETJMP: + case TCCIR_OP_NL_LONGJMP: + case TCCIR_OP_JUMP: + case TCCIR_OP_JUMPIF: + case TCCIR_OP_IJUMP: + case TCCIR_OP_SWITCH_TABLE: + return 0; + default: + break; + } + } + } + IROperand lea_dest = tcc_ir_op_get_dest(ir, dq); /* Update btype to match the loaded value (the LEA dest was a pointer-typed * INT32; after fusion it holds the loaded value). */ diff --git a/arm-link.c b/arm-link.c index dd222b2b..db0da90b 100644 --- a/arm-link.c +++ b/arm-link.c @@ -496,10 +496,13 @@ ST_FUNC void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr, imm12 = val & 0xfff; imm4 = (val >> 12) & 0xf; x = (imm4 << 16) | imm12; - if (type == R_ARM_THM_MOVT_ABS) - write32le(ptr, read32le(ptr) | x); - else - add32le(ptr, x); + /* The Thumb variants are handled by the separate R_ARM_THM_MOVT_ABS / + R_ARM_THM_MOVW_ABS_NC case below, so `type` here is always one of the + two ARM (A32) relocations -- never R_ARM_THM_MOVT_ABS. A stray + `if (type == R_ARM_THM_MOVT_ABS)` check used to guard this add and was + therefore dead code (see docs/bugs.md #10). add32le matches upstream + tinycc's handling of these relocations. */ + add32le(ptr, x); } return; case R_ARM_MOVT_PREL: diff --git a/arm-thumb-asm.c b/arm-thumb-asm.c index 626a3325..5c74cdcb 100644 --- a/arm-thumb-asm.c +++ b/arm-thumb-asm.c @@ -2891,14 +2891,14 @@ uint32_t thumb_parse_special_register(int token) { return 0x10; } - else if (strstr(buffer, "basepri") != NULL) - { - return 0x11; - } else if (strstr(buffer, "basepri_max") != NULL) { return 0x12; } + else if (strstr(buffer, "basepri") != NULL) + { + return 0x11; + } else if (strstr(buffer, "faultmask") != NULL) { return 0x13; diff --git a/arm-thumb-gen.c b/arm-thumb-gen.c index 791f93df..19eb092c 100644 --- a/arm-thumb-gen.c +++ b/arm-thumb-gen.c @@ -225,6 +225,11 @@ int vararg_push_size = 0; /* bytes pushed for variadic r0-r3 save (16 or 0 * (right below pushed regs), so locals are addressed relative to * allocated_stack_size (without pad): * FP + frame_offset = SP + allocated_stack_size + frame_offset. */ +/* Bytes the real run's scratch PUSHes have currently moved SP below its + * steady-state position (see get_scratch_reg_with_save). Defined after the + * scratch bookkeeping state below. */ +static int scratch_push_sp_bias(void); + static inline int fp_adjust_local_offset(int frame_offset, int is_param) { if (is_param) @@ -233,8 +238,12 @@ static inline int fp_adjust_local_offset(int frame_offset, int is_param) if (!tcc_state->need_frame_pointer && frame_offset <= 0) { /* Convert FP-relative (negative) to SP-relative (positive). - * FP + frame_offset = SP + allocated_stack_size + frame_offset. */ - return allocated_stack_size + frame_offset; + * FP + frame_offset = SP + allocated_stack_size + frame_offset. + * A scratch PUSH inside the current instruction has moved SP down; + * without the bias every access in the push window reads/writes 4 + * bytes low per active push (struct_byval fuzz seed 6105: LDR of a + * by-value field between push {r0} and pop {r0}). */ + return allocated_stack_size + scratch_push_sp_bias() + frame_offset; } if (frame_offset < 0 && callee_push_size > 0) @@ -852,6 +861,21 @@ typedef struct CodeGenDryRunState static CodeGenDryRunState dry_run_state; +/* Bytes the real run's scratch PUSHes have currently moved SP below its + * steady-state position. Derived from the push bookkeeping so it can never + * drift from the actual PUSH/POP pairing (including deferred pops). The dry + * run never emits pushes, so its bias is always 0. */ +static int scratch_push_sp_bias(void) +{ + if (dry_run_state.active) + return 0; + int bias = 0; + for (int i = 0; i < scratch_push_count; i++) + if (scratch_push_type[i] == 1) + bias += 4; + return bias; +} + /* Separate literal pool for dry-run mode to avoid modifying the real pool. * This allows accurate code size tracking without affecting the real pass. */ static ThumbLiteralPoolEntry *dry_run_literal_pool = NULL; @@ -1532,7 +1556,7 @@ static ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) scratch_save_slot < (ir->scratch_save_size / 4)) { int frame_offset = ir->scratch_save_base + (scratch_save_slot * 4); - int sp_offset = allocated_stack_size + frame_offset; + int sp_offset = allocated_stack_size + scratch_push_sp_bias() + frame_offset; if (!store_word_to_base(reg_to_save, R_SP, sp_offset, 0)) tcc_error("compiler_error: scratch save STR failed (offset %d)", sp_offset); result.reg = reg_to_save; @@ -1601,7 +1625,7 @@ static void restore_scratch_reg(ScratchRegAlloc *alloc) if (scratch_save_slot > 0) scratch_save_slot--; int frame_offset = ir->scratch_save_base + (scratch_save_slot * 4); - int sp_offset = allocated_stack_size + frame_offset; + int sp_offset = allocated_stack_size + scratch_push_sp_bias() + frame_offset; if (!load_word_from_base(alloc->reg, R_SP, sp_offset, 0)) tcc_error("compiler_error: scratch restore LDR failed (offset %d)", sp_offset); alloc->saved = 0; @@ -1661,19 +1685,23 @@ static void restore_all_pushed_scratch_regs(void) return; } - /* Restore in reverse order */ + /* Restore in reverse order. scratch_push_count is trimmed as each entry + * is restored so scratch_push_sp_bias() sees only the still-active pushes + * while emitting the LDRs below. */ for (int i = scratch_push_count - 1; i >= 0; i--) { int reg = scratch_push_stack[i]; - LOG_SCRATCH("auto-restoring r%d (push order %d, type %d)", reg, i, scratch_push_type[i]); - if (scratch_push_type[i] == 2) + int type = scratch_push_type[i]; + LOG_SCRATCH("auto-restoring r%d (push order %d, type %d)", reg, i, type); + scratch_push_count = i; + if (type == 2) { /* Saved to scratch area: restore via LDR */ TCCIRState *ir = tcc_state->ir; if (scratch_save_slot > 0) scratch_save_slot--; int frame_offset = ir->scratch_save_base + (scratch_save_slot * 4); - int sp_offset = allocated_stack_size + frame_offset; + int sp_offset = allocated_stack_size + scratch_push_sp_bias() + frame_offset; if (!load_word_from_base(reg, R_SP, sp_offset, 0)) tcc_error("compiler_error: scratch auto-restore LDR failed (offset %d)", sp_offset); } @@ -2733,6 +2761,26 @@ static void th_literal_pool_reserve_upcoming_bytes(int upcoming_bytes) th_literal_pool_generate(); } +static int th_literal_pool_would_flush_for(int upcoming_bytes) +{ + int pool_count = dry_run_state.active ? dry_run_literal_pool_count : thumb_gen_state.literal_pool_count; + + if (!thumb_gen_state.generating_function || pool_count == 0) + return 0; + + return thumb_gen_state.code_size + pool_count * 4 + upcoming_bytes >= 1020; +} + +/* Count of conditioned instructions still pending inside an IT/ITE/... block, + * tracked by ot() purely for literal-pool flush suppression. Kept separate + * from mov_equiv_it_pending, which mov_equiv_reset_all() may zero mid-block. + * While this is non-zero a pool flush would land INSIDE the IT block: the + * flush emits its pool + B.W skip-branch BEFORE the bytes of the op being + * emitted, so the branch would occupy a conditioned slot, inherit the IT + * condition, and the opposite arm would fall through into pool data and + * execute it (fuzz ptr seed 5759: O2 HardFault). */ +static int pool_flush_it_pending; + int is_valid_opcode(thumb_opcode op) { return (op.size == 2 || op.size == 4); @@ -2904,15 +2952,6 @@ int ot(thumb_opcode op) if (op.size == 0) return op.size; - /* DEBUG: emit-stream trace for the 90_struct miscompile. Same compiler + - * identical stable allocation ⇒ device and QEMU emit identical opcode streams - * up to the silicon-divergent branch; diffing this trace pinpoints the first - * differing emitted instruction (and its IR index). Real-run only. */ - if (!dry_run_state.active && funcname && - !strcmp((const char *)funcname, "test_init_struct_from_struct") && tcc_state && tcc_state->ir) - fprintf(stderr, "EMIT i=%d ind=0x%x op=0x%x sz=%d\n", tcc_state->ir->codegen_instruction_idx, (unsigned)ind, - (unsigned)op.opcode, op.size); - /* Detect instructions that write to R9 when it's reserved for GOT pointer. * Exclude push/pop/stmdb/ldmia which legitimately save/restore R9. */ if (text_and_data_separation && !allow_r9_write) @@ -3068,6 +3107,36 @@ int ot(thumb_opcode op) imm_cache_reset_all(); } + /* Literal-pool flush safety around IT blocks. Call-site reservations + * (th_literal_pool_reserve_upcoming_bytes) cover the block's CODE bytes, + * but a conditioned arm that materializes a large constant + * (load_full_const) grows the pool AFTER the reservation was checked, so + * the threshold can still trip mid-block. Track the architectural IT + * window here and (a) never flush while an op is conditioned, (b) flush + * BEFORE the IT opcode itself if the worst-case block — 4 code bytes plus + * an 8-byte pool entry per conditioned instruction — could hit the + * threshold, so the deferred flush of (a) never overshoots the LDR-literal + * range. Runs in both passes so dry-run and real layouts stay identical. */ + int op_in_it_block = 0; + if (thumb_gen_state.generating_function) + { + if (pool_flush_it_pending > 0) + { + op_in_it_block = 1; + pool_flush_it_pending--; + } + else + { + int it_len = mov_equiv_it_block_length(op); + if (it_len > 0) + { + if (thumb_gen_state.code_size + op.size + thumb_gen_state.literal_pool_count * 4 + 12 * it_len >= 1020) + th_literal_pool_generate(); + pool_flush_it_pending = it_len; + } + } + } + /* Dry run: don't emit actual opcodes, but still track code size and * handle literal pool generation to ensure code addresses match real pass. */ if (dry_run_state.active) @@ -3080,7 +3149,7 @@ int ot(thumb_opcode op) * code size including the literal pool, so that ind matches * between dry-run and real pass. */ const int max_offset = thumb_gen_state.code_size + thumb_gen_state.literal_pool_count * 4; - if (max_offset >= 1020) + if (max_offset >= 1020 && !op_in_it_block) { th_literal_pool_generate(); } @@ -3095,7 +3164,7 @@ int ot(thumb_opcode op) thumb_gen_state.code_size += op.size; // 16-bit encoding for ldr should be efficient const int max_offset = thumb_gen_state.code_size + thumb_gen_state.literal_pool_count * 4; - if (max_offset >= 1020) + if (max_offset >= 1020 && !op_in_it_block) { th_literal_pool_generate(); } @@ -3744,6 +3813,14 @@ ST_FUNC int tcc_gen_machine_try_strd_imm_spill(int64_t val1, int64_t val2, return 0; MachineCodegenContext ctx = {0}; + /* Materializing the immediates may PUSH the scratch register(s) when FP is + * omitted and no scratch-save area is reserved, lowering SP by 4 per push. + * The STRD destination is SP-relative, so an uncompensated offset would write + * the pair 4*pushes bytes below the intended slot — the array/struct + * initializer then lands at the wrong offset and later reads return stale + * data (fuzz seed 12057). Snapshot the push stack so we can measure the SP + * shift after acquiring the registers and fold it into the offset. */ + int spc_before = scratch_push_count; MachineOperand op1 = {.kind = MACH_OP_IMM, .u.imm.val = val1}; int r1 = mach_ensure_in_reg(&ctx, &op1, 0); int r2; @@ -3757,6 +3834,25 @@ ST_FUNC int tcc_gen_machine_try_strd_imm_spill(int64_t val1, int64_t val2, mach_release_all(&ctx); return 0; } + /* Account for any real SP-lowering pushes (type 1) done above. Saves routed + * to a reserved scratch area (type 2) keep SP stable and need no adjustment. + * The shift only affects an SP-relative base; an FP base is unperturbed. */ + if (base_reg == R_SP) { + int sp_shift = 0; + for (int s = spc_before; s < scratch_push_count && s < 128; s++) + if (scratch_push_type[s] == 1) + sp_shift += 4; + if (sp_shift) { + /* Only the positive (above-SP) local case is safe to compensate by simple + * addition; a negative (below-SP) offset combined with the shift is rare + * and not worth special-casing — fall back to per-element stores. */ + if (sign || abs_off + sp_shift > 1020) { + mach_release_all(&ctx); + return 0; + } + abs_off += sp_shift; + } + } const uint32_t puw = sign ? 4u : 6u; ot_check(th_strd_imm((uint32_t)r1, (uint32_t)r2, (uint32_t)base_reg, abs_off, puw)); mach_release_all(&ctx); @@ -5505,6 +5601,31 @@ static void thumb_emit_data_processing_mop32(const MachineOperand *src1, const M } } + /* Shift-by-0 identity: on ARM, LSR/ASR with immediate field 0 means + * shift-by-32 (yielding 0 / sign-extend), NOT shift-by-0. Fold x >> 0 + * to a plain MOV Rd, Rm so the semantics are correct regardless of + * whether the optimizer managed to simplify the IR. */ + if (!dest_sets_flags && barrel_shift == 0 && + (op == TCCIR_OP_SHR || op == TCCIR_OP_SAR || op == TCCIR_OP_ROR) && + src2->kind == MACH_OP_IMM && !src2->needs_deref && !src2->is_64bit && + (uint32_t)src2->u.imm.val == 0) + { + int dest_reg = mach_get_dest_reg(&mctx, dest, 0); + uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; + int src1_reg = mach_ensure_in_reg(&mctx, src1, excl); + ot_check_mov_reg((uint32_t)dest_reg, (uint32_t)src1_reg, flags, + THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + if (dest->kind != MACH_OP_NONE) + { + const bool needs_wb = dest->kind == MACH_OP_SPILL || dest->kind == MACH_OP_PARAM_STACK || + (dest->kind == MACH_OP_REG && (dest->needs_deref || dest->u.reg.r0 == (int)PREG_REG_NONE)); + if (needs_wb) + mach_writeback_dest(dest, dest_reg); + } + mach_release_all(&mctx); + return; + } + /* UXTB/UXTH fast path: AND with #0xFF or #0xFFFF → UXTB/UXTH. * 16-bit encoding (2 bytes) vs 32-bit AND immediate (4 bytes). */ if (op == TCCIR_OP_AND && !dest_sets_flags && barrel_shift == 0 && @@ -5863,6 +5984,14 @@ static void mach_mod_mop(MachineCodegenContext *ctx, const MachineOperand *src1, int dest_reg = mach_get_dest_reg(ctx, dest, 0); uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; + /* Pre-exclude src2's physical register so that materializing src1 (which may + * need a scratch when it is an immediate or a deref) does not clobber src2's + * value before the divide reads it — same guard as mach_regonly_binop_mop. + * Without it an immediate dividend's scratch load could land on the divisor's + * register (random-C O1 wrong-code, seed 151: `K % (lr|1)` divisor clobbered). */ + if (src2->kind == MACH_OP_REG && !src2->needs_deref && thumb_is_hw_reg(src2->u.reg.r0)) + excl |= (1u << (uint32_t)src2->u.reg.r0); + /* 2. Ensure src1 in a register. */ int src1_reg = mach_ensure_in_reg(ctx, src1, excl); if (thumb_is_hw_reg(src1_reg)) @@ -6580,18 +6709,23 @@ ST_FUNC void tcc_gen_machine_mla_mop(MachineOperand src1, MachineOperand src2, M /* Pre-exclude registers directly referenced by REG operands so that scratch * allocations for other operands (e.g. immediates) cannot clobber them. + * A dereferenced operand's r0 is its POINTER register — it must survive + * until that operand's load is emitted, so it is excluded exactly like a + * plain value register (ptr fuzz seed 59549: src2's spill reload picked the + * deref-accumulator's pointer register as scratch, and the accumulator then + * dereferenced the just-loaded multiplicand value → wild-address fault). * The pre-allocated DEST register must be excluded too: if a source load * grabs it as a saved scratch (push/pop), the restoring pop after the MLA * overwrites the just-computed result. */ uint32_t live_regs = 0; - if (src1.kind == MACH_OP_REG && !src1.needs_deref) + if (src1.kind == MACH_OP_REG && src1.u.reg.r0 >= 0 && src1.u.reg.r0 < 16) live_regs |= (1u << (uint32_t)src1.u.reg.r0); - if (src2.kind == MACH_OP_REG && !src2.needs_deref) + if (src2.kind == MACH_OP_REG && src2.u.reg.r0 >= 0 && src2.u.reg.r0 < 16) live_regs |= (1u << (uint32_t)src2.u.reg.r0); - if (accum.kind == MACH_OP_REG && !accum.needs_deref) + if (accum.kind == MACH_OP_REG && accum.u.reg.r0 >= 0 && accum.u.reg.r0 < 16) live_regs |= (1u << (uint32_t)accum.u.reg.r0); - if (dest.kind == MACH_OP_REG && !dest.needs_deref && - dest.u.reg.r0 != (int)PREG_REG_NONE) + if (dest.kind == MACH_OP_REG && + dest.u.reg.r0 != (int)PREG_REG_NONE && dest.u.reg.r0 >= 0 && dest.u.reg.r0 < 16) live_regs |= (1u << (uint32_t)dest.u.reg.r0); int src1_reg = mach_ensure_in_reg(&ctx, &src1, live_regs); @@ -6757,6 +6891,13 @@ ST_FUNC int tcc_gen_machine_mlal_accum_mop(MachineOperand src1, MachineOperand s s2.is_64bit = false; uint32_t excl = (1u << (uint32_t)rd_lo) | (1u << (uint32_t)rd_hi); + /* Pre-exclude both sources' registers (a deref operand's r0 is its pointer + * register) so ensuring one source cannot grab the other's register as a + * spill-reload scratch — same clobber class as tcc_gen_machine_mla_mop. */ + if (s1.kind == MACH_OP_REG && s1.u.reg.r0 >= 0 && s1.u.reg.r0 < 16) + excl |= (1u << (uint32_t)s1.u.reg.r0); + if (s2.kind == MACH_OP_REG && s2.u.reg.r0 >= 0 && s2.u.reg.r0 < 16) + excl |= (1u << (uint32_t)s2.u.reg.r0); int rn = mach_ensure_in_reg(&ctx, &s1, excl); if (thumb_is_hw_reg(rn)) excl |= (1u << (uint32_t)rn); @@ -7088,8 +7229,13 @@ ST_FUNC void tcc_gen_machine_setif_mop(MachineOperand src, MachineOperand dest, uint32_t excl = thumb_is_hw_reg(lo_reg) ? (1u << (uint32_t)lo_reg) : 0u; int hi_reg = mach_get_dest_reg(&mctx, &dst_hi, excl); - /* Emit ITE sequence for lo word. */ - th_literal_pool_reserve_upcoming_bytes(6); + /* Emit ITE sequence for lo word. Reserve the WHOLE atomic ITE+movs block so + * a literal-pool flush never lands between the IT and its conditioned movs. + * A high register (R8-R12) dest forces the 4-byte mov.w (T2) encoding, so the + * worst case is ITE(2) + 3*mov.w(4) = 14 bytes — NOT 6 (which only covers the + * 2-byte movs of a low-reg dest). Under-reserving split the ITE and ran the + * fall-through into the literal pool (seed 89 O1 HardFault). */ + th_literal_pool_reserve_upcoming_bytes(14); ot_check(th_it(cond, ite_mask)); /* ITE — two conditioned instructions */ ot_check(th_mov_imm(lo_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); ot_check(th_mov_imm(lo_reg, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); @@ -7103,7 +7249,11 @@ ST_FUNC void tcc_gen_machine_setif_mop(MachineOperand src, MachineOperand dest, { int dest_reg = mach_get_dest_reg(&mctx, &dest, 0); - th_literal_pool_reserve_upcoming_bytes(6); + /* Reserve the whole ITE+2-movs block: a high-register dest (R8-R12) uses the + * 4-byte mov.w (T2) encoding, so the worst case is ITE(2) + 2*mov.w(4) = 10 + * bytes, not 6. Under-reserving let a literal-pool flush split the ITE and + * run the fall-through into the pool (seed 89 O1 HardFault). */ + th_literal_pool_reserve_upcoming_bytes(10); ot_check(th_it(cond, ite_mask)); /* ITE — two conditioned instructions */ ot_check(th_mov_imm(dest_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); ot_check(th_mov_imm(dest_reg, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); @@ -9408,6 +9558,7 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s /* MOV-coalescing cache is per-function: register live ranges don't * cross function boundaries. */ mov_equiv_reset_all(); + pool_flush_it_pending = 0; TCCIRState *ir = tcc_state->ir; /* Determine if LR needs saving */ @@ -12000,6 +12151,13 @@ static int can_narrow_backward_branch(int32_t target_ir, int is_conditional, int if (offset >= 0) return 0; + /* If emitting the narrow branch would first flush a pending literal pool, + * the branch source moves forward after this range check. A borderline + * T1/T2 branch can become out of range by the time backpatching runs, and + * th_patch_call() cannot widen an already-emitted 16-bit branch in place. */ + if (th_literal_pool_would_flush_for(2)) + return 0; + return is_conditional ? branch_fits_t1(offset) : branch_fits_t2(offset); } @@ -12399,7 +12557,7 @@ ST_FUNC void tcc_gen_machine_block_copy_mop(TCCIRState *ir, IROperand dest, IROp * Compute dest address into r0 BEFORE pushing lr, since the address is * sp-relative and pushing changes sp. The BL to memcpy clobbers lr, * so we must save/restore it for leaf functions whose prologue didn't. */ - if (size >= 64) + if (size >= TCCIR_BLOCK_COPY_MEMCPY_MIN_BYTES) { tcc_machine_addr_of_stack_slot(R0, frame_offset, 0 /* not param */); tcc_machine_load_constant(R1, PREG_REG_NONE, symref->addend, 0, sym); diff --git a/bisect_pass.sh b/bisect_pass.sh new file mode 100755 index 00000000..c941be08 --- /dev/null +++ b/bisect_pass.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Bisect which TCC_DISABLE_PASS fixes each failing seed. +# Usage: ./bisect_pass.sh +SEED="$1" +OLEVEL="$2" +GOOD="$3" + +cd tests/ir_tests || exit 1 + +run() { + TCC_DISABLE_PASS="$1" python run.py -c "../../$SEED" --cflags="$OLEVEL" 2>/dev/null | grep -o 'checksum=[0-9a-f]*' +} + +PASSES="uninit_ub uninit_dom_ret dce const_prop const_var_prop global_init symref_prop global_sl_fwd const_prop_tmp const_agg_fold known_bits neg_chain_cse add_reassoc redundant_assign string_calls self_copy_elim value_tracking cmp_expr_fold self_arith cmp_offset_fold branch_fold switch_collapse stack_nonnull setif_fuse stack_bool or_bool setif_or_taut var_tmp_fwd var_to_tmp nonneg_fold float_branch vrp single_val_tmp float_narrow deref_fwd fusion_mla deref_indexed disp_fusion copy_prop chain_fold pair_reorder postinc bool_simplify sl_forward bf_insert_extract cmp_field_fuse const_cascade branch_fold_2x jump_thread elim_fallthru kb_cascade branch_cleanup dead_vla_struct alloca_load_fwd zero_vla byte_store_merge store_redundant dse dead_static_store dead_var_store dead_addrvar dead_trail_addrvar dead_alloca_vreg dead_local_slot dead_lea_store dead_temp_local inplace_arith global_base_share orphan_cmp inf_loop_simpl dead_pre_inf return_reuse entry_store esp_cleanup" + +echo "=== baseline ($OLEVEL, no disable) ===" +NONE=$(run "") +echo "$NONE (good=$GOOD)" + +echo "=== bisecting passes ===" +for p in $PASSES; do + R=$(run "$p") + if [ "$R" != "$NONE" ]; then + MATCH="" + [ "$R" = "checksum=$GOOD" ] && MATCH=" *** FIXES (matches good) ***" + echo "$p -> $R$MATCH" + fi +done diff --git a/bugs/01-const-prop-tmp-missing-divmod-folds.md b/bugs/01-const-prop-tmp-missing-divmod-folds.md deleted file mode 100644 index 51e45e3b..00000000 --- a/bugs/01-const-prop-tmp-missing-divmod-folds.md +++ /dev/null @@ -1,56 +0,0 @@ -# 01 — `const_prop_tmp` does not fold IMOD/UMOD/DIV/UDIV/PDIV - -**Status:** FIXED in this branch ([ir/opt_constprop.c:4340-4378](../ir/opt_constprop.c#L4340-L4378)) -**Severity:** Medium — blocks bigger cascades, not a miscompile. - -## Symptom - -`const_prop_tmp`'s two-immediate fold table in [ir/opt_constprop.c:4294-4353](../ir/opt_constprop.c#L4294-L4353) covers -`ADD`/`SUB`/`AND`/`OR`/`XOR`/`SHL`/`SHR`/`SAR`/`ROR`/`MUL`/`UMULL`/`UBFX` -but **not** integer division/remainder. After propagation, an op like -`T11 <-- #-13 IMOD #61` stays in the IR with both operands as immediates -and never folds to `T11 <-- ASSIGN #-13`. - -## Repro - -`tests/gcctestsuite/.../gcc.c-torture/execute/bitfld-1.c` at `-O2`. The -"AFTER LOOP ROTATION" dump shows: - -``` -0008: T11 <-- #-13 IMOD #61 -0009: CMP T11,#-13 -0010: JMP to 13 if "==" -0011: FUNCPARAMVOID FUNCPARAMVOID #131072 -0012: CALL GlobalSym(1137) CALL #131072 ; abort() -``` - -`T11` should fold to `#-13`, `CMP` to a tautology, JMP to unconditional, -and `abort()` to dead code that DCE removes. - -## Why it matters - -Beyond the static fold itself, this stalls **all downstream cascades**: -the `CALL abort()` between a stack STORE and a later stack read keeps -`sl_forward` from forwarding the stored value (it conservatively assumes -a call may clobber memory). Without the fold, the call stays, and the -read-after-store chain never collapses. - -## Fix - -Extend the fold switch with: - -```c -case TCCIR_OP_DIV: -case TCCIR_OP_PDIV: -case TCCIR_OP_UDIV: -case TCCIR_OP_IMOD: -case TCCIR_OP_UMOD: -``` - -each handling `v2 == 0` (and `INT64_MIN / -1` for the signed variants) by -setting `ok = 0` so the fold is skipped on UB inputs. - -## Related - -- [[02]] — without `known_bits`, the operands of these IMOD/UMODs would never *become* both-immediate in the first place. Both bugs together gate the bitfld-1 cascade. -- [[04]] — even after this fold fires, the downstream cleanup needs the pipeline to keep iterating. diff --git a/bugs/02-shl-shr-fold-unequal-amounts.md b/bugs/02-shl-shr-fold-unequal-amounts.md deleted file mode 100644 index 87317956..00000000 --- a/bugs/02-shl-shr-fold-unequal-amounts.md +++ /dev/null @@ -1,59 +0,0 @@ -# 02 — `SHL N → SHR M` peephole only handles `N == M` - -**Status:** WORKED AROUND via [ir/opt_knownbits.c](../ir/opt_knownbits.c) -**Severity:** Medium — large class of missed folds on bitfield reads. - -## Symptom - -The peephole at [ir/opt_constprop.c:1436-1475](../ir/opt_constprop.c#L1436-L1475) handles only the -byte-/half-cast pattern `SHL #N → SHR #N → AND #mask`: - -```c -if (shl_amt != shr_amt || shl_amt <= 0 || shl_amt >= 32) - continue; -``` - -The bitfield-extract idiom uses **unequal** amounts: - -- 7-bit unsigned bitfield at bit position 7: `SHL #18 → SHR #25` -- 7-bit signed bitfield at bit position 0: `SHL #25 → SAR #25` - -These never collapse. They also can't be folded by `const_prop_tmp` alone -because the source value usually isn't fully constant — only specific bit -ranges are (from a preceding `(x AND mask) OR const` insert). - -## Repro - -bitfld-1's chain after the insert sequence: - -``` -T5 = (...) OR #115 ; bits 0..6 = 115 (= -13 in 7b sign) -T9 = T5 SHL #18 -T10 = T9 SHR #25 ; expect: bits 7..13 of T5 = 61 -T14 = T5 SHL #25 -T15 = T14 SAR #25 ; expect: bits 0..6 sign-ext = -13 -``` - -`const_prop` can fold neither chain. The whole abort-test ladder stays alive. - -## Workaround - -Added [ir/opt_knownbits.c](../ir/opt_knownbits.c) — a known-bits lattice (per-temp -and per-stack-slot `known_zero`/`known_one` masks). It propagates through -`AND`/`OR`/`XOR`/`SHL`/`SHR`/`SAR` and rewrites the op to `ASSIGN imm` -when all 32 bits become known. This covers the bitfield extract because -the relevant bits of `T5` are forced known by the preceding inserts even -though `T5`'s full value is not. - -## A simpler, narrower alternative - -For the unequal-shift peephole alone, generalize the existing fold: -when `shl_amt <= shr_amt`, replace with `(x >> (M - N)) & ((1 << (32 - M)) - 1)` -(`SHR` + `AND`). This won't help when the source value is partially known -but not constant — the cascade still needs known-bits — so the workaround -went the more general route. - -## Related - -- [[01]] — even when known_bits folds the SHL/SHR chain to a constant, the downstream IMOD needs the IMOD fold to also fire. -- [[04]] — and the resulting dead `abort()` call needs the pipeline to iterate so `sl_forward` can forward the stack store to subsequent reads. diff --git a/bugs/03-dead-local-slot-missing-lea-deref.md b/bugs/03-dead-local-slot-missing-lea-deref.md deleted file mode 100644 index 455fdde6..00000000 --- a/bugs/03-dead-local-slot-missing-lea-deref.md +++ /dev/null @@ -1,79 +0,0 @@ -# 03 — `dead_local_slot_elim` ignores STOREs via LEA temp deref - -**Status:** FIXED in this branch via new pass [ir/opt_dead_lea_store.c](../ir/opt_dead_lea_store.c) -**Severity:** Medium — leaves dead bitfield writes after upstream chains collapse. - -## Symptom - -`dead_local_slot_elim` ([ir/opt_memory.c:4406-4441](../ir/opt_memory.c#L4406-L4441)) -only NOPs STOREs whose `dest` operand is a **direct** `StackLoc[X]` form: - -```c -if (q->op != TCCIR_OP_STORE) continue; -IROperand dest = tcc_ir_op_get_dest(ir, q); -if (irop_get_tag(dest) != IROP_TAG_STACKOFF) continue; -if (!dest.is_local || irop_get_vreg(dest) != -1) continue; -``` - -It silently skips the equally common temp-deref form: - -``` -T0 <-- Addr[StackLoc[-4]] -T0***DEREF*** <-- T2 [STORE] -``` - -The `live[]` collection at [ir/opt_memory.c:4273-4342](../ir/opt_memory.c#L4273-L4342) has the same -asymmetry — temp-deref reads aren't registered either, so even the -elimination logic that *does* fire is working from an incomplete picture -of which slots are live. - -## Repro - -bitfld-1 after the [[02]] workaround folds all the bitfield extractors — -the IR collapses to just the two bitfield-insert STOREs: - -``` -0007: R0(T3)***DEREF*** <-- R2(T5) [STORE] ; never read again -0008: RETURNVALUE #0 -``` - -`dead_local_slot_elim` walks past those STOREs (dest tag != STACKOFF), -the stack frame stays, the bitfield computation stays. Final size: -15 instructions vs GCC's 2. - -## Fix - -New pass [ir/opt_dead_lea_store.c](../ir/opt_dead_lea_store.c): - -1. Identify single-def TEMPs whose RHS is `Addr[StackLoc[Y]]` - (single-def required so the slot mapping is stable; lval dests are - skipped from the def count — that's the gotcha from [[05]]). -2. Resolve both STORE dests and lval-source reads through that map, - so the temp-deref form participates in liveness. -3. Eliminate a STORE whose byte range is never read by a later instruction. - -Conservative bails: any IJUMP / SETJMP / INLINE_ASM / VLA in the function, -any non-mem* CALL, any escape of the address to a VAR/PARAM or untracked -TEMP, any mem* `PARAM1` (the source side) with unknown size or unknown -source. The existing `dead_local_slot_elim` does similar tameness work -for the direct-stack-ref form — extending its 1500-line implementation -to also recognize the temp-deref shape was deemed higher risk than a -narrower companion pass. - -## Why both passes? - -The two forms cover different upstream sources: - -- Direct `STORE StackLoc[X]` form arises after `sl_forward` canonicalizes - a `LEA + STORE T_DEREF` pair — `dead_local_slot_elim` handles these. -- Temp-deref `STORE T0_DEREF` form survives when `sl_forward` doesn't - canonicalize (the LEA temp is reused, has multi-use shape, etc.). - The new pass handles these. - -A future refactor could unify both into one pass with a slot-resolver -helper, but the current split keeps each pass small and obviously sound. - -## Related - -- [[02]] — without `known_bits` the downstream reads of the slot don't go away, so this pass would correctly leave the STOREs alive. -- [[05]] — gotcha that bit the first attempt at this pass. diff --git a/bugs/04-memory-pipeline-trigger-stall.md b/bugs/04-memory-pipeline-trigger-stall.md deleted file mode 100644 index ebf7879c..00000000 --- a/bugs/04-memory-pipeline-trigger-stall.md +++ /dev/null @@ -1,86 +0,0 @@ -# 04 — `memory_passes` group stalls when its trigger returns 0 mid-cascade - -**Status:** WORKED AROUND via the `kb_cascade` compound pass in [ir/opt_pipeline.c](../ir/opt_pipeline.c) -**Severity:** Medium — limits how far a single pipeline run can drive a chain reaction. - -## Symptom - -`pipeline_run_group` ([ir/opt_pipeline.c:63-118](../ir/opt_pipeline.c#L63-L118)) iterates a pass -group until the *trigger* pass returns 0: - -```c -if (group->trigger_idx >= 0) { - int tch = trigger->run(ctx); - ... - if (tch <= 0) break; -} -``` - -The `memory_passes` group uses `sl_forward` as its trigger -([ir/opt_pipeline.c:220-232](../ir/opt_pipeline.c#L220-L232)). Once `sl_forward` exhausts the -*currently visible* forwarding opportunities, the group exits — even if -other passes in the group (or future iterations) would create new -opportunities for it. - -## Repro - -bitfld-1, iteration 1 of `memory_passes`: - -1. `sl_forward` — forwards stored value into the *first* chain's - re-read. Returns >0. Group continues. -2. `const_cascade`, `known_bits`, `branch_fold_2x`, `dce`, - `elim_fallthru` — together they fold the first chain, kill its - `abort()`, NOP the now-trivial JMP-to-next. - -Iteration 2: - -3. `sl_forward` re-runs on the cleaned-up IR. With the `abort()` call - gone, it *could now* forward the stack store across to the **next** - chain's read. But its analysis returns 0 because the changes from - step 2 haven't been re-discovered as new forwarding sites in this - iteration's pre-scan, **or** sl_forward's incremental check decides - there's nothing new. Group exits. The other three chains never fold. - -End state: only the first of four `abort()` chains is eliminated. - -## Workaround - -A compound pass `kb_cascade` ([ir/opt_pipeline.c:150-169](../ir/opt_pipeline.c#L150-L169)) loops the -relevant subset internally to a fixed point: - -```c -for (int i = 0; i < 8; i++) { - ch += tcc_ir_opt_known_bits(ir); - ch += tcc_ir_opt_const_prop_tmp(ir); - ch += tcc_ir_opt_branch_folding(ir); - tcc_ir_opt_dce(ir); - ch += tcc_ir_opt_eliminate_fallthrough(ir); - tcc_ir_opt_compact_nops(ir); - ch += tcc_ir_opt_sl_forward(ir); - if (!ch) break; -} -``` - -It's added at the end of `memory_passes`. With this, all four bitfld-1 -chains cascade in a single pipeline step. - -## Better fix (deferred) - -The trigger mechanism is a useful optimization (skip the group when -nothing's primed it), but it should be triggered by *any* pass returning -> 0, not specifically the indexed trigger. Two options: - -1. Change `pipeline_run_group` to compute `round_changes` from the full - group and re-iterate while `round_changes > 0`, falling back to the - trigger only as a first-iteration gate. -2. Promote `sl_forward` out of the trigger slot, run the group based on - `round_changes` like the trigger-less groups already do. - -Either change affects every group, so it needs a wider sweep to verify -no group depends on the early-exit behavior. The narrow `kb_cascade` -workaround sidesteps that risk. - -## Related - -- [[02]] — the cascade only matters because `known_bits` *can* fold the chain heads; the trigger stall hid that we needed to. -- [[01]] — the chain head's IMOD fold is what creates the dead `abort()` whose removal lets `sl_forward` continue. diff --git a/bugs/05-var-param-stackoff-encoding.md b/bugs/05-var-param-stackoff-encoding.md deleted file mode 100644 index 32ab33d8..00000000 --- a/bugs/05-var-param-stackoff-encoding.md +++ /dev/null @@ -1,73 +0,0 @@ -# 05 — VAR/PARAM operands carry `tag=STACKOFF` for their spill slot - -**Status:** DOCUMENTED (footgun, not a bug per se) -**Severity:** Low for existing code; High for new pass authors. - -## What surprised me - -When a VAR or PARAM is referenced via its potential stack-spill encoding, -the operand has: - -- `tag == IROP_TAG_STACKOFF` -- `is_local == 1` -- `is_lval == 1` -- `vreg_type != 0` (the originating VAR/PARAM index) -- `u.imm32` = the spill-slot offset (which may collide with offsets of - real, distinct stack allocations) - -This is **indistinguishable** from a real direct stack reference like -`StackLoc[-4]` (which has `vreg_type == 0`) on every field *except* -`vreg_type`. - -A new pass that filters operands with: - -```c -if (op.tag == IROP_TAG_STACKOFF && op.is_local && op.is_lval) { /* stack ref */ } -``` - -will silently treat a VAR's spill encoding as if it were a real slot. -If the pass also tracks per-stack-slot state (e.g. known-bits) and a -real STORE happens to write the *same offset*, it will load that state -when the VAR is read — and miscompile. - -## How it bit me - -`opt_knownbits.c`'s first cut treated `tag=STACKOFF, is_lval, is_local` -as a direct stack read. On -`tests/.../gcc.c-torture/execute/20040313-1.c`, a `V0` variable holding -`d = 0` was encoded as `StackLoc[-4100], vreg_type=VAR, pos=0`. The -array `t[1025]` happened to start at the same offset `-4100`, with -`t[0] = 1024` stored to it shortly before `d`'s read. The pass loaded -the `t[0]` known-bits value (1024) as if it were `d`'s value, computed -`d << 2 = 4096`, and folded that into a downstream address — turning -`t[d=0]` into `t[1024]`. Tests that depended on `d == 0` corrupted at -runtime. - -## Suggested check for new passes - -When treating a `STACKOFF` operand as a real stack slot reference: - -```c -if (op.tag == IROP_TAG_STACKOFF && op.is_local && op.is_lval && - op.vreg_type == 0) /* MUST: no vreg attached */ -{ - /* genuine direct StackLoc[X] ref */ -} -``` - -`vreg_type == 0` (no vreg) is the only encoding for a true direct stack -reference. Anything else is a vreg-backed pseudoreg whose offset field -is metadata about *where it would spill*, not where the program reads -from. - -## Where this would help - -A short comment in [tccir_operand.h](../tccir_operand.h) at the IROperand definition -documenting this case would have saved hours. The existing -`dead_local_slot_elim` already gets it right (it filters -`irop_get_vreg(op) != -1`), but the convention isn't called out -anywhere I could find. - -## Related - -- [[03]] — the same encoding gotcha affects the new dead-LEA-store pass; it uses the same `vreg_type == 0` guard. diff --git a/bugs/06-tu-summary-store-indexed-is-lval.md b/bugs/06-tu-summary-store-indexed-is-lval.md deleted file mode 100644 index 0d8275ee..00000000 --- a/bugs/06-tu-summary-store-indexed-is-lval.md +++ /dev/null @@ -1,56 +0,0 @@ -# 06 — `collect_tu_func_summary` missed STORE_INDEXED / STORE_POSTINC writes when `is_lval` was cleared - -**Status:** FIXED in this branch ([ir/opt.c:822-844](../ir/opt.c#L822-L844)) -**Severity:** Medium — silently prevented end-of-TU dead-static-store elimination. - -## Symptom - -`tcc_ir_collect_tu_func_summary` recorded a write to a static global only -when the STORE dest carried both `is_sym=1` and `is_lval=1`: - -```c -if (q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED || - q->op == TCCIR_OP_STORE_POSTINC) { - IROperand dest = tcc_ir_op_get_dest(ir, q); - if (dest.is_sym && dest.is_lval) { ... } // <-- too strict -} -``` - -But `disp_fusion` may have cleared `is_lval` on the *base* operand of -`STORE_INDEXED` / `STORE_POSTINC` (see comment in [ir/opt_fusion.c:1925-1928](../ir/opt_fusion.c#L1925-L1928): *"disp_fusion clears -is_lval on STORE_INDEXED's base, so the is_lval test alone would -mis-classify it as a redef"*). Result: writes to a static global through -an indexed/postinc form were silently dropped from the summary. - -## Repro - -`tests/gcctestsuite/.../gcc.c-torture/compile/pr25483.c` at `-O2`. -`decode_init` writes `mdct_win[j] = (int)(d * 3)` inside a loop. After -fusion, the IR contains: - -``` -0019: GlobalSym(1182) <-- R0(T6) STORE_INDEXED R6(T7) -``` - -The summary collector saw `dest.is_lval=0` (cleared by `disp_fusion`) and -skipped the entry, so `mdct_win` never appeared in `static_writes`. -Without that record, [[08]]'s `tcc_ir_tu_analyze_dead_statics` could not -mark `mdct_win` as `tu_no_readers` and `decode_init` was never -re-optimized. - -## Fix - -Relax the `is_lval` check specifically for the indexed/postinc forms — -their dest *is* the memory write target regardless of the flag: - -```c -int dest_is_write_target = - dest.is_sym && - (dest.is_lval || q->op == TCCIR_OP_STORE_INDEXED || - q->op == TCCIR_OP_STORE_POSTINC); -``` - -## Related - -- [[07]] — the same is_lval over-restriction affected `dead_static_store_elim` itself. -- [[08]] — the late_reopt mechanism that this summary feeds. diff --git a/bugs/07-dead-static-store-unfused-temp-deref.md b/bugs/07-dead-static-store-unfused-temp-deref.md deleted file mode 100644 index f5e74898..00000000 --- a/bugs/07-dead-static-store-unfused-temp-deref.md +++ /dev/null @@ -1,88 +0,0 @@ -# 07 — `dead_static_store_elim` missed the pre-fusion `T = ADD(SYMREF, …); *T = v` form - -**Status:** FIXED in this branch ([ir/opt_memory.c:5336-5440](../ir/opt_memory.c#L5336-L5440)) -**Severity:** Medium — pass was effectively a no-op for static-array writes. - -## Symptom - -`dead_static_store_elim` looked for the *post-fusion* shape only: - -```c -IROperand dest = tcc_ir_op_get_dest(ir, q); -if (!dest.is_sym || !dest.is_lval) continue; -``` - -i.e. it required the STORE dest itself to be a `SYMREF` operand. But -during the IR optimization pipeline, the canonical form of a static-array -write is still: - -``` -T_addr = ADD(SYMREF, scaled_index) ; or LEA / ASSIGN of SYMREF -*T_addr = value ; STORE through TEMP, dest=lval TEMP -``` - -The fusion from "TEMP-DEREF STORE" to "STORE_INDEXED with SYMREF base" -runs during machine_op / codegen translation, **after** the late_cleanup -pass group has already run. So in practice, the pass never matched a -real-world write to a file-scope static array — it was only fixing -direct `static_int = 0` style scalar writes. - -## Repro - -`tests/gcctestsuite/.../gcc.c-torture/compile/pr25483.c` at `-O2`: - -```c -static int mdct_win[8]; -int decode_init(double d) { - int j; - for (j = 4; j; j--) { d *= 0.5; mdct_win[j] = (d * 3); } -} -``` - -IR in the late_cleanup phase (pre-codegen): - -``` -0011: T3 <-- V0 SHL #2 -0012: T4 <-- GlobalSym(1182) ADD T3 ; T4 = &mdct_win[j] -0018: T4***DEREF*** <-- T6 [STORE] ; *T4 = (int)(d*3) -``` - -`dest=T4` is a TEMP, not a SYMREF, so the pass skipped the STORE even -though `mdct_win` was correctly marked `tu_no_readers`. - -## Fix - -Add an indirect-resolution helper that, when dest is a single-def lval -TEMP, traces back to the TEMP's defining `ADD`/`LEA`/`ASSIGN` and pulls -the SYMREF from `src1`: - -```c -static Sym *dss_resolve_store_dest_sym(TCCIRState *ir, IRQuadCompact *q, - int store_idx) { - IROperand dest = tcc_ir_op_get_dest(ir, q); - if (dest.is_sym) { ... handle direct form ... } - if (q->op != TCCIR_OP_STORE || !dest.is_lval) return NULL; - /* TEMP-DEREF: trace back to single-def ADD/LEA/ASSIGN of SYMREF */ - ... -} -``` - -Constraints kept tight to stay sound: single-def TEMP only, no other -defs anywhere in the function, src1 must be a non-lval SYMREF. - -## Why it matters (cascade) - -NOPing the STORE alone is small; the win is what DCE drops afterward. -For pr25483, NOPing the STORE_INDEXED to `mdct_win` lets DCE remove the -chain feeding it: - -- `T6 = CALL __aeabi_d2iz(T5)` — pure aeabi call, result now dead -- `T5 = CALL __aeabi_dmul(d, 3.0)` — pure aeabi call, result now dead -- `T3 = SHL V0, 2` and `T4 = ADD(mdct_win, T3)` — address dead - -Final result: 30 instructions → 16 instructions for `decode_init`. - -## Related - -- [[06]] — companion is_lval over-restriction in the summary collector. -- [[08]] — without late_reopt firing at all, this pass wouldn't run on pr25483 regardless. diff --git a/bugs/08-late-reopt-gated-on-inline-fns.md b/bugs/08-late-reopt-gated-on-inline-fns.md deleted file mode 100644 index f21a10c9..00000000 --- a/bugs/08-late-reopt-gated-on-inline-fns.md +++ /dev/null @@ -1,110 +0,0 @@ -# 08 — `gen_late_reopt_functions` only iterated `inline_fns`, locking out non-auto-inline functions - -**Status:** FIXED in this branch ([tccgen.c:29381-29453](../tccgen.c#L29381-L29453)) -**Severity:** Medium — entire end-of-TU dead-static-store mechanism silently skipped most candidate functions. - -## Symptom - -`gen_late_reopt_functions` walks `tcc_state->inline_fns` and re-compiles -entries with `func_late_reopt` set: - -```c -for (i = 0; i < s->nb_inline_fns; ++i) { - fn = s->inline_fns[i]; - sym = fn->sym; - if (!sym->type.ref->f.func_late_reopt) continue; - ... begin_macro(compile_ts, 1); next(); gen_function(sym); ... -} -``` - -It requires `fn->func_str` (the saved token stream) to replay-compile. -Tokens are saved only when the function takes one of the inline-related -paths in `decl()` — specifically when `sym->type.t & VT_INLINE` is set -or `auto_inline_sig_ok(sym)` returns 1. - -`auto_inline_sig_ok` rejects: -- `double` / `long double` parameters or return type (via `auto_inline_type_ok` enum) -- struct *parameters* in non-static functions -- `_Complex` types -- unnamed parameters -- VLA parameters -- vector types -- structs > 16 bytes - -Any function matching one of these signatures fell through to the plain -`else { gen_function(sym); }` branch with **no token preservation**. -At end-of-TU, those functions could not be re-compiled even when -`tcc_ir_tu_analyze_dead_statics` marked their writes as dead. - -## Repro - -`tests/gcctestsuite/.../gcc.c-torture/compile/pr25483.c`: - -```c -static int mdct_win[8]; -int decode_init(double d) { /* double param → auto_inline_sig_ok = 0 */ - int j; - for (j = 4; j; j--) { d *= 0.5; mdct_win[j] = (d * 3); } -} -``` - -`mdct_win` has no readers in the TU — TU analysis correctly flagged it -`tu_no_readers` and `decode_init` as `func_late_reopt`. But -`decode_init` was never in `inline_fns`, so `gen_late_reopt_functions` -silently skipped it. Output: 30 instructions vs GCC's 1. - -## Fix - -In `decl()`'s "regular function definition" `else` branch, when -`opt_dead_store` is enabled, take the same save+replay path that the -auto-inline TOO-LARGE branch uses: - -```c -if (tcc_state->opt_dead_store) { - struct InlineFunc *fn = tcc_malloc(...); - fn->sym = sym; - skip_or_save_block(&fn->func_str); - int body_len = fn->func_str->len; - if (body_len <= 512) { - dynarray_add(&tcc_state->inline_fns, &tcc_state->nb_inline_fns, fn); - /* replay-compile */ - begin_macro(compile_ts, 1); next(); gen_function(sym); end_macro(); - if (!sym->type.ref->f.tu_static_writer) { - /* not a writer — drop tokens, detach so gen_inline_functions skips */ - fn->sym = NULL; tok_str_free(fn->func_str); - } - } else { - /* body too large to retain — still need to replay-compile from the - * saved stream because skip_or_save_block consumed the tokens */ - begin_macro(fn->func_str, 1); next(); gen_function(sym); end_macro(); - } -} -``` - -For `tu_static_writer` entries that weren't flagged for late_reopt -(their statics turned out to have readers), the *existing* -`gen_inline_functions` walk re-emits the body anyway — overwriting -only the symbol's `st_value` and leaving the first emission's bytes -as orphan in `.text`. That re-emission is desirable: it produces a -more optimized body once all auto-inline candidates have had their -flags finalized. Do *not* attempt to detach those entries from -`inline_fns` to suppress the re-emit — doing so leaves you with the -sub-optimized first emission (regression observed on -`tests/tests2/55_lshift_type.c`, main grew 532 → 1459 instructions). - -Also gate the "function might return no value" warning on -`!ir_late_reopt_phase` so the second compile doesn't double-emit it. - -## Why it matters (cascade) - -Pairs with [[06]] (summary collector now records the write) and [[07]] -(late_cleanup pass can now NOP the unfused TEMP-DEREF STORE). The three -together close pr25483's gap from 30 instructions to 16. Further wins -beyond that need a pure-loop elimination pass (the remaining -`__aeabi_dmul` calls into `d`, but `d`'s final value is never observed -— GCC reaches `bx lr` by recognizing the whole loop is dead). - -## Related - -- [[06]] — write summary collector fix. -- [[07]] — DSE pass fix to match the unfused store form. diff --git a/bugs/README.md b/bugs/README.md deleted file mode 100644 index 67b4f7e2..00000000 --- a/bugs/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Bug Reports - -Issues observed in the TCC IR optimizer during the bitfld-1 gap-closure work -(2026-05). Each report stands alone; cross-references use `[[NN]]` style. - -| # | Title | Status | -|----|---------------------------------------------------------------|------------| -| 01 | `const_prop_tmp` does not fold `IMOD`/`UMOD`/`DIV`/`UDIV`/`PDIV` with two-immediate operands | FIXED | -| 02 | `SHL N → SHR M` peephole only handles `N == M`; misses bitfield-extract (`N != M`) | WORKED AROUND | -| 03 | `dead_local_slot_elim` ignores STOREs through a LEA temp (`T = Addr[StackLoc[X]]; STORE T***DEREF***`) | FIXED | -| 04 | `memory_passes` group stalls when its trigger (`sl_forward`) returns 0 mid-cascade | WORKED AROUND | -| 05 | VAR/PARAM operands carry `tag=STACKOFF` for their potential spill slot; conflated with direct stack refs in new passes | DOCUMENTED | -| 06 | `collect_tu_func_summary` missed STORE_INDEXED / STORE_POSTINC writes when `is_lval` was cleared | FIXED | -| 07 | `dead_static_store_elim` only matched post-fusion SYMREF dest; missed the pre-fusion TEMP-DEREF form | FIXED | -| 08 | `gen_late_reopt_functions` only iterated `inline_fns`, locking out functions failing `auto_inline_sig_ok` | FIXED | - -Statuses: -- **FIXED**: a code change in this commit/branch resolves it. -- **WORKED AROUND**: the underlying limitation is still present; mitigated by an additional pass or extra pipeline pass. -- **DOCUMENTED**: footgun that bit a new pass author; recorded for next person. diff --git a/configure b/configure index 09ba2b30..70abb093 100755 --- a/configure +++ b/configure @@ -172,6 +172,8 @@ for opt do ;; --enable-asan) confvars_set asan ;; + --disable-asan) asan_disabled=yes + ;; --enable-ubsan) confvars_set ubsan ;; --enable-lsan) confvars_set lsan @@ -195,6 +197,12 @@ for opt do esac done +# AddressSanitizer is enabled by default for this fork; opt out with +# --disable-asan (e.g. for fast production / firmware builds). +if test "$asan_disabled" != "yes"; then + confvars_has asan || confvars_set asan +fi + show_help() { cat << EOF Usage: configure [options] @@ -223,7 +231,8 @@ Advanced options (experts only): --extra-ldflags= specify linker options [$LDFLAGS] --debug include debug info with resulting binaries - --enable-asan enable AddressSanitizer (ASan) + --enable-asan enable AddressSanitizer (ASan) [default] + --disable-asan disable AddressSanitizer (ASan) --enable-ubsan enable UndefinedBehaviorSanitizer (UBSan) --enable-lsan enable LeakSanitizer (LSan) --enable-O0 disable optimizations (GCC -O0) diff --git a/docs/bug2_derived_iv_prompt.md b/docs/bug2_derived_iv_prompt.md new file mode 100644 index 00000000..fa0e78a1 --- /dev/null +++ b/docs/bug2_derived_iv_prompt.md @@ -0,0 +1,169 @@ +# Next-session prompt: fix bug #2 (`transform_derived_iv` derived-IV strength reduction) + +> **RESOLVED 2026-07-02 — kept for historical reference.** The pass is fixed +> and re-enabled; see `docs/bugs.md` #2 for the full write-up. The actual root +> cause was NOT in the transform (its IR output was correct): the miscompile +> came from `tcc_ir_opt_cmp_stack_addr_fold`'s stack-address resolver crossing +> control-flow merge points (deleting the single-trip varargs9 loop's only +> exit test). Fixes: merge-sound `ir_resolve_stack_addr_value_ex` +> (ir/opt_constprop.c), a taint-based escape analysis replacing `feeds_mem` +> over the FULL loop body (`sr_div_value_stays_in_regs`), and removal of the +> unreachable/unsound shared-pointer path. All acceptance criteria below were +> met (torture 11201, primary 1908, ut 2342, golden 21, fuzz 0–2000 clean; +> regression test `tests/ir_tests/258_derived_iv_strength_reduction.c`). + +Paste the block below into a fresh session. It is self-contained; everything a +new agent needs to pick this up cold is here. It reflects what was learned in +the 2026-07-02 miscompile-hunting sessions (see `docs/bugs.md` #2 and the +in-code comment at the top of `transform_derived_iv` in `ir/opt_loop_utils.c`). + +--- + +## Task + +Re-enable and correctly fix **derived-IV strength reduction** +(`transform_derived_iv` in `ir/opt_loop_utils.c`), which is currently disabled +by an unconditional `return 0;` at the top of the function. It must be +re-enabled **without introducing any miscompile**. This is a *redesign* task, +not a point fix — treat it with full miscompile-hunting rigor +(`docs/debugging_fuzz_divergences.md`). Do **not** ship it unless the full +regression + a fuzz sweep are clean. + +Bug #11 sibling context: the analogous pass #7 (`tcc_ir_hoist_pure_calls`) was +fixed and re-enabled in the same era — that one had specific, self-contained +defects. #2 is harder: its index bookkeeping is fragile and it has a history of +"linker heap corruption," so budget for restructuring, not patching. + +## Where it lives + +- Function: `int transform_derived_iv(...)` in `ir/opt_loop_utils.c` (~130 lines + of rewrite logic below the disabling `return 0;`). +- Disabled by: a plain `return 0;` immediately after the out-param + initialization at the top of the function. (Do NOT use a `(void*)1` sentinel + to "disable differently" — it trips GCC `-Werror=array-bounds` on the later + `*out_ptr_vreg = ...` writes; that's why it's a plain early return.) +- Caller: `iv_strength_reduction_core()` (invoked from `ir/opt_loop.c:190,206,220`), + which does the `APPLY_SHIFT` index bookkeeping around the transform's + `out_idx_shift` / `out_postnop_origpos` / `out_stride_pos` return values. +- Detection of derived IVs (regular ADD-based and INDEXED forms) is in the same + file above `transform_derived_iv` (grep `Found DIV`, `Found INDEXED-DIV`, + `Found MLA-DIV`). + +## Confirmed reproduction + +`gcc.c-torture/execute/va-arg-24.c` **miscompiles at -O1** (QEMU exit code 1; +O0 and O2 pass). Steps: + +1. Remove the disabling `return 0;` (and its comment) at the top of + `transform_derived_iv`. +2. `make cross` +3. `cd tests/ir_tests && python -m pytest test_gcc_torture_ir.py -k "va-arg-24" -q` + → `va-arg-24-O1` FAILS ("Test exited with code 1"); O0/O2 pass. + +The failing loop (macro-expanded, per varargs function): +```c +for (i = x + 1; i <= 10; i++) + n[i] = va_arg (ap, int); /* n[] is a local int[11]; ap is the va_list */ +verify (..., n); /* checks n[i] == i for all i */ +``` + +## What was already root-caused (2026-07-02) + +Method: compile va-arg-24.c at -O1 with `-dump-ir`, once with the pass enabled +and once disabled, and `diff` the `=== IR AFTER OPTIMIZATIONS ===` sections +(the pass runs between "AFTER LOOP ROTATION" and "AFTER OPTIMIZATIONS"). + +Findings: + +1. **The transform fires on the array-element address.** `&n[i]` is + strength-reduced into a pointer IV: init `ptr = &n[x+1]`, stride `+4`, loop + guard `ptr `. The + transformed IR *looks* structurally correct at a glance (right start address, + right stride, right trip count), yet the compiled program computes wrong + values — so the fault is in a **downstream interaction** (copy-prop / DCE + merging the address temp into the pointer and dropping a deref or the stride, + and/or the register-allocation / va_list interaction), exactly as the + in-code comment above the disable warns. + +2. **The `feeds_mem` guard is incomplete.** It is meant to skip DIVs whose + address feeds a memory access (the backend already forms efficient indexed + `LDR/STR rN,[rb,rm,LSL#k]` addressing, so nothing is lost by skipping). But + va-arg-24's DIV is a **non-indexed address-temp ADD** — `div->use_idx` points + at `T = base + (i<<2)` (op `TCCIR_OP_ADD`, with `shl_idx` = the feeding SHL, + `stride=4`), NOT at a `STORE_INDEXED`. The `feeds_mem` scan checks whether the + ADD's dest (`ud_vr`) is the lval dest/src of a STORE/LOAD in the loop body + (`sr_vreg_is_ud_or_offset`), but va-arg-24's connection between the ADD result + and the actual store escapes that scan, so it does not bail. + +3. **Shared-path / general-path asymmetry (already fixed defensively).** The + general path bails on `STORE_INDEXED`/`LOAD_INDEXED` uses via `feeds_mem`; the + shared-pointer fast path (`shared_ptr_vreg >= 0`) rewrote + `STORE_INDEXED->STORE` / `LOAD_INDEXED->LOAD` with **no** such check. A guard + was added so both paths are consistently conservative — it is present but + **dormant** (the function still returns 0 early). Keep it. + +## Suggested approaches (pick one, or better) + +- **(A) Make the memory-feeding detection sound.** Redesign `feeds_mem` (or add a + use-def pass) so a DIV is skipped whenever its computed address value reaches + ANY dereference/store/load in the loop — including the non-indexed + address-temp ADD case va-arg-24 exercises. This preserves the pass for genuine + non-memory derived IVs (address used only in further pointer arithmetic) while + guaranteeing correctness for memory-feeding ones. Lowest-risk direction. +- **(B) Restrict scope.** Only transform DIVs whose address is provably never + dereferenced (used purely in more pointer arithmetic that is itself not a + memory address). Simpler to prove correct; may leave value on the table. +- **(C) Fix the downstream interaction.** If (A)/(B) show the transformed IR is + actually valid and the fault is later (copy-prop merging the address temp into + the pointer and dropping a deref/stride), fix that pass instead. Higher effort; + confirm with a `bisect_opt.py` run which knob actually corrupts the value. + +Whichever you choose, also re-check the fragile index bookkeeping +(`use_idx` / `shl_idx` / `new_use_idx` / `out_stride_pos` / `out_postnop_origpos` +and the caller's `APPLY_SHIFT`) — the "heap corruption" history points at +off-by-one shifts when multiple DIVs / calls interleave. + +## Tools + +- `-dump-ir` flag (build already has `CONFIG_TCC_DEBUG`): dumps IR + before-opt / after-loop-rotation / after-opt per function. IR-diff + enabled-vs-disabled is the fastest way to see the exact rewrite. +- `make cross CFLAGS+='-DTCC_LOG_IV_SR=1'` for `LOG_IV_SR` tracing of the pass + (or add a temporary unconditional `fprintf(stderr, ...)` — more reliable if + the CFLAGS override drops other flags). +- `scripts/bisect_opt.py` — QEMU-confirmed culprit knob + the exact IR line + where a value is misfolded (see `docs/debugging_fuzz_divergences.md`). +- `scripts/diff_olevels.py --count N --start M` — O0/O1/O2 self-consistency + fuzz sweep. NOTE: pre-existing divergences at seeds 193, 222, 477, 555, 591 + (and 1136, 1259, 1371, 1378, 1522, 1820 in 800–2000) are **backend + literal-pool / regalloc compile-failures that fail at -O0** — unrelated to the + optimizer. Filter them by checking whether `-O0` compiled; only an O0-compiles- + but-O1/O2-diverges result implicates an optimizer change. + +## Acceptance criteria (all must hold with the pass ENABLED) + +1. `va-arg-24` passes at O0/O1/O2: + `cd tests/ir_tests && python -m pytest test_gcc_torture_ir.py -k "va-arg-24" -q` +2. Full gcc-torture IR execute suite: `python -m pytest test_gcc_torture_ir.py -q -n auto` — 0 failures (baseline 11201 pass). +3. Primary IR suite: `python -m pytest test_qemu.py -q -n auto` — 0 failures (baseline 1904 pass). +4. Host unit tests: `make ut` — 0 failures. Re-enable/rewrite the two disabled + tests in `tests/unit/arm/armv8m/test_opt_loop_utils.c` + (`test_transform_derived_iv_always_returns_zero`, + `test_transform_derived_iv_shared_path_also_disabled`) to assert the new + behaviour, and add a positive test that a non-memory derived IV IS reduced. +5. `scripts/diff_olevels.py --count 2000 --start 0` — no NEW divergences beyond + the pre-existing O0 backend failures listed above. +6. Add a project IR regression test under `tests/ir_tests/` (register in + `test_qemu.py` `TEST_FILES`) that reduces the va-arg-24 array-store-in-loop + pattern and would produce a wrong checksum if the DIV were mis-transformed. + (Avoid `static __attribute__((pure))` + `--gc-sections`: that pattern hits an + unrelated pre-existing "undefined symbol" linker bug.) +7. Update `docs/bugs.md` #2 to FIXED with the validation numbers, and replace the + disabling comment in `ir/opt_loop_utils.c`. + +## If it can't be made correct + +If (A)–(C) don't yield a provably-correct re-enable within scope, leave it +DISABLED (the current safe state) and record the additional findings in +`docs/bugs.md` #2 and the in-code comment — do not ship a partial fix. A +disabled missing-optimization is strictly better than a miscompile. diff --git a/docs/bugs.md b/docs/bugs.md new file mode 100644 index 00000000..a5c1223f --- /dev/null +++ b/docs/bugs.md @@ -0,0 +1,201 @@ +# Known bugs + +## Bug: tccdebug SValue pointer marker on non-pointer base types + +`tcc_debug_print_svalue()` prints a trailing `*` for `VT_LLONG` and other +non-pointer basic types whose numeric value shares bits with `VT_PTR`, +because it checks `if (vt & VT_PTR)` instead of testing the basic type with +`(vt & VT_BTYPE) == VT_PTR`. Confined to debug diagnostic output; no +compiler semantics affected. Not yet fixed. + +## Bug: `tcc_set_linker()` boolean suboptions must be last in a `-Wl,` comma chain + +`link_option()` (`libtcc.c:1268`): for a bare boolean flag (a `val` with no +`=`, e.g. `"Bsymbolic"`), the match loop requires `*p == '\0'` exactly +(`else if (*p) return 0;`) — it never special-cases a following comma. The +value-taking branch two lines above it (`if (*q == '=')`) explicitly +accepts `*p == ',' || *p == '='`. So `-Wl,-Bsymbolic,-rpath=/x` fails to +match `"Bsymbolic"` at all, falls through every `link_option()` check in +`tcc_set_linker()`'s if/else-if chain, and hits `unsupported linker option` +for the *entire* remaining chain even though every suboption is individually +valid. `-Wl,-rpath=/x,-Bsymbolic` (boolean flag last) or passing it alone +both work. Not yet fixed. + +Likely fix: in the bare-boolean branch of `link_option()`, accept +`*p == ','` the same way the value-taking branch does, and have +`tcc_set_linker()`'s caller advance `option` past that comma (mirroring how +it already advances past a value via `skip_linker_arg`). Regression lock +(`tests/unit/arm/armv8m/test_libtcc_options_linker.c`, +`test_wl_boolean_flag_before_value_suboption_currently_fails`) pins the +current buggy behavior — flip its assertions once fixed. + +## Bug: `_Pragma` operator is entirely unimplemented + +`tccpp.c` has no handling anywhere for the `_Pragma(string-literal)` unary +operator required by C11 6.10.9 — only the `#pragma` *directive* form is +recognized, in `pragma_parse()` (`tccpp.c:2463`); there is no +`_Pragma`/`TOK__Pragma` keyword recognition in the lexer or `tccgen.c`'s +parser at all. + +Per the standard, `_Pragma("X")` must be destringized and processed as if a +`#pragma X` directive appeared right there in the token stream — this is +what lets the common portable idiom `#define DO_PRAGMA(x) _Pragma(#x)` +conditionally emit pragmas from macros. Instead: +- Under `-E`, `_Pragma("message \"hi\"")` passes through completely + untouched instead of being rewritten to `#pragma message "hi"` (verified + against `gcc -E`, which does perform the rewrite). +- In a real (non-`-E`) compile, `_Pragma` is parsed as an ordinary, + unrecognized identifier: at file scope this fails with `error: identifier + expected`; inside a function body it produces `warning: implicit + declaration of function '_Pragma'` followed by `error: ';' expected`. Any + translation unit using `_Pragma` fails to compile outright. + +Regression lock: `tests/frontend/pp/14_pragma_operator_currently_unsupported.c` +pins the `-E`-mode passthrough symptom. Once `_Pragma` support is added, its +golden (`14_pragma_operator_currently_unsupported.expect`) must be updated +to the destringized/rewritten form. Not yet fixed. + +## Bugs: linker-script lexer over-eagerly swallows `.` and `*` as identifier characters + +Root cause, three manifestations below: `ld_next_token()` in `tccld.c` +lists `.`, `*` (and, separately, never adds `!`) among the +identifier-*start* characters (`isalpha(c) || c=='_' || c=='.' || +c=='*' || c=='$'`). A bare `.` or `*` in linker-script source is therefore +always lexed as `LDTOK_NAME` with `tok_buf == "."`/`"*"`, never as the raw +punctuation value — so every `if (p->tok == '.')` / `if (p->tok == '*')` +check elsewhere in the file is unreachable dead code. No fix attempted; +all three are pinned as regression tests documenting current behavior. + +### linker-script location counter `.` is silently treated as a symbol named "." + +`tccld.c`: `ld_next_token()` / `ld_parse_primary()` / `ld_parse_sections()` / +`ld_parse_output_section_contents()` + +Because `.` never lexes as the raw char `'.'` (46), the location-counter +read in `ld_parse_primary()` and the location-counter *assignment* handling +in `ld_parse_sections()`/`ld_parse_output_section_contents()` never trigger. +`". = expr;"` falls through to the generic "symbol assignment" code path +and creates/updates a symbol literally named `"."`, while +`LDScript.location_counter` never advances via script content at all — +breaking address assignment, `"_end = .;"`-style epilogue symbols, and +`os->current_offset`/`start_lc` bookkeeping. Regression pin: +`tests/unit/arm/armv8m/test_ld_script.c`, +`test_bug_location_counter_dot_is_treated_as_phantom_symbol`. + +### multiplication operator never applies in linker-script expressions + +`tccld.c`: `ld_next_token()` / `ld_parse_mul()` + +Same root cause: a standalone `*` (e.g. in `"2 * 3"`) lexes as +`LDTOK_NAME`, not the raw char `42`, so `ld_parse_mul()`'s +`while (p->tok == '*' || ...)` never fires: `"X * Y"` silently evaluates to +just `X`, and the unconsumed `"*"` token is picked up one level out and +misparsed as a brand-new top-level SECTIONS item (e.g. a bogus output +section literally named `"*"`, consuming the following number as its +address). No error is reported. Regression pin: +`tests/unit/arm/armv8m/test_ld_script.c`, +`test_bug_expr_multiplication_operator_never_applies`. + +### malformed MEMORY attribute string causes silent phantom-region corruption + +`tccld.c`: `ld_expect()` / `ld_parse_memory_attributes()` / `ld_parse_memory()` + +`ld_expect()` does not advance the token position when it reports a +mismatch, and its return value is discarded by nearly every caller. The +`'!'` invert-attribute prefix (explicitly scaffolded for in +`ld_parse_memory_attributes()`'s char-switch) can never actually lex as +part of an identifier, since `!` is absent from both the identifier-start +and identifier-continuation sets. Once it appears, the parser gets stuck +re-reporting the same mismatch and falls into the generic "skip one token +and keep looping" fallback in `ld_parse_memory()`'s outer loop, which then +misinterprets leftover stray tokens (`rx`, `ORIGIN`, `LENGTH`, ...) as +brand-new memory-region names. Concretely, +`MEMORY { FLASH (!rx) : ORIGIN = 0x0, LENGTH = 1K }` silently produces 4 +bogus regions (`FLASH`, `rx`, `ORIGIN`, `LENGTH`, all-zero fields) with an +overall `ld_script_parse_string()` return of 0 (success) — no crash, no +reported error. Regression pin: `tests/unit/arm/armv8m/test_ld_script.c`, +`test_bug_memory_invert_attribute_causes_phantom_regions`. + +## Bug: linker-script section-pattern parsing leaves a bogus empty leading pattern entry + +`tccld.c`: `ld_parse_section_pattern()` + +Every call unconditionally adds one `LDSectionPattern` via +`ld_add_pattern()` *before* parsing the real glob name(s) inside the +parens (apparently meant to eventually capture a leading file-pattern, +e.g. the `*` in `*(.text*)`), but never populates that entry's `.pattern` +field. Every single `*(...)`/`KEEP(...)` occurrence therefore leaves one +permanent bogus entry (`pattern==""`, `type==LD_PAT_GLOB`, `keep` = +whatever was passed in), doubling `nb_patterns` and polluting +`ld_script_dump()` output. Harmless for `ld_section_should_keep()` today +(an empty pattern can't match a non-empty section name) but a real, +observable data-structure defect. Regression pin: +`tests/unit/arm/armv8m/test_ld_script.c`, +`test_sections_output_section_dotted_with_patterns_and_keep`. Not yet fixed. + +## Bug: linker-script standard field order (`> REGION AT > LMA :PHDR`) silently drops the phdr association + +`tccld.c`: `ld_parse_sections()` + +The per-output-section suffix-clause parsing checks `'>'` (region), then +`':'` (phdr), then `"AT"` (load region) — in that fixed order, exactly once +each. Real-world scripts conventionally write +`"> REGION AT > LMA_REGION :PHDR"` (AT *before* the phdr tag); with that +ordering the `':'` check has already run (and seen `"AT"`, not `':'`) by +the time `AT > LMA_REGION` is consumed, and the trailing `:PHDR` is never +looked at again — `os->phdr_idx` silently stays `-1`, no error reported. +Only the non-standard `"> REGION :PHDR AT > LMA_REGION"` order works. +Regression pin: `tests/unit/arm/armv8m/test_ld_script.c`, +`test_bug_sections_standard_region_at_phdr_order_drops_phdr` (paired with +`test_sections_region_at_and_phdr_supported_order`, which shows the order +that does work). Not yet fixed. + +## Bug: `tcc_opt_get_level()` can only return 0 or 1 + +`tccopt.c`: `tcc_opt_get_level()` + +The function comment claims it "Map TCC's optimization settings to our +levels", but the implementation only inspects `tcc_state->opt_fp_offset_cache`. +It returns 1 whenever that flag is set and 0 otherwise; there is no code path +that returns 2 (or higher) to reflect `-O2`/`-O3`/`-Os`. Consequently, a caller +using this level to decide which passes to run will under-select optimizations +whenever the user requests `-O2` but the FP-offset-cache flag is off, or +over-select at `-O0` if the flag happens to be on. The real pipeline in +`ir/opt_pipeline.c` does not currently use this helper, so the bug is latent. +Regression pin: `tests/unit/arm/armv8m/test_tccopt.c`, +`test_opt_get_level_bug_comment_claims_map_but_only_reads_fp_cache`. Not yet +fixed. + +## Bug: `tccelf_delete()` frees `sym_attrs` but leaves pointer/count stale + +`tccelf.c`: `tccelf_delete()` + +`tccelf_delete()` calls `tcc_free(s1->sym_attrs)` but does not reset +`s1->sym_attrs` to NULL or `s1->nb_sym_attrs` to 0. If the same `TCCState` +is reused without being zeroed, a later `get_sym_attr(s1, index, 1)` sees +`index >= s1->nb_sym_attrs` as false (because `nb_sym_attrs` is still +non-zero), returns a pointer into the freed allocation, and writes to it. +The usual compiler teardown frees the whole `TCCState` immediately after +`tccelf_delete()`, so the bug is latent for normal usage, but it makes the +lifecycle contract unreliable for any caller that deletes ELF state and then +re-initializes the same state. + +Regression pin: `tests/unit/arm/armv8m/test_tccelf.c`, +`test_tccelf_delete_leaves_sym_attrs_stale`. Not yet fixed. + +## Bug: `dwarf_emit_reg_op()` / `dwarf_loc_reg_op_len()` silently accept negative register numbers + +`tccdbg.c`: `dwarf_loc_reg_op_len()` (`tccdbg.c:2066`) and `dwarf_emit_reg_op()` (`tccdbg.c:2073`) + +Both helpers check `regno >= 0 && regno <= 31` to decide whether to use the +short `DW_OP_reg0..DW_OP_reg31` form. For negative `regno` values the check +fails, the value is then treated as an unsigned quantity, and a `DW_OP_regx` +location expression is emitted followed by a multi-byte ULEB128 encoding of +the (now huge) register number. Negative register numbers are invalid in DWARF; +the function should either assert or report an error instead of silently +emitting nonsensical location information. + +Regression pin: `tests/unit/arm/armv8m/test_tccdbg.c`, +`test_dwarf_emit_reg_op_negative_reg_encodes_as_regx` and +`test_dwarf_loc_reg_op_len_edge_cases`. Not yet fixed. + diff --git a/docs/builtin_classify_type.md b/docs/builtin_classify_type.md deleted file mode 100644 index 59acd5c1..00000000 --- a/docs/builtin_classify_type.md +++ /dev/null @@ -1,239 +0,0 @@ -# `__builtin_classify_type` Implementation Plan - -## Overview - -GCC's `__builtin_classify_type(expr)` is a compile-time builtin that returns an integer constant classifying the type of its argument expression. It is used in `` and GCC torture tests (e.g., `20040709-1.c`, `20040709-2.c`) to detect floating-point types at compile time. - -The builtin evaluates at **compile time only** — the argument expression is parsed for its type but **never emitted as code** (similar to `sizeof`). - -## GCC Type Classification Values - -| Value | GCC Enum Constant | Type Category | -|-------|---------------------------|--------------------------------------| -| 0 | `no_type_class` | void | -| 1 | `integer_type_class` | integer types (char, short, int, long, long long, _Bool, enum) | -| 2 | `char_type_class` | **not used in C** (only C++ plain `char`) | -| 3 | `enumeral_type_class` | **not used in C** (C enums → integer) | -| 4 | `boolean_type_class` | **not used in C** (C _Bool → integer) | -| 5 | `pointer_type_class` | pointer types | -| 6 | `reference_type_class` | **C++ only** — references | -| 7 | `offset_type_class` | **C++ only** — pointer-to-member | -| 8 | `real_type_class` | float, double, long double | -| 9 | `complex_type_class` | _Complex float/double/long double | -| 10 | `function_type_class` | function types (bare function, not pointer-to-function) | -| 11 | `method_type_class` | **C++ only** — method types | -| 12 | `record_type_class` | struct | -| 13 | `union_type_class` | union | -| 14 | `array_type_class` | array types | -| 15 | `string_type_class` | **not used in C** | -| 16 | `opaque_type_class` | **not used in C** | -| 17 | `bitint_type_class` | _BitInt (GCC 14+) | -| 18 | `vector_type_class` | GCC vector types (`__attribute__((vector_size(...)))`) | - -### Key Observations for C (what TCC needs) - -In practice for C code, only these values appear: - -- **0** — `void` -- **1** — all integer types (`char`, `short`, `int`, `long`, `long long`, `_Bool`, enums) -- **5** — pointers (including pointer-to-function, arrays decay to pointers in expressions) -- **8** — `float`, `double`, `long double` -- **9** — `_Complex` types (if supported) -- **12** — `struct` -- **13** — `union` -- **14** — array types (when passed as a type, not decayed) - -Note: In GCC's C mode, `enum` maps to **1** (integer), not 3. `_Bool` also maps to **1**, not 4. - -## TCC Type System Mapping - -The mapping from TCC's `VT_*` type flags to GCC classification values: - -| TCC Type (`VT_BTYPE`) | TCC Flags | GCC Classification | -|-----------------------------|----------------------------------------|--------------------| -| `VT_VOID` (0) | — | 0 (void) | -| `VT_BYTE` (1) | ± `VT_UNSIGNED` | 1 (integer) | -| `VT_SHORT` (2) | ± `VT_UNSIGNED` | 1 (integer) | -| `VT_INT` (3) | ± `VT_UNSIGNED`, ± `VT_ENUM` | 1 (integer) | -| `VT_LLONG` (4) | ± `VT_UNSIGNED` | 1 (integer) | -| `VT_PTR` (5) | without `VT_ARRAY` | 5 (pointer) | -| `VT_PTR` (5) | with `VT_ARRAY` | 14 (array) | -| `VT_FUNC` (6) | — | 10 (function) | -| `VT_STRUCT` (7) | without `VT_UNION` high bits | 12 (record/struct) | -| `VT_STRUCT` (7) | with `VT_UNION` high bits (`IS_UNION`) | 13 (union) | -| `VT_FLOAT` (8) | without `VT_COMPLEX` | 8 (real) | -| `VT_DOUBLE` (9) | without `VT_COMPLEX` | 8 (real) | -| `VT_LDOUBLE` (10) | without `VT_COMPLEX` | 8 (real) | -| `VT_FLOAT` (8) | with `VT_COMPLEX` | 9 (complex) | -| `VT_DOUBLE` (9) | with `VT_COMPLEX` | 9 (complex) | -| `VT_LDOUBLE` (10) | with `VT_COMPLEX` | 9 (complex) | -| `VT_BOOL` (11) | — | 1 (integer) | -| any with `VT_VECTOR` | — | 18 (vector) *optional* | - -## Implementation Steps - -### Step 1: Add Token Definition - -In `tcctok.h`, add near the other `__builtin_*` tokens (~line 190): - -```c -DEF(TOK_builtin_classify_type, "__builtin_classify_type") -``` - -### Step 2: Add Classification Helper Function - -In `tccgen.c`, add a static helper that maps a `CType` to the GCC integer: - -```c -/* GCC __builtin_classify_type return values (C mode) */ -#define GCC_TYPE_CLASS_VOID 0 -#define GCC_TYPE_CLASS_INTEGER 1 -#define GCC_TYPE_CLASS_POINTER 5 -#define GCC_TYPE_CLASS_REAL 8 -#define GCC_TYPE_CLASS_COMPLEX 9 -#define GCC_TYPE_CLASS_FUNCTION 10 -#define GCC_TYPE_CLASS_STRUCT 12 -#define GCC_TYPE_CLASS_UNION 13 -#define GCC_TYPE_CLASS_ARRAY 14 -#define GCC_TYPE_CLASS_VECTOR 18 - -static int gcc_classify_type(CType *type) -{ - int bt = type->t & VT_BTYPE; - int t = type->t; - - switch (bt) { - case VT_VOID: - return GCC_TYPE_CLASS_VOID; - - case VT_BYTE: - case VT_SHORT: - case VT_INT: - case VT_LLONG: - case VT_BOOL: - return GCC_TYPE_CLASS_INTEGER; - - case VT_PTR: - if (t & VT_ARRAY) - return GCC_TYPE_CLASS_ARRAY; - return GCC_TYPE_CLASS_POINTER; - - case VT_FUNC: - return GCC_TYPE_CLASS_FUNCTION; - - case VT_STRUCT: - if (IS_UNION(t)) - return GCC_TYPE_CLASS_UNION; - return GCC_TYPE_CLASS_STRUCT; - - case VT_FLOAT: - case VT_DOUBLE: - case VT_LDOUBLE: - if (t & VT_COMPLEX) - return GCC_TYPE_CLASS_COMPLEX; - return GCC_TYPE_CLASS_REAL; - - default: - return GCC_TYPE_CLASS_INTEGER; /* fallback */ - } -} -``` - -### Step 3: Add Parser Case in `unary()` - -In the `unary()` function in `tccgen.c`, add a case alongside the other `TOK_builtin_*` cases (near `TOK_builtin_constant_p`): - -```c -case TOK_builtin_classify_type: - parse_builtin_params(1, "e"); /* nc=1: nocode, "e": one expression */ - n = gcc_classify_type(&vtop->type); - vtop--; - vpushi(n); - break; -``` - -Key details: -- **`nc=1`** — increments `nocode_wanted` so the argument expression is parsed but no code is generated (just like `sizeof`). -- **`"e"`** — parse one expression argument. -- After parsing, inspect `vtop->type` to get the type, pop it, and push the integer constant result. - -### Step 4: Add Test - -Create `tests/ir_tests/NN_builtin_classify_type.c`: - -```c -#include - -struct S { int x; }; -union U { int x; float f; }; - -int main(void) -{ - int i = 0; - float f = 0.0f; - double d = 0.0; - int *p = &i; - struct S s; - union U u; - int arr[4]; - void (*fp)(void); - - printf("%d\n", __builtin_classify_type(i)); /* 1 - integer */ - printf("%d\n", __builtin_classify_type(f)); /* 8 - real */ - printf("%d\n", __builtin_classify_type(d)); /* 8 - real */ - printf("%d\n", __builtin_classify_type(p)); /* 5 - pointer */ - printf("%d\n", __builtin_classify_type(s)); /* 12 - struct */ - printf("%d\n", __builtin_classify_type(u)); /* 13 - union */ - printf("%d\n", __builtin_classify_type(0)); /* 1 - integer */ - printf("%d\n", __builtin_classify_type(0.0)); /* 8 - real */ - printf("%d\n", __builtin_classify_type((char)0)); /* 1 - integer */ - return 0; -} -``` - -Corresponding `.expect` file: -``` -1 -8 -8 -5 -12 -13 -1 -8 -1 -``` - -### Step 5: Verify GCC Torture Tests - -After implementation, verify the two GCC torture tests that use this builtin pass: -```bash -cd tests/ir_tests -python run.py -c ../gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/execute/20040709-1.c --cflags="-O1" -python run.py -c ../gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/execute/20040709-2.c --cflags="-O1" -``` - -## Edge Cases & Notes - -1. **Array vs pointer**: `__builtin_classify_type(arr)` where `arr` is `int[4]` — GCC returns 5 (pointer) because the expression `arr` decays to a pointer. However `__builtin_classify_type((int[4]){})` on a compound literal that hasn't decayed should return 14 (array). In practice, since TCC parses the argument as an expression, array-to-pointer decay will already have occurred, so this should naturally return 5 for array names — matching GCC behavior. - -2. **Function vs function pointer**: `__builtin_classify_type(main)` — the function name decays to a function pointer, so GCC returns 5 (pointer). This should work naturally. - -3. **String literals**: `__builtin_classify_type("hello")` — the string literal is `char[6]` which decays to `char*`, so returns 5 (pointer). - -4. **No side effects**: The argument must not generate any code. The `nocode_wanted` flag via `parse_builtin_params(1, ...)` handles this. - -5. **`_Complex` types**: If/when TCC supports `_Complex`, the `VT_COMPLEX` flag check ensures correct classification (value 9). - -6. **`VT_VECTOR` types**: Optionally return 18 for GCC vector types if `VT_VECTOR` is set. This is a GCC 14+ addition and low priority. - -## Files to Modify - -| File | Change | -|--------------|-----------------------------------------------------| -| `tcctok.h` | Add `TOK_builtin_classify_type` token definition | -| `tccgen.c` | Add `gcc_classify_type()` helper + `case` in `unary()` | - -## Estimated Effort - -Small — ~30 lines of code across 2 files, plus test file. The implementation is entirely compile-time (no IR or codegen changes needed). diff --git a/docs/codegen_dry_run_opt.md b/docs/codegen_dry_run_opt.md deleted file mode 100644 index adf1fc85..00000000 --- a/docs/codegen_dry_run_opt.md +++ /dev/null @@ -1,159 +0,0 @@ -# Codegen dry-run optimisation plan - -Two complementary optimisations to reduce compilation time on memory-constrained -hardware (4–6 MB for TCC). - ---- - -## Option A — Skip dry-run for scratch-conflict-free functions - -### Rationale - -The dry-run serves three purposes: - -1. Scratch tracking — fills `dry_insn_scratch[]` / `dry_insn_saves[]`, feeds Phase-3 fixup. -2. LR-in-prologue detection — `tcc_gen_machine_dry_run_get_lr_push_count()`. -3. Branch offset analysis — `branch_opt_analyze()` selects 16-bit vs 32-bit encodings. - -If scratch pushes are provably impossible, purposes 1 and 2 are no-ops and the -dry-run can be skipped entirely. Purpose 3 falls back to conservative 32-bit -encodings (already the default fallback), costing 2 bytes per branch — acceptable. - -### Condition - -ARM has r0–r12 = 13 allocatable integer registers; scratch needs at most 2 -simultaneously. If there are always ≥2 free integer registers and ≥2 free VFP -registers at every program point, no push/pop can occur. - -```c -int can_skip_dry_run = - __builtin_popcountll(ir->ls.dirty_registers) <= 11 && - __builtin_popcountll(ir->ls.dirty_float_registers) <= 14; // 16 s-regs available -``` - -Evaluated once, just before the two-pass loop in `tcc_ir_codegen_generate`. - -### What changes when skipping - -| Concern | Effect | -|---|---| -| `dry_insn_scratch[]` / `dry_insn_saves[]` | Stay zero (`tcc_mallocz`) — correct | -| Phase-3 fixup loop | Sees all-zero saves — no-op, safe to run or skip | -| LR in prologue | No scratch push → no LR push; `leaffunc` already set correctly | -| Branch optimizer | `branch_opt_analyze` not called → 32-bit fallback for all branches | -| Prologue emission | Uses `ir->ls.dirty_registers` + `stack_size` directly — both available | - -### Loop structure change - -```c -// still call branch_opt_init so get_encoding returns the 32-bit fallback cleanly -tcc_gen_machine_branch_opt_init(); - -int pass_start = can_skip_dry_run ? 1 : 0; -for (int pass = pass_start; pass < 2; pass++) -{ - ... -} -``` - -When `pass_start == 1`, emit the prologue at the point where it was previously -emitted inside the dry-run finalisation block (just before the real-run starts). - ---- - -## Modified Option B — Cache decoded operands, reuse in real-run - -Only active when Option A did **not** fire. - -### Rationale - -Every instruction goes through `decode_mop_args` → `machine_op_from_ir` (interval -table lookups, register resolution) **twice** — once in the dry-run, once in the -real-run. Caching the dry-run results eliminates the second decode pass. - -Only `dest`, `src1`, `src2` are cached (3 slots × 24 bytes = 72 bytes/instruction). -`scale` and `accum` operands (indexed memory ops, MLA) are rare and re-decoded in -the real-run. - -### Memory cost - -`3 × sizeof(MachineOperand) × N` on a 32-bit host: - -| Instructions | Memory | -|---|---| -| 50 | 3.6 KB | -| 100 | 7.2 KB | -| 500 | 36 KB | - -### Allocation - -```c -// allocated before the two-pass loop, only when !can_skip_dry_run -MachineOperand *mop_cache = tcc_malloc(3 * ir->next_instruction_index * sizeof(MachineOperand)); -// layout: [3*i+0] = dest, [3*i+1] = src1, [3*i+2] = src2 -``` - -### Dry-run: fill cache - -After every `DECODE(...)` call in the dry-run instruction loop: - -```c -mop_cache[3*i+0] = a.dest; -mop_cache[3*i+1] = a.src1; -mop_cache[3*i+2] = a.src2; -``` - -### After dry-run: decide whether cache is valid - -Phase-3 fixup mutates the interval table when `any_fixup != 0`. - -```c -int use_mop_cache = !any_fixup; -if (!use_mop_cache) { - tcc_free(mop_cache); - mop_cache = NULL; -} -``` - -### Real-run: use cache via wrapper macro - -```c -#define DECODE(...) (use_mop_cache \ - ? cached_mop_args(mop_cache, i, (MopSpec){__VA_ARGS__}, \ - ir, cq, &src1_ir, &src2_ir, &dest_ir, has_incoming_jump) \ - : decode_mop_args(ir, cq, &src1_ir, &src2_ir, &dest_ir, i, \ - has_incoming_jump, (MopSpec){__VA_ARGS__})) -``` - -`cached_mop_args` reads dest/src1/src2 from the cache and re-calls -`machine_op_from_ir` only for `scale` and `accum` when the spec requests them. - -### Teardown - -```c -tcc_free(mop_cache); // after real-run ends; safe when NULL (tcc_free checks) -``` - ---- - -## Combined control flow - -``` -can_skip_dry_run == 1 - Option A fires: single pass (pass=1 only), no cache, 32-bit branches, - prologue emitted immediately before real-run. - -can_skip_dry_run == 0 - Option B active: two passes, mop_cache allocated. - any_fixup == 0 → cache reused in real-run - any_fixup != 0 → cache freed, normal decode in real-run -``` - ---- - -## Files to modify - -| File | Change | -|---|---| -| `ir/codegen.c` | Condition check, `pass_start`, prologue placement, cache alloc/fill/use/free | -| `arm-thumb-gen.c` | Ensure `branch_opt_init` is safe to call without a subsequent `branch_opt_analyze` | diff --git a/docs/complex/DESIGN_DECISIONS.md b/docs/complex/DESIGN_DECISIONS.md deleted file mode 100644 index de07d87e..00000000 --- a/docs/complex/DESIGN_DECISIONS.md +++ /dev/null @@ -1,247 +0,0 @@ -# Complex Number Support - Design Decisions - -This document records key design decisions for the complex number implementation. - -## Decision 1: Type Representation - -### Option A: New VT_BTYPE values -Add `VT_CFLOAT` (15) and `VT_CDOUBLE` (16) as new basic types. - -**Pros:** -- Clean separation of complex types -- Easy type checking with simple bit tests -- Follows pattern of other fundamental types - -**Cons:** -- Requires changing VT_BTYPE mask if we exceed 16 types -- Need to update all switch statements on VT_BTYPE - -### Option B: VT_COMPLEX flag -Add a `VT_COMPLEX` flag bit that combines with `VT_FLOAT`/`VT_DOUBLE`. - -**Pros:** -- No new basic types needed -- Natural composition of properties - -**Cons:** -- More complex type checking logic everywhere -- May conflict with existing flag bits - -### Decision: Option A (New VT_BTYPE values) -**Rationale:** Complex types are distinct fundamental types in C99. The explicit approach is cleaner and less error-prone. - -**CRITICAL REQUIREMENT:** Must expand VT_BTYPE mask from 0x000f to 0x001f (4 bits → 5 bits) to accommodate VT_CDOUBLE = 16. - -**Implementation steps:** -1. Change `#define VT_BTYPE 0x000f` to `0x001f` in `tcc.h` -2. Audit all code that uses VT_BTYPE (estimated ~50-100 locations) -3. Verify no conflicts with other flag bits (VT_UNSIGNED, VT_ARRAY, etc.) -4. Run full test suite to catch regressions - -**Alternative if mask expansion too risky:** Fall back to Option B (VT_COMPLEX flag) - ---- - -## Decision 2: IR Representation - -### Option A: Native complex operations -Add `TCCIR_OP_CADD`, `TCCIR_OP_CMUL`, etc. - -**Pros:** -- Backend can optimize complex operations -- Cleaner IR representation - -**Cons:** -- More IR opcodes to implement in backend -- Optimization passes need to understand complex semantics - -### Option B: Lower to scalar operations -Complex `a + b` becomes operations on real and imag parts separately. - -**Pros:** -- Reuses existing IR operations -- No new opcodes needed -- Optimization passes work automatically - -**Cons:** -- Loses semantic information early -- Backend can't optimize as effectively - -### Decision: Option B (Lower to scalar operations) -**Rationale:** Simpler implementation, leverages existing optimizer. Can revisit if complex optimization becomes critical. - ---- - -## Decision 3: Register Allocation - -### Option A: Treat as 64/128-bit value -Use 2 or 4 registers as a single unit. - -**Pros:** -- Natural for moves and copies -- Consistent with struct passing - -**Cons:** -- Register allocator needs to reserve consecutive registers -- Complex to handle spilling - -### Option B: Split into real/imag components -Allocate separate vregs for real and imaginary parts. - -**Pros:** -- Simpler register allocation -- Better register utilization - -**Cons:** -- More vregs created -- Need to track pairing - -### Decision: Option A (Treat as unit) -**Rationale:** Aligns with AAPCS which treats complex as unit. Simpler code generation. - ---- - -## Decision 4: Complex Division Implementation - -### Option A: Inline expansion -Generate full instruction sequence for division. - -**Pros:** -- No function call overhead -- Better for optimization - -**Cons:** -- Many instructions (~20+ for software FP) -- Code bloat - -### Option B: Runtime library call -Call `__divsc3` (float) or `__divdc3` (double). - -**Pros:** -- Smaller code -- Library handles edge cases (NaN, Inf) - -**Cons:** -- Function call overhead -- Dependency on libgcc or libtcc1 - -### Decision: Hybrid approach -- **VFP targets:** Inline for float complex, call runtime for double complex -- **Software FP:** Always call runtime - ---- - -## Decision 5: `__real__` and `__imag__` Support - -### Option A: GCC extensions only -Support only when `-std=gnu99` or extensions enabled. - -### Option B: Always support -Treat as always available (like GCC does). - -### Decision: Option B (Always support) -**Rationale:** These operators are essential for complex number programming and widely expected. Newlib's complex.h relies on them. - ---- - -## Decision 6: Complex Constants - -### Option A: Native lexer support -Parse `1.0fi` directly in lexer. - -**Pros:** -- Cleaner -- Better error messages - -**Cons:** -- More lexer changes - -### Option B: Preprocessor macro -Define `__fic(x)` macro that constructs complex. - -**Pros:** -- Simpler implementation - -**Cons:** -- Doesn't match user expectations -- Won't work with newlib's `I` macro - -### Decision: Option A (Native support) -**Rationale:** The `1.0fi` syntax is standard C99. Must support directly. - ---- - -## Decision 7: Complex Comparison Operators - -C99 specifies that complex types only support `==` and `!=` (equality comparison). - -### Decision: Follow C99 strictly -- `==` and `!=` : Compare both real and imaginary parts -- `<`, `>`, `<=`, `>=` : Compile error - -**Note:** May need special handling in parser to give clear error for ordered comparison of complex. - ---- - -## Decision 8: VFP vs Software FP Code Paths - -### Decision: Conditional code generation in arm-thumb-gen.c - -```c -if (arch_config->has_fpu) { - /* Generate VFP instructions */ -} else { - /* Call runtime functions or use integer ops */ -} -``` - -The runtime functions (e.g., `__addsf3`, `__mulsf3`) are already provided by libtcc1 or newlib. - ---- - -## Open Questions - -1. **Struct-based vs Native Implementation:** Should we reconsider lowering `_Complex float` to `struct { float __re; float __im; }` early in compilation? This would: - - Reuse all existing struct handling (ABI, codegen, etc.) - - Require minimal type system changes - - Lose some type information for diagnostics - - Need special-case handling for `__real__`/`__imag__` - - **Recommendation:** Prototype both approaches in Phase 0 and measure implementation effort. - -2. **VT_BTYPE mask expansion risk:** Expanding from 0x000f to 0x001f affects core type system. What's the blast radius? - - How many places use VT_BTYPE? - - Do any flags rely on bit 4 being available? - - Performance impact of 5-bit vs 4-bit mask? - -3. **Long double complex:** On ARM, `long double` is same as `double`. Should `long double complex` be: - - Same as `double complex` (same VT_CDOUBLE) - - Distinct type (new VT_CLDOUBLE = VT_CDOUBLE alias) - - **Recommendation:** Same type, simpler implementation. - -4. **Complex integers:** C99 doesn't support `_Complex int`, but GCC has extension. Should we support it? - - **Phase 1:** Reject with clear error - - **Future:** Add if users request - -5. **Complex bit-fields:** GCC rejects these. We should too, but when? Parse time or later? - **Recommendation:** Parse time, clearer error message. - -6. **Type-generic math:** `` macros need to dispatch to complex functions. How to handle this without `_Generic`? (May defer until `_Generic` fully working.) - -7. **Implicit conversion to bool:** What should `if (complex_var)` do? - - Error (safest) - - True if non-zero (real OR imag != 0) - - True if real != 0 (discard imag) - - **C99 spec:** Allows conversion to bool (6.3.1.2) - non-zero if either part non-zero. - ---- - -## Change Log - -| Date | Decision | Notes | -|------|----------|-------| -| TBD | Type representation | Chose Option A (new VT_BTYPE) | -| TBD | IR representation | Chose Option B (lower to scalar) | -| TBD | Register allocation | Chose Option A (treat as unit) | diff --git a/docs/complex/FIX_PLAN.md b/docs/complex/FIX_PLAN.md deleted file mode 100644 index c66bd1f4..00000000 --- a/docs/complex/FIX_PLAN.md +++ /dev/null @@ -1,271 +0,0 @@ -# Complex Numbers Fix Plan - -**Created:** 2026-02-26 -**Goal:** Fix all complex float arithmetic (add/sub/mul/div) end-to-end - -## Root Cause Analysis - -The complex implementation has correct type system (Phase 1) and IR encoding (Phase 2), -but Phase 3 (code generation) has multiple bugs that cause infinite loops at runtime. - -### Bug 1: Parameters/variables not marked as complex -- **Location:** `tccgen.c:800-834` -- **Problem:** `tcc_ir_vreg_type_set_complex()` is never called for parameter or variable - vregs. The register allocator treats them as single-register floats (LS_REG_TYPE_INT) - instead of register pairs (LS_REG_TYPE_COMPLEX_FLOAT). -- **Evidence:** Debug output shows `reg_type=0` for complex params instead of `reg_type=5`. - -### Bug 2: Incoming register assignment ignores complex -- **Location:** `ir/codegen.c:365` -- **Problem:** `int is_64bit = interval && (interval->is_double || interval->is_llong);` - does NOT check `interval->is_complex`. Complex function params get assigned single - registers (r0, r1) instead of register pairs (r0:r1, r2:r3). -- **Evidence:** IR dump shows `src1: pr0=0 pr1=31` — pr1=31 is PREG_REG_NONE. - -### Bug 3: Complex variable initialization doesn't zero imaginary part -- **Location:** `tccgen.c` (gen_cast_s) + `arm-thumb-gen.c` (store handler) -- **Problem:** `_Complex float a = 1.0f;` generates `V0 <-- #1065353216 [ASSIGN]` — - a single scalar assignment. The imaginary part (second 4 bytes) is uninitialized. -- **Expected:** Should store {1.0f, 0.0f} = two 4-byte values. - -### Bug 4: Stack corruption in thumb_process_complex_op -- **Location:** `arm-thumb-gen.c:~4665` -- **Problem:** After `th_pop(pop_mask)`, the code does - `th_add_imm(R_SP, R_SP, 4, ...)` for single-register case. But pop already - adjusts SP, so this corrupts the stack by 4 bytes. - -### Bug 5: Complex mul/div IR generation missing -- **Location:** `ir/core.c:1168` -- **Problem:** `tcc_ir_gen_f()` only handles FADD/FSUB for complex, not FMUL/FDIV. - Mul/div fall through to scalar FP path which treats complex as a single float. - -### Bug 6: Complex mul codegen has clobbering issues -- **Location:** `arm-thumb-gen.c` (thumb_process_complex_mul) -- **Problem:** `gen_softfp_mul_call()` tries to save results in r2-r5, but each - `__aeabi_fmul` call clobbers r0-r3. The function also has a broken pop sequence - that stores r6 to stack[0] then pops r0-r3, expecting r0 to get the real result, - but the imag result was already moved to r1 before the pop. - -### Bug 7: Complex div codegen has register ordering issues -- **Location:** `arm-thumb-gen.c` (thumb_process_complex_div) -- **Problem:** When source registers overlap with r0-r3 (common case), the - sequential mov instructions can clobber values before they're read. - -### Bug 8: Debug fprintf in production code -- **Location:** Multiple files -- **Problem:** Many `fprintf(stderr, "DEBUG ...")` statements in hot paths. - ---- - -## TODO List - -- [ ] Fix 1: Mark param/var vregs as complex (`tccgen.c:800-834`) -- [ ] Fix 2: Fix incoming register assignment (`ir/codegen.c:365`) -- [ ] Fix 3: Handle real-to-complex initialization -- [ ] Fix 4: Fix stack corruption in `thumb_process_complex_op` -- [ ] Fix 5: Add FMUL/FDIV to complex IR generation (`ir/core.c`) -- [ ] Fix 6: Rewrite `thumb_process_complex_mul` -- [ ] Fix 7: Fix register ordering in `thumb_process_complex_div` -- [ ] Fix 8: Remove all debug fprintf statements -- [ ] Verify: `make cross` builds -- [ ] Verify: `50_complex_types.c` passes -- [ ] Verify: `51_complex_arith.c` passes (all 4 ops) -- [ ] Verify: `make test -j16` no regressions -- [ ] Update `IMPLEMENTATION_STATUS.md` - ---- - -## Implementation Details - -### Fix 1: Mark param/var vregs as complex - -**File:** `tccgen.c` lines 800-834 - -After the existing `is_float(type->t)` blocks for both params and variables, add: - -```c -/* Mark complex parameters - needs register pairs */ -if (type->t & VT_COMPLEX) - tcc_ir_vreg_type_set_complex(tcc_state->ir, vreg); -``` - -Two locations: -1. Line ~804: After param float marking (inside `if (r & VT_PARAM)`) -2. Line ~828: After variable float marking (inside else branch) - ---- - -### Fix 2: Fix incoming register assignment - -**File:** `ir/codegen.c` line 365 - -Change: -```c -int is_64bit = interval && (interval->is_double || interval->is_llong); -``` -To: -```c -int is_64bit = interval && (interval->is_double || interval->is_llong || interval->is_complex); -``` - -This ensures complex params are assigned register pairs (r0:r1, r2:r3) in -`tcc_ir_set_incoming_arg_registers()`, and that `argno` advances by 2. - ---- - -### Fix 3: Handle real-to-complex initialization - -**File:** `arm-thumb-gen.c` — store handler for complex types - -When storing a scalar value to a complex variable (VT_COMPLEX flag set), the store -handler must: -1. Store the scalar value as the real part (at offset +0) -2. Store zero (0x00000000) as the imaginary part (at offset +4 for float) - -This can be detected when the destination is marked complex but the source is a -scalar constant or single-register value. - -Alternatively, in `tccgen.c` `gen_cast_s()` around line 4005: -- Detect `(dbt & VT_COMPLEX) && !(sbt & VT_COMPLEX)` -- Just propagate VT_COMPLEX to vtop so the ASSIGN IR instruction carries the flag -- The codegen store for ASSIGN with complex dest and scalar src generates two stores - ---- - -### Fix 4: Fix stack corruption in thumb_process_complex_op - -**File:** `arm-thumb-gen.c` around line 4665 - -Delete this block: -```c -if (pop_count == 1) - ot_check(th_add_imm(R_SP, R_SP, 4, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); -``` - -`th_pop()` already adjusts SP by `4 * popcount(pop_mask)`. Adding 4 more corrupts -the stack frame. - ---- - -### Fix 5: Add FMUL/FDIV to complex IR generation - -**File:** `ir/core.c` in `tcc_ir_gen_f()` around line 1168 - -Change: -```c -if (is_complex_op && (ir_op == TCCIR_OP_FADD || ir_op == TCCIR_OP_FSUB)) -``` -To: -```c -if (is_complex_op && (ir_op == TCCIR_OP_FADD || ir_op == TCCIR_OP_FSUB || - ir_op == TCCIR_OP_FMUL || ir_op == TCCIR_OP_FDIV)) -``` - -The codegen already has `thumb_process_complex_mul` and `thumb_process_complex_div` -for FMUL/FDIV dispatch in `tcc_gen_machine_fp_op`. This fix ensures the IR -generation path creates the right instruction with complex-typed operands. - ---- - -### Fix 6: Rewrite thumb_process_complex_mul - -**File:** `arm-thumb-gen.c` - -Current implementation has fundamental issues with register clobbering across -soft-float calls. Rewrite strategy: - -``` -(a+bi) * (c+di) = (ac-bd) + i(ad+bc) -``` - -Safe approach using stack for all intermediates: -1. Push all 4 input components (a, b, c, d) to stack -2. Compute ac: load a,c from stack -> call __aeabi_fmul -> push result -3. Compute bd: load b,d from stack -> call __aeabi_fmul -> push result -4. Compute ad: load a,d from stack -> call __aeabi_fmul -> push result -5. Compute bc: load b,c from stack -> call __aeabi_fmul -> push result -6. Real = ac - bd: load ac,bd from stack -> call __aeabi_fsub -> push result -7. Imag = ad + bc: load ad,bc from stack -> call __aeabi_fadd -> push result -8. Pop real,imag results -> move to dest registers -9. Clean up stack - -Key fix: Do NOT try to keep intermediate results in r2-r6. Every __aeabi call -clobbers r0-r3, and saving/restoring callee-saved registers (r4-r6) adds -complexity. Use the stack for all intermediates — it's simpler and correct. - -Stack layout for intermediates (growing down from current SP): -``` -[sp+20] = d (imag of src2) -[sp+16] = c (real of src2) -[sp+12] = b (imag of src1) -[sp+ 8] = a (real of src1) -[sp+ 4] = intermediate results (reused) -[sp+ 0] = intermediate results (reused) -``` - ---- - -### Fix 7: Fix register ordering in thumb_process_complex_div - -**File:** `arm-thumb-gen.c` - -The `__divsc3(float a, float b, float c, float d)` call expects: -- r0 = a (real of numerator) -- r1 = b (imag of numerator) -- r2 = c (real of denominator) -- r3 = d (imag of denominator) - -Problem: if src registers ARE r0-r3 (which they typically are since params arrive -in r0:r1 and r2:r3), the sequential mov instructions clobber values: -```c -if (s1_r != R0) mov R0, s1_r; // might clobber s2_r if s2_r == R0 -if (s1_i != R1) mov R1, s1_i; // might clobber s2_i if s2_i == R1 -``` - -Fix: Push all source values to stack first, then pop into r0-r3 in correct order. -Or use careful ordering analysis to determine safe mov sequence. - -Simpler fix: Since complex params typically arrive in r0:r1 and r2:r3, which is -exactly the __divsc3 argument order, check if registers already match and skip -moves. For the general case, save to stack and reload. - ---- - -### Fix 8: Remove debug fprintf - -**Files to clean:** -- `arm-thumb-gen.c` — Remove fprintf in `thumb_process_complex_op`, `thumb_process_complex_mul`, `thumb_process_complex_div`, `tcc_gen_machine_fp_op` -- `ir/core.c` — Remove fprintf in `tcc_ir_put` (2 locations) and `tcc_ir_gen_f` -- `ir/live.c` — Remove fprintf in `tcc_ir_live_intervals_compute` -- `ir/pool.c` — Remove fprintf in `tcc_ir_pool_add` -- `ir/vreg.c` — Remove fprintf in `tcc_ir_vreg_type_set_complex` and `tcc_ir_vreg_type_get` -- `tccir_operand.c` — Remove fprintf in `svalue_to_iroperand` -- `tccgen.c` — Remove the large debug block before `tcc_ir_liveness_analysis` (~line 11900) -- `tccls.c` — Remove fprintf in `tcc_ls_add_live_interval` - ---- - -## Verification Plan - -```bash -# 1. Build -make clean && make cross - -# 2. Type system test (should already pass) -cd tests/ir_tests && python run.py -c 50_complex_types.c - -# 3. Arithmetic test (the main fix target) -cd tests/ir_tests && python run.py -c 51_complex_arith.c - -# 4. Full regression suite -make test -j16 -``` - -Expected 51_complex_arith.c output: -``` -add: 4.0 + 0.0i -sub: -2.0 + 0.0i -mul: 3.0 + 0.0i -div: 3.0 + 0.0i -OK: All basic complex arithmetic tests passed! -``` diff --git a/docs/complex/GETTING_STARTED.md b/docs/complex/GETTING_STARTED.md deleted file mode 100644 index 6d348027..00000000 --- a/docs/complex/GETTING_STARTED.md +++ /dev/null @@ -1,255 +0,0 @@ -# Complex Number Support - Getting Started Guide - -This guide helps you get started implementing complex number support in TinyCC. - -## Prerequisites - -Before starting, ensure you have: -- Working TinyCC build environment -- ARM cross-compiler (`arm-none-eabi-gcc`) for comparison -- Python 3 with pytest for testing - -```bash -# Verify build works -make clean && make cross -j$(nproc) - -# Verify tests run -make test-venv -make test-prepare -cd tests/ir_tests && python run.py -c 01_hello_world.c -``` - -## IMPORTANT: Read This First - -**⚠️ CRITICAL:** Before starting Phase 1, you MUST complete Phase 0 (Research) to make a fundamental design decision. The current VT_BTYPE mask (0x000f) only supports values 0-15, but we need value 16 for VT_CDOUBLE. - -**Two paths forward:** -1. **Expand VT_BTYPE mask** to 0x001f (requires auditing ~50-100 code locations) -2. **Use struct-based approach** (map complex to struct early, simpler but loses type info) - -See README.md Phase 0 for details. - -## Quick Start: Phase 1 (Type System) - -**Prerequisites:** Phase 0 complete, design decision made. - -### Step 1: Expand VT_BTYPE Mask (if chosen) - -Edit `tcc.h` around line 1000: - -```c -/* BEFORE: */ -#define VT_BTYPE 0x000f /* mask for basic type */ - -/* AFTER: */ -#define VT_BTYPE 0x001f /* mask for basic type (expanded for complex) */ -``` - -**Then run tests:** -```bash -make clean && make cross -j$(nproc) -make test -j16 # Verify no regressions -``` - -### Step 2: Add Type Constants - -Edit `tcc.h` around line 1185: - -```c -#define VT_BOOL 11 /* ISOC99 boolean type */ -/* 12 is available for future use */ -#define VT_QLONG 13 /* 128-bit integer */ -#define VT_QFLOAT 14 /* 128-bit float */ -#define VT_CFLOAT 15 /* float _Complex */ -#define VT_CDOUBLE 16 /* double _Complex (requires VT_BTYPE=0x001f) */ -``` - -### Step 3: Update Parser - -Edit `tccgen.c` function `parse_btype()`. Find the `TOK_COMPLEX` case around line 5886: - -**Current:** -```c -case TOK_COMPLEX: - tcc_error("_Complex is not yet supported"); -``` - -**Change to:** -```c -case TOK_COMPLEX: - complex_modifier = 1; /* Track that we saw _Complex */ - next(); - break; -``` - -Then modify the `TOK_FLOAT` and `TOK_DOUBLE` cases to check this flag. - -### Step 4: Add Type Helpers - -Edit `tcctype.h`: - -```c -static inline int tcc_is_complex_type(int t) -{ - int bt = t & VT_BTYPE; - return (bt == VT_CFLOAT || bt == VT_CDOUBLE); -} -``` - -### Step 5: Test - -Create minimal test: - -```c -/* test_complex.c */ -#include - -int main(void) -{ - _Complex float cf; - _Complex double cd; - - printf("sizeof(cf) = %d\n", (int)sizeof(cf)); - printf("sizeof(cd) = %d\n", (int)sizeof(cd)); - return 0; -} -``` - -Compile: -```bash -./armv8m-tcc -c test_complex.c -o test_complex.o -arm-none-eabi-objdump -h test_complex.o -``` - -**Success:** No compilation error, object file created. - -## Debugging Tips - -### Enable Parser Debug - -```bash -make clean -make CFLAGS+='-DPARSE_DEBUG' cross 2>&1 | head -100 -``` - -### View IR Output - -```bash -./armv8m-tcc -dump-ir -c test_complex.c -``` - -### Compare with GCC - -```bash -# See what GCC generates -arm-none-eabi-gcc -O1 -S -mcpu=cortex-m33 test_complex.c -o test_complex.s -cat test_complex.s -``` - -### Use GDB - -```bash -# Compile with debug info -./armv8m-tcc -g -c test_complex.c -o test_complex.o - -# Debug the compiler itself -gdb ./armv8m-tcc -(gdb) break parse_btype -(gdb) run -c test_complex.c -``` - -## Common Issues - -### Issue: "_Complex is not yet supported" still appears - -**Cause:** Parser not reaching your new code or token not recognized. - -**Debug:** -```c -case TOK_COMPLEX: - fprintf(stderr, "DEBUG: Found TOK_COMPLEX\n"); /* Add this */ - complex_modifier = 1; - next(); - break; -``` - -### Issue: Wrong sizeof results - -**Cause:** Type size function not updated. - -**Fix:** Update `tcc_get_basic_type_size()` in `tcctype.h`: - -```c -case VT_CFLOAT: - return 8; -case VT_CDOUBLE: - return 16; -``` - -### Issue: IR shows wrong types - -**Cause:** IROperand encoding not handling complex. - -**Fix:** Add to `tccir_operand.c` functions that map VT_ to IROP_BTYPE_. - -## Testing Your Changes - -### Create Test File - -```bash -cd tests/ir_tests -cat > 50_complex_types.c << 'EOF' -#include - -int main(void) -{ - _Complex float cf; - _Complex double cd; - - if (sizeof(cf) != 8) { - printf("FAIL: sizeof(float _Complex) = %d, expected 8\n", (int)sizeof(cf)); - return 1; - } - if (sizeof(cd) != 16) { - printf("FAIL: sizeof(double _Complex) = %d, expected 16\n", (int)sizeof(cd)); - return 1; - } - printf("OK\n"); - return 0; -} -EOF - -echo "OK" > 50_complex_types.expect -``` - -### Run Test - -```bash -python run.py -c 50_complex_types.c -``` - -**Expected:** Test compiles and outputs "OK". - -## Next Steps - -After Phase 1 works: - -1. Move to Phase 2: IR support (straightforward type encoding) -2. Phase 3: Code generation (most work, start with load/store) -3. Phase 4-8: Incrementally add features - -See `README.md` for full phase descriptions and `IMPLEMENTATION_CHECKLIST.md` for detailed tasks. - -## Resources - -- C99 Standard: Section 6.2.5 (Types), 7.3 (Complex arithmetic) -- ARM AAPCS: Procedure Call Standard for ARM Architecture -- GCC Complex Docs: https://gcc.gnu.org/onlinedocs/gcc/Complex.html - -## Getting Help - -If stuck: -1. Check existing type implementations (VT_FLOAT, VT_DOUBLE) for patterns -2. Compare with GCC output -3. Add debug prints to understand flow -4. Check IR dump to see where things go wrong diff --git a/docs/complex/IMPLEMENTATION_CHECKLIST.md b/docs/complex/IMPLEMENTATION_CHECKLIST.md deleted file mode 100644 index a1864bb3..00000000 --- a/docs/complex/IMPLEMENTATION_CHECKLIST.md +++ /dev/null @@ -1,331 +0,0 @@ -# Complex Number Support - Implementation Checklist - -Use this checklist to track implementation progress. - -## Legend -- [ ] Not started -- [-] In progress -- [x] Complete - ---- - -## Phase 0: Research and Preparation - -### 0.1 ABI Research -- [x] Read ARM AAPCS §4.1.2 (composite types) -- [x] Study GCC complex handling: `gcc -fdump-tree-gimple test.c` -- [x] Study Clang LLVM IR: `clang -S -emit-llvm test.c` -- [x] Document exact register allocation for soft-float and VFP - -### 0.2 VT_BTYPE Decision -- [x] Count all uses: `grep -r "VT_BTYPE" *.c *.h | wc -l` -- [x] Identify code that relies on mask being 0x000f -- [x] **Decision Made:** Use VT_COMPLEX flag (bit 20) instead of expanding mask -- [x] Document decision in DESIGN_DECISIONS.md - -### 0.3 ABI Compatibility Test -- [-] Write GCC-compiled complex function -- [-] Call from TCC and verify result -- [ ] Test reverse direction (TCC → GCC call) -- [ ] Document any ABI incompatibilities - ---- - -## Phase 1: Type System Foundation ✅ MOSTLY COMPLETE - -### 1.1 Type Constants -- [x] Add `VT_COMPLEX` flag to `tcc.h` (bit 20, 0x00100000) -- [x] Verify no conflicts with other flags - -### 1.2 Parser Changes -- [x] Modify `TOK_COMPLEX` handling in `parse_btype()` (`tccgen.c`) -- [x] Handle `float _Complex` -> `VT_FLOAT | VT_COMPLEX` -- [x] Handle `double _Complex` -> `VT_DOUBLE | VT_COMPLEX` -- [x] Handle `_Complex float` (reversed order) -- [x] Handle `_Complex double` (reversed order) -- [x] Handle `__complex__` GCC extension - -### 1.3 Type Helper Functions -- [x] Add `tcc_is_complex_type()` to `tcctype.h` -- [x] Add `tcc_complex_base_type()` to `tcctype.h` -- [x] Add `tcc_is_complex_float()` helper -- [x] Add `tcc_is_complex_double()` helper - -### 1.4 Type Size/Alignment -- [x] Update `tcc_get_basic_type_size()` for complex (8 for CFLOAT, 16 for CDOUBLE) -- [x] Verify alignment: 4-byte for CFLOAT, 8-byte for CDOUBLE -- [x] Check struct layout with complex members - -### 1.5 Type Checking Updates -- [x] Find all `switch (bt)` on VT_BTYPE -- [x] Update type checking for VT_COMPLEX flag -- [x] Update `tcc_type_to_string()` for complex type names - -### 1.6 Type Conversion Support -- [x] Update `tcc_convert_type()` for real → complex -- [x] Update `tcc_convert_type()` for complex → real (discard imag) -- [x] Update `tcc_convert_type()` for complex → complex (widen/narrow) -- [x] Update `tcc_convert_type()` for integer → complex -- [x] Implement explicit cast: `(_Complex float)expr` -- [-] Handle complex to bool conversion (C99 6.3.1.2) - -### 1.7 Testing -- [x] Create `tests/ir_tests/50_complex_types.c` -- [x] Create `tests/ir_tests/50_complex_types.expect` -- [x] Test passes: `./run.py -c 50_complex_types.c` - ---- - -## Phase 2: IR Support ✅ COMPLETE - -### 2.1 IR Operand Type Encoding -- [x] Add `is_complex` field to `IROperand` in `tccir_operand.h` -- [x] Update encoding in `svalue_to_iroperand()` -- [x] Update decoding in `iroperand_to_svalue()` - -### 2.2 IR Type Mapping -- [x] Ensure VT_COMPLEX flag maps to `is_complex` in IROperand -- [x] Ensure `is_complex` restores VT_COMPLEX flag - -### 2.3 IR Dump Output -- [x] Verify `-dump-ir` shows correct complex types -- [x] Add type name for complex in IR debug output - -### 2.4 Testing -- [x] Run `./armv8m-tcc -dump-ir -c test.c` and verify output - ---- - -## Phase 3: Code Generation 🚧 PARTIAL - -### 3.1 Complex Value Representation -- [x] Document register pair usage (r0/r1 for CFLOAT) -- [x] Document register quad usage (r0-r3 for CDOUBLE) -- [x] VFP register usage documented (s0/s1 for CFLOAT, d0/d1 for CDOUBLE) - -### 3.2 Load Operations -- [x] Implement CFLOAT load (2 consecutive loads) -- [x] Implement CDOUBLE load (4 consecutive loads or 2 double loads) -- [x] Handle stack-based complex values - -### 3.3 Store Operations -- [x] Implement CFLOAT store (2 consecutive stores) -- [x] Implement CDOUBLE store -- [x] Handle stack frame allocation for complex locals - -### 3.4 Move Operations -- [x] Implement CFLOAT register-to-register move -- [x] Implement CDOUBLE register-to-register move - -### 3.5 Addition/Subtraction -- [x] Software FP: CFLOAT add (call `__addsf3` x2) -- [x] Software FP: CDOUBLE add (call `__adddf3` x2) -- [x] `thumb_process_complex_op()` implemented - -### 3.6 Multiplication -- [ ] Software FP: Call `__mulsf3` twice + `__subsf3` + `__addsf3` -- [ ] VFP: Inline VMUL + VSUB + VADD sequence -- [ ] Implement in `thumb_process_complex_op()` or new function - -### 3.7 Division -- [ ] Software FP: Call `__divsc3`/`__divdc3` runtime function -- [ ] VFP: Implement inline or call runtime -- [ ] Handle edge cases (division by zero) - -### 3.8 Negation -- [ ] Software FP: Negate both parts -- [ ] VFP: VNEG.F32/VNEG.F64 both parts - -### 3.9 Register Allocator Updates -- [x] Ensure consecutive register allocation for complex -- [x] Handle spilling of complex values to stack -- [x] Update live range tracking for register pairs - -### 3.10 Testing -- [-] Create `tests/ir_tests/51_complex_arith.c` -- [x] Addition test passes -- [x] Subtraction test passes -- [ ] Multiplication test passes -- [ ] Division test passes - ---- - -## Phase 4: Real/Imaginary Accessors 🚧 PARTIAL - -### 4.1 Keywords -- [x] Add `TOK_REAL` (`__real__`) to `tcctok.h` -- [x] Add `TOK_IMAG` (`__imag__`) to `tcctok.h` - -### 4.2 Parser Support -- [x] Parse `__real__` unary expression -- [x] Parse `__imag__` unary expression -- [x] Generate code to extract real part -- [x] Generate code to extract imaginary part - -### 4.3 L-value Support -- [ ] Allow `__real__ x = value;` (assignment) -- [ ] Allow `__imag__ x = value;` (assignment) -- [ ] Support address-of: `&__real__ x` - -### 4.4 Testing -- [ ] Create `tests/ir_tests/53_complex_accessors.c` -- [ ] Read tests pass -- [ ] Write tests pass -- [ ] Address-of tests pass - ---- - -## Phase 5: Complex Constants ❌ NOT STARTED - -### 5.1 Lexer Changes -- [ ] Parse `i` suffix on float constants -- [ ] Parse `if` suffix (imaginary float) -- [ ] Parse `i` after regular float (e.g., `1.0i`) -- [ ] Handle `fi` suffix for float imaginary - -### 5.2 Constant Creation -- [ ] Create zero real + imaginary value representation -- [ ] Store in data section -- [ ] Handle static initialization - -### 5.3 _Complex_I Constant -- [ ] Ensure `_Complex_I` expands to `1.0fi` or similar -- [ ] Update `include/complex.h` if needed - -### 5.4 Testing -- [ ] Create `tests/ir_tests/54_complex_init.c` -- [ ] Constant initialization tests pass -- [ ] Static initialization tests pass -- [ ] CMPLX macro works - ---- - -## Phase 6: Complex Library Support ✅ COMPLETE - -### 6.1 Header File -- [x] Create `include/complex.h` -- [x] Define `complex` macro to `_Complex` -- [x] Define `_Complex_I` (placeholder until constants work) -- [x] Define `I` -- [x] Add CMPLX/CMPLXF/CMPLXL macros - -### 6.2 Basic Functions -- [x] `creal/crealf/creall` (inline implementations) -- [x] `cimag/cimagf/cimagl` (inline implementations) -- [x] `conj/conjf/conjl` (link to newlib) -- [x] `cabs/cabsf/cabsl` (link to newlib) - -### 6.3 Math Functions -- [x] All math functions link to newlib - -### 6.4 Testing -- [ ] Create `tests/ir_tests/57_complex_math.c` -- [ ] Basic function tests pass -- [ ] Math function tests pass - ---- - -## Phase 7: Calling Conventions 🚧 PARTIAL - -### 7.1 Parameter Passing -- [x] CFLOAT in r0/r1 (soft float) or s0/s1 (VFP) -- [x] CDOUBLE in r0-r3 (soft float) or d0/d1 (VFP) -- [ ] Stack parameter passing for overflow (verify) - -### 7.2 Return Values -- [x] CFLOAT return in r0/r1 or s0/s1 -- [x] CDOUBLE return in r0-r3 or d0/d1 - -### 7.3 Function Prologue/Epilogue -- [x] Correct stack frame for complex locals -- [x] Save/restore complex callee-saved registers - -### 7.4 Varargs (Optional) -- [ ] Decide if complex in varargs supported -- [ ] Document limitation if not supported - -### 7.5 Testing -- [ ] Create `tests/ir_tests/52_complex_calls.c` -- [ ] Pass by value tests pass -- [ ] Return value tests pass -- [ ] Nested call tests pass - ---- - -## Phase 8: Debug Information ❌ NOT STARTED - -### 8.1 DWARF Types -- [ ] Add DWARF type entry for CFLOAT -- [ ] Add DWARF type entry for CDOUBLE -- [ ] Use DW_ATE_complex_float - -### 8.2 Debug Output -- [ ] Verify `tccdbg.c` handles VT_COMPLEX -- [ ] Verify correct debug info generation - -### 8.3 Testing -- [ ] Compile with `-g` -- [ ] Verify GDB can inspect complex variables -- [ ] Verify correct values shown in debugger - ---- - -## Phase 9: Testing & Quality 🚧 IN PROGRESS - -### 9.1 Unit Tests -- [x] 50_complex_types.c passes -- [-] 51_complex_arith.c (add/sub only) -- [ ] 52_complex_calls.c -- [ ] 53_complex_accessors.c -- [ ] 54_complex_init.c -- [ ] 55_complex_compare.c -- [ ] 56_complex_edge.c -- [ ] 57_complex_math.c - -### 9.2 Negative Tests -- [ ] Complex bit-field produces error -- [ ] Ordered comparison produces error -- [ ] Clear error messages - -### 9.3 GCC Testsuite -- [ ] Identify relevant GCC tests -- [ ] Run GCC complex tests -- [ ] Document pass/fail status - -### 9.4 Regression Testing -- [-] Run full test suite: `make test -j16` -- [x] No regressions in existing tests (verified for Phases 1-2) - -### 9.5 Code Review -- [ ] Review all changes -- [ ] Check for code style compliance -- [ ] Verify comments added - ---- - -## Quick Reference: Current Status - -| Phase | Status | % Complete | -|-------|--------|------------| -| 0: Research | ✅ Done | 100% | -| 1: Type System | ✅ Done | 95% | -| 2: IR Support | ✅ Done | 100% | -| 3: Code Gen | 🚧 Partial | 50% | -| 4: Accessors | 🚧 Partial | 60% | -| 5: Constants | ❌ Not Started | 0% | -| 6: Library | ✅ Done | 90% | -| 7: Calling Conv | 🚧 Partial | 70% | -| 8: Debug Info | ❌ Not Started | 0% | -| 9: Testing | 🚧 In Progress | 30% | - -**Overall Completion: ~60%** - ---- - -## Next Actions (Recommended Priority) - -1. **Implement Complex Multiplication** (Phase 3) - High Impact -2. **Implement Complex Division** (Phase 3) - High Impact -3. **Add Imaginary Constant Support** (Phase 5) - High Impact -4. **Create Missing Test Files** (Phase 9) - Medium Impact -5. **Complete __real__/__imag__ L-values** (Phase 4) - Medium Impact diff --git a/docs/complex/IMPLEMENTATION_STATUS.md b/docs/complex/IMPLEMENTATION_STATUS.md deleted file mode 100644 index 6fabce7f..00000000 --- a/docs/complex/IMPLEMENTATION_STATUS.md +++ /dev/null @@ -1,272 +0,0 @@ -# Complex Number Support - Implementation Status - -**Last Updated:** 2026-02-26 - -## Summary - -Complex number support in TinyCC for ARMv8-M is **partially implemented**. Phase 1 (Type System) and Phase 2 (IR Support) are functionally complete. Phase 3 (Code Generation) has basic arithmetic working but needs completion for full compliance. - -**Recent Changes:** Implemented fixes from FIX_PLAN.md - corrected register allocation for complex parameters and IR generation for FMUL/FDIV. - -## Implementation Progress by Phase - -### Phase 1: Type System Foundation ✅ COMPLETE - -| Component | Status | Notes | -|-----------|--------|-------| -| VT_COMPLEX flag | ✅ Done | Implemented as bit 20 flag (0x00100000) | -| Parser (`TOK_COMPLEX`) | ✅ Done | `parse_btype()` handles `_Complex` keyword | -| Type helpers | ✅ Done | `tcc_is_complex_type()` etc. in `tcctype.h` | -| Size/alignment | ✅ Done | 8 bytes for CFLOAT, 16 for CDOUBLE | -| Type conversions | ✅ Done | Real↔Complex, widening, casting | -| `__real__`/`__imag__` | ✅ Partial | Parser recognizes, basic implementation | - -**Files Modified:** -- `tcc.h` - Added `VT_COMPLEX` flag -- `tcctok.h` - Added `TOK_REAL`, `TOK_IMAG` -- `tcctype.h` - Added complex type helper functions -- `tccgen.c` - Parser changes for complex types - -**Test Status:** `tests/ir_tests/50_complex_types.c` ✅ PASSES - ---- - -### Phase 2: IR Support ✅ COMPLETE - -| Component | Status | Notes | -|-----------|--------|-------| -| IROperand complex flag | ✅ Done | `is_complex` field added | -| Type encoding | ✅ Done | `svalue_to_iroperand()` handles complex | -| Type decoding | ✅ Done | `iroperand_to_svalue()` restores complex flag | -| IR dump output | ✅ Done | Shows complex types correctly | - -**Files Modified:** -- `tccir_operand.h` - Added `is_complex` field to `IROperand` -- `tccir_operand.c` - Encoding/decoding logic for complex types - -**Test Status:** `./armv8m-tcc -dump-ir` shows correct complex types ✅ - ---- - -### Phase 3: Code Generation 🚧 PARTIAL (Fixes Applied) - -| Component | Status | Notes | -|-----------|--------|-------| -| Value representation | ✅ Done | Register pairs for complex values | -| Load/store | ✅ Done | Consecutive memory operations | -| Addition/Subtraction | ✅ Done | `thumb_process_complex_op()` implemented | -| Multiplication | 🚧 Fixed | Rewritten with stack-based approach | -| Division | 🚧 Fixed | Uses `__divsc3` runtime call | -| Register allocator | ✅ Done | Handles register pairs | - -**Fixes Applied (from FIX_PLAN.md):** - -1. ✅ **Fix 1:** Mark param/var vregs as complex (`tccgen.c:805-807, 832-834`) -2. ✅ **Fix 2:** Fix incoming register assignment (`ir/codegen.c:365`) - added `is_complex` check -3. ⏭️ **Fix 3:** Handle real-to-complex initialization - NOT YET DONE -4. ✅ **Fix 4:** Fix stack corruption in `thumb_process_complex_op` - removed extra SP adjustment -5. ✅ **Fix 5:** Add FMUL/FDIV to complex IR generation (`ir/core.c:1168`) -6. ✅ **Fix 6:** Rewrite `thumb_process_complex_mul` with stack-based approach -7. ✅ **Fix 7:** Fix register ordering in `thumb_process_complex_div` -8. ⏭️ **Fix 8:** Remove debug fprintf statements - NOT YET DONE - -**Files Modified:** -- `arm-thumb-gen.c` - Complex operation handling -- `ir/codegen.c` - Register assignment for complex params -- `ir/core.c` - FMUL/FDIV IR generation - -**Known Issues:** -- Complex multiplication/division still cause HardFault at runtime - needs further debugging -- Debug output still enabled (`DEBUG` macros active) - ---- - -### Phase 4: Real/Imaginary Accessors 🚧 PARTIAL - -| Component | Status | Notes | -|-----------|--------|-------| -| Keywords | ✅ Done | `TOK_REAL`, `TOK_IMAG` in `tcctok.h` | -| Parser | ✅ Done | Unary expression parsing | -| Code generation | ✅ Basic | Extraction works | -| L-value support | ❌ TODO | Assignment to `__real__ x` not complete | -| Address-of | ❌ TODO | `&__real__ x` not complete | - -**Files Modified:** -- `tcctok.h` - Token definitions -- `tccgen.c` - Parser support (lines 7097-7120) - ---- - -### Phase 5: Complex Constants ❌ NOT STARTED - -| Component | Status | Notes | -|-----------|--------|-------| -| Imaginary suffix | ❌ TODO | `1.0fi`, `2.0i` parsing | -| Constant creation | ❌ TODO | Data section storage | -| `_Complex_I` | ❌ TODO | Macro definition | - -**Blocker:** Lexer changes needed in `tccpp.c` for imaginary suffix parsing. - ---- - -### Phase 6: Complex Library Support 🚧 PARTIAL - -| Component | Status | Notes | -|-----------|--------|-------| -| `complex.h` header | ✅ Done | `include/complex.h` created | -| `complex` macro | ✅ Done | Maps to `_Complex` | -| `I` macro | ⚠️ Partial | Defined but `1.0fi` not working yet | -| `CMPLX` macros | ✅ Done | Compound literal versions | -| `creal/cimag` | ✅ Done | Inline implementations | -| Math functions | ✅ Deferred | Using newlib's implementations | - -**Files Created:** -- `include/complex.h` - C99 complex header (complete) - ---- - -### Phase 7: Calling Conventions 🚧 PARTIAL - -| Component | Status | Notes | -|-----------|--------|-------| -| Parameter passing | ✅ Basic | Works for simple cases | -| Return values | ✅ Basic | Works for simple cases | -| AAPCS compliance | ⚠️ Review needed | Verify against spec | -| Stack overflow | ❌ TODO | Complex on stack | -| Varargs | ❌ Deferred | Low priority | - -**Files Modified:** -- `arm-thumb-gen.c` - Call site handling -- `arm-thumb-callsite.c` - Argument passing - ---- - -### Phase 8: Debug Information ❌ NOT STARTED - -| Component | Status | Notes | -|-----------|--------|-------| -| DWARF types | ❌ TODO | Add complex float/double entries | -| GDB testing | ❌ TODO | Verify variable inspection | - -**Files to Modify:** -- `tccdbg.c` - Debug info generation - ---- - -### Phase 9: Testing 🚧 IN PROGRESS - -| Test | Status | -|------|--------| -| `50_complex_types.c` | ✅ PASS | -| `51_complex_arith.c` | 🚧 Partial (add/sub only, mul/div need debugging) | -| `52_complex_calls.c` | ❌ Not created | -| `53_complex_accessors.c` | ❌ Not created | -| `54_complex_init.c` | ❌ Not created | -| `55_complex_compare.c` | ❌ Not created | -| `56_complex_edge.c` | ❌ Not created | -| `57_complex_math.c` | ❌ Not created | - ---- - -## What Works Now - -### ✅ Type Declarations -```c -_Complex float cf; -_Complex double cd; -float _Complex cf2; /* Alternate syntax */ -``` - -### ✅ sizeof -```c -sizeof(_Complex float) /* Returns 8 */ -sizeof(_Complex double) /* Returns 16 */ -``` - -### ✅ Basic Arithmetic (Add/Subtract) -```c -_Complex float a = ...; -_Complex float b = ...; -_Complex float c = a + b; /* Works */ -_Complex float d = a - b; /* Works */ -``` - -### ✅ Type Conversions -```c -float f = 3.0f; -_Complex float cf = f; /* Real -> Complex */ -float g = cf; /* Complex -> Real (discards imag) */ -``` - -### ✅ complex.h Header -```c -#include -complex double z; /* 'complex' macro works */ -``` - ---- - -## What's Missing / Not Working - -### ❌ Complex Multiplication and Division (Partially Fixed) -```c -_Complex float c = a * b; /* Code generation rewritten but still HardFaults */ -_Complex float d = a / b; /* Code generation rewritten but still HardFaults */ -``` - -**Status:** Applied fixes from FIX_PLAN.md, but runtime issues remain. - -### ❌ Imaginary Constants -```c -_Complex float c = 1.0f + 2.0fi; /* ERROR: 'fi' suffix not recognized */ -``` - -### ❌ Full __real__/__imag__ L-value Support -```c -__real__ c = 5.0f; /* May not work */ -&__real__ c; /* May not work */ -``` - ---- - -## Next Steps (Priority Order) - -### High Priority -1. **Debug Complex Multiplication/Division** - The stack-based implementations are in place but still causing HardFaults. Need to debug the generated assembly. -2. **Remove Debug Output** - Clean up all DEBUG fprintf statements - -### Medium Priority -3. **Imaginary Constant Support** - Add `fi`/`i` suffix parsing in `tccpp.c` -4. **Complete __real__/__imag__ L-value Support** -5. **Create Missing Test Files** - Tests 52-57 - -### Low Priority -6. **Debug Information** (Phase 8) -7. **Varargs Support** (Phase 7) -8. **Complex Integer Types** (GCC extension) - ---- - -## Testing Commands - -```bash -# Type system test -cd tests/ir_tests -python run.py -c 50_complex_types.c - -# Check IR output -./armv8m-tcc -dump-ir -c test.c - -# Compile complex test -./armv8m-tcc -c test_complex.c -o test_complex.o -``` - ---- - -## References - -- Original Plan: `README.md` -- Design Decisions: `DESIGN_DECISIONS.md` -- Test Plan: `TEST_PLAN.md` -- Getting Started: `GETTING_STARTED.md` -- Fix Plan: `FIX_PLAN.md` diff --git a/docs/complex/IMPROVEMENTS.md b/docs/complex/IMPROVEMENTS.md deleted file mode 100644 index efd778fc..00000000 --- a/docs/complex/IMPROVEMENTS.md +++ /dev/null @@ -1,231 +0,0 @@ -# Complex Number Implementation Plan - Improvements Made - -This document summarizes improvements made to the original implementation plan. - -## Critical Issues Fixed - -### 1. **VT_BTYPE Mask Overflow (BLOCKER)** - -**Problem:** Original plan proposed `VT_CDOUBLE = 16`, but `VT_BTYPE` mask is `0x000f` (max value 15). - -**Solution:** Added clear decision point with two options: -- **Option A (Recommended):** Expand VT_BTYPE from 0x000f to 0x001f (5 bits) - - Requires auditing ~50-100 code locations - - More future-proof (supports up to 31 types) - -- **Option B (Fallback):** Use VT_COMPLEX flag bit - - More complex type checking throughout codebase - - Fallback if mask expansion too risky - -**Files Updated:** -- `README.md` §1.1 - Added critical decision point -- `DESIGN_DECISIONS.md` Decision 1 - Added implementation steps for mask expansion -- `GETTING_STARTED.md` - Added prominent warning before Step 1 -- `IMPLEMENTATION_CHECKLIST.md` - Added Phase 0.2 for VT_BTYPE audit - ---- - -## Major Additions - -### 2. **Phase 0: Research and Preparation** - -**Why Added:** Original plan jumped directly to implementation without validating approach. - -**New Phase 0 includes:** -- ABI research (ARM AAPCS §4.1.2) -- Study GCC/Clang implementations -- VT_BTYPE mask audit -- Prototype struct-based approach -- ABI compatibility testing -- **Decision point before committing to implementation strategy** - -**Files Updated:** -- `README.md` - Added complete Phase 0 section -- `IMPLEMENTATION_CHECKLIST.md` - Added Phase 0 tasks -- `GETTING_STARTED.md` - Added warning to complete Phase 0 first - -### 3. **Type Conversion Rules** - -**Problem:** Original plan didn't specify how type conversions work. - -**Added:** -- Real ↔ Complex conversions (C99 6.3.1.7) -- Complex ↔ Complex (widening/narrowing) -- Integer → Complex -- Explicit casts -- Complex → Bool (C99 6.3.1.2) - -**Files Updated:** -- `README.md` §1.5 - New subsection on type conversion -- `IMPLEMENTATION_CHECKLIST.md` §1.6 - Conversion implementation tasks -- `TEST_PLAN.md` - New "Type Conversion Tests" section - -### 4. **ABI Calling Convention Details** - -**Problem:** Calling convention was Phase 7 but affects design from start. - -**Added:** -- Moved AAPCS details earlier (Phase 3.0) -- Documented exact register usage for soft-float and VFP -- Clarified atomic treatment of complex values -- Stack overflow handling - -**Files Updated:** -- `README.md` §3.0 - New subsection before code generation - ---- - -## Test Coverage Improvements - -### 5. **Critical ABI Compatibility Tests** - -**Added:** -- GCC-compiled function called from TCC -- TCC-compiled function called from GCC -- Stack parameter passing tests - -**Files Updated:** -- `TEST_PLAN.md` - New "ABI Compatibility Tests" section (critical) - -### 6. **Union and Aliasing Tests** - -**Added:** -- Complex in unions -- Pointer aliasing tests -- Layout compatibility tests - -**Files Updated:** -- `TEST_PLAN.md` - New "Union and Aliasing Tests" section - -### 7. **Type Conversion Tests** - -**Added:** -- Real → Complex -- Complex → Real -- Widening/narrowing -- Integer conversions -- Cast operations - -**Files Updated:** -- `TEST_PLAN.md` - New "Type Conversion Tests" section - ---- - -## Design Decision Enhancements - -### 8. **Expanded Open Questions** - -**Added:** -- Question about struct-based vs native implementation -- VT_BTYPE mask expansion risk assessment -- Complex to bool conversion behavior - -**Files Updated:** -- `DESIGN_DECISIONS.md` - Expanded from 4 to 7 questions with recommendations - ---- - -## Documentation Structure Improvements - -### 9. **Clear Decision Points** - -**Before:** Plan assumed one implementation path. - -**After:** Multiple decision points with clear criteria: -1. Phase 0: Choose implementation strategy -2. Phase 1: VT_BTYPE mask size decision -3. Phase 3: Inline vs runtime for complex operations - -### 10. **Risk Callouts** - -Added prominent warnings for: -- VT_BTYPE overflow risk -- ABI compatibility requirements -- Phase 0 prerequisite - ---- - -## Summary of File Changes - -| File | Lines Added | Key Improvements | -|------|-------------|------------------| -| `README.md` | ~80 | Phase 0, VT_BTYPE fix, type conversion, AAPCS details | -| `DESIGN_DECISIONS.md` | ~40 | Mask expansion steps, expanded open questions | -| `TEST_PLAN.md` | ~100 | ABI tests, conversion tests, union tests | -| `IMPLEMENTATION_CHECKLIST.md` | ~30 | Phase 0 tasks, conversion tasks | -| `GETTING_STARTED.md` | ~20 | Critical warning, mask expansion step | -| `IMPROVEMENTS.md` | New | This document | - -**Total:** ~270 lines added/modified - ---- - -## Remaining Risks - -### High Priority -1. **VT_BTYPE mask expansion** - Could break existing code if flags conflict -2. **ABI compatibility** - Must match GCC exactly or interop fails -3. **Register allocator** - Handling register pairs may be complex - -### Medium Priority -4. **Complex division** - Mathematically complex, many edge cases -5. **Debug info** - DWARF generation may need updates -6. **Performance** - Inline vs runtime tradeoffs - -### Low Priority -7. **Type-generic math** - Deferred to post-MVP -8. **Complex integers** - GCC extension, low priority - ---- - -## Recommended Next Steps - -1. **Complete Phase 0** (estimated 1-2 days) - - Read ARM AAPCS carefully - - Count VT_BTYPE uses: `grep -rn "VT_BTYPE" *.c *.h | wc -l` - - Prototype struct-based approach - - Make implementation decision - -2. **If choosing mask expansion:** - - Create feature branch - - Expand VT_BTYPE to 0x001f - - Run full test suite - - Fix regressions before proceeding - -3. **If choosing struct-based:** - - Define internal complex struct type - - Map _Complex to struct in parser - - Implement __real__/__imag__ as special accessors - -4. **Implement incrementally:** - - Start with Phase 1 (types only) - - Test thoroughly before Phase 2 - - Get each phase working before next - -5. **Test ABI compatibility early:** - - Don't wait until Phase 7 - - Test calling convention after basic codegen works - ---- - -## Questions for Reviewer - -1. **VT_BTYPE expansion:** Is expanding the mask acceptable? Any known conflicts? -2. **Struct-based approach:** Should we seriously consider this as primary path? -3. **Implementation effort:** With improvements, estimate now ~3-4 weeks vs original 2-3 weeks. Acceptable? -4. **Test coverage:** Are ABI compatibility tests sufficient? -5. **Deferred features:** Agree on deferring complex integers and _Generic to post-MVP? - ---- - -## Conclusion - -The improved plan is more robust with: -- ✅ Critical VT_BTYPE issue addressed -- ✅ Phase 0 research prevents costly rework -- ✅ Type conversion rules specified -- ✅ ABI compatibility prioritized -- ✅ Test coverage expanded -- ✅ Clear decision points identified - -**Status:** Plan ready for Phase 0 implementation. diff --git a/docs/complex/README.md b/docs/complex/README.md deleted file mode 100644 index cef56a39..00000000 --- a/docs/complex/README.md +++ /dev/null @@ -1,556 +0,0 @@ -# Complex Number Support Implementation Plan - -This document outlines the plan for adding C99 complex number support (`_Complex`, `__complex__`, `complex.h`) to TinyCC for ARMv8-M. - -## Overview - -Complex numbers in C99 are defined as: -- `float _Complex` - 8 bytes (2 x float) -- `double _Complex` - 16 bytes (2 x double) -- `long double _Complex` - 16 bytes (2 x double, same as double _Complex on ARM) - -### Current Status (Updated: 2026-02-26) - -**Implementation is ~60% complete.** Phases 1-2 are done, Phase 3 is partially complete. - -| Phase | Status | Description | -|-------|--------|-------------| -| 1: Type System | ✅ **COMPLETE** | Type parsing, sizeof, conversions work | -| 2: IR Support | ✅ **COMPLETE** | Complex types flow through IR correctly | -| 3: Code Gen | 🚧 **PARTIAL** | Add/sub work, **mul/div missing** | -| 4: Accessors | 🚧 **PARTIAL** | `__real__`/`__imag__` parse, L-values pending | -| 5: Constants | ❌ **NOT STARTED** | `1.0fi` imaginary suffix not implemented | -| 6: Library | ✅ **COMPLETE** | `complex.h` header ready | -| 7: ABI/Calling | 🚧 **PARTIAL** | Basic calls work, edge cases pending | - -**What Works:** -```c -_Complex float cf; // ✅ Declaration -sizeof(_Complex float); // ✅ Returns 8 -_Complex float c = a + b; // ✅ Addition -_Complex float d = a - b; // ✅ Subtraction -``` - -**What's Missing:** -```c -_Complex float c = a * b; // ❌ Multiplication not implemented -_Complex float d = a / b; // ❌ Division not implemented -_Complex float c = 1.0f + 2.0fi; // ❌ Imaginary constants not implemented -``` - -**See also:** -- [Implementation Status](IMPLEMENTATION_STATUS.md) - Detailed status -- [Implementation Checklist](IMPLEMENTATION_CHECKLIST.md) - Task-by-task tracking - ---- - -## Phase 0: Research and Preparation (RECOMMENDED) - -**Goal:** Validate approach before major implementation. - -### 0.1 Study Existing Implementations -- Examine GCC's complex handling: `gcc -fdump-tree-all test.c` -- Check Clang IR: `clang -S -emit-llvm test.c` -- Review ARM AAPCS §4.1.2 (composite types) - -### 0.2 Verify ABI Compatibility -**Critical test:** Ensure TCC can call GCC-compiled complex functions. - -```bash -# Compile with GCC -arm-none-eabi-gcc -c complex_func.c -o gcc_complex.o - -# Call from TCC -./armv8m-tcc -c test_caller.c -o tcc_caller.o -arm-none-eabi-gcc tcc_caller.o gcc_complex.o -o test -``` - -### 0.3 Prototype struct-based approach -Test if lowering to struct early is viable: -```c -/* Quick prototype: map _Complex float to struct */ -typedef struct { float __re; float __im; } __tcc_cfloat; -``` -Compare code generation quality vs native approach. - -### 0.4 Check TCC Type System Limits -```bash -# Find all VT_BTYPE users -grep -r "VT_BTYPE" *.c *.h | wc -l -# Estimate refactoring effort for mask expansion -``` - -**Deliverable:** Decision document: struct-based vs native complex types. - ---- - -## Phase 1: Type System Foundation ✅ COMPLETE - -**Goal:** Enable parsing and representation of complex types. - -**Status:** All tasks completed. Type declarations, sizeof, and conversions work. - -### 1.1 Add Complex Type Flag -**Files:** `tcc.h` ✅ - -**Decision Made:** Use `VT_COMPLEX` flag (bit 20) instead of expanding VT_BTYPE mask. - -```c -/* Implementation: */ -#define VT_COMPLEX 0x00100000 /* Complex type flag (bit 20) */ -/* VT_FLOAT | VT_COMPLEX = float _Complex */ -/* VT_DOUBLE | VT_COMPLEX = double _Complex */ -``` - -**Rationale:** Avoids modifying core type mask, cleaner integration with existing code. - -**Test:** `tests/ir_tests/50_complex_types.c` passes ✅ - -### 1.2 Update Parser Type Handling -**Files:** `tccgen.c` (parse_btype) - -Replace the error with proper type handling: -```c -case TOK_COMPLEX: - /* Mark that we saw _Complex, apply when float/double is seen */ - complex_flag = 1; - next(); - break; -``` - -Then when `TOK_FLOAT` or `TOK_DOUBLE` is parsed, combine with complex flag: -```c -case TOK_FLOAT: - if (complex_flag) - u = VT_CFLOAT; - else - u = VT_FLOAT; - goto basic_type; -``` - -### 1.3 Add Type Helper Functions -**Files:** `tcctype.h` - -Add type checking utilities: -```c -static inline int tcc_is_complex_type(int t) -{ - int bt = t & VT_BTYPE; - return (bt == VT_CFLOAT || bt == VT_CDOUBLE); -} - -static inline int tcc_complex_base_type(int t) -{ - int bt = t & VT_BTYPE; - if (bt == VT_CFLOAT) return VT_FLOAT; - if (bt == VT_CDOUBLE) return VT_DOUBLE; - return bt; -} -``` - -### 1.4 Update Type Size/Alignment Functions -**Files:** `tcctype.h`, `tccgen.c` - -Update `tcc_get_basic_type_size()` and type alignment calculations: -```c -case VT_CFLOAT: - return 8; /* 2 floats */ -case VT_CDOUBLE: - return 16; /* 2 doubles */ -``` - -### 1.5 Type Conversion Rules -**Files:** `tccgen.c` (type conversion functions) - -Implement C99 conversion rules: -```c -/* Real to complex: real part = value, imag = 0 */ -float f = 1.0f; -_Complex float cf = f; /* cf = 1.0 + 0i */ - -/* Complex to real: discard imaginary part (C99 6.3.1.7) */ -_Complex float cf = 3.0f + 4.0fi; -float f = cf; /* f = 3.0 (implicit conversion) */ - -/* Complex to complex: convert components */ -_Complex float cf = 1.0f + 2.0fi; -_Complex double cd = cf; /* widen both parts */ - -/* Integer to complex */ -int x = 5; -_Complex float cf = x; /* cf = 5.0 + 0i */ -``` - -**Implementation:** -- Update `tcc_convert_type()` in `tccgen.c` -- Handle implicit conversions in assignments -- Handle explicit casts: `(_Complex float)expr` - -### 1.6 Testing (Phase 1) -Create test file `tests/ir_tests/50_complex_types.c`: -```c -#include - -int main(void) -{ - _Complex float cf; - _Complex double cd; - - /* Check sizes */ - if (sizeof(cf) != 8) return 1; - if (sizeof(cd) != 16) return 1; - - printf("OK\n"); - return 0; -} -``` - -**Deliverable:** Parser accepts complex type declarations, sizeof works correctly. - ---- - -## Phase 2: IR Support for Complex Types ✅ COMPLETE - -**Goal:** Extend IR to represent complex values and operations. - -**Status:** Complete. Complex types flow through IR with `is_complex` flag. - -### 2.1 IROperand Complex Flag -**Files:** `tccir_operand.h`, `tccir_operand.c` ✅ - -Added `is_complex` field to `IROperand` struct: -```c -typedef struct IROperand { - /* ... existing fields ... */ - int is_complex; /* Set for complex float/double types */ -} IROperand; -``` - -Functions updated: -- `svalue_to_iroperand()` - Sets `is_complex` from `VT_COMPLEX` flag -- `iroperand_to_svalue()` - Restores `VT_COMPLEX` flag - -### 2.2 IR Operations Strategy -**Decision:** Lower complex operations to existing float ops in front-end. -- Complex add → Two float adds (real + real, imag + imag) -- Complex sub → Two float subtracts -- Complex mul/div → Component-wise operations (see Phase 3) - -### 2.3 Testing (Phase 2) -Test IR dump shows correct complex types: `./armv8m-tcc -dump-ir -c test.c` - -**Deliverable:** Complex types flow through IR with correct type information ✅ - ---- - -## Phase 3: Code Generation 🚧 PARTIAL - -**Goal:** Generate ARM Thumb-2 code for complex operations. - -**Status:** Add/Subtract implemented. **Multiplication and Division TODO.** - -### 3.0 ARM AAPCS Calling Convention - -**Software FP (no VFP):** -- `float _Complex`: Passed in r0 (real), r1 (imag); returned in r0, r1 -- `double _Complex`: Passed in r0-r1 (real lo/hi), r2-r3 (imag lo/hi); returned same - -**Hardware FP (VFP):** -- `float _Complex`: Passed in s0 (real), s1 (imag); returned in s0, s1 -- `double _Complex`: Passed in d0 (real), d1 (imag); returned in d0, d1 - -### 3.1 Complex Number Representation ✅ -Complex values use register pairs: -- `float _Complex`: rN (real), rN+1 (imag) or sN/sN+1 with VFP -- `double _Complex`: rN/rN+1 (real), rN+2/rN+3 (imag) or dN/dN+1 with VFP - -### 3.2 Complex Load/Store ✅ -**Files:** `arm-thumb-gen.c` - -Load/store implemented via consecutive memory operations. - -### 3.3 Complex Arithmetic Operations - -#### Addition/Subtraction ✅ -**Implementation:** `thumb_process_complex_op()` in `arm-thumb-gen.c` - -Component-wise operations: -- Software FP: Calls `__addsf3`/`__subsf3` twice -- VFP: Inline VADD.F32/VSUB.F32 - -```c -/* float _Complex add: (a+ib) + (c+id) = (a+c) + i(b+d) */ -VADD.F32 s0, s0, s2 /* real: a + c */ -VADD.F32 s1, s1, s3 /* imag: b + d */ -``` - -#### Multiplication ❌ TODO -**Formula:** `(a+ib) * (c+id) = (ac-bd) + i(ad+bc)` - -**Implementation needed:** -```c -/* Software FP: Call runtime functions */ -ac = __mulsf3(a, c); -bd = __mulsf3(b, d); -ad = __mulsf3(a, d); -bc = __mulsf3(b, c); -real = __subsf3(ac, bd); -imag = __addsf3(ad, bc); - -/* VFP: Inline sequence */ -VMUL.F32 s4, s0, s2 /* ac */ -VMUL.F32 s5, s1, s3 /* bd */ -VMUL.F32 s6, s0, s3 /* ad */ -VMUL.F32 s7, s1, s2 /* bc */ -VSUB.F32 s0, s4, s5 /* ac-bd (real) */ -VADD.F32 s1, s6, s7 /* ad+bc (imag) */ -``` - -#### Division ❌ TODO -**Formula:** `(ac+bd)/(c²+d²) + i(bc-ad)/(c²+d²)` - -**Options:** -1. Inline expansion (many instructions) -2. Call runtime: `__divsc3` (float) / `__divdc3` (double) - -**Recommendation:** Use runtime calls for software FP, inline for VFP. - -### 3.4 Register Allocator ✅ -**Files:** `tccls.c` - -Register allocator handles complex values as pairs with consecutive registers. - -### 3.5 Testing -- `tests/ir_tests/51_complex_arith.c` - Add/sub work ✅ -- Multiplication tests - **Need implementation** -- Division tests - **Need implementation** - ---- - -## Phase 4: Real and Imaginary Part Access - -**Goal:** Support `__real__` and `__imag__` operators (GCC extension, widely used). - -### 4.1 Add Keywords -**Files:** `tcctok.h` - -```c -DEF(TOK_REAL, "__real__") -DEF(TOK_IMAG, "__imag__") -``` - -### 4.2 Parse Real/Imag Operators -**Files:** `tccgen.c` - -Handle in expression parser: -```c -case TOK_REAL: - next(); - parse_unary(); /* parse operand */ - /* Generate code to extract real part */ - if (tcc_is_complex_type(vtop->type.t)) { - /* For float complex, just take lower 4 bytes */ - /* Mark as regular float type */ - } - break; -``` - -### 4.3 Testing (Phase 4) -Test extraction and assignment to parts. - -**Deliverable:** `__real__` and `__imag__` operators work. - ---- - -## Phase 5: Complex Constants - -**Goal:** Support imaginary constants like `1.0fi`, `2.0i`. - -### 5.1 Add Imaginary Suffix Support -**Files:** `tccpp.c` (preprocessor number parsing) - -Parse `i` or `j` suffix on floating constants (after `f` or no suffix). - -### 5.2 Create Complex Constants -**Files:** `tccgen.c` - -Generate constant complex values: -```c -/* 1.0fi -> {0.0f, 1.0f} */ -/* Store in data section as two consecutive floats */ -``` - -### 5.3 Testing (Phase 5) -Test constant initialization and usage. - -**Deliverable:** Imaginary constants work correctly. - ---- - -## Phase 6: Complex Built-in Functions - -**Goal:** Provide `` library support. - -### 6.1 Create complex.h Header -**Files:** `include/complex.h` - -```c -#ifndef _COMPLEX_H -#define _COMPLEX_H - -#define complex _Complex -#define _Complex_I 1.0fi -#define I _Complex_I - -/* C11 CMPLX macros */ -#define CMPLX(x, y) ((_Complex double){ x, y }) -#define CMPLXF(x, y) ((_Complex float){ x, y }) -#define CMPLXL(x, y) ((_Complex long double){ x, y }) - -/* Basic operations */ -double creal(_Complex double z); -float crealf(_Complex float z); -/* ... etc ... */ - -#endif -``` - -### 6.2 Implement Complex Functions (Runtime) -**Files:** `lib/libtcc1.c` or link with newlib - -Newlib already has complex math functions. Ensure ABI compatibility. - -### 6.3 Testing (Phase 6) -Test against newlib's complex math functions. - -**Deliverable:** `` usable, math functions work. - ---- - -## Phase 7: Calling Conventions (ABI Compliance) - -**Goal:** Ensure complex values are passed according to ARM AAPCS. - -### 7.1 AAPCS Complex Calling Convention -According to AAPCS: -- `float _Complex`: passed in r0/r1 (or s0/s1 with VFP) -- `double _Complex`: passed in r0-r3 (or d0/d1 with VFP) -- Return values in same registers - -### 7.2 Update Call Generation -**Files:** `arm-thumb-gen.c`, `tccir.c` - -Ensure complex values are: -- Split into components for argument passing -- Recombined on function entry -- Properly returned - -### 7.3 Testing (Phase 7) -Create `tests/ir_tests/52_complex_calls.c`: -```c -_Complex float add_complex(_Complex float a, _Complex float b) -{ - return a + b; -} - -int main(void) -{ - _Complex float x = 1.0f + 2.0fi; - _Complex float y = 3.0f + 4.0fi; - _Complex float z = add_complex(x, y); - /* Check result */ -} -``` - -**Deliverable:** Complex values pass correctly across function calls. - ---- - -## Phase 8: Debug Information - -**Goal:** Generate correct DWARF debug info for complex types. - -### 8.1 Update Debug Info Generation -**Files:** `tccdbg.c` - -Add DWARF type entries for complex: -```c -case VT_CFLOAT: - /* DW_ATE_complex_float with 8-byte size */ -case VT_CDOUBLE: - /* DW_ATE_complex_float with 16-byte size */ -``` - -### 8.2 Testing (Phase 8) -Verify GDB can inspect complex variables. - -**Deliverable:** Debug info correct, GDB shows complex values. - ---- - -## Phase 9: Comprehensive Testing - -### 9.1 Unit Tests -Create tests in `tests/ir_tests/`: - -| Test | Description | -|------|-------------| -| `50_complex_types.c` | Type sizes, alignment | -| `51_complex_arith.c` | +, -, *, / operations | -| `52_complex_calls.c` | Function arguments/returns | -| `53_complex_real_imag.c` | `__real__`, `__imag__` | -| `54_complex_const.c` | Constant initialization | -| `55_complex_comparison.c` | ==, != operators | -| `56_complex_math.c` | cabs, cexp, etc. | - -### 9.2 GCC Testsuite Integration -Identify relevant tests from `tests/gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/` - -### 9.3 Edge Cases -- Complex division by zero -- Complex NaN/Inf handling -- Mixed real/complex operations -- Complex bit-fields (should error) - ---- - -## Implementation Order Summary - -| Phase | Component | Effort | Priority | -|-------|-----------|--------|----------| -| 1 | Type System | Medium | Must have | -| 2 | IR Support | Low | Must have | -| 3 | Code Gen | High | Must have | -| 4 | Real/Imag Ops | Low | Should have | -| 5 | Constants | Medium | Should have | -| 6 | complex.h | Low | Should have | -| 7 | ABI/Calling | High | Must have | -| 8 | Debug Info | Low | Nice to have | -| 9 | Testing | High | Ongoing | - ---- - -## Technical Notes - -### Alternative: Lower to Struct Early -Instead of adding complex types throughout, could lower complex to a struct `{ T real; T imag; }` early in compilation. This would require less changes but lose type information for optimization. - -### VFP vs Software FP -- With VFP: Use vector instructions for complex operations -- Software FP: Use integer register pairs and software FP library - -### Complex Division -Complex division is the most complex operation. Options: -1. Inline the full calculation (many instructions) -2. Call runtime library function - -Recommendation: Call runtime for software FP, inline for VFP. - ---- - -## References - -- C99 Standard, Section 7.3 (Complex arithmetic) -- ARM AAPCS, Section 4.3 (Parameter passing) -- GCC documentation on `_Complex` and `__real__`/`__imag__` -- Newlib complex.h implementation diff --git a/docs/complex/TEST_PLAN.md b/docs/complex/TEST_PLAN.md deleted file mode 100644 index 541a621e..00000000 --- a/docs/complex/TEST_PLAN.md +++ /dev/null @@ -1,523 +0,0 @@ -# Complex Number Support - Test Plan - -## Overview - -This document defines comprehensive testing for complex number support. Tests are organized by phase and include positive tests, negative tests, and edge cases. - -## Test Organization - -``` -tests/ir_tests/ -├── 50_complex_types.c # Phase 1: Type system tests -├── 50_complex_types.expect -├── 51_complex_arith.c # Phase 3: Arithmetic operations -├── 51_complex_arith.expect -├── 52_complex_calls.c # Phase 7: Function calls -├── 52_complex_calls.expect -├── 53_complex_accessors.c # Phase 4: __real__, __imag__ -├── 53_complex_accessors.expect -├── 54_complex_init.c # Phase 5: Initialization -├── 54_complex_init.expect -├── 55_complex_compare.c # Equality comparison -├── 55_complex_compare.expect -├── 56_complex_edge.c # Edge cases -├── 56_complex_edge.expect -└── 57_complex_math.c # Phase 6: Math functions - └── 57_complex_math.expect -``` - -## Phase 1: Type System Tests (50_complex_types.c) - -### Test 1.1: Size and Alignment -```c -#include - -int main(void) -{ - printf("sizeof(float) = %d\n", (int)sizeof(float)); - printf("sizeof(double) = %d\n", (int)sizeof(double)); - printf("sizeof(float _Complex) = %d\n", (int)sizeof(float _Complex)); - printf("sizeof(double _Complex) = %d\n", (int)sizeof(double _Complex)); - printf("sizeof(long double _Complex) = %d\n", (int)sizeof(long double _Complex)); - return 0; -} -``` - -**Expected output:** -``` -sizeof(float) = 4 -sizeof(double) = 8 -sizeof(float _Complex) = 8 -sizeof(double _Complex) = 16 -sizeof(long double _Complex) = 16 -``` - -### Test 1.2: Type Declaration Variations -```c -_Complex float cf1; -float _Complex cf2; -_Complex double cd1; -double _Complex cd2; -__complex__ float gcf; /* GCC extension */ -``` - -### Test 1.3: Array of Complex -```c -_Complex float arr[10]; -printf("sizeof(arr) = %d\n", (int)sizeof(arr)); /* Should be 80 */ -``` - -### Test 1.4: Pointer to Complex -```c -_Complex float *p; -printf("sizeof(p) = %d\n", (int)sizeof(p)); /* Should be 4 (pointer) */ -``` - -### Test 1.5: Complex Struct Member -```c -struct S { - _Complex float c; - int x; -}; -printf("sizeof(struct S) = %d\n", (int)sizeof(struct S)); /* Should be 16 (8 + 4 + 4 pad) */ -``` - ---- - -## Phase 3: Arithmetic Tests (51_complex_arith.c) - -### Test 3.1: Complex Addition -```c -_Complex float a = 1.0f + 2.0fi; -_Complex float b = 3.0f + 4.0fi; -_Complex float c = a + b; -printf("%.1f %.1f\n", __real__ c, __imag__ c); /* "4.0 6.0" */ -``` - -### Test 3.2: Complex Subtraction -```c -_Complex float c = a - b; -printf("%.1f %.1f\n", __real__ c, __imag__ c); /* "-2.0 -2.0" */ -``` - -### Test 3.3: Complex Multiplication -```c -/* (1+2i) * (3+4i) = (3-8) + i(4+6) = -5 + 10i */ -_Complex float c = a * b; -printf("%.1f %.1f\n", __real__ c, __imag__ c); /* "-5.0 10.0" */ -``` - -### Test 3.4: Complex Division -```c -/* (5+10i) / (1+2i) = 5 */ -_Complex float num = 5.0f + 10.0fi; -_Complex float den = 1.0f + 2.0fi; -_Complex float quot = num / den; -printf("%.1f %.1f\n", __real__ quot, __imag__ quot); /* "5.0 0.0" */ -``` - -### Test 3.5: Double Complex Operations -Same tests with `double _Complex` to verify 16-byte operations. - -### Test 3.6: Mixed Real and Complex -```c -_Complex float c = a + 5.0f; /* 5 is real, should add to real part */ -printf("%.1f %.1f\n", __real__ c, __imag__ c); /* "6.0 2.0" */ -``` - -### Test 3.7: Complex Negation -```c -_Complex float c = -a; -printf("%.1f %.1f\n", __real__ c, __imag__ c); /* "-1.0 -2.0" */ -``` - ---- - -## Phase 4: Accessor Tests (53_complex_accessors.c) - -### Test 4.1: Read Real and Imaginary -```c -_Complex float c = 3.0f + 4.0fi; -float r = __real__ c; -float i = __imag__ c; -printf("%.1f %.1f\n", r, i); /* "3.0 4.0" */ -``` - -### Test 4.2: Modify Real Part -```c -_Complex float c = 3.0f + 4.0fi; -__real__ c = 10.0f; -printf("%.1f %.1f\n", __real__ c, __imag__ c); /* "10.0 4.0" */ -``` - -### Test 4.3: Modify Imaginary Part -```c -_Complex float c = 3.0f + 4.0fi; -__imag__ c = 20.0f; -printf("%.1f %.1f\n", __real__ c, __imag__ c); /* "3.0 20.0" */ -``` - -### Test 4.4: Address of Parts -```c -_Complex float c = 3.0f + 4.0fi; -float *rp = &__real__ c; -float *ip = &__imag__ c; -*rp = 100.0f; -printf("%.1f\n", __real__ c); /* "100.0" */ -``` - ---- - -## Phase 5: Initialization Tests (54_complex_init.c) - -### Test 5.1: Compound Literal Initialization -```c -_Complex float c = 1.0f + 2.0fi; -``` - -### Test 5.2: Real-Only Initialization -```c -_Complex float c = 5.0f; /* Imaginary part is 0 */ -printf("%.1f %.1f\n", __real__ c, __imag__ c); /* "5.0 0.0" */ -``` - -### Test 5.3: CMPLX Macro -```c -#include -_Complex float c = CMPLXF(1.0f, 2.0f); -``` - -### Test 5.4: Static Initialization -```c -static _Complex float c = 1.0f + 2.0fi; -``` - -### Test 5.5: Array Initialization -```c -_Complex float arr[3] = {1.0f, 2.0f + 3.0fi, 4.0f}; -``` - ---- - -## Phase 7: Function Call Tests (52_complex_calls.c) - -### Test 7.1: Pass and Return Complex -```c -_Complex float add(_Complex float a, _Complex float b) -{ - return a + b; -} - -int main(void) -{ - _Complex float x = 1.0f + 2.0fi; - _Complex float y = 3.0f + 4.0fi; - _Complex float z = add(x, y); - printf("%.1f %.1f\n", __real__ z, __imag__ z); /* "4.0 6.0" */ - return 0; -} -``` - -### Test 7.2: Complex in Struct Parameter -```c -struct Pair { - _Complex float c; - int n; -}; - -void process(struct Pair p); -``` - -### Test 7.3: Complex Variadic Functions (if supported) -```c -/* Note: complex in varargs may have special requirements */ -``` - ---- - -## Comparison Tests (55_complex_compare.c) - -### Test 5.1: Equality -```c -_Complex float a = 1.0f + 2.0fi; -_Complex float b = 1.0f + 2.0fi; -_Complex float c = 3.0f + 4.0fi; -printf("%d %d\n", a == b, a == c); /* "1 0" */ -``` - -### Test 5.2: Inequality -```c -printf("%d %d\n", a != b, a != c); /* "0 1" */ -``` - -### Test 5.3: Ordered Comparison (Compile Error Test) -```c -/* This should produce compile error */ -if (a < b) { } /* error: invalid operands to binary < */ -``` - ---- - -## Edge Case Tests (56_complex_edge.c) - -### Test 6.1: Division by Zero -```c -_Complex float a = 1.0f + 2.0fi; -_Complex float zero = 0.0f + 0.0fi; -_Complex float c = a / zero; -/* Should produce Inf or NaN */ -``` - -### Test 6.2: NaN Propagation -```c -/* Operations with NaN should produce NaN */ -``` - -### Test 6.3: Infinity -```c -/* Operations with Inf should follow IEEE rules */ -``` - -### Test 6.4: Very Large/Small Numbers -```c -/* Test for overflow/underflow */ -``` - -### Test 6.5: Pure Real/Pure Imaginary -```c -_Complex float real_only = 5.0f; /* 5 + 0i */ -_Complex float imag_only = 5.0fi; /* 0 + 5i */ -``` - ---- - -## Math Library Tests (57_complex_math.c) - -### Test 7.1: cabs (Absolute Value) -```c -#include -_Complex float c = 3.0f + 4.0fi; -float a = cabsf(c); -printf("%.1f\n", a); /* "5.0" */ -``` - -### Test 7.2: creal/cimag -```c -_Complex float c = 3.0f + 4.0fi; -printf("%.1f %.1f\n", crealf(c), cimagf(c)); /* "3.0 4.0" */ -``` - -### Test 7.3: conj (Conjugate) -```c -_Complex float c = 3.0f + 4.0fi; -_Complex float conj_c = conjf(c); -printf("%.1f %.1f\n", __real__ conj_c, __imag__ conj_c); /* "3.0 -4.0" */ -``` - -### Test 7.4: cexp -```c -/* e^(0 + i*pi) = -1 */ -_Complex float c = cexpf(0.0f + 3.14159265fi); -/* Should be approximately -1 + 0i */ -``` - -### Test 7.5: csqrt -```c -/* sqrt(-1) = i */ -_Complex float c = csqrtf(-1.0f + 0.0fi); -/* Should be approximately 0 + 1i */ -``` - ---- - -## Type Conversion Tests (NEW) - -### TConv 1: Real to Complex -```c -float f = 3.0f; -_Complex float cf = f; -printf("%.1f %.1f\n", __real__ cf, __imag__ cf); /* "3.0 0.0" */ -``` - -### TConv 2: Complex to Real (Implicit) -```c -_Complex float cf = 3.0f + 4.0fi; -float f = cf; /* Discard imaginary part */ -printf("%.1f\n", f); /* "3.0" */ -``` - -### TConv 3: Complex Widening -```c -_Complex float cf = 1.0f + 2.0fi; -_Complex double cd = cf; /* Widen both components */ -``` - -### TConv 4: Integer to Complex -```c -int x = 5; -_Complex float cf = x; -printf("%.1f %.1f\n", __real__ cf, __imag__ cf); /* "5.0 0.0" */ -``` - -### TConv 5: Cast Operations -```c -_Complex double cd = (_Complex double)(3.0f + 4.0fi); -float f = (float)(5.0 + 10.0i); /* f = 5.0 */ -``` - ---- - -## ABI Compatibility Tests (NEW - CRITICAL) - -### ABI 1: Call GCC-Compiled Function -```c -/* gcc_func.c - compiled with arm-none-eabi-gcc */ -_Complex float gcc_add(_Complex float a, _Complex float b) -{ - return a + b; -} - -/* tcc_caller.c - compiled with TCC */ -extern _Complex float gcc_add(_Complex float, _Complex float); - -int main(void) -{ - _Complex float x = 1.0f + 2.0fi; - _Complex float y = 3.0f + 4.0fi; - _Complex float z = gcc_add(x, y); - /* Verify result correct */ -} -``` - -### ABI 2: TCC Function Called by GCC -Reverse of ABI 1 - TCC implements, GCC calls. - -### ABI 3: Stack Parameter Passing -```c -/* Force parameters onto stack */ -void many_params( - int a, int b, int c, int d, /* Use r0-r3 */ - _Complex float cf); /* Must go on stack */ -``` - ---- - -## Union and Aliasing Tests (NEW) - -### Union 1: Complex in Union -```c -union U { - _Complex float cf; - float arr[2]; -}; -union U u; -u.cf = 1.0f + 2.0fi; -printf("%.1f %.1f\n", u.arr[0], u.arr[1]); /* "1.0 2.0" */ -``` - -### Union 2: Pointer Aliasing -```c -_Complex float cf = 3.0f + 4.0fi; -float *fp = (float *)&cf; -printf("%.1f %.1f\n", fp[0], fp[1]); /* "3.0 4.0" */ -``` - ---- - -## Negative Tests (Should Produce Errors) - -### NTest 1: Complex Bit-field -```c -struct S { - _Complex int x : 8; /* error: bit-field has invalid type */ -}; -``` - -### NTest 2: Ordered Comparison -```c -_Complex float a, b; -if (a < b) { } /* error: invalid operands to binary < */ -``` - -### NTest 3: Complex Integer (if not supported) -```c -_Complex int x; /* may be error or warning */ -``` - -### NTest 4: Cast to Complex Integer -```c -int x = 5; -_Complex int c = (_Complex int)x; /* error if not supported */ -``` - ---- - -## GCC Testsuite Integration - -Relevant tests from GCC c-torture suite: - -``` -tests/gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/ -├── compile/ -│ └── complex/ (if exists) -└── execute/ - └── complex/ (if exists) -``` - -Also check: -``` -tests/gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.dg/complex* -``` - ---- - -## Test Automation - -### Running Tests -```bash -# Individual test -cd tests/ir_tests -python run.py -c 50_complex_types.c - -# All complex tests -pytest -k "complex" -v - -# Full test suite (after full implementation) -make test -j16 -``` - -### Expected Files Format -Each `.expect` file contains expected stdout output: -``` -sizeof(float) = 4 -sizeof(double) = 8 -sizeof(float _Complex) = 8 -sizeof(double _Complex) = 16 -OK -``` - ---- - -## Success Criteria - -| Phase | Pass Criteria | -|-------|--------------| -| 1 | All type tests pass, sizeof correct | -| 2 | IR dump shows correct complex types | -| 3 | Arithmetic tests within 0.0001 tolerance | -| 4 | Accessor tests pass | -| 5 | Initialization tests pass | -| 6 | complex.h usable, basic functions work | -| 7 | Function call tests pass | -| 8 | Debug info valid (GDB check) | -| 9 | All tests pass, no regressions | - ---- - -## Performance Benchmarks (Future) - -Once basic functionality works, consider: - -1. **FFT benchmark:** Compare TCC vs GCC for DFT/FFT algorithms -2. **Matrix multiply:** Complex matrix operations -3. **Filter banks:** Digital signal processing kernels diff --git a/docs/debugging_fuzz_divergences.md b/docs/debugging_fuzz_divergences.md new file mode 100644 index 00000000..051518dc --- /dev/null +++ b/docs/debugging_fuzz_divergences.md @@ -0,0 +1,229 @@ +# Debugging a fuzz divergence (LLM playbook) + +End-to-end workflow an agent (or human) should follow when a differential-fuzz +seed produces different output at two optimization levels. This complements +`docs/fuzz_triage_guide.md` (which covers the *sweep* + triage infrastructure); +this document is the **per-bug investigation → fix → regression-test** loop. + +Golden rule: **tcc -O0 is the trusted oracle.** If O1/O2/Os disagree with O0, an +optimizer is broken. Ground truth is `gcc -m32 -funsigned-char` (ARM ABI: +unsigned char, 32-bit long) — never plain `gcc`. + +--- + +## 0. Before you start + +```bash +cd libs/tinycc +make cross -j$(nproc) # armv8m-tcc must be current after every edit +``` + +## 1. Recollect everything for one sweep-report seed (start here) + +The sweep reports (`fuzz_triage_*.md`) list seeds **per suite/profile**: +`ptr 5759` means seed 5759 of gen_c.py's `ptr` profile — NOT the program +`diff_olevels.py --seed 5759` (default profile) would generate. +`scripts/triage_seed.py` owns that mapping and collects the whole +investigation starting kit in one command: + +```bash +python3 scripts/triage_seed.py --suite longlong --seed 3161 +# or, for an existing repro file: +python3 scripts/triage_seed.py --file repro.c +``` + +It writes to `tests/fuzz/results/triage/_/`: + +- `seed.c` — the generated program +- `outputs.txt` — tcc signatures at `-O0/-O1/-O2/-Os` with FULL output, so a + HardFault keeps its `PC=/CFSR=/BFAR=` register dump +- `gcc_reference.txt` — `arm-none-eabi-gcc -O2` ground truth (must equal + tcc `-O0`; a mismatch is loudly flagged — suspect a gcc-bad quarantine case) +- `crash_disasm.txt` — (crash signatures only) force-thumb disassembly window + around the faulting PC of the divergent ELF +- `reduced.c` — line-granularity reduction preserving the divergence +- `bisect.txt` — `bisect_opt.py` Phase A/B/C output on the reduced repro +- `SUMMARY.md` — one-page digest (signatures, divergent level, culprit knobs) + +`--skip-reduce` / `--skip-bisect` skip the slow steps; `--olevels` narrows the +level list. Exit code: 0 consistent, 1 divergence collected, 2 infra error. + +Steps 2–3 below describe what the collector runs under the hood (and how to +re-run each piece by hand when iterating on a fix). + +### Manual reproduce + confirm + +```bash +# one seed, all O-levels self-consistency: +python3 scripts/diff_olevels.py --seed N --require-qemu +# ground truth (must equal tcc -O0): +bash tests/fuzz/runseed.sh tests/fuzz/fuzz_triage_repros/seedN.c -O0 +``` + +If `diff_olevels.py` reports `DIVERGE`, note the failing level (the `high` level) +and the correct O0 checksum. + +## 2. Run the automated bisector + +```bash +python3 scripts/bisect_opt.py --seed N --high=-O1 +# or, for an existing .c repro: +python3 scripts/bisect_opt.py --file tests/fuzz/fuzz_triage_repros/seedN.c --high=-O2 +``` + +(Use `--high=-O1` with `=` — argparse needs it because the value starts with `-`.) + +The script reports two cross-checked signals: + +- **Phase A — culprit knob(s), QEMU-confirmed.** Every `-fno-` whose + removal at the failing level restores the O0 signature. *All* such knobs are + listed (a real root cause is often gated by more than one; e.g. seed 295 was + fixed by `-fno-store-load-fwd`, `-fno-const-prop`, **and** + `-fno-indexed-memory`). The most specific one is the pass that *creates* the + bad value; the others are passes that *propagate* it. +- **Phase B — the exact IR fold.** Dumps IR after every pass and flags where a + memory read (`LOAD` / `LOAD_INDEXED` / `***DEREF***`) at a stable instruction + address turns into a constant `#...` — the classic misfold signature. Prints + the before/after lines and the pass (group) name, plus, for each culprit knob, + the individual passes it gates (the functions to open in `ir/opt_*.c`). + +The intersection of "fold in group X" + "culprit knob gates pass X" is the bug +location. For seed 295 this was: fold in `entry_store_group` + `store-load-fwd` +gates `entry_store` → `ir/opt_memory.c:tcc_ir_opt_entry_store_prop`. + +**Phase B only detects memory→constant folds.** For bugs that drop a store, +rewrite control flow, or mis-thread a branch (e.g. seed 671, where +jump-threading dropped `arr8[0] = arr9[u5&7]` from a loop), Phase B is silent +and the script automatically falls back to **Phase C** (below). + +## 3. Read the IR + locate the code + +### Phase C — final-IR diff (the general fallback) + +After Phase A, `bisect_opt.py` automatically diffs the final optimized IR at +`high` vs `high -fno-` for the most specific culprit knob, and you can +re-run it on a reduced repro: + +```bash +python3 scripts/bisect_opt.py --file reduced.c --high=-O2 --diff-knob jump-threading +``` + +Read the diff for instructions present on the **correct** (`+`) side but absent +on the buggy (`-`) side — that is the dropped computation. (Reducing first is +important: on a full 100-line seed, O2 unrolling/rotation makes the diff too +noisy; on a 56-line reduced repro the dropped `arr8[0]=arr9[3]` store stands +out immediately.) For seed 671 this diff pinpointed the missing store in one +read, naming `ir/opt_jump_thread.c` (`tcc_ir_opt_jump_threading`). + +### Manual IR walk + +If you prefer, dump the full pass sequence directly: + +```bash +./armv8m-tcc -dump-ir-passes=all -O1 -nostdlib -mcpu=cortex-m33 -mthumb \ + -mfloat-abi=soft -ffunction-sections \ + -Itests/ir_tests/libc_includes -Itests/ir_tests/libc_imports \ + -Itests/ir_tests/libc_includes/newlib -Iinclude \ + -c repro.c -o /dev/null # > passes.txt 2>&1 +``` + +Match the `BEFORE`/`AFTER` lines from the bisector against `=== AFTER ===` +blocks. Grep the pass/group name in `ir/` to find the implementing function. + +### When Phase A finds no knob + +SSA-pipeline bugs are not gated by `-fno-*`. Follow the `TCC_SKIP_SSA` / +`TCC_SKIP_SSA2` env-var bisection in `docs/fuzz_triage_guide.md` ("When +`culprit knob = none`"). Pass names: `ssa:sccp ssa:cprop ssa:fold ssa:gvn +ssa:reassoc ssa:strength ssa:narrow ssa:dce ssa:dead_loop ...`. + +### Reducing a huge repro + +```bash +python3 scripts/reduce_divergence.py tests/fuzz/fuzz_triage_repros/seedN.c \ + --low -O0 --high -O2 -o reduced.c +``` + +Line-granularity delta reduction that preserves the divergence. Use it to shrink +a 100-line fuzz seed before reading IR. + +## 4. Write the regression test FIRST + +**Do not fix the bug before the test exists.** The test must fail on the unfixed +build and pass after the fix — that is the only proof the fix is real. + +Pattern (see `tests/ir_tests/193_…199_`, `204_fuzz_entry_store_loop_overwrite`): + +1. Copy the (ideally reduced) repro to `tests/ir_tests/NN_fuzz_.c` + with a header comment naming the pass, the root cause, and the fix in one + sentence. `NN` = next free number. +2. Create `tests/ir_tests/NN_fuzz_.expect` containing the single + correct line, e.g. `checksum=47b835f7` (the `gcc -m32 -funsigned-char` value). +3. Register it in `TEST_FILES` in `tests/ir_tests/test_qemu.py`. +4. Confirm it **fails** on the buggy code and **passes** after the fix: + ```bash + git stash push ir/.c && make cross -j$(nproc) + cd tests/ir_tests && python run.py -c NN_fuzz_.c --cflags="-O1" # wrong value + git stash pop && make cross -j$(nproc) + python run.py -c NN_fuzz_.c --cflags="-O1" # correct value + ``` + +## 5. Fix, then verify broadly + +```bash +make cross -j$(nproc) +# the new regression test at every level: +cd tests/ir_tests && for o in -O0 -O1 -O2; do python run.py -c NN_fuzz_.c --cflags="$o"; done +# full IR suite (must stay green): +python3 -m pytest test_qemu.py -n 16 -q +# confirm no new fuzz divergences were introduced in the bug's neighbourhood: +python3 scripts/diff_olevels.py --seeds 0-5000 --require-qemu +``` + +A fix is only complete when: the new regression test passes, the full IR suite is +green, and the fuzz sweep shows **zero new** divergences (pre-existing unrelated +ones are expected — compare against `fuzz_triage_0_5000.md`). + +--- + +## Pitfalls & lessons + +- **Reduce first, always.** Phase C (final-IR diff) and manual IR reading are + only readable on a *reduced* repro. At -O2 a full fuzz seed unrolls/rotates + into hundreds of lines of noise; the 56-line reduced form surfaces the single + dropped store. Run `scripts/reduce_divergence.py` before reading IR. +- **Instrument a COPY, keep the repro pristine.** The `trace(__LINE__)` technique + from `fuzz_triage_guide.md` is great for finding the first divergent + statement, but the `printf` calls perturb optimization (they prevent + unrolling/inlining), so the instrumented build can produce the *correct* + result and mask the bug (seed 671). Always instrument a throwaway copy and + keep the pristine repro for IR dumping. +- **One bug, many "fixing" knobs.** A misfolded constant (or dropped store) + flows through several later passes, so disabling any of them can mask the + symptom. The real root cause is the pass that *creates* the bad value (the one + Phase B/C flags), not the first knob Phase A reports. Cross-check the phases. +- **Entry-block stores dominate, but domination ≠ "still current".** A store in + the entry block executes before all code, but a later store (often inside a + loop, reached via the back-edge) overwrites the value. Forwarding the entry + value into a loop-interior read is wrong. This was seed 295's bug + (`entry_store_prop`). Any "entry-BB value forwarding" pass must invalidate an + offset the moment it is written after the entry block — and *not* be shielded + by "but a runtime-indexed load might read it": runtime loads read memory + directly and are unaffected by the forwarding table. +- **`-O0` is the oracle, but `char`/`long` ABI matters.** Always compare against + `gcc -m32 -funsigned-char`; plain `gcc` (signed char) makes correct ARM code + look wrong. +- **HardFault + MANY unrelated "fixing" knobs = backend layout bug, not an IR + misfold.** When Phase A reports half the knob list (each just shifts code + layout) and the signature is a wild `PC`/`BFAR`, stop reading IR and read the + disassembly around the stacked PC first (`crash_disasm.txt` from + `triage_seed.py`, or `arm-none-eabi-objdump -d -M force-thumb`). A PC that + lands in objdump "garbage" is execution falling into data. ptr seed 5759: a + literal-pool flush landed INSIDE an ITE block — the pool's B.W skip-branch + occupied the else-arm slot, so the then-path fell through into pool data + (fix: IT-window guard in `ot()`, test 254). Same family as seed 2987 (STRD + fuse across a jump target, test 251): the IR is fine; the emitted layout + isn't. +- **Size-sensitive tests.** A codegen-layout change can break tests like + `96_nodata_wanted` (labels-as-values / literal pools). If a "fix" breaks an + unrelated test, suspect literal-pool or branch-range regressions, not the test. diff --git a/docs/design_loop_unrolling.md b/docs/design_loop_unrolling.md deleted file mode 100644 index 191bad21..00000000 --- a/docs/design_loop_unrolling.md +++ /dev/null @@ -1,550 +0,0 @@ -# Loop Unrolling Design - -## Goal - -Unroll small constant-trip-count loops to eliminate branch overhead and enable -further optimizations (constant folding, dead code elimination). - -## Motivating Example - -```c -const char *str = "hello"; -int sum = 0; -for (int i = 0; i < 5; i++) { - sum += strlen(str); -} -``` - -After strlen folding, the IR loop body becomes `V1 = V1 + #5` repeated 5 times. -The actual optimized IR before unrolling (from dump_ir.txt): - -``` -0000: V0 <-- GlobalSym(268435461) [ASSIGN] ; str = "hello" -0001: V1 <-- #0 [ASSIGN] ; sum = 0 -0002: V2 <-- #0 [ASSIGN] ; i = 0 -0003: CMP V2, #5 ; HEADER: i < 5? -0004: JMP to 14 if ">=S" ; EXIT: jump past loop -0005: JMP to 11 ; jump to body (skip latch on first iter) -0006: T0 <-- V2 [ASSIGN] ; LATCH: save old i -0007: V2 <-- T0 ADD #1 ; i++ -0008: JMP to 3 ; back to header -0009: NOP -0010: NOP ; (folded PARAM — was strlen arg) -0011: NOP ; (folded CALL — strlen folded to #5) -0012: V1 <-- V1 ADD #5 ; BODY: sum += 5 -0013: JMP to 6 ; jump to latch -0014: ... ; EXIT TARGET: printf etc. -``` - -Loop structure detected by `tcc_ir_detect_loops()`: -- Backward jump: instruction 8 (`JMP to 3`) — this is the latch -- `header_idx = 3`, `start_idx = 3`, `end_idx = 8` -- Body extends to 13 via forward jump analysis (instr 5 jumps to 11, instr 13 jumps to 6) -- `preheader_idx = 2` (the `V2 <-- #0` instruction before header) - -With full unrolling, this becomes: - -``` -0001: V1 <-- #0 -0012: V1 <-- V1 ADD #5 ; iteration 0 - V1 <-- V1 ADD #5 ; iteration 1 - V1 <-- V1 ADD #5 ; iteration 2 - V1 <-- V1 ADD #5 ; iteration 3 - V1 <-- V1 ADD #5 ; iteration 4 -``` - -And the existing iterative constant propagation (Phase 1) collapses it to `V1 <-- #25`. - -## Scope - -**Full unrolling only** for loops where: -- Trip count is a compile-time constant -- Trip count <= threshold (16) -- Loop body is small (<= 32 non-NOP instructions) -- No nested loops (single-level only) -- Simple exit condition: `CMP IV, #N` followed by conditional jump -- Total expanded size: `trip_count * body_insn_count <= 128` - -Partial unrolling (unroll-by-factor) is out of scope for the initial -implementation. - -## Where It Fits in the Pipeline - -In `tccgen.c` (around line 23991), between dead store elimination and LICM: - -``` -Phase 4: Store-load forwarding, redundant/dead store elimination (existing, ~line 23963-23990) -Phase 5a: Loop unrolling (NEW) -Phase 5a': Re-run Phase 1 iterative const prop + DCE (NEW — collapse unrolled code) -Phase 5: LICM (existing, disabled, ~line 23992) -Phase 6: IV strength reduction (existing, ~line 24008) -``` - -The key is that loop unrolling runs **after** strlen/constant folding has -simplified the body and **before** IV strength reduction (which would be -confused by an unrolled loop). After unrolling, we re-run the Phase 1 iterative -loop so constant propagation can collapse `0 + 5 + 5 + 5 + 5 + 5 → 25`. - -## Data Structures - -No new data structures. Reuse existing ones: - -| Structure | Defined in | Used for | -|-----------|-----------|----------| -| `IRLoop` | `ir/licm.h:28` | Loop bounds: header_idx, start_idx, end_idx, preheader_idx | -| `IRLoops` | `ir/licm.h:41` | Collection of detected loops | -| `InductionVar` | `ir/opt.c:7991` | IV: vreg, init_val, step, def_idx, init_idx | - -## Algorithm — Detailed - -### Phase 1: Detect loops and find candidates - -```c -int tcc_ir_opt_loop_unroll(TCCIRState *ir) -{ - IRLoops *loops = tcc_ir_detect_loops(ir); - // Process innermost loops first (highest start_idx) - // For each loop, call try_unroll_loop() -} -``` - -For each loop, `try_unroll_loop()` performs these checks: - -#### 1a. Find the induction variable - -Reuse `find_induction_vars()` (ir/opt.c:8021). This function: -- Scans `[loop->start_idx, loop->end_idx]` for `V = V + const` pattern -- Verifies V has exactly 1 definition inside the loop -- Looks for initialization `V = #const` in preheader (up to 5 instructions back) -- Returns `InductionVar { vreg, init_val, step, def_idx, init_idx }` - -**Requirement**: exactly 1 basic IV found (multi-IV loops are too complex). - -#### 1b. Find the exit condition - -Scan from `loop->header_idx` forward (at most 2 instructions) for: - -``` -CMP Viv, #limit -JMP to exit_target if COND -``` - -Where: -- `Viv` is the IV vreg from step 1a -- `#limit` is an immediate constant -- `COND` is one of: `>=S` (for `i < N`), `>S` (for `i <= N`), `==` (for `i != N`) -- `exit_target > loop->end_idx` (jumps past the loop) - -Extract: `cmp_idx`, `jmpif_idx`, `exit_target`, `limit`, `cond_token`. - -#### 1c. Compute trip count - -```c -switch (cond_token) { - case TOK_GE: // >=S means loop runs while < - trip_count = (limit - init_val + step - 1) / step; // ceiling division - break; - case TOK_GT: // >S means loop runs while <= - trip_count = (limit - init_val) / step + 1; - break; - case TOK_NE: // != means loop runs until equality - if ((limit - init_val) % step != 0) return 0; // infinite loop risk - trip_count = (limit - init_val) / step; - break; -} -``` - -**Bail if**: `trip_count <= 0`, `trip_count > 16`, or `step <= 0`. - -#### 1d. Identify the body instructions - -The "body" is everything between the exit conditional jump and the back-edge -jump that is NOT: -- The CMP instruction (`cmp_idx`) -- The conditional exit JMP (`jmpif_idx`) -- The IV increment (`iv.def_idx`) -- The back-edge JMP (latch jump to header) -- NOP instructions -- The `T0 <-- V2 [ASSIGN]` preceding the IV increment (save-old-IV pattern) - -In the example IR: -``` -Body instructions to clone = { 0012: V1 <-- V1 ADD #5 } -``` - -Count them: `body_insn_count`. **Bail if** `body_insn_count > 32` or -`trip_count * body_insn_count > 128`. - -#### 1e. Check no nested loops - -Scan body for backward JMP instructions (target < source). If any found, -bail — this is a nested loop. - -#### 1f. Check no side effects that prevent unrolling - -Scan body for instructions that are problematic: -- `FUNCCALLVAL` / `FUNCCALLVOID` — bail (calls can have side effects) - - Exception: if we later add pure-function tracking, pure calls are OK -- `INLINE_ASM` — bail -- `SETJMP` / `LONGJMP` — bail - -**Note**: `STORE` instructions are fine to unroll — they just happen N times to -different addresses (array writes). `LOAD` too. - -### Phase 2: Emit unrolled code - -Strategy: **in-place overwrite + `insert_instr_at()` for overflow**. - -Since `insert_instr_at()` (ir/opt.c:8284) already exists and correctly updates -all jump targets, we can use it when the unrolled body doesn't fit in the -original loop's instruction slots. - -However, to avoid the index-shifting complexity entirely for the common case, -use this two-tier approach: - -#### 2a. NOP out the entire loop region - -```c -for (int i = loop->start_idx; i <= loop_actual_end; i++) - ir->compact_instructions[i].op = TCCIR_OP_NOP; -``` - -Also NOP the IV initialization in the preheader (`iv.init_idx`). - -Also NOP the forward-jump into the body (`instr 5: JMP to 11` in our example) -if it's within the loop region. - -#### 2b. Compute write positions - -Available NOP slots: count NOPs in `[loop->start_idx, loop_actual_end]`. -Needed slots: `trip_count * body_insn_count`. - -- If `needed <= available`: write in-place starting at `loop->start_idx` -- If `needed > available`: write what fits in-place, then use `insert_instr_at()` - to insert remaining instructions at `loop_actual_end + 1` - -#### 2c. Clone body instructions for each iteration - -For each iteration `k = 0 .. trip_count - 1`: - For each body instruction `orig`: - - 1. Copy the instruction: `new.op = orig.op` - 2. Copy operands from the original (read src1, src2, dest from pool) - 3. **Remap operands**: - - If src1/src2 references the IV vreg → replace with constant - `#(init_val + k * step)` — but only if the IV is used as a value, - not being defined - - If dest is the IV vreg → this is the IV increment, already excluded - - VAR vregs defined inside the body: for each iteration k > 0, - allocate fresh TMPs via `tcc_ir_vreg_alloc_temp(ir)` and remap - all references to them within that iteration's copy - 4. Write to the next available slot using: - ```c - ir->compact_instructions[write_pos].op = new_op; - ir->compact_instructions[write_pos].operand_base = tcc_ir_pool_add(ir, dest); - tcc_ir_pool_add(ir, src1); - tcc_ir_pool_add(ir, src2); - ``` - 5. Clear `is_jump_target` on cloned instructions - -#### 2d. Patch the entry - -The original `JMP to exit if >=S` at `jmpif_idx` was NOPed. We need the -code to flow from the preheader into the first unrolled instruction. - -Since we write the unrolled body starting at `loop->start_idx` (which is the -header), the preheader naturally falls through into it. No patching needed — -the NOP'd header is replaced by the first unrolled body instruction. - -But we need to handle the `exit_target`: make sure the last unrolled -instruction falls through to `exit_target`. If the unrolled code ends before -`exit_target`, insert `JMP to exit_target` as the final instruction. - -#### 2e. Concrete example walkthrough - -For our test case (trip_count=5, body=[`V1 <-- V1 ADD #5`]): - -Original slots 3–13 (11 slots) get NOPed. We need 5 instructions. - -Write at positions 3–7: -``` -0003: V1 <-- V1 ADD #5 ; iteration 0 -0004: V1 <-- V1 ADD #5 ; iteration 1 -0005: V1 <-- V1 ADD #5 ; iteration 2 -0006: V1 <-- V1 ADD #5 ; iteration 3 -0007: V1 <-- V1 ADD #5 ; iteration 4 -0008: NOP ; (remaining slots stay NOP) -... -0013: NOP -0014: ... ; EXIT TARGET (unchanged) -``` - -Falls through to 0014 naturally. Phase 1 re-run folds: -``` -V1 = 0; V1 = V1+5; V1 = V1+5; ... → V1 = 25 -``` - -### Phase 3: Re-run constant propagation - -After unrolling, call the Phase 1 iterative loop again: - -```c -if (unrolled_count > 0) { - int iter2 = 0; - int ch2; - do { - ch2 = 0; - if (tcc_state->opt_dce) ch2 += tcc_ir_opt_dce(ir); - if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_const_prop(ir); - if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_const_prop_tmp(ir); - if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_branch_folding(ir); - } while (ch2 > 0 && ++iter2 < 10); -} -``` - -## File-by-file Implementation Plan - -### Step 1: Add flag — `tcc.h` and `libtcc.c` - -**tcc.h** (~line 1147, after `opt_iv_strength_red`): -```c -unsigned char opt_loop_unroll; /* -floop-unroll: full unroll small loops */ -``` - -**libtcc.c** (~line 1724, in flag table after `iv-strength-red`): -```c -{offsetof(TCCState, opt_loop_unroll), 0, "loop-unroll"}, -``` - -**libtcc.c** (~line 2279, in -O1 block): -```c -s->opt_loop_unroll = 1; /* Full-unroll small constant-trip-count loops */ -``` - -### Step 2: Declare API — `ir/opt.h` - -Add declarations (near the other loop optimization declarations): -```c -int tcc_ir_opt_loop_unroll(TCCIRState *ir); -int tcc_ir_opt_loop_unroll_with_loops(TCCIRState *ir, IRLoops *loops); -``` - -### Step 3: Implement — `ir/opt.c` - -Add a new section after the IV strength reduction code (~line 8570). - -**Helper: `find_loop_exit_condition()`** -```c -/* Scan from header_idx for: CMP Viv, #limit; JUMPIF exit_target COND - * Returns 1 if found, fills out_cmp_idx, out_jmpif_idx, out_limit, out_cond, - * out_exit_target. */ -static int find_loop_exit_condition(TCCIRState *ir, IRLoop *loop, - int iv_vreg, - int *out_cmp_idx, int *out_jmpif_idx, - int *out_limit, int *out_cond, int *out_exit_target); -``` - -Scan instructions `[header_idx, header_idx+3]`: -- Find `CMP` where one operand is `iv_vreg` and the other is immediate -- Find `JUMPIF` immediately after the CMP -- Extract condition token from the JUMPIF -- Extract exit target (must be > loop->end_idx to be an exit) - -**Helper: `compute_trip_count()`** -```c -static int compute_trip_count(int init_val, int limit, int step, int cond_token); -``` - -Handle: -- `>=S` (generated by `i < N`): `trip_count = ceil((limit - init_val) / step)` - with `ceil(a/b) = (a + b - 1) / b` for positive values -- `>S` (generated by `i <= N`): `trip_count = (limit - init_val) / step + 1` -- Validate: `trip_count >= 0`, `(limit - init_val)` is exact multiple of step - for `!=` conditions - -**Helper: `collect_body_instructions()`** -```c -/* Collect non-control-flow, non-IV body instructions to clone. - * Returns count, fills body_indices[] array. */ -static int collect_body_instructions(TCCIRState *ir, IRLoop *loop, - int iv_vreg, int cmp_idx, int jmpif_idx, int iv_def_idx, - int *body_indices, int max_body); -``` - -Walk `[loop->start_idx, loop_actual_end]`, skip: -- NOP instructions -- CMP at cmp_idx -- JUMPIF at jmpif_idx -- All JMP (unconditional) instructions -- IV increment at iv_def_idx -- ASSIGN that copies IV to a temp (pattern: `T = Viv` where T is only - used by the IV increment on the next line) - -**Main: `try_unroll_loop()`** -```c -static int try_unroll_loop(TCCIRState *ir, IRLoop *loop) -{ - InductionVar ivs[MAX_IV]; - int num_ivs = find_induction_vars(ir, loop, ivs, MAX_IV); - if (num_ivs != 1) return 0; - - InductionVar *iv = &ivs[0]; - int cmp_idx, jmpif_idx, limit, cond, exit_target; - if (!find_loop_exit_condition(ir, loop, iv->vreg, - &cmp_idx, &jmpif_idx, &limit, &cond, &exit_target)) - return 0; - - int trip_count = compute_trip_count(iv->init_val, limit, iv->step, cond); - if (trip_count <= 0 || trip_count > 16) return 0; - - int body_indices[128]; - int body_count = collect_body_instructions(ir, loop, iv->vreg, - cmp_idx, jmpif_idx, iv->def_idx, body_indices, 128); - if (body_count <= 0 || body_count > 32) return 0; - if (trip_count * body_count > 128) return 0; - - // Check no nested loops (backward jumps in body) - // Check no CALL/ASM instructions in body - - // === EMIT === - // NOP out entire loop region [start_idx .. actual_end] + IV init - // Write trip_count copies of body at start_idx - // Add JMP to exit_target at the end if needed - - return 1; -} -``` - -**Vreg remapping during clone:** - -For each body instruction being cloned for iteration k: -- Read original dest, src1, src2 -- If src1 or src2 has vreg == iv_vreg: replace with `irop_make_imm32(-1, init_val + k * step, VT_INT)` -- For VAR vregs defined in the body (not the IV): need per-iteration copies. - But since we use full unrolling and the accumulator pattern is `V = V + const`, - we do NOT remap — the same V is accumulated across iterations. This is correct: - ``` - V1 = V1 + 5 ; iter 0: V1 goes from 0 → 5 - V1 = V1 + 5 ; iter 1: V1 goes from 5 → 10 - ``` - -The only remapping needed is: uses of the IV as a value (e.g., `arr[i] = i` -where i appears as src). The IV definition itself is excluded from the body. - -**Writing an instruction in-place at a NOP slot:** -```c -static void write_instr_at(TCCIRState *ir, int pos, TccIrOp op, - IROperand dest, IROperand src1, IROperand src2) -{ - IRQuadCompact *q = &ir->compact_instructions[pos]; - q->op = op; - q->is_jump_target = 0; - q->operand_base = tcc_ir_pool_add(ir, dest); - tcc_ir_pool_add(ir, src1); - tcc_ir_pool_add(ir, src2); -} -``` - -This reuses the existing `tcc_ir_pool_add()` to allocate operand pool entries. -The old operand pool entries for the NOPed instructions become garbage but are -harmless (the pool only grows; it's freed when the IR block is freed). - -### Step 4: Wire into pipeline — `tccgen.c` - -At ~line 23991, after dead store elimination, before LICM: - -```c - /* Phase 5a: Loop Unrolling - fully unroll small constant-trip-count loops */ - int unrolled_count = 0; - if (tcc_state->opt_loop_unroll) - unrolled_count = tcc_ir_opt_loop_unroll(ir); - - /* Phase 5a': After unrolling, re-run iterative constant propagation + DCE - * to collapse the expanded constant arithmetic (e.g. 0+5+5+5+5+5 → 25) */ - if (unrolled_count > 0) - { - int iter2 = 0, ch2; - do { - ch2 = 0; - if (tcc_state->opt_dce) ch2 += tcc_ir_opt_dce(ir); - if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_const_prop(ir); - if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_const_prop_tmp(ir); - if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_branch_folding(ir); - if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_value_tracking(ir); - } while (ch2 > 0 && ++iter2 < 10); - } -``` - -### Step 5: Add tests - -**Test 1**: Existing `100_pure_func_strlen.c` — verify with `--dump-ir` that -the loop is eliminated and `V1 <-- #25` appears in the optimized IR. -Update the expect file if output changes (it shouldn't — same result, less work). - -**Test 2**: New `101_loop_unroll_basic.c`: -```c -#include -int main() { - int sum = 0; - for (int i = 0; i < 4; i++) sum += 10; - printf("%d\n", sum); // expect: 40 - return sum != 40; -} -``` - -**Test 3**: New `102_loop_unroll_no_unroll.c`: -```c -#include -int main() { - int sum = 0; - int n = 100; - for (int i = 0; i < n; i++) sum += 1; // n not const — don't unroll - printf("%d\n", sum); - return sum != 100; -} -``` - -**Test 4**: New `103_loop_unroll_with_array.c`: -```c -#include -int main() { - int arr[4]; - for (int i = 0; i < 4; i++) arr[i] = i * 10; - printf("%d %d %d %d\n", arr[0], arr[1], arr[2], arr[3]); - return 0; -} -``` - -Add all to `TEST_FILES` in `tests/ir_tests/test_qemu.py`. - -### Step 6: Validate - -```bash -make cross && make test -j16 # IR tests (must all pass) -make test-asm -j16 # ASM tests (no regressions) -# Optionally: -make test-gcc-torture-compile # GCC torture compile tests -``` - -## Edge Cases - -| Case | Expected behavior | -|------|-------------------| -| `for (i=0; i<0; i++)` | trip_count=0, NOP out loop, keep init values | -| `for (i=0; i<1; i++)` | trip_count=1, emit body once (no loop overhead) | -| `for (i=5; i<10; i+=2)` | trip_count=ceil(5/2)=3, emit 3 copies with IV=5,7,9 | -| `for (i=0; i<17; i++)` | trip_count=17 > threshold, skip | -| Body has `if/else` | Body contains JUMPIF → forward jumps within body. These need target remapping per iteration. Complex — bail for v1 | -| IV used after loop | Keep IV final value: `V2 = init + trip_count * step` assigned before exit | - -## Risks and Mitigations - -| Risk | Mitigation | -|------|-----------| -| Code size explosion | Conservative threshold: trip_count * body_size <= 128 | -| Instruction index corruption (like LICM bug) | Write into NOP slots — no shifting. Only use insert_instr_at() as fallback | -| Incorrect vreg remapping | Keep it simple: V accumulators aren't remapped (correct for `V=V+C`). IV uses get constant substitution. Fresh TMPs only for TMP vregs defined in body | -| Interactions with IV strength reduction | Unrolling eliminates the loop; IV SR detects no loops (safe) | -| Register pressure increase | Unrolled code reuses same VARs; linear scan handles spills | -| Body with internal branches | v1: bail on bodies containing JUMPIF (revisit later) | -| Operand pool growth | Pool only grows, old entries become dead — acceptable for small unrolls | diff --git a/docs/design_scalar_evolution.md b/docs/design_scalar_evolution.md deleted file mode 100644 index ed5b5008..00000000 --- a/docs/design_scalar_evolution.md +++ /dev/null @@ -1,216 +0,0 @@ -# Scalar Evolution / Loop Accumulator Optimization Design - -## Goal - -Recognize simple accumulation patterns in loops and replace them with a -closed-form computation, eliminating the loop entirely without unrolling. - -## Motivating Example - -After strlen folding, the loop: - -```c -int sum = 0; -for (int i = 0; i < 5; i++) { - sum += 5; // strlen("hello") folded to 5 -} -``` - -produces IR: - -``` -V1 <-- #0 ; sum = 0 -V2 <-- #0 ; i = 0 -loop: - CMP V2, #5 - JMP exit if >=S - V1 <-- V1 ADD #5 ; sum += 5 - V2 <-- V2 ADD #1 ; i++ - JMP loop -exit: - ... use V1 ... -``` - -Scalar evolution recognizes that `V1` has the closed form: -`V1_final = init + trip_count * stride = 0 + 5 * 5 = 25` - -The entire loop is replaced with: - -``` -V1 <-- #25 -``` - -## Relationship to Loop Unrolling - -These are complementary optimizations: - -| | Loop Unrolling | Scalar Evolution | -|---|---|---| -| Approach | Replicate body N times | Compute final value directly | -| When better | Body has side effects, memory ops | Body is pure accumulation | -| Code size | Grows with trip count | Constant (1-2 instructions) | -| Generality | Works for any small loop | Only for reducible patterns | - -Scalar evolution is strictly better when applicable, but applies to fewer cases. -Loop unrolling is more general and also enables scalar evolution indirectly -(by exposing constant patterns to the existing constant propagation). - -**Recommended order**: Try scalar evolution first; if it fails, fall back to -loop unrolling. - -## Scope - -**Patterns recognized** (initial implementation): - -1. **Constant accumulation**: `acc += constant` over N iterations - - Result: `acc = init + N * constant` -2. **Linear induction final value**: `i = 0; i < N; i += step` - - Result: `i_final = N` (or `init + trip_count * step`) -3. **Constant assignment in loop**: `x = constant` repeated N times - - Result: `x = constant` (one assignment) - -**Not in scope** (future work): -- Polynomial induction (`sum += i` → triangular number) -- Reduction with non-constant stride (`sum += arr[i]`) -- Floating-point accumulation (precision semantics differ) -- Multiple exit loops - -## Where It Fits in the Pipeline - -``` -Phase 1: Constant propagation + strlen folding (existing) -Phase 5a: Scalar evolution / loop replacement (NEW) -Phase 5b: Loop unrolling (for remaining loops) (NEW) -Phase 1': Re-run constant prop + DCE (collapse results) -Phase 5: LICM (existing, disabled) -Phase 6: IV strength reduction (existing) -``` - -Runs in the same slot as loop unrolling, just before it. - -## Algorithm - -### Step 1: Loop analysis - -For each detected loop (reuse `tcc_ir_detect_loops()`): - -1. Identify all **basic induction variables** (reuse `find_induction_vars()`) -2. Determine **trip count** (same as loop unrolling: constant init, limit, step) -3. Verify **single exit** from loop header - -### Step 2: Classify loop body vregs - -Scan all non-NOP instructions in the loop body. For each VAR vreg `V` defined -in the loop, classify it: - -- **Basic IV**: `V = V + const_step` (already identified) -- **Constant accumulator**: `V = V + const` or `V = V - const` - (where const does not depend on any loop-variant value) -- **Constant overwrite**: `V = const` (same constant every iteration) -- **Non-reducible**: anything else (memory store, function call, etc.) - -A loop is **fully reducible** if: -- Every instruction is either a NOP, an IV increment, a reducible accumulator - update, or a branch instruction (CMP/JMP) for loop control -- There are no STORE, CALL, or other side-effecting instructions - -### Step 3: Compute closed-form values - -For each reducible accumulator: - -| Pattern | Closed Form | -|---------|------------| -| `V = V + C` (accumulator) | `V_final = V_init + trip_count * C` | -| `V = V - C` | `V_final = V_init - trip_count * C` | -| `V = C` (overwrite) | `V_final = C` | -| IV `V += step` | `V_final = V_init + trip_count * step` | - -Compute `trip_count * C` at compile time (both are constants). If the result -overflows 32 bits, bail out (preserve runtime semantics). - -### Step 4: Replace loop with assignments - -1. NOP out all instructions from loop preheader through loop end -2. At the loop start position, emit: - - For each reducible VAR: `V <-- #closed_form_value` - - Fall through to the original exit target -3. If any VAR is used after the loop, make sure its final value is set - -### Step 5: Dead IV cleanup - -The IV initialization and any IV-only uses become dead. Existing DCE handles -this automatically. - -## API - -```c -/* In ir/opt.h */ - -/* Attempt to replace loops with closed-form scalar computations. - * Returns number of loops eliminated. */ -int tcc_ir_opt_scalar_evolution(TCCIRState *ir); - -/* Variant using pre-detected loops */ -int tcc_ir_opt_scalar_evolution_with_loops(TCCIRState *ir, IRLoops *loops); -``` - -## Data Structures - -```c -/* Accumulator pattern found in a loop body */ -typedef struct LoopAccumulator { - int vreg; /* VAR vreg being accumulated */ - int init_val; /* Initial value (from preheader) */ - int stride; /* Constant added per iteration */ - int init_idx; /* Instruction index of initialization */ - int update_idx; /* Instruction index of accumulation in loop */ - enum { - ACCUM_ADD, /* V = V + C */ - ACCUM_SUB, /* V = V - C */ - ACCUM_ASSIGN, /* V = C (constant overwrite) */ - } kind; -} LoopAccumulator; - -#define MAX_ACCUMULATORS 8 -``` - -## Configuration - -Reuse `opt_loop_unroll` flag or add a separate `opt_scalar_evol` flag. -Enable at `-O1`. - -## Testing Strategy - -1. **Primary test**: `100_pure_func_strlen.c` - loop eliminated, sum = 25 -2. **New tests**: - - `sum += 3` over 10 iterations → sum = 30 - - `sum += i` (NOT reducible with initial impl - should fall through to - unrolling or remain as loop) - - Two accumulators in same loop: `sum1 += 2; sum2 += 3;` - - Loop with memory store in body (should NOT be eliminated) - - Trip count = 0 (loop never executes, preserve init values) - - Accumulator with negative stride: `sum -= 1` - - Overflow edge case: `sum += 0x40000000` over 8 iterations - -## Risks and Mitigations - -| Risk | Mitigation | -|------|-----------| -| Incorrect trip count for edge conditions | Handle `<`, `<=`, `!=` separately; test boundary values | -| Overflow semantics mismatch | Use 32-bit wrapping arithmetic (matches C unsigned); bail for signed overflow | -| Dead code after elimination | Existing DCE handles cleanup | -| Interaction with IV strength reduction | Eliminated loops have no IVs; SR skips them naturally | -| Missing a side effect in the loop | Conservative: any STORE/CALL/volatile makes loop non-reducible | - -## Implementation Steps - -1. Write `tcc_ir_opt_scalar_evolution()` in `ir/opt.c`: - a. Detect loops, find IVs, compute trip counts - b. Scan body for accumulator patterns - c. Check full reducibility (no side effects) - d. Compute closed-form values - e. Replace loop with constant assignments -2. Wire into pipeline before loop unrolling -3. Re-run Phase 1 constant prop after both passes -4. Add tests -5. Verify no regressions diff --git a/docs/fixes/20000313-1_value_tracking_addrtaken.md b/docs/fixes/20000313-1_value_tracking_addrtaken.md deleted file mode 100644 index 6f402cb8..00000000 --- a/docs/fixes/20000313-1_value_tracking_addrtaken.md +++ /dev/null @@ -1,238 +0,0 @@ -# Fix: Value Tracking Ignores Address-Taken Variables Across Calls - -**Test case**: `gcc.c-torture/execute/20000313-1.c` -**Symptom**: Exit code 1 (abort) with `-O1 -g`, passes without optimization. - -## Test Case - -```c -unsigned int buggy(unsigned int *param) -{ - unsigned int accu, zero = 0, borrow; - accu = - *param; // accu = 0xFFFFFFFF (negate 1) - borrow = - (accu > zero); // borrow = 0xFFFFFFFF - *param += accu; // *param = 1 + 0xFFFFFFFF = 0 - return borrow; -} - -int main(void) -{ - unsigned int param = 1; - unsigned int borrow = buggy(¶m); - if (param != 0) abort(); // Should NOT abort - if (borrow + 1 != 0) abort(); // Should NOT abort - return 0; -} -``` - -Expected: `param == 0` after call (modified through pointer), `borrow == 0xFFFFFFFF`. - -## Root Cause - -The `tcc_ir_opt_value_tracking` pass in `ir/opt.c` (line ~919) incorrectly -constant-folds a comparison on a variable whose address was taken and passed to -a function call. - -### IR for `main` before optimization: - -``` -0000: V0 <-- #1 [ASSIGN] ; param = 1 -0001: T0 <-- &V0 ; take address of param -0002: PARAM0[call_0] T0 ; pass ¶m to buggy -0003: CALL GlobalSym(buggy) --> V1 ; call buggy(¶m) -0004: CMP V0,#0 ; check if param == 0 -0005: JMP to 8 if "==" ; skip abort if true -0006: FUNCPARAMVOID #65536 -0007: CALL abort -``` - -### IR for `main` after optimization (BUGGY): - -``` -0000: V0 <-- #1 [ASSIGN] -0001: R4(T0) <-- &V0 -0002: PARAM0[call_0] R4(T0) -0003: CALL GlobalSym(buggy) --> R5(V1) -0004: NOP ; ← BUG: CMP was removed -0005: NOP ; ← BUG: JMP was removed -0006: FUNCPARAMVOID #65536 -0007: CALL abort ; ← always reached → crash -``` - -The value tracking pass sees `V0 = 1` at instruction 0000 and propagates this -constant through to instruction 0004 (`CMP V0, #0`). Since `1 != 0`, it -concludes the branch at 0005 is never taken and eliminates both the CMP and JMP -as NOPs. This causes the unconditional fall-through to `abort()`. - -**The pass ignores that V0's address was taken (`&V0`) and passed to `buggy()`, -which modifies `*param` (i.e., V0) through the pointer.** After the CALL, -V0's value is no longer known to be 1. - -## Disassembly Comparison - -### Without optimization (correct): - -```arm -; main: -10001198: movs r0, #1 ; param = 1 -1000119a: str.w r0, [r7, #-4] ; store to stack -1000119e: subs r4, r7, #4 ; r4 = ¶m -100011a0: mov r0, r4 -100011a2: bl buggy -100011a6: mov r5, r0 ; save borrow -100011a8: ldr.w r0, [r7, #-4] ; RELOAD param from stack -100011ac: cmp r0, #0 ; check param == 0 -100011ae: beq.w skip_abort1 -100011b2: bl abort -``` - -### With -O1 -g (broken): - -```arm -; main: -10001190: movs r0, #1 ; param = 1 -10001192: str.w r0, [r7, #-4] -10001196: subs r4, r7, #4 ; r4 = ¶m -10001198: mov r0, r4 -1000119a: bl buggy -1000119e: mov r5, r0 ; save borrow -100011a0: bl abort ; ALWAYS calls abort! CMP/branch gone -``` - -## Bug Location - -**File**: `ir/opt.c`, function `tcc_ir_opt_value_tracking` (line ~919) - -Two missing safety checks: - -### 1. Pattern 1 (line ~1019): Missing addrtaken guard on constant assignment - -```c -/* Pattern 1: Direct constant assignment: Vx <- #const */ -if (q->op == TCCIR_OP_ASSIGN && irop_is_immediate(src1)) -{ - if (dest_pos >= 0 && dest_pos <= max_vreg) - { - // BUG: No check for addrtaken! - state[dest_pos].is_constant = 1; - state[dest_pos].value = irop_get_imm64_ex(ir, src1); - } - continue; -} -``` - -The sibling pass `tcc_ir_opt_const_prop` (line ~340) correctly guards: - -```c -IRLiveInterval *interval = tcc_ir_get_live_interval(ir, dest_vr); -if (interval && interval->addrtaken) -{ - var_info[pos].def_count++; - var_info[pos].is_constant = 0; - continue; -} -``` - -### 2. Missing CALL invalidation (after line ~1108) - -The catch-all invalidation at line ~1108 only fires for instructions that -**define** a VAR vreg: - -```c -/* Any other instruction that defines a VAR vreg invalidates the constant */ -if (dest_pos >= 0 && dest_pos <= max_vreg && irop_config[q->op].has_dest) -{ - state[dest_pos].is_constant = 0; -} -``` - -But `FUNCCALLVOID` and `FUNCCALLVAL` do not define V0 — they define V1 (the -return value). V0 is modified **indirectly** through the pointer. The pass -never invalidates V0 across the call. - -## Proposed Fix - -Two changes in `tcc_ir_opt_value_tracking`: - -### Fix A: Never mark address-taken variables as constant - -At Pattern 1 (line ~1019), add the addrtaken guard before marking constant: - -```c -/* Pattern 1: Direct constant assignment: Vx <- #const */ -if (q->op == TCCIR_OP_ASSIGN && irop_is_immediate(src1)) -{ - if (dest_pos >= 0 && dest_pos <= max_vreg) - { - /* If address is taken, the variable can be modified through aliases; - * do not track it as constant. */ - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, dest_vr); - if (interval && interval->addrtaken) - { - state[dest_pos].is_constant = 0; - } - else - { - state[dest_pos].is_constant = 1; - state[dest_pos].value = irop_get_imm64_ex(ir, src1); - } - } - continue; -} -``` - -This is the **minimal and safest fix**. If a variable's address is taken, we -simply never consider it constant, period. This matches the conservative -approach used by `tcc_ir_opt_const_prop`. - -### Fix B (belt-and-suspenders): Invalidate address-taken vars at CALLs - -After the catch-all at line ~1108, add explicit CALL handling: - -```c -/* Function calls can modify any address-taken variable through pointers */ -if (q->op == TCCIR_OP_FUNCCALLVOID || q->op == TCCIR_OP_FUNCCALLVAL) -{ - for (int v = 0; v <= max_vreg; v++) - { - if (state[v].is_constant) - { - int32_t vr = TCCIR_ENCODE_VREG(TCCIR_VREG_TYPE_VAR, v); - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, vr); - if (interval && interval->addrtaken) - state[v].is_constant = 0; - } - } -} -``` - -**Fix A alone is sufficient**, since it prevents addrtaken vars from ever -entering the constant state. Fix B is an extra safety net. - -### Also apply to Pattern 2 (line ~1023) - -The same addrtaken guard should be added to Pattern 2 (arithmetic with constant -operand) for completeness, since `Vx <- Vy + #const` could also propagate a -stale constant for an addrtaken variable. - -## Testing - -1. Verify the test passes with both `-O0` and `-O1 -g`: - ```bash - cd tests/ir_tests - python run.py -c ../gcctestsuite/.../20000313-1.c - python run.py -c ../gcctestsuite/.../20000313-1.c --cflags="-O1 -g" - ``` - -2. Run the full test suite to check for regressions: - ```bash - make test -j16 - make test-all - ``` - -## Risk Assessment - -**Low risk.** Fix A is purely conservative — it reduces the set of variables -eligible for constant folding. Any variable whose address is taken will simply -not be optimized by this pass. This matches the behavior already used by the -sibling `tcc_ir_opt_const_prop` pass and cannot introduce new miscompilations. diff --git a/docs/fixes/20000412-3_large_struct_implicit_decl.md b/docs/fixes/20000412-3_large_struct_implicit_decl.md deleted file mode 100644 index 54f7c0a8..00000000 --- a/docs/fixes/20000412-3_large_struct_implicit_decl.md +++ /dev/null @@ -1,310 +0,0 @@ -# Fix: Large Struct Pass-by-Value Broken for Implicitly Declared Functions - -**Test case**: `gcc.c-torture/execute/20000412-3.c` -**Symptom**: Exit code 1 (abort) with `-O0`. - -## Test Case - -```c -typedef struct { - char y; - char x[32]; -} X; /* sizeof(X) == 33 bytes */ - -int z(void) -{ - X xxx; - xxx.x[0] = xxx.x[31] = '0'; - xxx.y = 0xf; - return f(xxx, xxx); /* f() not yet declared — implicit declaration */ -} - -int main(void) -{ - int val = z(); - if (val != 0x60) - abort(); - exit(0); -} - -int f(X x, X y) -{ - if (x.y != y.y) - return 'F'; - return x.x[0] + y.x[0]; /* expected: '0' + '0' = 0x60 = 96 */ -} -``` - -Expected: `f` returns `0x60` (96). Actual: exit code 1 (abort). - -## Root Cause - -The struct `X` is 33 bytes. Per ARM AAPCS, composite types larger than 16 bytes -must be passed via **invisible reference** — the caller allocates a copy on the -stack and passes a pointer to that copy. - -### Callee side (correct) - -When `f(X x, X y)` is compiled, the compiler knows it has 33-byte struct -parameters. The IR treats `P0`/`P1` as 4-byte pointers and dereferences them: - -``` -0002: T0 <-- StackLoc[-4] [LOAD] ; reload pointer -0004: T2 <-- T0***DEREF*** [LOAD] ; dereference: x.y = *(pointer) -``` - -The generated ARM correctly uses `ldrb r2, [r0, #0]` (indirect load through -pointer). - -### Caller side (broken) - -When `z()` calls `f(xxx, xxx)`, the function `f` has **no visible prototype** -(it's declared after `z`). The compiler sees it as `FUNC_OLD` (K&R-style / -implicit declaration). - -The IR emits: - -``` -0009: PARAM0[call_0] StackLoc[-33] -0010: PARAM1[call_0] StackLoc[-33] -0011: CALL GlobalSym(935) --> T6 -``` - -These are raw struct values at `StackLoc[-33]`, not pointers to copies. - -The generated ARM loads the **first 4 bytes of the struct value** instead of -passing the struct's address: - -```arm -sub.w ip, r7, #33 ; ip = &xxx (address of struct on stack) -ldr.w r0, [ip] ; BUG: r0 = first 4 bytes of struct DATA -sub.w ip, r7, #33 -ldr.w r1, [ip] ; BUG: r1 = first 4 bytes of struct DATA -bl f -``` - -The callee then dereferences these garbage "pointers" (actually `0x0f303030` -or similar), causing a wrong result or crash. - -### The mismatch - -| | Caller (`z`) | Callee (`f`) | -|---|---|---| -| **Sees `f` as** | `int f()` (implicit, no param info) | `int f(X x, X y)` (33-byte struct params) | -| **Passes in r0/r1** | First 4 bytes of struct value | Expects pointers to struct copies | - -## Bug Location - -**File**: `tccgen.c`, function `gfunc_param_typed` (line ~6469) - -The AAPCS invisible-reference conversion for large structs (lines 6505–6552) -is inside the `else` branch that only executes when a proper prototype exists -(`arg != NULL`): - -```c -static void gfunc_param_typed(Sym *func, Sym *arg) -{ - func_type = func->f.func_type; - if (func_type == FUNC_OLD || (func_type == FUNC_ELLIPSIS && arg == NULL)) - { - /* default casting : only need to convert float to double */ - if ((vtop->type.t & VT_BTYPE) == VT_FLOAT) - gen_cast_s(VT_DOUBLE); - // ... other default casts ... - // *** NO large-struct handling here! *** - } - else if (arg == NULL) - { - tcc_error("too many arguments to function"); - } - else - { - // ... prototype-aware path ... - if ((type.t & VT_BTYPE) == VT_STRUCT) - { - int align, size = type_size(&type, &align); - if (size > 16) - { - /* AAPCS invisible reference: allocate temp copy, pass pointer */ - // ... mk_pointer() + gaddrof() ... - } - } - gen_assign_cast(&type); - } -} -``` - -The `FUNC_OLD` path (lines 6475–6493) handles only `float→double` promotion, -bitfield casts, and `VT_MUSTCAST`. It has **no handling for large structs**. - -## Proposed Fix - -Add large-struct invisible-reference handling to the `FUNC_OLD` / no-prototype -path, since the ABI convention must be followed regardless of whether a -prototype is visible. - -### Fix: Add AAPCS struct handling to the FUNC_OLD path - -In `gfunc_param_typed`, at the top of the `FUNC_OLD` branch (line ~6477), -before existing default casting: - -```c -if (func_type == FUNC_OLD || (func_type == FUNC_ELLIPSIS && arg == NULL)) -{ - /* ARM AAPCS: large structs must use invisible reference even without - * a prototype, since the ABI is a property of the callee's compiled - * code, not the caller's view of the declaration. */ - if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) - { - int align, size = type_size(&vtop->type, &align); - if (size > 16) - { - if (nocode_wanted) - return; - if (!(vtop->r & VT_LVAL)) - tcc_error("cannot pass large struct by value"); - - int temp_vr; - int tmp_loc = get_temp_local_var(size, align, &temp_vr); - - SValue dst; - memset(&dst, 0, sizeof(dst)); - dst.type = vtop->type; - dst.r = VT_LOCAL | VT_LVAL; - dst.vr = temp_vr; - dst.c.i = tmp_loc; - vpushv(&dst); - vswap(); - vstore(); - - mk_pointer(&vtop->type); - gaddrof(); - return; - } - } - - /* existing default casting: float to double, etc. */ - if ((vtop->type.t & VT_BTYPE) == VT_FLOAT) - { - gen_cast_s(VT_DOUBLE); - } - // ... -} -``` - -This duplicates the logic from the prototype-aware path (lines 6505–6552) but -uses `vtop->type` (the actual argument type) instead of `arg->type` (the -parameter type from the prototype, which doesn't exist here). - -### Alternative: Extract shared helper - -To avoid duplication, extract a helper function: - -```c -/* Convert a large struct argument to an invisible-reference pointer (AAPCS). - * Returns 1 if conversion was applied, 0 otherwise. */ -static int maybe_convert_large_struct_to_ref(CType *type) -{ - if ((type->t & VT_BTYPE) != VT_STRUCT) - return 0; - int align, size = type_size(type, &align); - if (size <= 16) - return 0; - if (nocode_wanted) - return 1; - if (!(vtop->r & VT_LVAL)) - tcc_error("cannot pass large struct by value"); - - int temp_vr; - int tmp_loc = get_temp_local_var(size, align, &temp_vr); - - SValue dst; - memset(&dst, 0, sizeof(dst)); - dst.type = *type; - dst.r = VT_LOCAL | VT_LVAL; - dst.vr = temp_vr; - dst.c.i = tmp_loc; - vpushv(&dst); - vswap(); - vstore(); - - mk_pointer(&vtop->type); - gaddrof(); - return 1; -} -``` - -Then call it from both paths in `gfunc_param_typed`: - -```c -if (func_type == FUNC_OLD || (func_type == FUNC_ELLIPSIS && arg == NULL)) -{ - if (maybe_convert_large_struct_to_ref(&vtop->type)) - return; - /* existing default casts ... */ -} -else -{ - type = arg->type; - type.t &= ~VT_CONSTANT; - if (maybe_convert_large_struct_to_ref(&type)) - return; - gen_assign_cast(&type); -} -``` - -## Disassembly Comparison - -### Current (broken): - -```arm -; z() calling f(): -sub.w ip, r7, #33 ; ip = &xxx -ldr.w r0, [ip] ; r0 = WRONG: loads struct bytes 0-3 -sub.w ip, r7, #33 -ldr.w r1, [ip] ; r1 = WRONG: loads struct bytes 0-3 -bl f -``` - -### Expected (after fix): - -```arm -; z() calling f(): -; allocate temp copy 1 on stack, memcpy xxx into it -; allocate temp copy 2 on stack, memcpy xxx into it -; r0 = pointer to temp copy 1 -; r1 = pointer to temp copy 2 -bl f -``` - -## Testing - -1. Verify the test passes: - ```bash - cd tests/ir_tests - python run.py -c ../gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/execute/20000412-3.c --cflags="-O0" - ``` - -2. Run the full test suite to check for regressions: - ```bash - make test -j16 - make test-all - ``` - -3. Also test with a prototype-visible variant to confirm no regression: - ```c - int f(X x, X y); /* forward declaration */ - int z(void) { X xxx; ... return f(xxx, xxx); } - ``` - -## Risk Assessment - -**Low risk.** The fix adds handling to a code path that previously had none for -this case. It only affects `FUNC_OLD` (implicit/K&R) calls with struct arguments -larger than 16 bytes — a narrow and well-defined scenario. The same conversion -logic already works correctly for prototype-visible calls. - -One caveat: if the callee is compiled by a different compiler that does NOT use -invisible references for large structs on `FUNC_OLD` calls, there would be an -ABI mismatch. However, GCC and Clang both follow the AAPCS regardless of -prototype visibility, so this fix aligns TCC with standard behavior. diff --git a/docs/fixes/20010122-1_builtin_return_address.md b/docs/fixes/20010122-1_builtin_return_address.md deleted file mode 100644 index 3c038e68..00000000 --- a/docs/fixes/20010122-1_builtin_return_address.md +++ /dev/null @@ -1,503 +0,0 @@ -# Fix: `__builtin_return_address` / `__builtin_frame_address` Broken on ARM Thumb-2 - -**Test case**: `gcc.c-torture/execute/20010122-1.c` -**Symptom**: Exit code 1 (abort) with `-O0 -g`. - -## Test Case Summary - -The test validates that `__builtin_return_address(0)` returns a consistent value -regardless of surrounding code (calls to `dummy()` before/after), and that -`__builtin_return_address(1)` correctly walks one frame up. - -```c -void NOINLINE *test1 (void) { - return __builtin_return_address(0); // leaf — no other calls -} -void NOINLINE *test2 (void) { - dummy(); - return __builtin_return_address(0); // call before -} -void NOINLINE *test3 (void) { - void *t = __builtin_return_address(0); - dummy(); - return t; // call after -} -// test4a–test6a: __builtin_return_address(1) from nested call via alloca -// main checks: test1() == test2() == test3() → abort if not -``` - -## Root Cause - -Three interrelated bugs in how `__builtin_return_address` is implemented. - -### Bug 1: Hardcoded offset `2 * PTR_SIZE` doesn't match frame layout - -`tccgen.c:7164-7176` adds `2 * PTR_SIZE = 8` to the frame pointer to locate the -saved LR. This generates IR `StackLoc[8] [LOAD]`, meaning "load from FP + 8." - -But the actual prologue (`arm-thumb-gen.c:5881-5898`) does a single push of all -registers then `mov r7, sp`, placing FP at the bottom of the push area. ARM push -stores registers in ascending register-number order, so for -`push {r4, r5, r7, r12, lr}`: - -``` -[FP + 16] = lr (r14) ← return address -[FP + 12] = r12 (alignment pad) -[FP + 8] = r7 (old FP) -[FP + 4] = r5 -[FP + 0] = r4 ← FP points here -``` - -The offset from FP to LR = `offset_to_args - 4`, which varies per function. -The hardcoded `8` is almost never correct. - -### Bug 2: Leaf functions don't save LR to stack - -`arm-thumb-gen.c:5811`: LR is only pushed for non-leaf functions. `test1` is a -leaf → LR never pushed → `StackLoc[8]` reads garbage → `test1() != test2()` → -abort. - -### Bug 3: Frame chain walk broken for level >= 1 - -For level >= 1, the code dereferences FP (`*FP`) expecting old FP. But since -FP = bottom of push area, `[FP + 0]` = lowest-numbered pushed register (e.g. -r4), NOT the saved old FP. Frame walking is impossible. - -## Fix: Standard Thumb Frame Record via Two-Phase Push - -Restructure the prologue so FP always points to a standard `{old_FP, LR}` frame -record, matching GCC's ARM Thumb convention. This fixes all three bugs. - -### New stack layout - -``` -Higher addresses -───────────────────────────────── - caller's stack args FP + 8 + N -───────────────────────────────── - saved LR FP + 4 ← __builtin_return_address(0) - saved r7 (old FP) FP + 0 ← *FP = parent frame pointer -═══════════════ FP (r7) ══════════ - callee-saved r11 FP - 4 ┐ - callee-saved r5 FP - 8 │ callee_push_size bytes - callee-saved r4 FP - 12 ┘ -───────────────────────────────── - locals / spills FP - callee_push_size - 4 ... -───────────────────────────────── - SP -Lower addresses -``` - -Key invariants: -- `[FP + 0]` = saved old FP (always) -- `[FP + 4]` = saved LR (always) -- `offset_to_args = 8` (always — the frame record `{r7, lr}` is exactly 8 bytes) -- Local/spill at IR offset `X` → physical address `FP + X - callee_push_size` - -### Step 1: Add `force_lr_save` flag - -**File: `tcc.h` (line ~1116)** - -Add a new flag next to `force_frame_pointer`: - -```c -uint8_t force_frame_pointer; /* required for VLA/dynamic SP even if omit_frame_pointer */ -uint8_t force_lr_save; /* __builtin_return_address needs LR saved even in leaf */ -``` - -**File: `tccgen.c` (line ~11413)** - -Reset the flag at function start, alongside `force_frame_pointer`: - -```c -tcc_state->force_frame_pointer = 0; -tcc_state->need_frame_pointer = 0; -tcc_state->force_lr_save = 0; -``` - -### Step 2: Set flags in `__builtin_return_address` handler - -**File: `tccgen.c` (line ~7143)** - -At the start of the `TOK_builtin_frame_address` / `TOK_builtin_return_address` -case, force both frame pointer and LR save: - -```c -case TOK_builtin_frame_address: -case TOK_builtin_return_address: -{ - int tok1 = tok; - tcc_state->force_frame_pointer = 1; - if (tok1 == TOK_builtin_return_address) - tcc_state->force_lr_save = 1; - // ... rest of handler -``` - -This ensures: -- The function gets a frame pointer (standard two-push layout) -- LR is pushed even if the function is a leaf - -### Step 3: Fix offset from `2 * PTR_SIZE` to `PTR_SIZE` - -**File: `tccgen.c` (line ~7168)** - -```c -// BEFORE: -#ifdef TCC_TARGET_ARM - vpushi(2 * PTR_SIZE); -// AFTER: -#ifdef TCC_TARGET_ARM - vpushi(PTR_SIZE); -``` - -Because `[FP + 4] = LR` in the new layout (was `[FP + 8]` assumption before). - -### Step 4: Restructure prologue - -**File: `arm-thumb-gen.c`, function `tcc_gen_machine_prolog` (line ~5794)** - -Add a new global to track the callee-saved push size: - -```c -int callee_push_size = 0; /* bytes pushed BELOW FP (callee-saved regs) */ -uint32_t callee_saved_regs = 0; /* register mask for second push */ -``` - -In `tcc_gen_machine_prolog`, replace the current single-push logic: - -```c -// ── Phase 1: Determine which registers need saving ── -uint16_t frame_regs = 0; // {r7, lr} — the frame record -uint16_t callee_regs = 0; // everything else (r4-r6, r8-r11) -int callee_count = 0; - -// Frame record: always r7; lr if non-leaf or force_lr_save -frame_regs = (1 << R_FP); -if (!leaffunc || tcc_state->force_lr_save) { - frame_regs |= (1 << R_LR); -} - -// Callee-saved: r4-r11 as determined by used_registers -for (int i = R4; i <= R11; ++i) { - if (tcc_state->text_and_data_separation && i == R9) continue; - if (i == R_FP) continue; // r7 is in frame_regs - if (used_registers & (1ULL << i)) { - callee_regs |= (1 << i); - callee_count++; - } -} -// Add R10 for nested function static chain if needed -if (extra_prologue_regs & (1u << ARM_R10)) { - if (!(callee_regs & (1u << ARM_R10))) { - callee_regs |= (1u << ARM_R10); - callee_count++; - } -} -// Pad callee-saved to even count for 8-byte alignment -if (callee_count % 2 != 0) { - callee_regs |= (1 << R12); - callee_count++; -} - -// ── Phase 2: need_frame_pointer decision ── -// (same as current logic but also force when force_lr_save is set) -if (func_var || tcc_state->force_lr_save) - tcc_state->need_frame_pointer = 1; -const int need_fp = (tcc_state->force_frame_pointer - || tcc_state->need_frame_pointer - || (stack_size > 0)); -tcc_state->need_frame_pointer = need_fp; - -// ── Phase 3: Emit pushes ── -if (need_fp) { - // ── Two-phase push ── - // Phase A: frame record - ot_check(th_push(frame_regs)); - ot_check(th_mov_reg(R_FP, R_SP, ...)); // mov r7, sp - // Phase B: callee-saved (below FP) - if (callee_count > 0) - ot_check(th_push(callee_regs)); - - callee_push_size = callee_count * 4; - callee_saved_regs = callee_regs; - - // offset_to_args: distance from FP to caller's stack args - // With standard frame record: always 8 (the {r7, lr} pair) - offset_to_args = 8; - - pushed_registers = frame_regs | callee_regs; // for dry-run tracking -} else { - // ── No frame pointer: single push of callee-saved + LR ── - // (same as current behavior for trivial functions) - uint16_t regs = callee_regs; - int count = callee_count; - if (!leaffunc || tcc_state->force_lr_save) { - regs |= (1 << R_LR); - count++; - } - if (count % 2 != 0) { regs |= (1 << R12); count++; } - if (count > 0) ot_check(th_push(regs)); - callee_push_size = 0; - callee_saved_regs = 0; - offset_to_args = count * 4; - pushed_registers = regs; -} - -// ── Phase 4: Allocate locals ── -if (stack_size & 7) stack_size = (stack_size + 7) & ~7; -allocated_stack_size = stack_size; -if (stack_size > 0) gadd_sp(-stack_size); -``` - -**Important**: The `extra_prologue_regs & (1u << R_LR)` check (line ~5818) for -dry-run LR discovery also needs updating. When need_fp = 1, LR is always in -`frame_regs`, so the dry-run can only add it to the non-FP case. - -### Step 5: Restructure epilogue - -**File: `arm-thumb-gen.c`, function `tcc_gen_machine_epilog` (line ~6190)** - -Replace the current single-pop epilogue: - -```c -ST_FUNC void tcc_gen_machine_epilog(int leaffunc) -{ - int lr_saved = pushed_registers & (1 << R_LR); - - if (tcc_state->need_frame_pointer) { - // ── Two-phase pop (mirrors two-phase push) ── - - if (callee_push_size > 0) { - // SP = FP - callee_push_size (point to callee-saved area) - // Works correctly even with alloca/VLA since FP is stable - ot_check(th_sub_imm(R_SP, R_FP, callee_push_size, ...)); - // Restore callee-saved registers - ot_check(th_pop(callee_saved_regs)); - // SP now = FP (pointing at frame record) - } else { - // No callee-saved: just restore SP from FP - ot_check(th_mov_reg(R_SP, R_FP, ...)); - } - - if (lr_saved) { - // Pop frame record: restore old FP into r7, return via PC - ot_check(th_pop((1 << R_FP) | (1 << R_PC))); - } else { - // Leaf function with frame pointer but no LR saved - ot_check(th_pop(1 << R_FP)); - ot_check(th_bx_reg(R_LR)); - } - } else { - // ── No frame pointer: existing behavior ── - if (allocated_stack_size > 0) - gadd_sp(allocated_stack_size); - if (lr_saved) { - pushed_registers |= (1 << R_PC); - pushed_registers &= ~(1 << R_LR); - ot_check(th_pop(pushed_registers)); - } else { - if (pushed_registers > 0) ot_check(th_pop(pushed_registers)); - ot_check(th_bx_reg(R_LR)); - } - } - - // Common cleanup - thumb_gen_state.generating_function = 0; - th_literal_pool_generate(); - thumb_free_call_sites(); -} -``` - -### Step 6: Adjust FP-relative local/spill offsets - -With callee-saved registers pushed below FP, all FP-relative local accesses -must account for the gap. A local at IR offset `-4` is now physically at -`FP - callee_push_size - 4`. - -**Approach**: Create a helper and apply it at every FP-relative local access -point. Do NOT adjust param accesses (those are above FP and already correct). - -```c -// New helper in arm-thumb-gen.c: -static inline int fp_adjust_local_offset(int frame_offset, int is_param) -{ - // Params are above FP (positive direction), no adjustment needed - // Locals/spills are below FP and must skip past callee-saved area - if (!is_param && tcc_state->need_frame_pointer) - return frame_offset - callee_push_size; - return frame_offset; -} -``` - -**Apply at these locations** (all in `arm-thumb-gen.c`): - -1. **`tcc_machine_load_spill_slot`** (line ~2104): spill slots are always locals - ```c - frame_offset = fp_adjust_local_offset(frame_offset, 0); - ``` - -2. **`tcc_machine_store_spill_slot`** (line ~2122): same - ```c - frame_offset = fp_adjust_local_offset(frame_offset, 0); - ``` - -3. **`tcc_machine_addr_of_stack_slot`** (line ~2852): has `is_param` flag - ```c - frame_offset = fp_adjust_local_offset(frame_offset, is_param); - ``` - -4. **`tcc_machine_can_encode_stack_offset_for_reg`** (line ~2080): used for - encoding checks — apply adjustment before the check - -5. **`tcc_machine_can_encode_stack_offset_with_param_adj`** (line ~2094): - applies offset_to_args for params, also needs local adjustment - -6. **IROP_TAG_STACKOFF handling** in the main codegen (line ~3244): - ```c - int frame_offset = irop_get_stack_offset(src); - // Apply callee-saved gap for locals - if (!src.is_param) - frame_offset = fp_adjust_local_offset(frame_offset, 0); - // Then apply offset_to_args for params (existing code) - if (src.is_param && frame_offset >= 0) - frame_offset += offset_to_args; - ``` - -7. **LEA operations** (line ~6450+): same pattern as IROP_TAG_STACKOFF - -8. **FP offset cache** (`get_cached_stack_addr_reg`, line ~4551): cache keys - must use adjusted offsets. Adjust before lookup: - ```c - if (!op.is_param) - frame_offset = fp_adjust_local_offset(frame_offset, 0); - if (op.is_param) - frame_offset += offset_to_args; - ``` - -9. **`tcc_machine_store_param_slot`** (line ~2157): already adds offset_to_args, - no local adjustment needed (it's always for params) - -10. **Parameter shuffle in prologue** (line ~5950+): accesses incoming stack - params at `offset + offset_to_args`. Since offset_to_args is now 8 (not - total push size), and these params are above the frame record, this is - correct. No change needed. - -### Step 7: Adjust variadic function handling - -**File: `arm-thumb-gen.c` (line ~5935)** - -Currently saves r0-r3 at `[FP - 16]` to `[FP - 4]`. With callee-saved below -FP, these fixed offsets collide with callee-saved registers. - -Two options: - -**Option A** (recommended): Reserve the variadic area as part of the callee-saved -region by saving r0-r3 AFTER the callee-saved push, at offsets relative to the -new SP: - -```c -// The variadic save area must be below callee-saved registers -// Adjust offsets: old [FP - 16..FP - 4] → new [FP - callee_push_size - 16..FP - callee_push_size - 4] -tcc_gen_machine_store_to_stack(R0, -callee_push_size - 16); -tcc_gen_machine_store_to_stack(R1, -callee_push_size - 12); -tcc_gen_machine_store_to_stack(R2, -callee_push_size - 8); -tcc_gen_machine_store_to_stack(R3, -callee_push_size - 4); -``` - -The `tcc_gen_machine_store_to_stack` helper stores relative to FP, so these -adjusted offsets place the saves below the callee-saved area. - -Similarly, the stack-args pointer at `[FP - 20]` becomes -`[FP - callee_push_size - 20]`, and the named-arg-bytes count at `[FP - 24]` -becomes `[FP - callee_push_size - 24]`. - -**Option B**: Include the variadic save area in the IR's stack frame (negative -offsets from `loc`), so it gets the callee_push_size adjustment automatically -via `fp_adjust_local_offset`. This requires the IR to know about variadic layout -at allocation time, which may be complex. - -### Step 8: Adjust static chain (nested functions) - -**File: `arm-thumb-gen.c` (line ~5912)** - -The static chain register (R10) is saved at `[FP - 4]` (CHAIN_SLOT_OFFSET). -With callee-saved below FP, adjust to `[FP - callee_push_size - 4]`. - -Search for `CHAIN_SLOT_OFFSET` or `-4` used for the chain slot and update: - -```c -// Old: -tcc_gen_machine_store_to_stack(R10, -4); // chain at [FP - 4] -// New: -tcc_gen_machine_store_to_stack(R10, -callee_push_size - 4); -``` - -Also update the `resolve_chain_base` function (line ~219) which reads the chain -at `[FP - 4]`: -```c -load_from_base_ir(out_scratch->reg, ..., callee_push_size + 4 /* abs offset */, - 1 /* sign: negative */, ...); -``` - -### Step 9: Verify `tcc_gen_machine_store_to_stack` helper - -Confirm this helper stores relative to FP (not SP). If it uses the -`need_frame_pointer ? R_FP : R_SP` pattern, it should work as-is since we're -always in the need_fp = 1 case for two-push functions. - -### Step 10: Handle dry-run codegen - -The two-pass codegen system (dry-run then real emit) discovers additional -register pushes during pass 1. Key concern: the dry-run's `lr_push_count` and -`scratch_regs_pushed` tracking must work with the new push structure. - -When the dry-run discovers LR needs saving (e.g. for a scratch push), this info -feeds into `extra_prologue_regs`. In the new layout, LR is always in the frame -record when need_fp = 1, so extra_prologue_regs only affects the no-FP case. - -Review `arm-thumb-gen.c:784-798` where `lr_saved_in_prologue` is computed and -update to match the new push structure. - -### Step 11: Edge case — `need_frame_pointer = 0` - -When `need_fp = 0` (very simple leaf functions, no locals, no spills): -- No two-phase push — use the existing single-push behavior -- `callee_push_size = 0` -- `offset_to_args = count * 4` (number of pushed regs × 4) -- No FP-relative accesses (no locals exist) -- `__builtin_return_address` forces need_fp = 1 (via `force_frame_pointer`) - -No changes needed for this case. - -## Testing - -```bash -# Primary test -cd tests/ir_tests -python run.py -c ../gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/execute/20010122-1.c --cflags="-O0 -g" -python run.py -c ../gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/execute/20010122-1.c --cflags="-O1 -g" - -# Full regression suites -make test -j16 # IR tests -make test-asm -j16 # Assembly tests -make test-all # IR + GCC torture -``` - -Key regression scenarios to watch: -- Variadic functions (printf, va_list) -- Nested functions with captured variables -- Functions with alloca/VLA -- Functions with many spills (large offset encoding) -- 64-bit operations (paired register spill/reload) -- Functions with no locals (need_fp = 0 path unchanged) - -## Risk Assessment - -**Medium-high risk.** This changes every function's prologue/epilogue and all -FP-relative offset calculations. The fix is architecturally correct (matches -GCC's Thumb convention), but the large surface area requires thorough testing. - -The `fp_adjust_local_offset` approach centralizes the adjustment, minimizing -the chance of missing a location. The key risk is missing an offset adjustment -site in the backend, which would manifest as accessing the wrong stack slot -(likely a callee-saved register value instead of a local variable). diff --git a/docs/fixes/20030914-1_long_double_param_assign.md b/docs/fixes/20030914-1_long_double_param_assign.md deleted file mode 100644 index 6634c52e..00000000 --- a/docs/fixes/20030914-1_long_double_param_assign.md +++ /dev/null @@ -1,94 +0,0 @@ -# Bug: `long double` parameter `+=` produces wrong result - -## Test case -``` -tests/gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/execute/20030914-1.c -``` - -## Symptom -`pc += pb.val[i]` has no effect when `pc` is a `long double` **parameter** — result stays at 10000.0 instead of accumulating to 10136.0. - -## Original error (may have been fixed separately) -``` -tcc_ir_vreg_live_interval: invalid vreg: -2 -``` -This no longer reproduces on current code. The remaining issue is pure runtime correctness. - -## Reproduction -```bash -cd tests/ir_tests -python run.py -c ../gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/execute/20030914-1.c --cflags="-O1" -# Exit code: 1 (abort called because f() returns 10000.0 instead of 10136.0) -``` - -## Minimal reproducer -```c -long double add_to_param(long double pc, int val) { - pc += val; // BUG: has no effect - return pc; -} -``` -- `long double` param `+=` int → **broken** (returns original value) -- `long double` local `+=` int → works fine - -## Root cause analysis (in progress) - -### IR generated for the broken case -``` -0000: PARAM0[call_0] P1 # convert val (int) to double -0001: CALL __aeabi_i2d --> T0 -0002: PARAM0[call_1] P0 # add P0 + T0 -0003: PARAM1[call_1] T0 -0004: CALL __aeabi_dadd --> T1 -0005: P0 <-- T1 [STORE] # store result back to P0 ← BUG HERE -0006: T2 <-- P0 [LOAD] # load P0 for return -0007: RETURNVALUE T2 -``` - -After register allocation: -``` -0005: R4(P0) <-- R0(T1) [STORE] # only writes low word! -0006: R0(T2) <-- R4(P0) [LOAD] # reads R4 (new low) + R5 (stale high) -``` - -### Disassembly confirms the bug -```asm -; Prologue: P0 (long double, 64-bit) saved to register pair -mov r4, r0 ; save P0 low word -mov r5, r1 ; save P0 high word - -; ... __aeabi_i2d and __aeabi_dadd calls ... -; Result of dadd is in (r0, r1) - -mov r4, r0 ; ← BUG: only stores low word to r4 - ; r5 (high word) is NOT updated with r1! - -; Return: -mov r0, r4 ; low word (correct - new value) -mov r1, r5 ; high word (WRONG - still original value!) -``` - -### Why it happens -The ASSIGN operation (`P0 <-- T1`) goes through `tcc_gen_machine_assign_op()` in [arm-thumb-gen.c](arm-thumb-gen.c#L6830). This function checks `irop_is_64bit(dest)` to decide whether to use the 64-bit assign path (`assign_op_64bit()`). - -**Hypothesis**: The `btype` field on the P0 destination operand is not set to `IROP_BTYPE_FLOAT64` (value 3), so `irop_is_64bit()` returns false, and the code falls through to the simple 32-bit `mov` path. - -### Debug instrumentation added -Temporary debug print added at [ir/codegen.c](ir/codegen.c) line ~1508 (TCCIR_OP_ASSIGN case) to verify the btype value at codegen time. **This needs to be built and tested.** - -## Next steps - -1. **Build with debug print** and run the test to confirm the btype value on the ASSIGN dest operand -2. **Trace where btype gets lost** — either: - - The IR generation (`tccgen.c`) doesn't set btype when creating the ASSIGN to P0 - - The register allocation pass (`tccls.c`) or fill-registers pass strips/overwrites the btype - - The operand encoding rounds trips incorrectly for parameter vregs -3. **Fix**: Ensure the `btype` is preserved as `IROP_BTYPE_FLOAT64` for `long double` parameter destinations in ASSIGN operations -4. **Verify** with the original test and the minimal reproducer -5. **Remove debug instrumentation** - -## Key files -- [arm-thumb-gen.c](arm-thumb-gen.c#L6726-L6870) — `assign_op_64bit()` and `tcc_gen_machine_assign_op()` -- [tccir_operand.h](tccir_operand.h#L201) — `irop_is_64bit()` checks btype -- [ir/mat.c](ir/mat.c#L671) — `tcc_ir_materialize_dest_ir()` also checks `irop_is_64bit()` -- [ir/codegen.c](ir/codegen.c#L1508) — ASSIGN dispatch (debug print added here) diff --git a/docs/fixes/omit_frame_pointer.md b/docs/fixes/omit_frame_pointer.md deleted file mode 100644 index 4d74f6ac..00000000 --- a/docs/fixes/omit_frame_pointer.md +++ /dev/null @@ -1,170 +0,0 @@ -# Plan: Omit Frame Pointer When Safe - -**Goal**: Eliminate unnecessary frame pointer (R7) setup in functions where SP -is statically known, saving 2-3 instructions per function and freeing R7 for -register allocation. - -**Current state**: GCC `-O2` omits the frame pointer for `main` in -`hello_inline.txt` (16 instructions), while TCC always emits it (20 instructions). - -## Problem - -In `arm-thumb-gen.c:6828`, the frame pointer decision is: - -```c -const int need_fp = (tcc_state->force_frame_pointer - || tcc_state->need_frame_pointer - || (stack_size > 0)); // <-- too conservative -``` - -Any function with locals or spills gets a frame pointer. The `stack_size > 0` -condition exists because **SP moves dynamically** during function calls: - -- `func_call_mop` does `gadd_sp(-stack_size)` before each call to reserve - outgoing stack args, then `gadd_sp(stack_size)` after (lines 8574-8577, - 8644-8648). -- Nested call preservation pushes R0-R3 onto the stack (lines 8566-8569). - -When SP moves, SP-relative offsets to locals become invalid. The frame pointer -provides a stable base. Without it, removing `stack_size > 0` causes widespread -test failures. - -## Key Insight - -The IR already pre-computes the maximum outgoing call argument area: - -- `ir->call_outgoing_size` — max bytes needed across all calls (`tccir.h:454`) -- `ir->call_outgoing_base` — frame offset of the reserved area (`tccir.h:453`) -- `ir/codegen.c:1329-1336` reserves this space in the stack frame layout - -But the backend ignores this and still does per-call dynamic SP adjustments. - -## Implementation Plan - -### Phase 1: Use Pre-Reserved Outgoing Area for Stack Args - -**Files**: `arm-thumb-gen.c` - -1. **Replace `gadd_sp(-stack_size)` with offset-based stores in `func_call_mop`** - - Currently (line 8574): `gadd_sp(-stack_size)` lowers SP, then - `store_word_to_stack(reg, stack_offset)` stores relative to the new SP. - - Change: compute `outgoing_base = ir->call_outgoing_base` (FP-relative - offset). Store stack args at `[base_reg + outgoing_base + stack_offset]` - where `base_reg` is FP or SP depending on `need_frame_pointer`. - - Remove the `gadd_sp(-stack_size)` / `gadd_sp(stack_size)` pair. - -2. **Adapt `store_word_to_stack` and `place_stack_arg_*` functions** - - These currently store at `[SP + offset]` assuming SP was already lowered. - - Change them to accept a base register + base offset, or pass the outgoing - base through the `CallGenContext`. - -3. **Handle nested call R0-R3 preservation without PUSH/POP** - - Currently `th_push(arg_regs_push_mask)` / `th_pop(...)` dynamically moves SP. - - Option A: Reserve slots for R0-R3 preservation in the frame (alongside - outgoing area). Store/load explicitly instead of push/pop. - - Option B: Move the nested-call saves to callee-saved spill slots allocated - during register allocation. (More complex, may not be needed initially.) - -### Phase 2: Remove `stack_size > 0` from Frame Pointer Decision - -**Files**: `arm-thumb-gen.c` - -4. **Update the `need_fp` condition** (line 6828): - ```c - const int need_fp = (tcc_state->force_frame_pointer - || tcc_state->need_frame_pointer); - ``` - The remaining conditions (`force_frame_pointer`, variadic, `force_lr_save`) - already cover the cases that truly need FP. - -5. **Verify `fp_adjust_local_offset`** (line 192): - - This adjusts local offsets by `callee_push_size` for FP-relative access. - - When FP is omitted, locals are SP-relative. The offset calculation changes: - SP points at the bottom of the frame (below outgoing area), so local offset - from SP = `stack_size + local_offset` (where `local_offset` is negative - from frame top). - - Verify that all ~15 sites using `tcc_state->need_frame_pointer ? R_FP : R_SP` - compute the correct offset in the SP case. - -### Phase 3: Account for Outgoing Area in SP-Relative Offsets - -6. **When `need_fp == 0` and `call_outgoing_size > 0`**: - - SP is at `frame_bottom - call_outgoing_size` after prologue. - - All SP-relative local accesses need an additional - `+ call_outgoing_size` offset. - - This adjustment should happen in `fp_adjust_local_offset` or at each - `base_reg` selection site. - -### Phase 4: Prologue/Epilogue Updates - -7. **Prologue** (around line 6894): - - When `need_fp == 0`: skip `MOV R7, SP` and R7 push. - - Still emit `SUB SP, #stack_size` for locals + outgoing area. - -8. **Epilogue** (around line 7298): - - When `need_fp == 0`: skip `MOV SP, R7` restore. - - Use `ADD SP, #stack_size` instead. - -## Risks and Edge Cases - -- **VLA / `alloca`**: Already covered by `force_frame_pointer = 1` in `tccgen.c`. -- **Variadic functions**: Already force FP via `func_var` check (line 6821). -- **`__builtin_return_address`**: Already forces FP via `force_lr_save` (line 6825). -- **Debug info (DWARF)**: `tccdbg.c:2969` checks `need_frame_pointer` for CFA - tracking. Needs testing — CFA may need to switch to SP-based when FP is omitted. -- **Nested functions / static chain**: Use R10 for chain, may reference FP for - parent frame access. Check `tcc_gen_machine_set_chain`. -- **Scratch register saves**: `get_scratch_reg_with_save` does PUSH/POP of - scratch registers mid-function. These also move SP. If these happen while - accessing locals, SP offsets break. Need to verify these never overlap with - local accesses, or track their adjustment. -- **Software FP library calls**: Lines 6025-6332 do `sub sp` for softfloat call - frames. These are internal helpers and may need the same treatment. - -## Testing Strategy - -1. `make test -j16` — IR test suite (primary) -4. Manual inspection of `hello_inline.txt` output to verify FP is omitted -5. Compare instruction counts before/after across the full test suite - -## TODO - -### Phase 1: Use Pre-Reserved Outgoing Area -- [ ] Add `outgoing_base` field to `CallGenContext` sourced from `ir->call_outgoing_base` -- [ ] Change `place_stack_arg_32bit` / `place_stack_arg_64bit` / `place_stack_arg_struct` to store at `[base_reg + outgoing_base + stack_offset]` instead of `[SP + stack_offset]` -- [ ] Remove `gadd_sp(-stack_size)` / `gadd_sp(stack_size)` from `func_call_mop` -- [ ] Replace R0-R3 nested call `th_push`/`th_pop` with explicit STR/LDR to reserved frame slots -- [ ] Remove `used_stack_size` tracking (no longer needed) -- [ ] Adapt softfloat helper call frames (lines 6025-6332) to use reserved area - -### Phase 2: Remove `stack_size > 0` Condition -- [ ] Change `need_fp` condition at line 6828 to `(force_frame_pointer || need_frame_pointer)` -- [ ] Verify all `force_frame_pointer = 1` sites in `tccgen.c` cover VLA/alloca/varargs - -### Phase 3: Fix SP-Relative Offsets -- [ ] Update `fp_adjust_local_offset` to add `call_outgoing_size` when FP is omitted -- [ ] Audit all ~15 `need_frame_pointer ? R_FP : R_SP` sites for correct offset math -- [ ] Handle `MACH_OP_PARAM_STACK` offset calculation (incoming args above frame) - -### Phase 4: Prologue/Epilogue -- [ ] Skip R7 push/pop and `MOV R7, SP` / `MOV SP, R7` when `need_fp == 0` -- [ ] Use `ADD SP, #stack_size` in epilogue instead of `MOV SP, R7` -- [ ] Update DWARF CFA tracking in `tccdbg.c` for SP-based frames - -### Phase 5: Edge Cases -- [ ] Audit `get_scratch_reg_with_save` PUSH/POP — verify no local access overlap -- [ ] Test nested functions / static chain with FP omitted -- [ ] Verify R9 (GOT base) save/restore in yasos text-data-separation mode - -### Phase 6: Testing -- [ ] `make test -j16` — IR tests pass -- [ ] `make test-asm -j16` — assembly tests pass -- [ ] `make test-gcc-torture-compile` — GCC torture tests pass -- [ ] Verify `hello_inline.txt` shows FP omitted for `main` -- [ ] Compare instruction count regressions across test suite - -## Expected Impact - -- Saves 2-4 instructions per non-leaf function (push/pop R7 + MOV R7,SP + MOV SP,R7) -- Frees R7 for general register allocation (significant for register pressure) -- Closer parity with GCC `-O2` output diff --git a/docs/fixes/sign_extend_cast_vreg_to_vreg.md b/docs/fixes/sign_extend_cast_vreg_to_vreg.md deleted file mode 100644 index c7117249..00000000 --- a/docs/fixes/sign_extend_cast_vreg_to_vreg.md +++ /dev/null @@ -1,118 +0,0 @@ -# Fix: 20001009-2.c — Missing sign extension + inline asm register clobber - -## Bug - -Test: `gcc.c-torture/execute/20001009-2.c` - -```c -int a = 0xff; -int c = (signed char)a; // Expected: c = -1, Actual: c = 255 -asm volatile ("" : : "r"(c)); // Clobbers register holding 'a' -if (c != -1) abort(); -``` - -Two independent bugs caused this test to fail: - -1. **Missing sign extension**: The `(signed char)` cast was silently dropped. -2. **Inline asm register clobber**: The asm constraint solver picked the - register already holding `a`, clobbering it. - -## Root Cause - -### Bug 1: ALLOW_SUBTYPE_ACCESS skips sign extension (tccgen.c) - -When casting from `int` to `signed char`, `gen_cast()` enters the -`ALLOW_SUBTYPE_ACCESS` path because: -- `vtop->r & VT_LVAL` is true (local variable `a` is on the stack) -- `ds <= ss` (1 byte ≤ 4 bytes) - -This optimization assumes the value is still in memory and a future -byte-sized load will naturally give sign extension. It just changes -`vtop->type.t` and skips code generation. - -This is correct for the legacy backend where values stay on the stack, -but the IR backend's register allocator promotes stack slots to registers — -the byte-load never happens. - -### Bug 2: Asm constraint solver ignores IR register allocation (arm-thumb-asm.c) - -The IR linear-scan allocator (tccls.c) and the inline asm constraint solver -(arm-thumb-asm.c) are two disconnected register-allocation worlds. The asm -solver scans r0 upward for "r" constraints and picks the first free register — -with no knowledge of which registers the IR allocator assigned to live -variables. This can pick a register already holding a live value, and the -operand load in `asm_gen_code` clobbers it. - -### Pre-existing bug: Thumb-2 push/pop encoding (arm-thumb-asm.c) - -`asm_gen_code()` used `gen_le32(0xe92d0000|regset)` for push and -`gen_le32(0xe8bd0000|regset)` for pop. For Thumb-2, 32-bit instructions -must be emitted as two 16-bit halfwords, not one 32-bit word. The -`gen_le32()` approach wrote bytes in the wrong order. - -## Fixes Applied - -### Fix 1: Disable ALLOW_SUBTYPE_ACCESS for IR mode (tccgen.c) - -```c -if (ALLOW_SUBTYPE_ACCESS && (vtop->r & VT_LVAL) && !tcc_state->ir) { -``` - -When `tcc_state->ir` is set, the ALLOW_SUBTYPE_ACCESS optimization is -skipped. The fallback SHL+SAR path generates explicit sign extension. - -### Fix 2: reserved_regs for asm constraint solver (multiple files) - -Added a `reserved_regs[NB_ASM_REGS]` mechanism: - -- **ir/codegen.c** (`tcc_ir_codegen_inline_asm_by_id`): Before calling - `tcc_asm_emit_inline`, iterates over all live interval arrays - (variables, temporaries, parameters) and marks physical registers of - intervals live at the current instruction index. These go into a - `reserved_regs` array. - -- **arm-thumb-asm.c** (`asm_compute_constraints`): New `reserved_regs` - parameter. After initializing `regs_allocated[]` from `clobber_regs`, - also marks reserved registers as `REG_IN_MASK | REG_OUT_MASK`. This - prevents the "r" constraint scanner from picking them. - -- **Key design**: `reserved_regs` only affects constraint allocation, NOT - `asm_gen_code` save/restore. This avoids spurious push/pop of callee-saved - registers that would corrupt output operands. - -- **tcc.h**, **tccasm.c**: Updated function signatures to thread - `reserved_regs` through `tcc_asm_emit_inline` → `asm_compute_constraints`. - Non-IR call sites pass `NULL`. - -### Fix 3: Thumb-2 push/pop encoding (arm-thumb-asm.c) - -```c -// Before (broken): -gen_le32(0xe92d0000 | regset); // push -gen_le32(0xe8bd0000 | regset); // pop - -// After (correct): -gen_le16(0xe92d); gen_le16(regset); // push: hw1, hw2 -gen_le16(0xe8bd); gen_le16(regset); // pop: hw1, hw2 -``` - -### Fix 4: parse_asm_operands initialization (tccasm.c) - -Added `op->reg = -1;` initialization in `parse_asm_operands()` so the -constraint solver correctly detects unassigned operands. - -## Files Modified - -| File | Change | -|------|--------| -| `tccgen.c` | Guard ALLOW_SUBTYPE_ACCESS with `!tcc_state->ir` | -| `tcc.h` | Updated signatures for `asm_compute_constraints`, `tcc_asm_emit_inline` | -| `arm-thumb-asm.c` | reserved_regs in constraint solver; Thumb-2 push/pop encoding | -| `tccasm.c` | Thread reserved_regs; `op->reg = -1` init | -| `ir/codegen.c` | Compute reserved_regs from live intervals | - -## Test Results - -- **3154 passed**, 768 xfailed, 0 failed (was 3148 passed before fix — 6 newly passing) -- All previously-regressing tests pass: pr41239, pr43560, pr45695, loop-6 -- The target test 20001009-2 passes diff --git a/docs/fuzz_triage_guide.md b/docs/fuzz_triage_guide.md new file mode 100644 index 00000000..a74c82a5 --- /dev/null +++ b/docs/fuzz_triage_guide.md @@ -0,0 +1,129 @@ +# Fuzz-sweep & triage guide + +How to enumerate, triage, and fix the remaining O1/O2 wrong-code bugs the +differential fuzzer finds, using the same workflow that cleared seeds 0–299. + +## TL;DR — the 45-minute cadence you asked for + +```bash +cd libs/tinycc +make cross -j$(nproc) # ensure armv8m-tcc is current + +# ── ~15 min: sweep + triage a wide range ─────────────────────────────── +tests/fuzz/triage_olevels.sh 0 4999 24 # LO HI JOBS -> fuzz_triage_0_4999.md +# (self-contained; a 5000-seed sweep is a few minutes on ~24 cores, then a +# quick per-seed culprit bisect on the handful that diverge) + +# ── ~30 min: iterate on fixes ────────────────────────────────────────── +# open fuzz_triage_0_4999.md, fix highest-leverage culprit groups first, +# rebuild + verify per below, run the regression gate, repeat. +``` + +`triage_olevels.sh` writes a markdown table classifying every failing seed and +bisecting a culprit pass. Reproducers land in `tests/fuzz/fuzz_triage_repros/`. +The sweep is **self-contained** — pure bash + `xargs -P` over `runseed.sh`, no +`pytest`/`pytest-xdist` dependency (so it works regardless of the active venv). + +## Prerequisites + +- `make cross` built `armv8m-tcc` (rebuild after any compiler change). +- `gcc` with 32-bit multilib (`gcc -m32`) — the ground-truth oracle. +- `qemu-system-arm`, `arm-none-eabi-gcc`. +- The mps2 newlib is built on first IR-test run; if missing: + `sh tests/ir_tests/qemu/mps2-an505/build_newlib.sh`. +- `pytest` (+ `pytest-xdist` for `-n`) is needed only for the **regression + gate** below, not for the sweep. + +## Why these oracles + +- **O-level self-consistency**: `triage_olevels.sh` compiles each seed at O0, + O1, O2, Os on the *same* ARM target and flags any disagreement (the same + contract as `tests/fuzz/test_random_c_olevels.py`, but standalone). No ABI + mismatch, fully reproducible — this is the authoritative sweep. +- **Ground truth = `gcc -m32 -funsigned-char`.** ARM's ABI is *unsigned* `char` + + *32-bit* `long`; plain `gcc`/`gcc -m32` (signed char) mis-judges any program + that uses `char`, which made O0 look wrong last time. Always pass both flags. +- **tcc -O0 is (so far) always correct** — so an optimizer is to blame whenever + O1/O2/Os diverge from O0. A row classed `O0-WRONG` instead points at the front + end / libc / O0 codegen (rare; investigate separately). + +## Reading the triage report + +| column | meaning | +|--------|---------| +| `class` | `O1` / `O2` / `Os` = that level miscompiles · `…/CRASH` = HardFault/Lockup · `COMPILE_CRASH` = compiler asserted (e.g. `mach_get_dest_reg: unexpected kind 3`, seed 2966) · `O0-WRONG` = not an optimizer bug | +| `ref` | gcc -m32 -funsigned-char result (the correct value) | +| `O0..Os` | tcc output per level | +| `culprit knob` | the single `-fno-` / `TCC_NO_COALESCE` that restores `ref`, or `-` if none isolates it | + +Group rows by `culprit knob` — one root cause usually covers several seeds (last +batch: 4 seeds shared `ssa_opt_dead_loop`, 2 shared `local_alu_cse`, etc.). + +## Fix → verify loop (per bug) + +```bash +S=588; LVL=-O2 # from the report +python3 tests/fuzz/gen_c.py --seed $S -o /tmp/s$S.c +# 1. confirm + ground truth +gcc -m32 -funsigned-char -O2 -w /tmp/s$S.c -o /tmp/g && /tmp/g # correct value +bash tests/fuzz/runseed.sh /tmp/s$S.c $LVL # tcc value (wrong) +# 2. find the diverging statement: insert a trace after each `cs = csmix(...)`: +perl -pe 's/(cs = csmix\([^;]*\);)/$1 trace(__LINE__,cs);/g' /tmp/s$S.c > /tmp/t.c +perl -0pi -e 's/(#include )/$1\nstatic void trace(int l,unsigned v){printf("L%d=%08x\\n",l,v);}/' /tmp/t.c +gcc -m32 -funsigned-char -O0 -w /tmp/t.c -o /tmp/g && /tmp/g > /tmp/ref.txt +# ...run /tmp/t.c through the mps2 makefile at $LVL, diff vs /tmp/ref.txt -> first divergent line +# 3. dump IR around it (debug build): ./armv8m-tcc -dump-ir-passes=all $LVL -c /tmp/s$S.c -o x.o 2>/dev/null +# 4. edit the implicated pass, then: +make cross -j$(nproc) +bash tests/fuzz/runseed.sh /tmp/s$S.c $LVL # == ref ? (also re-check O0/O1/O2/Os) +``` + +### When `culprit knob = none` + +The `-fno-*` flags only gate the `opt.c` pipeline. SSA-pipeline bugs (and the +SSA *rename* itself, e.g. the multidef-temp ternary) won't isolate. Temporarily +add skip gates to the two SSA drivers, rebuild, then bisect with +`TCC_SKIP_SSA="ssa:gvn"` / `TCC_SKIP_SSA2="ssa:cprop"`: + +```c +// ir/regalloc.c — RUN_SSA macro +const char *skip__ = getenv("TCC_SKIP_SSA"); +if (!(skip__ && strstr(skip__, name))) { (call); } // wrap the (call); +// ir/opt/ssa_opt.c — SSA_RUN macro +const char *skip2__ = getenv("TCC_SKIP_SSA2"); +if (!(skip2__ && strstr(skip2__, name))) changes += (call); +``` +Pass names: `ssa:var_const_fold ssa:var_forward ssa:sccp ssa:cprop ssa:fold +ssa:gvn ssa:reassoc ssa:strength ssa:narrow ssa:dce ssa:dead_loop ...`. If +*no* SSA-skip and *no* `-fno` helps but `-fno-inline-functions +-fno-inline-small-functions` does, the bug is exposed by inlining (the +multidef-temp class). **Remove these gates before committing.** + +## Regression gate (run before committing any fix) + +```bash +cd libs/tinycc +FUZZ_OLEVEL_SEEDS=0-299 python3 -m pytest tests/fuzz/test_random_c_olevels.py -n 16 -q # must stay 300/300 +cd tests/ir_tests +python3 -m pytest test_qemu.py test_codegen_asm.py \ + test_gcc_torture_ir.py -k "O1 or O2 or not torture" -n 8 -q # was 9063/0 +cd ../unit && make clean && make run # 1116/0 +``` +Add a regression test for each fix: a verbatim repro at `tests/ir_tests/NN_fuzz_.c` ++ `.expect` (the gcc -m32 -funsigned-char value) registered in `test_qemu.py` +(see 188–195 for the pattern). **Watch for size-sensitive tests** like +`96_nodata_wanted` (labels-as-values) when a fix changes codegen layout. + +## Parallelizing the diagnosis (optional, fast) + +For a big batch, ask Claude to run the **diagnosis workflow**: one agent per +failing seed reduces + root-causes it in parallel (read-only, no rebuilds, using +this same `runseed.sh` + the knobs), returning a grouped root-cause report. That +turned the 12-seed batch around in one pass; you then apply fixes serially. + +## Current known batch (300–2999, as of this writing) + +~25 failing seeds. Notable: **2966** = `COMPILE_CRASH` +(`mach_get_dest_reg: unexpected kind 3`); **588** = O2, culprit `-fno-const-prop`. +Full list — rerun `tests/fuzz/triage_olevels.sh 300 2999 24`. +0–299 is clean (300/300). diff --git a/docs/materialization/00_overview.md b/docs/materialization/00_overview.md deleted file mode 100644 index f2e48280..00000000 --- a/docs/materialization/00_overview.md +++ /dev/null @@ -1,109 +0,0 @@ -# Materialization Refactor: Overview - -## Problem Statement - -The current materialization layer (`ir/mat.c`, `ir/codegen.c`) sits between the IR and the backend (`arm-thumb-gen.c`), creating a tangled intermediate abstraction: - -1. **Materialization duplicates backend logic.** `ir/mat.c` decides when to load spills, how to handle constants, when addresses are encodable, etc. But the backend *also* makes these decisions (via `load_to_reg_ir`, `get_scratch_reg_with_save`, `tcc_machine_can_encode_stack_offset`). The two layers constantly second-guess each other. - -2. **Register fill is fragile.** `ir/codegen.c:tcc_ir_fill_registers_ir()` translates allocation results back into `IROperand` flags (`is_local`, `is_llocal`, `is_lval`, `is_param`, `pr0_spilled`). This encoding is the source of most materialization bugs — a misset flag causes double-dereferences, missing loads, or wrong offsets. - -3. **Scratch register allocation happens too late.** Materialization acquires scratch registers *during* code emission. This means the backend can't plan register usage across an instruction — it discovers conflicts as it emits. - -4. **Two operand representations.** `SValue` (legacy) and `IROperand` (compact IR) both need parallel materialization paths. Every fix must be applied twice. - -5. **VT_LLOCAL (double indirection) is a symptom.** The entire VT_LLOCAL mechanism exists because materialization can't express "this value is a spilled pointer that needs dereferencing" cleanly. With backend-driven materialization, the backend simply loads what it needs. - -## Proposed Architecture - -### Core Idea - -**Operate on virtual registers throughout IR and codegen. Let the backend decide how and when to materialize physical values.** - -``` -Current: - IR → fill_registers_ir() → materialize_*_ir() → tcc_gen_machine_*_op() → emit instructions - [ir/codegen.c] [ir/mat.c] [arm-thumb-gen.c] - -Proposed: - IR → machine_op_from_ir() → tcc_gen_machine_*_op() → mach_ensure_in_reg() → emit - [ir/codegen.c, thin] [arm-thumb-gen.c] [arm-thumb-gen.c] -``` - -### Key Principles - -1. **IR operands stay virtual.** No `fill_registers()` pass. Operands carry vreg IDs and allocation metadata (physical reg or spill offset) but no `is_local`/`is_lval` rewriting. - -2. **Backend owns materialization.** Each instruction handler in `arm-thumb-gen.c` knows exactly what it needs: "src1 in register", "src2 as immediate or register", "dest in register, store back if spilled". No generic IR-level guessing. - -3. **Dry run determines scratch needs.** A first pass over instructions (without emitting) records what physical registers and scratch regs each instruction needs. This feeds register allocation constraints back to the allocator. *(Note: a dry-run pass already exists in `ir/codegen.c` — this phase extends it.)* - -4. **Single operand format.** Eliminate the `SValue` path entirely from codegen. All codegen works with `IROperand` + allocation metadata via `MachineOperand`. - -## Phase Summary - -| Phase | Title | Scope | Status | Details | -|-------|-------|-------|--------|---------| -| 0 | SValue Elimination | Remove SValue-based materialization from codegen | ✅ **DONE** (`e19755e6`) | [01_phase0_svalue_elimination.md](01_phase0_svalue_elimination.md) | -| 1 | MachineOperand Type | New unambiguous operand representation | ✅ **Done** — type + `machine_op_from_ir()` done; `machine_op_from_ir` decoupled from `pr0_reg` via `IROP_VREG_PHYS` encoding; 8 `MachineOperand` kinds cover all cases | [02_phase1_machine_operand.md](02_phase1_machine_operand.md) | -| 2 | Backend-Driven Materialization | Move all materialization into `arm-thumb-gen.c` | ✅ **Complete** — All convertible ops have MOP handlers; `!irop_needs_pair` guards removed for DP, ASSIGN, BOOL, LOAD, FUNC_CALL (64-bit pair sources handled via `mach_resolve_deref_64`); RETURNVALUE supports 64-bit; JUMP/JUMPIF and LEA intentionally on old path | [03_phase2_backend_materialization.md](03_phase2_backend_materialization.md) | -| 3 | Dry-Run Integration | Extend existing dry-run with constraint collection | ✅ **DONE** (`c2569883`) | [04_phase3_dry_run.md](04_phase3_dry_run.md) | -| 4 | Eliminate `ir/mat.c` | Delete IR-level materialization module | ✅ **DONE** (`bc43b639`) | [05_phase4_eliminate_mat.md](05_phase4_eliminate_mat.md) | -| 5 | Simplify Stack/Spill | Clean up data structures | ✅ **Done** — Phases 5b–5q ✅; `pr0_spilled`/`pr1_spilled` removed; `fill_registers_ir` deleted (~256 lines); 10 dead `_op` declarations + bodies removed (~700 lines); JUMP/JUMPIF/TRAP converted to `_mop`; `pr0_reg`/`pr1_reg` fields removed from `IROperand` (10→9 bytes); all legacy `_ir` wrappers deleted (~560 lines); `tcc_gen_mach_load_to_reg` rewritten for direct-dest loading; inline asm path fully on MOP | [06_phase5_simplify_stack.md](06_phase5_simplify_stack.md) | -| 6 | Consolidate Dispatch | Merge dry-run/real-run loops into single parameterised pass | ✅ **Done** — merged into single `for (pass=0; pass<2)` loop; `ir/codegen.c` reduced from 2106→1767 lines (−339, ~16%); extracted `ir_codegen_before_ret_peephole()`, `ir_codegen_record_scratch()`, `ir_codegen_check_scratch()`, `ir_codegen_track_scratch()` helpers | [07_phase6_consolidate_dispatch.md](07_phase6_consolidate_dispatch.md) | - -## Implementation Order and Milestones - -### Milestone 1: SValue Elimination (Phase 0) — ✅ COMPLETE -- **Scope:** ~400 lines removed from `ir/codegen.c` and `ir/mat.c` -- **Deliverable:** All codegen uses IROperand. SValue materialization functions deleted. -- **Commit:** `e19755e6 new materialization plan` - -### Milestone 2: MachineOperand + Backend Materialization (Phase 1 + Phase 2) — ✅ COMPLETE -- **Scope:** `MachineOperand` type, `machine_op_from_ir()`, and all convertible MOP handlers. -- **Done:** DP (ADD/SUB/CMP/SHL/SHR/SAR/AND/OR/XOR/ADC), ASSIGN (all dests), SETIF, BOOL_OR/AND, LOAD, STORE, LOAD_INDEXED, STORE_INDEXED, LOAD_POSTINC, STORE_POSTINC, IJUMP, FUNCPARAMVAL/VOID, RETURNVALUE (32-bit and 64-bit), MUL/DIV group (MUL/DIV/UDIV/IMOD/UMOD/TEST_ZERO 32-bit), MLA, UMULL, FP single-precision (FADD/FSUB/FMUL/FDIV/FNEG/FCMP/CVT_ITOF/CVT_FTOI/CVT_FTOF), VLA (VLA_ALLOC/VLA_SP_SAVE/VLA_SP_RESTORE), FUNC_CALL (32-bit and 64-bit non-complex dest), SWITCH_TABLE. -- **64-bit pair guards removed:** DP, ASSIGN, BOOL, LOAD, FUNC_CALL — `!irop_needs_pair` guards removed; 64-bit pair sources resolved by `mach_resolve_deref_64` before lo/hi splitting. -- **Intentionally on old path:** JUMP/JUMPIF (no register materialization), LEA (already single-layer), complex types, static chain, double-precision FP. -- **Key constraint resolved (Phase 5b):** `fill_registers_ir` no longer runs unconditionally at dispatch-loop top. `machine_op_from_ir` now fills its `IROperand *op` in-place (`ir_fill_op` helper at old-path `_op` sites). Double-fill is no longer possible. -- **Phase 5p complete:** `pr0_reg`/`pr1_reg` fields removed from `IROperand` (10→9 bytes). Added `irop_phys_r0()`/`irop_phys_r1()` helpers that read physical registers from interval table. `load_to_dest_ir` takes explicit `(int dest_r0, int dest_r1, IROperand src)`. All legacy `_ir` functions + `arm-thumb-asm.c` converted. `irop_init_phys_regs()` deleted. -- **Phase 5q complete:** All legacy `_ir` wrapper functions deleted (~560 lines): `load_to_dest_ir`, `store_ex_ir`, `store_ir`, `th_store_resolve_base_ir`, `irop_phys_r0`/`irop_phys_r1`, `th_store32_imm_or_reg`. `tcc_gen_mach_load_to_reg` rewritten to load directly into dest register (no scratch intermediary), fixing inline asm operand clobber regression (pr49390). -- **Test gate:** `make test -j16` — all tests passing - -### Milestone 3: Dry Run Integration (Phase 3) — ✅ COMPLETE -- **Scope:** Dual arrays `dry_insn_scratch[]`/`dry_insn_saves[]`, `try_reassign_scratch_conflict()` with R_FP+static_chain exclusion. -- **Deliverable:** Scratch conflicts resolved by reassigning vregs to callee-saved registers in a fixup pass. -- **Commit:** `c2569883 phase 3: enable dry-run scratch conflict fixup` - -### Milestone 4: Cleanup (Phase 4 + Phase 5 + Phase 6) — Phase 4 ✅, Phase 5 ✅, Phase 6 ✅ -- **Phase 4 done:** `ir/mat.c`, `ir/operand.c`, `ir/operand.h` deleted (`bc43b639`). `ir/machine_op.c` / `ir/machine_op.h` are the replacement. -- **Phase 5 done:** Dead `TCCStackSlot` fields removed (`0e772abb`). Header deduplication moot (`ir/operand.h` already deleted; only `tccir_operand.h` remains). Lazy fill coordination (Phase 5b) complete — unconditional dispatch-loop fills removed, `machine_op_from_ir` fills in-place, explicit `ir_fill_op` calls added at all old-path `_op` sites. -- **Phase 5c done:** FP double-precision `!irop_needs_pair` guards removed — `tcc_gen_machine_fp_mop` extended with `fp_mop_load_double_arg/do_bl/writeback_result` helpers for all FADD/FSUB/FMUL/FDIV/FNEG/FCMP/CVT_* via `__aeabi_dadd` etc. All `!ir->has_static_chain` guards removed (44 occurrences) — new `MACH_OP_CHAIN_REL` operand kind handles captured variable access via static chain. -- **Phase 5d done:** 14 dead old-path `else` branches removed. `ir/codegen.c` reduced by 440 lines (3149 → 2709). -- **Phase 5e done:** `*_before_ret` peephole converted to MOP path. 6 old-path call sites removed. -- **Phase 5f–5h done:** `machine_op_from_ir` decoupled from `fill_registers_ir`; FUNCCALL func_target → MachineOperand; LOAD spilled-dest support. -- **Phase 5i done:** LOAD/STORE `MACH_OP_NONE` fallback → `tcc_error` (proves old path dead). -- **Phase 5j done:** ~2400 lines dead `_op` backend functions deleted from `arm-thumb-gen.c`. -- **Phase 5k done:** Callsite arg-handling fully on MOP. `fill_arg_from_machine_op` bridge deleted. `is_complex` guards removed from FP/FUNCCALL dispatch. `fill_registers_ir` wrapped in `#ifdef TCC_REGALLOC_DEBUG`. Bug fixes: ARM_R12 base clobber in 64-bit stack arg placement; PARAM_STACK excluded from needs_deref double-indirection. -- **Phase 5l done:** `pr0_spilled`/`pr1_spilled` fields converted to `_reserved0`/`_reserved1` (1-bit each). All 9 read sites in `ir/codegen.c` + `arm-thumb-gen.c` deleted; 3 write sites removed. IROperand remains 10 bytes. -- **Phase 5m done:** `fill_registers_ir` fully deleted (~256 lines). All 6 `#ifdef TCC_REGALLOC_DEBUG` wrappers + the 2 function implementations + 3 declarations removed. `machine_op_from_ir` is now sole materialization path. -- **Phase 5n done:** 10 dead `_op` handler declarations and bodies removed (~700 lines). Includes `tcc_gen_machine_jump_op`, `tcc_gen_machine_cond_jump_op`, `tcc_gen_machine_trap_op`, etc. -- **Phase 5o done:** JUMP, JUMPIF, and TRAP fully converted to `_mop` handlers. Dispatch loop is now 100% MOP — zero `_op` calls remain. -- **Phase 5p done:** `machine_op_from_ir` decoupled from `pr0_reg` — reads interval table directly for physreg. `IROP_VREG_PHYS_VALID`/`IROP_VREG_PHYS_MASK` encoding in `u.imm32` for vreg=-1 operands. `pr0_reg`/`pr1_reg` fields removed from `IROperand` (10→9 bytes). -- **Phase 5q done:** All legacy `_ir` wrapper functions deleted (~560 lines). `tcc_gen_mach_load_to_reg` rewritten for direct-dest loading. Inline asm operand clobber regression (pr49390) fixed. -- **Phase 6 done:** Merged dry-run + real-run dispatch loops into single `for (pass=0; pass<2)` loop. `ir/codegen.c` reduced from 2106→1767 lines (−339, ~16%). See [07_phase6_consolidate_dispatch.md](07_phase6_consolidate_dispatch.md). -- **Current file sizes:** `ir/codegen.c`=1767, `arm-thumb-gen.c`=8055, `ir/machine_op.c`=328, `tccir_operand.h`=560, `tccir_operand.c`=844, `arm-thumb-asm.c`=3539 -- **Test gate:** `make test -j16` — 3310 passed, 79 skipped, 582 xfailed, 0 failed - -## Risk Analysis - -| Risk | Mitigation | -|---|---| -| **Breaking existing tests during migration** | Convert one instruction handler at a time; run tests after each | -| **SValue still used in parser** | SValue stays in `tccgen.c`/`tccpp.c` — we only remove it from codegen path | -| **Dry run diverges from real run** | Assert-check that dry run predictions match real emission | -| **Performance regression from two passes** | Dry run is already implemented and cheap | -| **64-bit / float edge cases** | These are already the buggiest paths; explicit MachineOperand::kind makes them clearer | - -## Review Notes - -See [review.md](review.md) for a detailed review of this plan against the actual codebase state. diff --git a/docs/materialization/01_phase0_svalue_elimination.md b/docs/materialization/01_phase0_svalue_elimination.md deleted file mode 100644 index b25c05b2..00000000 --- a/docs/materialization/01_phase0_svalue_elimination.md +++ /dev/null @@ -1,114 +0,0 @@ -# Phase 0: Eliminate SValue from Codegen Path - -> **Status: ✅ COMPLETE** — committed `e19755e6 new materialization plan` - -## Goal - -Remove the `SValue`-based materialization and register fill paths. All backend codegen uses `IROperand` exclusively. - -## Current State - -`ir/mat.c` has **two complete parallel APIs**: - -| SValue API (legacy) | IROperand API | -|---|---| -| `tcc_ir_materialize_value(ir, sv, result)` | `tcc_ir_materialize_value_ir(ir, op, result)` | -| `tcc_ir_materialize_const_to_reg(ir, sv, result)` | `tcc_ir_materialize_const_to_reg_ir(ir, op, result)` | -| `tcc_ir_materialize_addr(ir, sv, result, dest_reg)` | `tcc_ir_materialize_addr_ir(ir, op, result, dest_reg)` | -| `tcc_ir_materialize_dest(ir, dest, result)` | `tcc_ir_materialize_dest_ir(ir, op, result)` | -| `tcc_ir_fill_registers(ir, sv)` | `tcc_ir_fill_registers_ir(ir, op)` | - -Additionally, there's a **third wrapper layer** (`tcc_ir_mat_value`, `tcc_ir_mat_const`, `tcc_ir_mat_addr`, `tcc_ir_mat_dest`, etc.) that wraps the legacy implementations with newer result types (`TCCMatValue`, `TCCMatDest`, `TCCMatAddr`). - -`ir/codegen.c` only uses the IROperand versions (`_ir` suffix) in its main `tcc_ir_codegen_generate()` dispatch loop. The SValue versions may still be called from other paths. - -## Files Affected - -| File | Changes | -|---|---| -| `ir/mat.c` | Delete all SValue-based functions (~400 lines) | -| `ir/codegen.c` | Remove `tcc_ir_fill_registers()` (SValue version, ~170 lines) | -| `svalue.h` | No changes (SValue struct stays for parser use) | -| `tccgen.c` | No changes (parser keeps using SValue) | -| `tccir.h` | Remove `TCCMaterializedValue`/`Addr`/`Dest` SValue struct declarations | - -## Implementation Steps - -### Step 0.1: Audit SValue materialization callers - -**Action:** Find all call sites of the SValue-based materialization functions. - -```bash -grep -rn 'tcc_ir_materialize_value\b' --include='*.c' --include='*.h' -grep -rn 'tcc_ir_materialize_const_to_reg\b' --include='*.c' --include='*.h' -grep -rn 'tcc_ir_materialize_addr\b' --include='*.c' --include='*.h' -grep -rn 'tcc_ir_materialize_dest\b' --include='*.c' --include='*.h' -grep -rn 'tcc_ir_fill_registers\b' --include='*.c' --include='*.h' -grep -rn 'tcc_ir_mat_value\b' --include='*.c' --include='*.h' -grep -rn 'tcc_ir_mat_const\b' --include='*.c' --include='*.h' -grep -rn 'tcc_ir_mat_addr\b' --include='*.c' --include='*.h' -grep -rn 'tcc_ir_mat_dest\b' --include='*.c' --include='*.h' -``` - -**Expected:** SValue versions are only called from `ir/codegen.c` legacy paths and possibly `arm-thumb-callsite.c`. If there are callers in `arm-thumb-gen.c`, those need conversion first. - -**Decision point:** If SValue callers exist outside `ir/codegen.c`, they must be converted to IROperand equivalents before deletion. - -### Step 0.2: Identify dead SValue code paths in codegen - -**Action:** Check if there's a legacy dispatch loop in `ir/codegen.c` that uses SValue alongside the main IROperand dispatch loop. - -Look at `ir/codegen.c` around lines 1800–2300 for a second `switch(cq->op)` block. The file has **4 occurrences** of `case TCCIR_OP_ADD:`, suggesting at least 2 distinct dispatch paths, possibly more (one for need_* classification, one for actual dispatch, potentially a legacy SValue path, and a 64-bit path). - -**Decision point:** Determine which dispatch paths are truly dead vs. conditionally active. - -### Step 0.3: Delete SValue materialization functions from `ir/mat.c` - -**Action:** Remove the following functions: - -1. `tcc_ir_materialize_value()` (L69) -2. `tcc_ir_materialize_const_to_reg()` (L186) -3. `tcc_ir_materialize_addr()` (L262) -4. `tcc_ir_materialize_dest()` (L345) -5. `tcc_ir_mat_value()` (L924) — wrapper -6. `tcc_ir_mat_const()` (L937) — wrapper -7. `tcc_ir_mat_addr()` (L950) — wrapper -8. `tcc_ir_mat_dest()` (L963) — wrapper -9. `tcc_ir_mat_spilled()` (L902) — if no remaining callers -10. `tcc_ir_operand_needs_dereference()` (L1071) — if SValue-only - -Also remove static helpers only used by SValue path: `mat_slot_sv()`, `mat_offset_sv()`. - -### Step 0.4: Delete `tcc_ir_fill_registers()` (SValue version) from `ir/codegen.c` - -**Action:** Remove lines ~23–189 (the SValue `tcc_ir_fill_registers` function). Keep `tcc_ir_fill_registers_ir()` (lines ~190–350). - -### Step 0.5: Remove SValue struct declarations from `tccir.h` - -**Action:** Remove `TCCMaterializedValue`, `TCCMaterializedAddr`, `TCCMaterializedDest` if no IROperand code still uses them. Check if the `_ir` functions still return these types — if so, those structs stay until Phase 4. - -**Important:** Do NOT remove `TCCMatValue`/`TCCMatAddr`/`TCCMatDest` (the newer wrapper types) if they're used by IROperand functions. - -### Step 0.6: Compile and test - -```bash -make clean && make cross -j16 -make test -j16 -``` - -**Expected:** All tests pass. This is a pure dead-code removal with no behavior change. - -## Risk Assessment - -- **Risk: Low.** This is dead code removal. The SValue functions are a legacy path. -- **Risk: Medium** if the SValue functions are still reachable through conditional compilation or runtime paths. The audit in Step 0.1 will reveal this. -- **Mitigation:** `grep` thoroughly, compile with `-Werror -Wunused-function` to catch orphaned static helpers. - -## Verification Checklist - -- [x] All SValue materialization callers identified and removed/converted -- [x] No `tcc_ir_materialize_value\b` (non-`_ir`) references remain -- [x] No `tcc_ir_fill_registers\b` (non-`_ir`) references remain -- [x] `make cross` compiles without warnings -- [x] `make test -j16` passes -- [x] `ir/mat.c` SValue functions deleted (later: whole file deleted in Phase 4) diff --git a/docs/materialization/02_phase1_machine_operand.md b/docs/materialization/02_phase1_machine_operand.md deleted file mode 100644 index 67599b53..00000000 --- a/docs/materialization/02_phase1_machine_operand.md +++ /dev/null @@ -1,222 +0,0 @@ -# Phase 1: New Operand Representation — `MachineOperand` - -> **Status: ✅ Done** — `MachineOperand` type and `machine_op_from_ir()` fully implemented. Used exclusively on all dispatch paths (Phases 2–5q complete). `machine_op_from_ir` takes `const IROperand *op` and reads the interval table directly — no `fill_registers_ir` dependency. `fill_registers_ir` fully deleted (Phase 5m). `pr0_reg`/`pr1_reg`/`pr0_spilled`/`pr1_spilled` removed from `IROperand` (Phases 5l + 5p). All legacy `_ir` wrapper functions deleted (Phase 5q). `IROperand` is now 9 bytes packed. - -## Goal - -Replace the overloaded `IROperand` flags with a clear machine-level operand type that the backend can interpret without ambiguity. This separates "what the IR says" from "how the backend should materialize it." - -## Current State - -`IROperand` (defined in `tccir_operand.h`, 9 bytes packed) encodes operand state. After Phases 5l–5q, the codegen-time fields (`pr0_reg`, `pr1_reg`, `pr0_spilled`, `pr1_spilled`) have been removed. Remaining fields: - -| Flag | Meaning | Set By | -|---|---|---| -| `is_local` | Stack-relative (frame offset in payload) | IR construction (`tccgen.c`) | -| `is_llocal` | Double indirection (spilled pointer) | IR construction (`tccgen.c`) | -| `is_lval` | Needs load through address | IR construction (`tccgen.c`) | -| `is_param` | Stack-passed function parameter | IR construction (`tccgen.c`) | -| `is_const` | Immediate constant | IR construction | -| `tag` | IROP_TAG_VREG/IMM32/STACKOFF/etc. | IR construction | - -The backend (`arm-thumb-gen.c`) must test combinations of these flags to determine what to do: -- `pr0_spilled && !is_llocal` → load from spill slot -- `is_llocal` → load pointer from spill, then dereference -- `is_local && is_lval` → load from frame address -- `is_param && pr0_spilled` → load from parameter area - -These combinations are error-prone and the source of most materialization bugs. - -## Design - -### `MachineOperand` type - -```c -/* ir/machine_op.h */ - -typedef enum { - MACH_OP_REG, /* Value in physical register(s) */ - MACH_OP_SPILL, /* Value in spill slot, needs load */ - MACH_OP_IMM, /* Immediate constant */ - MACH_OP_FRAME_ADDR, /* Address = FP + offset (address-of local) */ - MACH_OP_SYMBOL, /* Symbol reference (global/extern) */ - MACH_OP_PARAM_STACK, /* Stack-passed parameter in caller frame */ -} MachineOperandKind; - -typedef struct { - MachineOperandKind kind; - CType type; - union { - struct { int r0, r1; } reg; /* MACH_OP_REG */ - struct { int offset; int size; } spill; /* MACH_OP_SPILL */ - struct { int64_t val; } imm; /* MACH_OP_IMM */ - struct { int offset; } frame; /* MACH_OP_FRAME_ADDR */ - struct { Sym *sym; int addend; } sym; /* MACH_OP_SYMBOL */ - struct { int offset; int size; } param; /* MACH_OP_PARAM_STACK */ - } u; - int vreg; /* Original vreg (for debug/liveness queries) */ - bool needs_deref; /* Load through this address (replaces VT_LVAL) */ - bool is_64bit; /* Two-register value */ -} MachineOperand; -``` - -### Conversion function - -```c -/* Replaces tcc_ir_fill_registers_ir() — instead of rewriting IROperand in - * place with flag mutations, produce a clean MachineOperand. */ -MachineOperand machine_op_from_ir(TCCIRState *ir, const IROperand *op); -``` - -This single function encapsulates the entire `tcc_ir_fill_registers_ir()` logic in a pure, side-effect-free mapping. It reads the register allocation results and the operand's IR-level tags to produce one of 6 unambiguous enum variants. - -## Implementation Steps - -### Step 1.1: Create `ir/machine_op.h` - -**Action:** Create the header with the `MachineOperand` type, `MachineOperandKind` enum, and the `machine_op_from_ir()` declaration. - -**Design decisions:** -- Keep it a plain C header (no C++ features) -- Include `tccir.h` for `IROperand`, `TCCIRState` -- `CType` comes from `tcc.h` — need a forward declaration or include - -### Step 1.2: Implement `machine_op_from_ir()` in `ir/machine_op.c` - -**Action:** Port the logic from `tcc_ir_fill_registers_ir()` (ir/codegen.c lines ~190–350) into a stateless conversion function. - -The key mapping logic is: - -```c -MachineOperand machine_op_from_ir(TCCIRState *ir, const IROperand *op) -{ - MachineOperand m = {0}; - m.vreg = irop_get_position(*op); - m.is_64bit = irop_is_64bit(*op); - // Extract type from op... - - if (irop_get_tag(*op) == IROP_TAG_IMM32) { - m.kind = MACH_OP_IMM; - m.u.imm.val = irop_get_imm32(*op); - return m; - } - - // Look up register allocation for this vreg - IRLiveInterval *interval = tcc_ir_live_interval_for_vreg(ir, m.vreg); - if (!interval) { - // Constant or special operand - // ... handle IROP_TAG_STACKOFF, IROP_TAG_SYMREF, etc. - } - - if (op->pr0_spilled) { - if (op->is_llocal) { - // Spilled pointer that needs dereferencing - m.kind = MACH_OP_SPILL; - m.needs_deref = true; - m.u.spill.offset = /* frame offset */; - } else if (op->is_param) { - m.kind = MACH_OP_PARAM_STACK; - m.u.param.offset = /* param offset */; - } else { - m.kind = MACH_OP_SPILL; - m.u.spill.offset = /* spill slot offset */; - } - } else if (op->is_local && !op->is_lval) { - // Address-of local variable (LEA) - m.kind = MACH_OP_FRAME_ADDR; - m.u.frame.offset = /* frame offset */; - } else if (op->is_sym) { - m.kind = MACH_OP_SYMBOL; - // ... extract sym + addend - } else { - m.kind = MACH_OP_REG; - m.u.reg.r0 = op->pr0_reg; - m.u.reg.r1 = m.is_64bit ? op->pr1_reg : -1; - } - - m.needs_deref = op->is_lval && (m.kind != MACH_OP_SPILL || !op->is_llocal); - return m; -} -``` - -**Critical:** This function must produce *exactly* the same materialization decisions as the current `fill_registers_ir` + `materialize_*_ir` combination. Write test assertions that compare old vs. new. - -### Step 1.3: Unit tests for `machine_op_from_ir()` - -**Action:** Create `tests/ir_tests/test_machine_op.c` (or a pytest test) that verifies: - -1. VREG with physical register → `MACH_OP_REG` -2. VREG spilled to stack → `MACH_OP_SPILL` -3. Immediate → `MACH_OP_IMM` -4. Local variable address → `MACH_OP_FRAME_ADDR` -5. Symbol reference → `MACH_OP_SYMBOL` -6. Stack-passed parameter → `MACH_OP_PARAM_STACK` -7. Spilled pointer (is_llocal) → `MACH_OP_SPILL` with `needs_deref=true` -8. 64-bit value in register pair → `MACH_OP_REG` with both r0/r1 -9. 64-bit value partially spilled → correct handling - -### Step 1.4: Wire into codegen alongside existing path - -**Action:** In `ir/codegen.c`, after the existing `tcc_ir_fill_registers_ir()` calls, add parallel `machine_op_from_ir()` calls and assert that the resulting `MachineOperand.kind` is consistent with the old flags. - -```c -// Existing: -tcc_ir_fill_registers_ir(ir, &src1_ir); -// New (validation only, remove after Phase 2): -MachineOperand m_src1 = machine_op_from_ir(ir, &src1_ir_orig); -assert(validate_machine_op_vs_filled_ir(&m_src1, &src1_ir)); -``` - -This runs both paths in parallel during the transition, catching any divergence immediately. - -### Step 1.5: Integrate into build - -**Action:** Add `ir/machine_op.c` to the Makefile (specifically `TINYCC_IR_SRC` or equivalent). - -```bash -make cross -j16 && make test -j16 -``` - -## Design Rationale - -### Why not just clean up IROperand flags? - -The flags encode *allocation state* (which register, whether spilled) mixed with *semantic state* (is_local, is_lval, is_param). These concerns should be separated. `IROperand` should stay as the IR-level representation; `MachineOperand` is the backend-level view after allocation. - -### Why a separate struct instead of extending IROperand? - -`IROperand` is packed to 9 bytes for cache efficiency during IR passes. `MachineOperand` is only created during codegen (one instruction at a time) and can afford to be larger and clearer. - -### Why not just pass allocation metadata separately? - -The whole point is to avoid the "test 5 flags in combination" pattern. A single `kind` enum replaces all flag combinations. - -## Verification Checklist - -- [x] `ir/machine_op.h` created with `MachineOperand` type (`MACH_OP_REG`, `MACH_OP_SPILL`, `MACH_OP_IMM`, `MACH_OP_FRAME_ADDR`, `MACH_OP_SYMBOL`, `MACH_OP_PARAM_STACK`) -- [x] `machine_op_from_ir()` implemented and handles all 6 operand categories -- [x] `ir/machine_op.c` added to build (included via `libtcc.c`) -- [x] `make cross` compiles without warnings -- [x] `make test -j16` passes (no behavior change — MOP path parallel to old path) -- [x] `fill_registers_ir` removed from MOP path — ✅ done (Phase 5m: `fill_registers_ir` fully deleted) - -## Historical Notes: `fill_registers_ir` Removal - -> **All items below are resolved.** Kept for historical reference on the design decisions made during the refactor. - -### Why `fill_registers_ir` was problematic - -`fill_registers_ir` did **more** than just copy `allocation.r0` into `pr0_reg`. It also: - -1. **Transformed `is_lval`/`is_local`/`is_param` flags** — register-resident params got `is_lval` cleared; pointer-deref operands kept it. -2. **Applied VLA stack-offset deltas** — when `is_local && is_llocal && IROP_TAG_STACKOFF`, the payload offset was adjusted by `old_stackoff - interval->original_offset`. -3. **Handled struct types** — stored `interval->allocation.offset` into `op->u.s.aux_data` instead of `op->u.imm32`. -4. **Stack-passed parameter detection** — set tag to `IROP_TAG_STACKOFF` + `is_param=1` + `is_local=1` for params where `incoming_reg0 < 0 && allocation.r0 == PREG_NONE`. - -### Key discovery: non-idempotent fill - -`fill_registers_ir` was **NOT** idempotent. For `IROP_TAG_STACKOFF` operands it applied a delta `old_stackoff - interval->original_offset` to `op->u.imm32`. Calling fill twice doubled this delta → 30 test failures. This was discovered during Phase 5a (failed attempt to internalize fill inside `machine_op_from_ir`). - -### Resolution - -Phase 5b removed dispatch-level fills, Phase 5f rewrote `machine_op_from_ir` to read the interval table directly (taking `const IROperand *op` — no mutation), and Phase 5m deleted `fill_registers_ir` entirely. All transforms are now handled inside `machine_op_from_ir` via direct interval-table reads. diff --git a/docs/materialization/03_phase2_backend_materialization.md b/docs/materialization/03_phase2_backend_materialization.md deleted file mode 100644 index d81894c4..00000000 --- a/docs/materialization/03_phase2_backend_materialization.md +++ /dev/null @@ -1,397 +0,0 @@ -# Phase 2: Backend-Driven Materialization - -> **Status: ✅ Complete** — All convertible ops now have MOP handlers. Done: DP (ADD/SUB/CMP/SHL/SHR/SAR/AND/OR/XOR/ADC), ASSIGN (all dests), SETIF (including 64-bit pair dest), BOOL_OR/AND (including 64-bit pair sources), LOAD (including 64-bit pair), STORE, LOAD_INDEXED, STORE_INDEXED, LOAD_POSTINC, STORE_POSTINC, IJUMP, FUNCPARAMVAL/VOID, RETURNVALUE (32-bit and 64-bit), MUL/DIV group (MUL/DIV/UDIV/IMOD/UMOD/TEST_ZERO 32-bit; MLA/UMULL converted to dedicated MOP handlers), FP single-precision (FADD/FSUB/FMUL/FDIV/FNEG/FCMP/CVT_ITOF/CVT_FTOI/CVT_FTOF; doubles/complex stay on old path), VLA (VLA_ALLOC/VLA_SP_SAVE/VLA_SP_RESTORE), FUNC_CALL (32-bit and 64-bit non-complex dest; complex/static-chain stays on old path), SWITCH_TABLE. `!irop_needs_pair` guards removed for DP, ASSIGN, BOOL, LOAD, and FUNC_CALL — 64-bit pair sources handled via `mach_resolve_deref_64`. Three backend bugs fixed: (1) 64-bit reg-to-reg LOAD only copied lo half — added hi-half MOV; (2) dest/scratch register overlap in `dp_mop64`/`shift64_mop` — determine dest pair BEFORE deref resolution + pre-exclude src reg operands; (3) `MACH_OP_PARAM_STACK` double-indirection — added early return with `needs_deref=false`. JUMP/JUMPIF and LEA are intentionally left on the old path (see below). - -## Goal - -Move all materialization decisions into `arm-thumb-gen.c` instruction handlers, replacing the centralized `ir/codegen.c` materialize-then-dispatch pattern with per-instruction backend-driven materialization using `MachineOperand`. - -## Current State (Actual Architecture) - -The plan's original pseudocode was inaccurate. Here's what actually happens: - -### Actual current flow - -``` -ir/codegen.c::tcc_ir_codegen_generate(): - 1. Classify operand needs (need_src1_value, need_src2_value, ...) - 2. Get IROperand copies from pool - 3. Call tcc_ir_fill_registers_ir() on each operand - 4. Call tcc_ir_materialize_value_ir() / _addr_ir() / _dest_ir() as needed - 5. Call tcc_gen_machine_*_op() in arm-thumb-gen.c (which receives already-filled IROperands) - 6. Release scratch registers from materialization -``` - -### What arm-thumb-gen.c actually does - -`arm-thumb-gen.c` does **NOT** call `tcc_ir_materialize_*` or `tcc_ir_mat_*` APIs. Instead it receives the pre-filled IROperands and then: - -1. Calls `get_scratch_reg_with_save(exclude_mask)` — **66 times** across the file -2. Calls `load_to_reg_ir(reg, r1, src_operand)` — **63 times** across the file -3. Emits Thumb-2 instructions via `ot(th_xxx(...))` -4. Calls `restore_scratch_reg(&alloc)` to clean up - -So there are **two layers of materialization**: `ir/mat.c` materializes into the IROperand, then `arm-thumb-gen.c` does its own `load_to_reg_ir` on top. This is the core redundancy. - -## Proposed Pattern - -Replace the current two-layer flow with a single-layer `MachineOperand`-based pattern: - -### New `mach_*` helper functions (in `arm-thumb-gen.c`) - -| Function | Role | -|---|---| -| `mach_ensure_in_reg(ctx, op)` | If REG: return reg. If SPILL: load to scratch. If IMM: mov to scratch. If FRAME_ADDR: compute address. | -| `mach_ensure_in_reg_or_imm(ctx, op)` | For ADD/SUB/CMP: return reg or encodable Thumb immediate | -| `mach_get_dest_reg(ctx, op)` | If dest is REG: return reg. If SPILL: allocate scratch. | -| `mach_writeback_dest(ctx, op, reg)` | If dest was SPILL: STR reg to spill slot. | -| `mach_ensure_addr(ctx, op)` | For LOAD/STORE: compute base register + offset. | -| `mach_release_scratch(ctx)` | Free scratch registers used in this instruction. | - -### Example: TCCIR_OP_ADD — before and after - -**Before (current):** -```c -// ir/codegen.c: -tcc_ir_fill_registers_ir(ir, &src1_ir); -tcc_ir_fill_registers_ir(ir, &src2_ir); -tcc_ir_fill_registers_ir(ir, &dest_ir); -tcc_ir_materialize_value_ir(ir, &src1_ir, &mat_src1); -tcc_ir_materialize_value_ir(ir, &src2_ir, &mat_src2); -tcc_ir_materialize_dest_ir(ir, &dest_ir, &mat_dest); -// Dispatch to backend: -tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, TCCIR_OP_ADD); -// arm-thumb-gen.c::tcc_gen_machine_data_processing_op(): -// calls get_scratch_reg_with_save() and load_to_reg_ir() again! -// ir/codegen.c: -tcc_machine_release_scratch(&mat_src1.scratch); // etc. -``` - -**After (proposed):** -```c -// ir/codegen.c (thin): -MachineOperand src1 = machine_op_from_ir(ir, &raw_src1); -MachineOperand src2 = machine_op_from_ir(ir, &raw_src2); -MachineOperand dest = machine_op_from_ir(ir, &raw_dest); -// Dispatch to backend: -tcc_gen_machine_data_processing_mop(ctx, src1, src2, dest, TCCIR_OP_ADD); - -// arm-thumb-gen.c::tcc_gen_machine_data_processing_mop(): -int r_src1 = mach_ensure_in_reg(ctx, &src1); -int r_src2 = mach_ensure_in_reg_or_imm(ctx, &src2, &is_imm, &imm_val); -int r_dest = mach_get_dest_reg(ctx, &dest); - -if (is_imm) - ot(th_add_imm(r_dest, r_src1, imm_val)); -else - ot(th_add_reg(r_dest, r_src1, r_src2)); - -mach_writeback_dest(ctx, &dest, r_dest); -mach_release_scratch(ctx); -``` - -## Implementation Steps - -### Step 2.1: Define `MachineCodegenContext` - -**Action:** Add a context struct to hold per-instruction state: - -```c -typedef struct { - TCCIRState *ir; - int instruction_index; - - /* Scratch register pool for current instruction */ - int scratch_regs[4]; - int scratch_count; - int scratch_used; - - /* Track which physical registers are live at this point */ - uint16_t live_reg_mask; - - /* Plan mode (dry run) vs emit mode */ - bool plan_mode; -} MachineCodegenContext; -``` - -**File:** `arm-thumb-gen.c` (or a new `arm-thumb-mach.h` header) - -### Step 2.2: Implement `mach_ensure_in_reg()` - -**Action:** This wraps the existing `get_scratch_reg_with_save` + `load_to_reg_ir` pattern: - -```c -static int mach_ensure_in_reg(MachineCodegenContext *ctx, const MachineOperand *op) -{ - switch (op->kind) { - case MACH_OP_REG: - return op->u.reg.r0; - - case MACH_OP_SPILL: { - int scratch = mach_alloc_scratch(ctx, /* exclude= */ 0); - int offset = op->u.spill.offset; - // LDR scratch, [fp, #offset] - emit_ldr_spill(scratch, offset, op->u.spill.size); - if (op->needs_deref) { - // Double indirection: load pointer, then load through it - emit_ldr_indirect(scratch, scratch, 0, /* size from type */); - } - return scratch; - } - - case MACH_OP_IMM: { - int scratch = mach_alloc_scratch(ctx, 0); - emit_mov_imm(scratch, op->u.imm.val); - return scratch; - } - - case MACH_OP_FRAME_ADDR: { - int scratch = mach_alloc_scratch(ctx, 0); - emit_add_fp_offset(scratch, op->u.frame.offset); - return scratch; - } - - case MACH_OP_SYMBOL: { - int scratch = mach_alloc_scratch(ctx, 0); - emit_load_symbol_addr(scratch, op->u.sym.sym, op->u.sym.addend); - return scratch; - } - - case MACH_OP_PARAM_STACK: { - int scratch = mach_alloc_scratch(ctx, 0); - emit_ldr_param(scratch, op->u.param.offset, op->u.param.size); - return scratch; - } - } -} -``` - -**Key insight:** Each `case` here corresponds to what `ir/mat.c` currently tests with multiple flag combinations. The explicit `kind` enum makes the code self-documenting. - -### Step 2.3: Implement remaining `mach_*` helpers - -Implement in `arm-thumb-gen.c`: - -- `mach_ensure_in_reg_or_imm(ctx, op, &is_imm, &imm_val)` — checks if IMM value is Thumb-encodable; if so, returns the immediate; otherwise loads to scratch register. -- `mach_get_dest_reg(ctx, op)` — returns physical reg or allocates scratch for spilled dest. -- `mach_writeback_dest(ctx, op, reg)` — STR to spill slot if dest was spilled. -- `mach_ensure_addr(ctx, op)` — for LOAD/STORE, returns base register + offset pair. -- `mach_alloc_scratch(ctx, exclude_mask)` — wraps `get_scratch_reg_with_save()`. -- `mach_release_scratch(ctx)` — wraps `restore_scratch_reg()`. - -### Step 2.4: Convert instruction handlers one-by-one - -**Action:** Create `_mop` variants of each `tcc_gen_machine_*_op` function that accept `MachineOperand` instead of `IROperand`. Start with the simplest: - -**Conversion order (easiest to hardest):** - -1. `tcc_gen_machine_data_processing_op` — arithmetic (ADD, SUB, MUL, etc.) -2. `tcc_gen_machine_load_op` / `tcc_gen_machine_store_op` — memory access -3. `tcc_gen_machine_assign_op` — register moves -4. `tcc_gen_machine_return_value_op` — function return -5. `tcc_gen_machine_lea_op` — address computation -6. `tcc_gen_machine_jump_op` / `_conditional_jump_op` — control flow -7. `tcc_gen_machine_setif_op` — conditional set -8. `tcc_gen_machine_bool_op` — boolean ops -9. `tcc_gen_machine_func_call_op` — function calls (most complex) -10. `tcc_gen_machine_func_parameter_op` — parameter passing -11. `tcc_gen_machine_fp_op` — floating point -12. `tcc_gen_machine_load_indexed_op` / `_store_indexed_op` — indexed memory -13. `tcc_gen_machine_load_postinc_op` / `_store_postinc_op` — post-increment -14. `tcc_gen_machine_vla_op` — VLA operations - -**For each handler:** -1. Write `_mop` version alongside existing `_op` version -2. Update `ir/codegen.c` dispatch to call `_mop` version (passing `MachineOperand` instead of `IROperand`) -3. Run `make test -j16` -4. Once all callers converted, delete the old `_op` version - -### Step 2.5: Update `ir/codegen.c` dispatch loop - -**Action:** Replace the centralized materialize-then-dispatch pattern: - -```c -// BEFORE (current): -tcc_ir_fill_registers_ir(ir, &src1_ir); -tcc_ir_materialize_value_ir(ir, &src1_ir, &mat_src1); -// ... then dispatch, then release - -// AFTER: -MachineOperand src1 = machine_op_from_ir(ir, &raw_src1); -// ... then dispatch (handler does its own materialization) -``` - -The dispatch loop becomes ~50% shorter because the classify-materialize-release boilerplate is deleted. - -### Step 2.6: Handle 64-bit values - -**Special attention:** 64-bit values (long long, double) use register pairs. The `mach_ensure_in_reg()` function must return both registers: - -```c -typedef struct { - int r0; - int r1; /* -1 if not 64-bit */ -} MachRegPair; - -MachRegPair mach_ensure_in_reg_pair(MachineCodegenContext *ctx, const MachineOperand *op); -``` - -For spilled 64-bit values, this loads two words from adjacent spill slots. For register pairs, it returns both physical regs. - -## What Is Actually Implemented - -### `tcc_gen_machine_data_processing_mop()` — **DONE** - -Handles: ADD, SUB, CMP, SHL, SHR, SAR, AND, OR, XOR, ADC_GEN, ADC_USE -Condition: no static chain (`!ir->has_static_chain`); `!irop_needs_pair` guard has been removed — 64-bit pair sources are now handled via `mach_resolve_deref_64` - -The dispatch path in `ir/codegen.c` determines `use_mop_dp` **after** `fill_registers_ir` runs, then calls `machine_op_from_ir` on the already-filled operands. The `mach_*` helpers inside handle: -- `MACH_OP_REG` — value already in register, use directly -- `MACH_OP_SPILL` — load to scratch via `get_scratch_reg_with_save` + `load_to_reg_ir` -- `MACH_OP_IMM` — check if Thumb-encodable; if not, load to scratch -- `MACH_OP_FRAME_ADDR` — compute FP + offset into scratch - -### `tcc_gen_machine_assign_mop()` — **DONE** - -Handles: TCCIR_OP_ASSIGN (register moves, truncate, sign-extend) -Condition: no static chain (`!ir->has_static_chain`); `!irop_needs_pair` guard has been removed — 64-bit pair sources/destinations are handled via `mach_resolve_deref_64` and the existing 64-bit assign path in `tcc_gen_machine_assign_mop` - -All destination kinds supported: REG (direct), SPILL (via `mach_get_dest_reg` scratch + `mach_writeback_dest` → `tcc_machine_store_spill_slot`), PARAM_STACK (via `mach_writeback_dest` → `tcc_machine_store_param_slot`). The earlier REG-only restriction has been removed — `tcc_machine_store_spill_slot` correctly applies `fp_adjust_local_offset`, which was the original concern. - -Source operand handling covers all `MachineOperandKind` variants: -- `MACH_OP_REG` (no deref) → direct `mach_writeback_dest` (0 scratch) -- `MACH_OP_REG` (deref) → `load_from_base_ir` into dest_reg -- `MACH_OP_IMM` → `tcc_machine_load_constant` into dest_reg -- `MACH_OP_SPILL` → `tcc_machine_load_spill_slot` + optional deref -- `MACH_OP_SYMBOL` → `tcc_machine_load_constant` with sym + optional deref -- `MACH_OP_FRAME_ADDR` → `tcc_machine_addr_of_stack_slot` -- `MACH_OP_PARAM_STACK` → `load_from_base_ir` with `offset_to_args` adjustment - -A special `assign_before_ret` guard in both dry-run and real-run prevents the ASSIGN MOP path from firing when the next instruction is RETURNVALUE (to preserve the existing RETURNVALUE peephole that sets `dest_ir.pr0_reg = REG_IRET`). The guard also checks `!has_incoming_jump[i+1]` to ensure consistency between dry-run and real-run. - -### `tcc_gen_machine_setif_mop()` — **DONE** - -Handles: TCCIR_OP_SETIF (conditional set) -Condition: non-pair, no static chain - -Emits: MOV dest, #0; IT cond; MOV dest, #1. Uses `mach_get_dest_reg` / `mach_writeback_dest` for destination, no source operand materialization needed (reads from condition flags). - -### `tcc_gen_machine_bool_mop()` — **DONE** - -Handles: TCCIR_OP_BOOL_OR, TCCIR_OP_BOOL_AND -Condition: no static chain (`!ir->has_static_chain`); `!irop_needs_pair` guard has been removed — 64-bit pair sources are now handled - -BOOL_OR: `mach_ensure_in_reg` for both sources, ORRS into dest, then IT NE / MOV #1 / IT EQ / MOV #0. -BOOL_AND: CMP src1, #0 / IT EQ / MOV dest, #0 / CMP src2, #0 / IT EQ / MOV dest, #0 / ... (short-circuit pattern). - -For 64-bit sources: lo and hi halves are ORR'd together to produce a single 32-bit "nonzero" value before the boolean operation. - -### `tcc_gen_machine_func_call_mop()` — **DONE** - -Handles: TCCIR_OP_FUNCCALLVAL, TCCIR_OP_FUNCCALLVOID -Condition: not complex (`!dest_ir.is_complex`), no static chain; `!irop_needs_pair(dest_ir)` guard has been removed — 64-bit pair destinations are now handled - -The destination return value is a `MachineOperand dest_mop`, produced by `machine_op_from_ir(ir, &dest_ir)` in the dispatch loop. Internally, `handle_return_value_mop(&dest_mop, drop_value)` calls `mach_writeback_dest(&dest_mop, ARM_R0)`, which handles: -- `MACH_OP_REG` — emit `MOV dest.r0, R0` when `r0 != ARM_R0`; for 64-bit: also `MOV dest.r1, R1` -- `MACH_OP_SPILL` — call `tcc_machine_store_spill_slot(R0, offset)`; for 64-bit: also store R1 at offset+4 -- `MACH_OP_NONE` — no-op (void or drop_value) - -`func_target` and `call_id_op` were **converted to MachineOperand** in Phase 5g: -- `gcall_or_jump_mop()` replaces `gcall_or_jump_ir()`, taking `MachineOperand func_mop` instead of reading `func_target.pr0_reg` -- Pre-save logic rewritten to use `func_mop.kind`, `func_mop.u.reg.r0`, `func_mop.needs_deref` -- `thumb_build_call_layout_from_ir()` extended with `MachineOperand **out_mops` parameter (Phase 5k) - -**Architecture note:** `tcc_gen_machine_func_call_op()` was deleted in Phase 5j. All function call codegen now goes through `tcc_gen_machine_func_call_mop()`, which handles all cases including complex types and static-chain functions (via `MACH_OP_CHAIN_REL`). `handle_return_value_mop` handles both 32-bit and 64-bit dest pairs (R0+R1 writeback). - -### `mach_resolve_deref_64()` — **DONE** - -Helper added to handle `needs_deref` 64-bit source operands before lo/hi half splitting. When a source `MachineOperand` has `needs_deref=true` and `is_64bit=true`, calling `mach_make_lo_half`/`mach_make_hi_half` directly is incorrect: `mach_make_hi_half` increments the register number (R0→R1) instead of the memory offset (+4), producing bogus loads. - -`mach_resolve_deref_64` resolves this by: -1. If `!needs_deref`: returns `*op` unchanged. -2. **PARAM_STACK special case:** If `op->kind == MACH_OP_PARAM_STACK`, returns `*op` with `needs_deref=false` (for stack params, `needs_deref=true` means "value IS at this stack slot," not "pointer at this slot to follow" — treating it as double indirection was **Bug #3**, fixed here). -3. Strips `needs_deref`, gets base address register via `mach_ensure_in_reg`. -4. Allocates two scratch registers. -5. Loads `[base+0]` → lo_reg and `[base+4]` → hi_reg via `load_from_base_ir(..., IROP_BTYPE_INT32, ...)`. -6. Returns a clean `MACH_OP_REG` pair operand with `is_64bit=true`, `needs_deref=false`. - -Called at entry of `thumb_emit_data_processing_mop64` (for both src1 and src2) and `thumb_emit_shift64_mop` (for src1) before any lo/hi splitting. - -**Bug #2 fix — Dest/scratch register overlap:** `mach_resolve_deref_64` allocates scratch registers, which could overlap with the dest register pair when dest was determined AFTER deref resolution. Fixed by: -- (a) Determining dest register pair (via `mach_get_dest_reg_pair`) BEFORE calling `mach_resolve_deref_64`. -- (b) Pre-excluding src1/src2 register operands from the scratch pool BEFORE deref resolution (preventing scratch from overlapping src registers that haven't been loaded yet). - -**Bug #3 fix — PARAM_STACK deref:** For `MACH_OP_PARAM_STACK`, `needs_deref=true` signals "value is at this stack offset" (ARM AAPCS: 64-bit params passed at aligned stack slots for args beyond r0–r3). The deref helper was loading the 64-bit value from the stack slot, then treating that as a pointer and loading through it — double indirection. Fixed by returning early with `needs_deref=false`. - -### `MachineCodegenContext` — **NOT YET IMPLEMENTED** - -The context struct described in Step 2.1 was not needed for the data-processing ops because `arm-thumb-gen.c` uses global state (`g_insn_scratch_count`, `g_insn_scratch_saves`) for per-instruction scratch bookkeeping. If more complex handlers require per-instruction context passing, this may be added then. - -## Remaining Conversion Work - -**Conversion order (easiest to hardest):** - -1. ~~`tcc_gen_machine_data_processing_op` — ADD/SUB/CMP/SHL/SHR/SAR/AND/OR/XOR/ADC~~ ✅ Done -2. ~~`tcc_gen_machine_assign_op` — register moves / truncate / sign-extend (all dests)~~ ✅ Done -3. ~~`tcc_gen_machine_bool_op` / `tcc_gen_machine_setif_op` — boolean and conditional set~~ ✅ Done -4. ~~`tcc_gen_machine_load_op` / `tcc_gen_machine_store_op` — memory access~~ ✅ Done -5. ~~`tcc_gen_machine_load_indexed_op` / `_store_indexed_op` — indexed memory~~ ✅ Done -6. ~~`tcc_gen_machine_load_postinc_op` / `_store_postinc_op` — post-increment~~ ✅ Done -7. ~~`tcc_gen_machine_indirect_jump_op` (IJUMP)~~ ✅ Done -8. ~~`tcc_gen_machine_func_parameter_op` (FUNCPARAMVAL/VOID)~~ ✅ Done -9. ~~`tcc_gen_machine_return_value_op` — function return (32-bit only; 64-bit stays on old path)~~ ✅ Done -10. ~~`tcc_gen_machine_data_processing_op` — MUL/DIV/UDIV/IMOD/UMOD/TEST_ZERO (32-bit; MLA/UMULL stay on old path)~~ ✅ Done -11. `tcc_gen_machine_lea_op` — **SKIP**: already handles spilled dest internally; no double-materialization; chain-tracking adds non-trivial complexity for no phase-3 benefit -12. `tcc_gen_machine_jump_op` / `_conditional_jump_op` — **SKIP**: no register materialization at all (reads `src.u.imm32` / `dest.u.imm32` directly); MOP wrapper would add zero value -13. ~~`tcc_gen_machine_func_call_op` — function calls~~ ✅ Done - - `tcc_gen_machine_func_call_mop()` handles 32-bit and 64-bit non-complex dest via `MachineOperand dest_mop`. - - `tcc_gen_machine_func_call_op()` retains its full implementation for the old path (complex, static chain). **Not a wrapper** — `handle_return_value()` (legacy with SValue compat) is only in `_op`; `handle_return_value_mop()` (32-bit and 64-bit via `MachineOperand`) is in `_mop`. - - `func_target` and `call_id_op` converted to MachineOperand (Phase 5g); callsite uses `MachineOperand **out_mops` (Phase 5k). -14. ~~`tcc_gen_machine_fp_op` — floating point (single-precision; doubles/complex stay on old path)~~ ✅ Done -15. ~~`tcc_gen_machine_vla_op` — VLA operations~~ ✅ Done - -For each handler: write `_mop` variant, update `ir/codegen.c` to call it (with `use_mop_*` flag), run tests, then delete old `_op` variant once all callers converted. - -Once ALL handlers are on the MOP path, `fill_registers_ir` can be deleted and the dispatch loop reduces to raw operand → `machine_op_from_ir` → dispatch. - -## Verification Checklist - -- [x] `tcc_gen_machine_data_processing_mop()` implemented -- [x] `mach_ensure_in_reg()` / `mach_ensure_in_reg_or_imm()` / `mach_get_dest_reg()` / `mach_writeback_dest()` helpers implemented -- [x] `make test -j16` passes with data-processing on MOP path -- [x] ASSIGN MOP (all dests), BOOL, SETIF ops on MOP path -- [x] LOAD / STORE ops on MOP path -- [x] LOAD_INDEXED / STORE_INDEXED / LOAD_POSTINC / STORE_POSTINC ops on MOP path -- [x] IJUMP (indirect jump) on MOP path -- [x] FUNCPARAMVAL / FUNCPARAMVOID on MOP path -- [x] RETURNVALUE on MOP path (32-bit; 64-bit/static-chain stays on old path) -- [x] MUL/DIV group on MOP path (MUL/DIV/UDIV/IMOD/UMOD/TEST_ZERO 32-bit; MLA/UMULL stay on old path) -- [N/A] LEA — skipped (single-layer already, handles spilled dest, chain-tracking complexity) -- [N/A] JUMP / JUMPIF — skipped (no register materialization, no scratch allocation) -- [x] FP single-precision on MOP path (FADD/FSUB/FMUL/FDIV/FNEG/FCMP/CVT_ITOF/CVT_FTOI/CVT_FTOF; doubles/complex stay on old path) -- [x] VLA on MOP path (VLA_ALLOC/VLA_SP_SAVE/VLA_SP_RESTORE) -- [x] FUNCCALLVAL / FUNCCALLVOID on MOP path (32-bit non-pair dest; dest replaced by `MachineOperand dest_mop`; - `func_target` and `call_id_op` still passed as filled IROperands; 64-bit/complex/static-chain stays on old path) -- [x] `irop_needs_pair` guards removed for DP and ASSIGN — 64-bit pair sources handled via `mach_resolve_deref_64` - (loads `[base+0]` / `[base+4]` into scratch regs before lo/hi splitting; applied in `thumb_emit_data_processing_mop64` - for both src1/src2 and `thumb_emit_shift64_mop` for src1) -- [x] `irop_needs_pair` guards removed for BOOL — 64-bit pair sources handled via lo/hi ORR reduction -- [x] `irop_needs_pair` guards removed for LOAD — 64-bit pair sources handled (including reg-to-reg hi-half MOV fix) -- [x] `irop_needs_pair` guards removed for FUNC_CALL dest — 64-bit pair return values handled via `handle_return_value_mop` - (R0 + R1 writeback to dest pair); `is_complex` guard retained -- [x] Bug fix: 64-bit reg-to-reg LOAD — `tcc_gen_machine_load_mop` MACH_OP_REG non-deref case added hi-half MOV - (`src.u.reg.r1 → dest_r1`) for 64-bit register pairs -- [x] Bug fix: dest/scratch overlap in `thumb_emit_data_processing_mop64` and `thumb_emit_shift64_mop` — moved dest - register pair determination BEFORE `mach_resolve_deref_64` calls; added pre-exclusion of src1/src2 register - operands from scratch pool -- [x] Bug fix: PARAM_STACK double-indirection in `mach_resolve_deref_64` — added early return for - `MACH_OP_PARAM_STACK` with `needs_deref=false` (value IS at stack slot, not pointer to follow) -- [x] `handle_return_value_mop` supports 64-bit dest — writes R0→dest.r0 and R1→dest.r1 (or spills both) -- [x] `tcc_gen_machine_bool_mop` supports 64-bit sources — lo/hi halves ORR'd to single nonzero test -- [x] 32-bit lvalue→64-bit dest ASSIGN bug fixed — `if (src.needs_deref)` changed to `if (src.needs_deref && src.is_64bit)` - in `tcc_gen_machine_assign_mop`: when a stack parameter is a 32-bit pointer that is being widened into a 64-bit dest - register pair, `needs_deref=true` but `is_64bit=false`; without the guard this incorrectly loaded `[ptr+0]`/`[ptr+4]` - (dereferencing 64-bit content through the pointer) instead of zero-extending the pointer value itself -- [x] `fill_registers_ir` removed from dispatch loop — ✅ done (Phase 5b removed dispatch-level fills; - Phase 5f rewrote `machine_op_from_ir` to read interval table directly; Phase 5m deleted `fill_registers_ir`) -- [x] `tcc_ir_fill_registers_ir()` function deleted from `ir/codegen.c` — ✅ done (Phase 5m) diff --git a/docs/materialization/04_phase3_dry_run.md b/docs/materialization/04_phase3_dry_run.md deleted file mode 100644 index e4e0838e..00000000 --- a/docs/materialization/04_phase3_dry_run.md +++ /dev/null @@ -1,187 +0,0 @@ -# Phase 3: Dry-Run Integration - -> **Status: ✅ COMPLETE** — committed `bc43b639 phase 3` + `c2569883 phase 3: enable dry-run scratch conflict fixup` - -## Goal - -Extend the existing dry-run pass in `ir/codegen.c` to collect per-instruction scratch register constraints using `MachineOperand`, and feed these constraints back to the register allocator. - -## Current State (Important: Dry Run Already Exists) - -**The original plan described this as a new feature, but a dry-run pass already exists.** The current `tcc_ir_codegen_generate()` in `ir/codegen.c` already runs the backend twice: - -1. **Dry run:** Calls `tcc_gen_machine_dry_run_begin()`, runs the full dispatch loop (instruction handlers execute but `ot()` is a no-op), then calls `tcc_gen_machine_dry_run_end()`. -2. **Real run:** Restores `ind`/`loc` state and runs the dispatch loop again, this time emitting actual code. - -The dry run currently serves to: -- Compute accurate code sizes for branch offset optimization (`tcc_gen_machine_branch_opt_analyze`) -- Detect whether LR was pushed in loops (to move it to prologue instead) -- Record scratch register usage patterns - -**What's missing:** The dry run does not currently feed scratch constraints back to the register allocator. It runs *after* allocation is final. - -## Proposed Extension - -### Per-instruction constraint collection - -During the dry run, each `mach_ensure_in_reg()` / `mach_alloc_scratch()` call records what it needs: - -```c -typedef struct { - int instruction_index; - int scratch_regs_needed; /* how many scratch regs this instruction needs */ - int scratch_reg_hints[4]; /* preferred scratch registers (if any) */ - bool needs_pair; /* needs an even-aligned register pair */ - bool clobbers[16]; /* which physical registers this instruction clobbers */ -} InstructionConstraints; -``` - -### Constraint-aware allocation - -``` -Current flow: - liveness → allocator → dry run (for branch sizing) → real run - -Proposed flow: - liveness → allocator (initial) → dry run (collect constraints) → allocator (refined) → real run -``` - -The second allocator pass is lightweight — it only adjusts assignments where the dry run found conflicts (e.g., a vreg was allocated to a register that a specific instruction needs as scratch). - -## Implementation Steps - -### Step 3.1: Add constraint recording to `MachineCodegenContext` - -**Action:** Extend the context struct (from Phase 2) with constraint tracking: - -```c -typedef struct { - // ... existing fields from Phase 2 ... - - /* Constraint recording (dry run only) */ - InstructionConstraints *constraints; - int constraints_count; - int constraints_capacity; -} MachineCodegenContext; -``` - -In dry-run mode, `mach_alloc_scratch()` records the scratch register it chose (or would choose) into `constraints[current_instruction]`. - -### Step 3.2: Record constraints during dry run - -**Action:** Modify the `mach_*` helpers to record scratch usage when `ctx->plan_mode == true`: - -```c -static int mach_alloc_scratch(MachineCodegenContext *ctx, uint16_t exclude_mask) -{ - int reg; - if (ctx->plan_mode) { - // Record that this instruction needs a scratch register - ctx->constraints[ctx->instruction_index].scratch_regs_needed++; - // Still allocate (to detect conflicts), but don't emit PUSH/POP - reg = get_scratch_reg_with_save(exclude_mask); - } else { - reg = get_scratch_reg_with_save(exclude_mask); - } - return reg; -} -``` - -### Step 3.3: Feed constraints to allocator - -**Action:** After dry run, scan constraints for conflicts: - -```c -void tcc_ir_apply_scratch_constraints(TCCIRState *ir, - InstructionConstraints *constraints, - int count) -{ - for (int i = 0; i < count; i++) { - for (int c = 0; c < 16; c++) { - if (constraints[i].clobbers[c]) { - // Mark register c as unavailable at instruction i - // This creates a "clobber interval" that the allocator respects - tcc_ls_add_clobber(ir, constraints[i].instruction_index, c); - } - } - } - // Re-run allocation with clobber intervals - tcc_ls_reallocate_with_clobbers(ir); -} -``` - -**Design decision:** The second allocation pass should be *incremental* — only re-allocate vregs that conflict with newly-discovered clobbers. A full re-allocation is correct but slower. - -### Step 3.4: Verify dry-run consistency - -**Action:** Add assertions that the dry run and real run produce consistent scratch allocation: - -```c -// After each instruction in real run: -if (DEBUG_VERIFY) { - assert(ctx->current_scratch_count == constraints[i].scratch_regs_needed); -} -``` - -Any divergence indicates a bug in the constraint recording. - -### Step 3.5: Incremental rollout - -**Action:** Initially, skip the second allocator pass and just collect/log constraints. Verify that: - -1. Constraint recording doesn't change behavior -2. Recorded constraints match actual scratch usage -3. Performance overhead is negligible - -Then enable the constraint-aware re-allocation in a follow-up. - -## Risk Assessment - -- **Risk: Low for constraint recording.** The dry run already exists; we're just adding bookkeeping. -- **Risk: Medium for constraint-aware allocation.** Re-running the allocator requires careful handling of already-assigned registers. -- **Risk: Low for divergence.** The dry run is deterministic — if both passes use the same `MachineOperand` inputs, constraints must match. - -## What Was Actually Built - -The design diverged from the plan's proposal. The actual implementation is simpler and more effective: - -### Per-instruction arrays (replaces `InstructionConstraints` struct) - -```c -int *dry_insn_scratch; /* count of mach_alloc_scratch() calls per instruction */ -uint16_t *dry_insn_saves; /* bitmask of registers needing PUSH per instruction */ -``` - -Allocated in `tcc_ir_codegen_generate()` for `ir->next_instruction_index` entries. - -### Scratch recording (replaces `plan_mode` flag) - -`arm-thumb-gen.c` uses two globals reset before each instruction: -```c -static int g_insn_scratch_count; /* incremented in get_scratch_reg_with_save */ -static uint16_t g_insn_scratch_saves; /* OR'd with (1<has_static_chain` -- [x] `tcc_ls_reset_scratch_cache()` called after any fixup -- [x] Consistency check logging under `TCC_LS_DEBUG` -- [x] `make test -j16` passes (3310 tests, 0 failures) -- [x] `postmod-1` test passes at both -O0 and -O1 diff --git a/docs/materialization/05_phase4_eliminate_mat.md b/docs/materialization/05_phase4_eliminate_mat.md deleted file mode 100644 index 4fabc680..00000000 --- a/docs/materialization/05_phase4_eliminate_mat.md +++ /dev/null @@ -1,124 +0,0 @@ -# Phase 4: Eliminate `ir/mat.c` - -> **Status: ✅ COMPLETE** — committed `bc43b639 phase 4` + `0e772abb phase 5: remove dead files and dead TCCStackSlot fields` - -## Goal - -With all materialization handled by the backend (Phase 2), remove the IR-level materialization module entirely. - -## Current State After Phase 2 - -At this point: -- All instruction handlers use `MachineOperand` + `mach_*` helpers -- `ir/codegen.c` dispatch loop only calls `machine_op_from_ir()`, no longer calls `tcc_ir_materialize_*_ir()` -- `ir/mat.c` functions are completely unused - -## What Moves Where - -| Current `ir/mat.c` function | Replacement | -|---|---| -| `tcc_ir_materialize_value_ir()` | `mach_ensure_in_reg()` in `arm-thumb-gen.c` | -| `tcc_ir_materialize_const_to_reg_ir()` | `mach_ensure_in_reg()` (IMM case) | -| `tcc_ir_materialize_addr_ir()` | `mach_ensure_addr()` in `arm-thumb-gen.c` | -| `tcc_ir_materialize_dest_ir()` | `mach_get_dest_reg()` in `arm-thumb-gen.c` | -| `tcc_ir_storeback_materialized_dest_ir()` | `mach_writeback_dest()` in `arm-thumb-gen.c` | -| `tcc_ir_release_materialized_*_ir()` | `mach_release_scratch()` in `arm-thumb-gen.c` | -| `tcc_ir_mat_spilled_op()` / `tcc_ir_is_spilled_ir()` | `machine_op.kind == MACH_OP_SPILL` | -| `tcc_ir_operand_needs_dereference()` | `machine_op.needs_deref` | - -## What Stays in IR - -| File | Status | -|---|---| -| `ir/live.c` | Unchanged — liveness analysis | -| `ir/vreg.c` | Unchanged — virtual register tracking | -| `ir/stack.c` | Simplified — only real locals + spill slots | -| `ir/codegen.c` | Reduced to `machine_op_from_ir()` conversion + dispatch loop | -| `ir/machine_op.h` | New — `MachineOperand` type (from Phase 1) | - -## Implementation Steps - -### Step 4.1: Verify no remaining callers of `ir/mat.c` functions - -**Action:** -```bash -# These should all return 0 matches: -grep -rn 'tcc_ir_materialize_value_ir\b' --include='*.c' --include='*.h' | grep -v 'ir/mat.c' -grep -rn 'tcc_ir_materialize_const_to_reg_ir\b' --include='*.c' --include='*.h' | grep -v 'ir/mat.c' -grep -rn 'tcc_ir_materialize_addr_ir\b' --include='*.c' --include='*.h' | grep -v 'ir/mat.c' -grep -rn 'tcc_ir_materialize_dest_ir\b' --include='*.c' --include='*.h' | grep -v 'ir/mat.c' -grep -rn 'tcc_ir_storeback_materialized_dest_ir\b' --include='*.c' --include='*.h' | grep -v 'ir/mat.c' -grep -rn 'tcc_ir_release_materialized_.*_ir\b' --include='*.c' --include='*.h' | grep -v 'ir/mat.c' -grep -rn 'tcc_ir_mat_value\b\|tcc_ir_mat_const\b\|tcc_ir_mat_addr\b\|tcc_ir_mat_dest\b' --include='*.c' --include='*.h' | grep -v 'ir/mat.c' -``` - -If any callers remain, they must be converted to use `mach_*` helpers first. - -### Step 4.2: Delete `ir/mat.c` - -**Action:** Remove the entire file (~1096 lines). - -### Step 4.3: Delete `ir/mat.h` (if it exists as a separate header) - -**Action:** Remove materialization-related declarations. Check `tccir.h` for any remaining references: - -- Remove `TCCMaterializedValue` struct -- Remove `TCCMaterializedAddr` struct -- Remove `TCCMaterializedDest` struct -- Remove `TCCMatValue` / `TCCMatAddr` / `TCCMatDest` wrapper types -- Remove function declarations for deleted functions - -### Step 4.4: Remove `ir/mat.c` from build system - -**Action:** Edit `Makefile` to remove `ir/mat.c` from source lists (look for `IR_SRC`, `TINYCC_IR_SRC`, or similar variables). - -### Step 4.5: Reduce `ir/codegen.c` - -**Action:** Remove now-dead code: - -1. Delete `tcc_ir_fill_registers_ir()` (replaced by `machine_op_from_ir()`) -2. Delete the operand classification block (the `need_src1_value`, `need_src2_value`, etc. switch) -3. Delete the centralized materialization block -4. Delete the scratch release block at the end of the dispatch loop - -The dispatch loop becomes: -```c -for each instruction: - get raw operands from pool - convert to MachineOperand via machine_op_from_ir() - dispatch to tcc_gen_machine_*_mop() handler - // (handler does its own materialization and cleanup) -``` - -**Expected:** `ir/codegen.c` reduces from ~2331 lines to ~400-600 lines. - -### Step 4.6: Compile and test - -```bash -make clean && make cross -j16 -make test -j16 -make test-gcc-torture-compile -``` - -## What Was Done - -### Files deleted -- `ir/mat.c` — the entire IR-level materialization module (~1096 lines) -- `ir/operand.c` — IROperand utility functions that were part of the old materialization layer -- `ir/operand.h` — header for the above - -### Replacement -- `ir/machine_op.c` + `ir/machine_op.h` — the new `MachineOperand`-based conversion module - -### Expected size reduction -`ir/codegen.c` was reduced from ~2331 to 1767 lines (Phase 5m deleted `fill_registers_ir` ~256 lines; Phase 6 consolidated dispatch loops −339 lines). - -## Verification Checklist - -- [x] `ir/mat.c` deleted -- [x] `ir/operand.c` deleted -- [x] `ir/operand.h` deleted -- [x] Build compiles without those files -- [x] `make test -j16` passes -- [x] `tcc_ir_fill_registers_ir()` deleted from `ir/codegen.c` — ✅ done (Phase 5m) -- [x] `ir/codegen.c` reduced from ~2331 to 1767 lines (Phase 5m + Phase 6 dispatch consolidation) diff --git a/docs/materialization/06_phase5_simplify_stack.md b/docs/materialization/06_phase5_simplify_stack.md deleted file mode 100644 index 3f6d59fd..00000000 --- a/docs/materialization/06_phase5_simplify_stack.md +++ /dev/null @@ -1,760 +0,0 @@ -# Phase 5: Simplify Stack and Spill Management - -> **Status: ✅ Done** — All sub-phases 5b–5q complete. All operations fully on MOP path. **Phase 5l** ✅: `pr0_spilled`/`pr1_spilled` removed from `IROperand`. **Phase 5m** ✅: `fill_registers_ir` deleted entirely (~256 lines). **Phase 5n** ✅: 10 dead `_op` function bodies + declarations removed (~700 lines). **Phase 5o** ✅: last 3 `_op` handlers converted to `_mop` — dispatch loop is 100% MOP. **Phase 5p** ✅: `pr0_reg`/`pr1_reg` fields removed from `IROperand` (10→9 bytes). Added `irop_phys_r0()`/`irop_phys_r1()` helpers that read interval table. `load_to_dest_ir` takes explicit `(int dest_r0, int dest_r1, IROperand src)`. All legacy `_ir` functions + `arm-thumb-asm.c` converted. `irop_init_phys_regs()` deleted. `tccir_operand.c` conversion functions updated. **Phase 5q** ✅: all legacy `_ir` wrappers deleted (~560 lines); `tcc_gen_mach_load_to_reg` rewritten for direct-dest loading; inline asm operand clobber regression (pr49390) fixed. - -## Goal - -With backend-driven materialization complete, clean up data structures that were only needed to support the old materialization layer. - -## Changes - -### 5.1: Simplify `IROperand` - -**Remove fields that are only used for materialization state encoding:** - -| Field | Current Use | Replacement | -|---|---|---| -| `pr0_spilled` | Set by `fill_registers_ir()` | `MachineOperand.kind == MACH_OP_SPILL` | -| `pr1_spilled` | Set by `fill_registers_ir()` | `MachineOperand.is_64bit && MACH_OP_SPILL` | -| `is_local` | Set by `fill_registers_ir()` | `MachineOperand.kind == MACH_OP_FRAME_ADDR` | -| `is_llocal` | Set by `fill_registers_ir()` | `MachineOperand.kind == MACH_OP_SPILL + needs_deref` | -| `is_param` | Set by `fill_registers_ir()` | `MachineOperand.kind == MACH_OP_PARAM_STACK` | - -**Note:** These fields are set by `tcc_ir_fill_registers_ir()` which is deleted in Phase 4. After Phase 4, nothing writes to these fields. Removing them shrinks `IROperand` and eliminates the possibility of stale/incorrect flag state. - -**Caution:** Verify that no IR-level pass (optimization, liveness) reads these fields. They should only be read during codegen. - -### 5.2: Remove materialization result structs - -Delete from `tccir.h` or `ir/mat.h`: - -```c -// REMOVE: -typedef struct TCCMaterializedValue { ... }; -typedef struct TCCMaterializedAddr { ... }; -typedef struct TCCMaterializedDest { ... }; -typedef struct TCCMatValue { ... }; -typedef struct TCCMatAddr { ... }; -typedef struct TCCMatDest { ... }; -``` - -### 5.3: Simplify `TCCStackSlot` - -**Remove fields that only existed for materialization decisions:** - -| Field | Purpose | Needed? | -|---|---|---| -| `addressable` | Told materialization layer not to spill this | **Remove** — backend decides | -| `live_across_calls` | Told materialization to use callee-saved reg | **Remove** — allocator handles this | - -Keep: `kind`, `vreg`, `offset`, `size`, `alignment` — these are fundamental to stack layout. - -### 5.4: Remove VT_LLOCAL handling from backend - -**Action:** Search `arm-thumb-gen.c` for `is_llocal` or `VT_LLOCAL` references. With `MachineOperand`, the double-indirection case is expressed as `MACH_OP_SPILL` with `needs_deref=true` — there's no separate code path. - -### 5.5: Consolidate operand headers - -**Current state:** There are two near-duplicate operand headers: -- `tccir_operand.h` (567 lines, 17-bit position) -- `ir/operand.h` (539 lines, 18-bit position) - -**Action:** Eliminate the older `tccir_operand.h` and keep only `ir/operand.h`. Update all `#include "tccir_operand.h"` to `#include "ir/operand.h"`. - -This is a maintenance hazard flagged during review — fixing it here prevents future bugs from edits to the wrong copy. - -## Implementation Steps - -### Step 5.1: Audit field usage - -```bash -# Verify these fields are only read during codegen (now deleted): -grep -rn 'pr0_spilled\|pr1_spilled' --include='*.c' --include='*.h' | grep -v 'ir/mat.c\|ir/codegen.c' -grep -rn 'is_llocal' --include='*.c' --include='*.h' | grep -v 'ir/mat.c\|ir/codegen.c' -grep -rn 'is_local' --include='*.c' --include='*.h' | grep -v 'ir/mat.c\|ir/codegen.c' -``` - -Any unexpected callers need investigation before removal. - -### Step 5.2: Remove fields from `IROperand` - -Edit `ir/operand.h` to remove `pr0_spilled`, `pr1_spilled`, `is_local`, `is_llocal`, `is_param` bitfields. - -**Note:** This changes `IROperand` layout. Since it's `__attribute__((packed))` at 10 bytes, removing 5 bits saves space and may improve cache behavior during IR passes. - -### Step 5.3: Remove `TCCMaterializedValue`/`Addr`/`Dest` structs - -Edit `tccir.h` to delete these struct definitions and any function declarations that reference them. - -### Step 5.4: Simplify `TCCStackSlot` - -Edit `tccir.h` or `ir/stack.h` to remove `addressable` and `live_across_calls` fields. - -### Step 5.5: Consolidate operand headers - -1. Diff `tccir_operand.h` vs `ir/operand.h` to identify differences -2. Ensure `ir/operand.h` is the superset -3. Replace all `#include "tccir_operand.h"` with `#include "ir/operand.h"` -4. Delete `tccir_operand.h` - -### Step 5.6: Compile and test - -```bash -make clean && make cross -j16 -make test -j16 -make test-gcc-torture-compile -``` - -## Expected Impact - -| Metric | Change | -|---|---| -| `IROperand` size | 10 bytes → ~9 bytes (5 bits freed) | -| Struct types deleted | 6 (3 legacy + 3 new wrapper) | -| `TCCStackSlot` fields | 2 removed | -| Duplicate headers | Consolidated (`tccir_operand.h` deleted) | -| Dead code | All VT_LLOCAL-specific code paths removed | - -## Current State (After `0e772abb`) - -### Done -- Dead `TCCStackSlot` fields removed (`addressable`, `live_across_calls` — these were never set meaningfully after Phase 0) -- `ir/operand.c`, `ir/operand.h`, `ir/mat.c` deleted (Phase 4) - -### Remaining: IROperand codegen-time flags - -The `fill_registers_ir` function is now deleted from the production path (behind `#ifdef TCC_REGALLOC_DEBUG`). `machine_op_from_ir` reads the interval table directly. However, the `pr0_reg`/`pr1_reg` fields remain in `IROperand` because legacy `_ir` functions still read/write them: - -| Field | Who sets it | Who reads it | Status | -|-------|------------|--------------|--------| -| `pr0_reg` / `pr1_reg` (5 bits each) | `svalue_to_iroperand()`, `irop_copy_svalue_info()`, `asm_gen_code()` | `load_to_dest_ir()` (~38 reads), `store_ex_ir()` (~10 reads), `th_store_resolve_base_ir()` (2 reads) | **Blocked:** legacy `_ir` functions + inline asm | -| `_reserved0` / `_reserved1` (1 bit each) | (unused) | (unused) | **Free** — formerly `pr0_spilled`/`pr1_spilled` (Phase 5l) | -| `is_llocal` | IR construction (`tccgen.c`) | `machine_op_from_ir()` for `needs_deref`; `tccopt.c` | **IR-semantic** — stays | -| `is_local` | IR construction (`tccgen.c`) | `machine_op_from_ir()`; `tccopt.c`; backend helpers | **IR-semantic** — stays | -| `is_param` | IR construction (`tccgen.c`) | `machine_op_from_ir()` | **IR-semantic** — stays | - -**Key insight:** `is_local`, `is_llocal`, and `is_param` are IR-semantic — set during IR construction, read during codegen. They do NOT need to be removed. Only `pr0_reg`/`pr1_reg` are pure codegen-time state that should be eliminated. - -**Remaining steps for full `pr0_reg`/`pr1_reg` removal:** -1. Convert `asm_gen_code` in `arm-thumb-asm.c` (6 writes) to use `MachineOperand` or read intervals directly -2. Convert `load_to_dest_ir`, `store_ex_ir`, `th_store_resolve_base_ir` in `arm-thumb-gen.c` (~50 reads, 3 writes) to use `MachineOperand` equivalents -3. Remove `pr0_reg : 5` and `pr1_reg : 5` from `IROperand` struct in `tccir_operand.h` -4. Also remove `_reserved0 : 1` and `_reserved1 : 1` (freed from Phase 5l) -5. Update `IROP_NONE` macro and `irop_init_phys_regs()` in `tccir_operand.h` -6. Update `svalue_to_iroperand()`, `iroperand_to_svalue()`, `irop_copy_svalue_info()` in `tccir_operand.c` -7. Verify `sizeof(IROperand)` — expected: 8 bytes, down from 10 - -### Remaining: `tccir_operand.h` deduplication - -Two near-identical operand headers still exist: -- `tccir_operand.h` (root, 17-bit position encoding) -- `tccir_operand.c` (root, companion) - -The `ir/` subdirectory no longer has `ir/operand.h` (deleted in Phase 4). The deduplication goal was to eliminate one copy, but since only `tccir_operand.h` remains, this is now moot — the duplication is gone. No further action needed on this item. - -## Verification Checklist - -- [x] Dead `TCCStackSlot` fields removed (`addressable`, `live_across_calls`) -- [x] `ir/mat.c`, `ir/operand.c`, `ir/operand.h` deleted -- [x] Unconditional dispatch-loop fills removed (Phase 5b) -- [x] `machine_op_from_ir` fills `IROperand *op` in-place (Phase 5b) -- [x] `ir_fill_op` at all old-path `_op` sites, dry-run and real-run (Phase 5b) -- [x] Debug trace blocks use pre-filled local copies (Phase 5b) -- [x] `ir_fill_op` removed from JUMP/JUMPIF dispatch (Phase 5c) — those ops only - read `irop_get_imm32(dest)` / `src.u.imm32` (raw immediates, never written - by `fill_registers_ir`); removing the fills is a pure elimination -- [x] SWITCH_TABLE converted to MOP via `tcc_gen_machine_switch_table_mop` (Phase 5c) - — reads only one register (`mach_ensure_in_reg`), no pr0_reg direct access -- [x] SETIF 64-bit pair dest supported in `tcc_gen_machine_setif_mop` (Phase 5c) - — `!irop_needs_pair(dest_ir)` guard removed; handler splits dest via - `mach_make_lo/hi_half`, emits `MOV lo, #0; IT cond; MOV lo, #1; MOV hi, #0` -- [x] MLA converted to MOP via `tcc_gen_machine_mla_mop` (Phase 5c) - — 4-operand MOP: src1, src2, dest, accum all via `mach_ensure_in_reg`; - accumulator read from `ir->iroperand_pool[operand_base+3]` converted with - `machine_op_from_ir`; single `th_mla` instruction; no fallback path needed -- [x] UMULL converted to MOP via `tcc_gen_machine_umull_mop` (Phase 5c) - — 64-bit dest split via `mach_make_lo/hi_half`; src1/src2 loaded via - `mach_ensure_in_reg`; single `th_umull` instruction -- [x] `!irop_needs_pair` guard removed for BOOL (Phase 5c) — 64-bit pair sources - handled via lo/hi ORR reduction to single nonzero test value -- [x] `!irop_needs_pair` guard removed for LOAD (Phase 5c) — 64-bit pair sources/dests - handled; bug fix: MACH_OP_REG non-deref case now copies hi-half (`src.u.reg.r1 → dest_r1`) -- [x] `!irop_needs_pair` guard removed for FUNC_CALL dest (Phase 5c) — 64-bit pair return - values handled via `handle_return_value_mop` (R0+R1 writeback); `is_complex` guard retained -- [x] Bug fix: dest/scratch register overlap in `thumb_emit_data_processing_mop64` and - `thumb_emit_shift64_mop` — dest pair determined BEFORE `mach_resolve_deref_64`; - src register operands pre-excluded from scratch pool -- [x] Bug fix: PARAM_STACK double-indirection in `mach_resolve_deref_64` — added early return - for `MACH_OP_PARAM_STACK` with `needs_deref=false` -- [x] `!irop_needs_pair` guard removed for MUL (Phase 5c) — 64-bit pair supported via - `thumb_emit_mul64_mop`: UMULL for lo 64-bit product, MLA for cross-product hi bits; - 32-bit result from 64-bit source falls back to plain MUL of lo halves -- [x] `!irop_needs_pair` + `!irop_is_64bit` guards removed for TEST_ZERO (Phase 5c) — - 64-bit src handled via `mach_resolve_deref_64` + `CMP lo,#0 / IT EQ / CMP hi,#0` -- [x] `!irop_needs_pair` guard removed for DIV/UDIV/IMOD/UMOD (Phase 5c) — these are - dead guards: `tccgen.c` lowers 64-bit integer division to `__divdi3` / `__udivdi3` / - `__moddi3` / `__umoddi3` FUNCCALL IR before the backend; no 64-bit TCCIR_OP_DIV ever - reaches `tcc_gen_machine_muldiv_mop` in practice -- [x] `make test -j16` passes — 3310 passed, 0 failed (all tests) -- [x] FP double-precision `!irop_needs_pair` guards removed (Phase 5c) — `tcc_gen_machine_fp_mop` - extended with `fp_mop_load_double_arg`, `fp_mop_do_bl`, `fp_mop_writeback_result` helpers; - all FADD/FSUB/FMUL/FDIV/FNEG/FCMP/CVT_* opcodes handle `is_double=true` via - `__aeabi_dadd`, `__aeabi_dsub`, etc.; `!irop_needs_pair` guards removed from both - dispatch loops -- [x] `!ir->has_static_chain` guards removed from MOP dispatch (44 occurrences, Phase 5c) — - new `MACH_OP_CHAIN_REL` operand kind added (`ir/machine_op.h`, `ir/machine_op.c`); - captured variables detected in `machine_op_from_ir` via `captured_offsets_list` scan; - handled in `mach_ensure_in_reg`, `mach_writeback_dest`, `fp_mop_load_arg`, - `mach_make_hi_half`, `load_mop`, `store_mop` (32-bit and 64-bit branches) -- [x] LEA converted to MOP path (was already on MOP path in both dispatch loops) -- [x] Dead old-path `else` branches removed (Phase 5d) — 14 unreachable fallbacks - deleted from both dry-run and real-run dispatch loops; 17 unconditionally-true - `use_mop_*` flag variables eliminated; only `use_mop_fp` and `use_mop_func_call` - remain (conditional on `is_complex`); `ir/codegen.c` reduced by 440 lines - (3149 → 2709); LOAD/ASSIGN/LOAD_INDEXED `*_before_ret` peephole conditions - simplified to just the `before_ret` guard -- [x] `*_before_ret` peephole converted to MOP path (Phase 5e) — LOAD, LOAD_INDEXED, - ASSIGN `before_ret` branches now construct synthetic `MACH_OP_REG(R0/R1)` dest - and patch interval allocation instead of falling back to old `_op` path; - 6 old-path call sites eliminated from both dispatch loops; `ir/codegen.c` - 2711 lines (net +2 from new peephole logic, −730 from old-path removal) -- [x] `machine_op_from_ir` decoupled from `fill_registers_ir` (Phase 5f) — function - reads interval table directly, `const IROperand *` signature (no mutation); - `mop_fixup_subcomponent()` helper for LOAD/STORE sub-component access; - LOAD/STORE dispatch guards `mop_src.kind != MACH_OP_NONE` to fall back to - old `_op` path for operands with tag=VREG, vreg=-1 (unfilled) -- [x] FUNCCALL `func_target` converted to MachineOperand (Phase 5g) — - `tcc_gen_machine_func_call_mop` signature changed from `IROperand func_target` - to `MachineOperand func_mop`; pre-save logic rewritten to use `func_mop.kind`, - `func_mop.u.reg.r0`, `func_mop.needs_deref` instead of `pr0_reg`/`is_lval`; - new `gcall_or_jump_mop()` function handles MACH_OP_SYMBOL (direct BL), - MACH_OP_IMM (relative), and indirect calls via `mach_ensure_in_reg`; - `ir/codegen.c` call sites use `machine_op_from_ir(ir, &src1_ir)` for func_target, - eliminating `ir_fill_op` for both `src1_ir` and `src2_ir` on MOP path; - all 3310 tests pass -- [x] LOAD spilled-dest support (Phase 5h) — `tcc_gen_machine_load_mop` rewritten - to accept any dest kind (MACH_OP_REG, MACH_OP_SPILL, MACH_OP_PARAM_STACK) - using `mach_get_dest_reg` + `mach_writeback_dest` pattern; 64-bit spilled dest - handled via `mach_make_hi_half` + separate writeback; LOAD dispatch condition - widened from `mop_dest.kind == MACH_OP_REG` to `mop_dest.kind != MACH_OP_NONE` - in both dry-run and real-run loops; eliminates all LOAD fallbacks observed in - test suite (8 test files previously triggered spilled-dest fallback); - all 3310 tests pass -- [x] LOAD/STORE `MACH_OP_NONE` fallback converted to `tcc_error` (Phase 5i) — zero tests - triggered the fallback; converting to a compiler error proves the old `_op` path is - dead for LOAD/STORE; `ir/codegen.c` simplified by removing 4 fallback branches -- [x] Dead `_op` backend functions removed (Phase 5j) — ~2400 lines deleted from - `arm-thumb-gen.c`: `tcc_gen_machine_data_processing_op`, `tcc_gen_machine_assign_op`, - `tcc_gen_machine_load_op`, `tcc_gen_machine_fp_op`, `tcc_gen_machine_func_call_op`, - `tcc_gen_machine_return_value_op`, and supporting helpers (`fill_register_arg`, - `tcc_gen_machine_func_start_op`, `tcc_gen_machine_func_jump_op`); VREG/-1 edge case - handled in `machine_op_from_ir` (pre-assigned physical reg); FPU_NONE compile guard - added for `tcc_gen_machine_fp_mop` -- [x] Callsite arg-handling converted to MOP (Phase 5k) — `fill_arg_from_machine_op` bridge - function deleted (~90 lines); `thumb_build_call_layout_from_ir` updated with - `MachineOperand **out_mops` 7th parameter; `build_reg_move_64bit/32bit` and - `place_stack_arg_64bit/32bit` rewritten to take `MachineOperand *mop` instead of - `IROperand *arg`; `THUMB_ARG_MOVE_LVAL` enum variant removed (replaced by - `THUMB_ARG_MOVE_MOP` with needs_deref); `tcc_gen_machine_fp_mop` signature extended - with `int is_complex` param; `is_complex` guards removed from FP/FUNCCALL dispatch - in `ir/codegen.c` (both dry-run and real-run); `tcc_ir_fill_registers_ir` and - `ir_fill_op` wrapped in `#ifdef TCC_REGALLOC_DEBUG` (no longer called in production) -- [x] Bug fix: ARM_R12 base clobber in `place_stack_arg_64bit` (Phase 5k) — when placing - a 64-bit needs_deref operand on stack, `mach_ensure_in_reg` returned ARM_R12 as base, - then `load_from_base_ir(ARM_R12, ..., ARM_R12)` clobbered the pointer before hi-half - load; fixed by excluding `(1u << ARM_R12)` from base allocation -- [x] Bug fix: PARAM_STACK double-indirection (Phase 5k) — `needs_deref=true` on - PARAM_STACK operands (from `interval->is_lvalue`) was incorrectly treated as - pointer-to-follow; PARAM_STACK always contains the value directly in the caller's - argument area; fixed by excluding `MACH_OP_PARAM_STACK` from the `needs_deref` - path in both `place_stack_arg_64bit` and `THUMB_ARG_MOVE_MOP` handler -- [x] `pr0_spilled`/`pr1_spilled` removed from `IROperand` (Phase 5l) — replaced with - `_reserved0`/`_reserved1` to maintain 10-byte packed layout; all `.pr0_spilled` / - `.pr1_spilled` reads/writes removed from `arm-thumb-gen.c`, `ir/codegen.c`, - `tccir_operand.c`, `arm-thumb-asm.c`; 2 bits freed in packed struct -- [x] `fill_registers_ir` + `ir_fill_op` deleted from production (Phase 5m) — ~256 lines - removed from `ir/codegen.c`: function body, wrapper, `_dbg_trace_all` variable + - matching block, main debug trace block; declaration removed from `tccir.h`; - `#ifdef TCC_REGALLOC_DEBUG` vreg stats + `[RA-PEEPHOLE]` trace kept (independent) -- [x] 10 dead `_op` declarations + bodies removed (Phase 5n) — ~700 lines from - `arm-thumb-gen.c`: `load_indexed_op`, `store_indexed_op`, `load_postinc_op`, - `store_postinc_op`, `indirect_jump_op`, `switch_table_op`, `setif_op`, `bool_op`, - `func_parameter_op`, `vla_op`; 10 declarations from `tcc.h`; 2 dead static helpers - (`thumb_irop_has_immediate_value`, `thumb_irop_needs_value_load`) also removed -- [x] Last 3 `_op` handlers converted to `_mop` (Phase 5o) — `jump_op` → `jump_mop`, - `conditional_jump_op` → `conditional_jump_mop`, `trap_op` → `trap_mop`; dispatch - loop now 100% MOP; 5 call sites updated in dry-run + real-run loops -- [x] `machine_op_from_ir` vreg=-1 path decoupled from `pr0_reg` (Phase 5p partial) — - `IROP_VREG_PHYS_VALID` (0x100) + `IROP_VREG_PHYS_MASK` (0x1F) encoding in `u.imm32` - for IROP_TAG_VREG operands with vreg=-1; `svalue_to_iroperand()` Case 1b encodes - pinned physical register; `machine_op_from_ir()` reads `u.imm32` instead of `pr0_reg`; - Case 1 (vr >= 0) must NOT set `u.imm32` (breaks complex imaginary part access); - GCC torture test 20030222-1 fixed (inline asm 64→32 constraint load) -- [x] `pr0_reg`/`pr1_reg` removed from `IROperand` — blocked by ~50 reads in `arm-thumb-gen.c` - legacy `_ir` functions and 6 writes in `arm-thumb-asm.c` — **RESOLVED (Phase 5q):** all legacy - `_ir` functions deleted; inline asm path converted to `tcc_gen_mach_load_to_reg`/`tcc_gen_mach_store_from_reg` -- [x] `_reserved0`/`_reserved1` removed from `IROperand` — removed along with `pr0_reg`/`pr1_reg` in Phase 5p - -## Phase 5a: Failed Attempt — Internalize Fill in `machine_op_from_ir` - -### What was tried - -Added `fill_registers_ir` call inside `machine_op_from_ir` so it would be self-contained: - -```c -MachineOperand machine_op_from_ir(TCCIRState *ir, const IROperand *op) -{ - IROperand filled = *op; - tcc_ir_fill_registers_ir(ir, &filled); - op = &filled; - // ... rest of conversion -} -``` - -### Why it failed (30 test failures) - -`fill_registers_ir` is **NOT idempotent**. For `IROP_TAG_STACKOFF` operands, it applies: -```c -delta = old_stackoff - interval->original_offset; -op->u.imm32 += delta; -``` - -The dispatch loop already calls `fill_registers_ir` unconditionally at lines 1382–1386 (dry-run) and 2091–2095 (real-run) **before** `machine_op_from_ir` is called. Adding fill inside `machine_op_from_ir` = double-fill → delta applied twice → corrupted stack offsets → 30 GCC torture test failures. - -The sub-component access logic (pr1_reg remap for `__imag__`) was also moved into `machine_op_from_ir` during this attempt but had to be reverted — old-path 64-bit pair operands can also have `pr1_reg != NONE && u.imm32 != 0` from fill's delta calculation, which is not an `__imag__` sub-component. - -### Lesson - -Cannot add fill inside `machine_op_from_ir` without simultaneously removing all dispatch-level fills. - -## Phase 5b: Correct Approach — Coordinated Fill Removal - -Must be done as a **single coordinated change**: - -### Step 1: Remove dispatch-level fills - -Remove the 6 unconditional `tcc_ir_fill_registers_ir()` calls from the dispatch loop: -- Dry-run: lines 1382–1386 (src1, src2, dest) -- Real-run: lines 2091–2095 (src1, src2, dest) - -### Step 2: Add fill inside `machine_op_from_ir` - -Now safe because it’s the only fill — no double-application. - -### Step 3: Add targeted fills at old-path `_op` call sites - -For all ops that bypass the MOP path and still need filled IROperands: -- `tcc_gen_machine_data_processing_op` (64-bit pair fallback) -- `tcc_gen_machine_assign_op` (64-bit pair fallback) -- `tcc_gen_machine_func_call_op` (64-bit/complex/static-chain fallback) -- `tcc_gen_machine_load_op` / `store_op` (64-bit pair fallback) -- `tcc_gen_machine_return_value_op` (64-bit fallback) -- `tcc_gen_machine_fp_op` (double/complex fallback) -- `tcc_gen_machine_lea_op`, `jump_op`, `conditional_jump_op` (always old-path) -- All remaining old-path ops - -### Step 4: Handle LOAD/STORE sub-component fixup - -The `__imag__` pr1_reg remap (lines 1535–1555 in codegen.c) must either: -- Be computed from the raw (unfilled) operand before fill, or -- Be passed as a flag to `machine_op_from_ir` (e.g., `machine_op_from_ir_for_load()`) - -### Step 5: Handle debug traces - -The `_dbg_trace_all` and `TCC_MACH_DBG` blocks read filled operand fields (`pr0_reg`, `is_lval`, etc.). These need fill before trace, or the trace format needs updating. - -### Risk - -This is a wide-reaching change touching every old-path dispatch site. Must be done with extreme care and tested against the full GCC torture suite (3310 tests). - -## Phase 5d: Dead Old-Path Fallback Removal (COMPLETED) - -### What was done - -Removed 14 dead (unreachable) `else` branches from both the dry-run and real-run -dispatch loops in `ir/codegen.c`. These branches unconditionally used the MOP path -(their `use_mop_*` flag was always `true`) but still carried dead fallback code for -the old `_op` path. - -### Ops cleaned up (14 dead sites × 2 loops = 28 branches removed) - -| Op | Old flag (always true) | -|----|----------------------| -| STORE | `use_mop_store` | -| STORE_INDEXED | `use_mop_store_indexed` | -| LOAD_POSTINC | `use_mop_load_postinc` | -| STORE_POSTINC | `use_mop_store_postinc` | -| RETURNVALUE | `use_mop_ret` | -| MUL, DIV, TEST_ZERO | `use_mop_mul` | -| MLA | `use_mop_mla` | -| UMULL | `use_mop_umull` | -| DP (data processing) | `use_mop_dp` | -| IJUMP | `use_mop_ijump` | -| SETIF | `use_mop_setif` | -| BOOL | `use_mop_bool` | -| FUNCPARAM | `use_mop_func_param` | -| VLA | `use_mop_vla` | - -### Additional simplifications - -- **LOAD/ASSIGN/LOAD_INDEXED**: Removed always-true `use_mop_*` part of conditions, - kept the `*_before_ret` peephole guards (these are runtime-variable). -- **17 `use_mop_*` flag variables deleted** along with their corresponding - `switch` case assignments in both loops. -- Only **`use_mop_fp`** and **`use_mop_func_call`** remain — both are conditional - on `!is_complex` and guard the FP/FUNCCALL old-path fallbacks needed for - `_Complex` type support. - -### Results - -- `ir/codegen.c`: 3149 → 2709 lines (**−440 lines**, −14%) -- All IR tests pass -- Build clean with `-Werror` - -## Phase 5e: Convert `before_ret` Peephole to MOP Path (COMPLETED) - -### What was done - -The LOAD, LOAD_INDEXED, and ASSIGN ops each had a `*_before_ret` peephole: -when the instruction immediately precedes RETURNVALUE on the same vreg, the -old-path `_op` handler was called so it could write directly to R0. This was -the last non-complex reason these three ops fell back to the old dispatch path. - -Phase 5e converts these peephole branches to use the MOP path instead: - -1. **Patch interval allocation** — when `before_ret` is detected, the dest - vreg's `IRLiveInterval` allocation is patched to `R0` (and `R1` for 64-bit), - so subsequent MOP handlers see the return register as the physical allocation. - -2. **Synthetic MOP dest** — instead of calling `machine_op_from_ir(dest)`, - construct `(MachineOperand){.kind = MACH_OP_REG, .u.reg.r0 = REG_IRET, ...}` - directly. This ensures the load/assign writes straight to R0 without a - later MOV in RETURNVALUE. - -### Sites converted (6 old-path call sites × 2 loops = 12 removed) - -| Op | Dry-run | Real-run | -|----|---------|----------| -| LOAD | `tcc_gen_machine_load_op` → MOP with R0 dest | same | -| LOAD_INDEXED | `tcc_gen_machine_load_op` → MOP with R0 dest | same | -| ASSIGN | `tcc_gen_machine_assign_op` → MOP with R0 dest | same | - -### Results - -- `ir/codegen.c`: 2711 lines (net +2 from new peephole logic, −730 lines from old-path removal) -- Only `is_complex` FP/FUNCCALL guards remain as old-path dispatch -- All IR tests pass -- Build clean with `-Werror` - -## Phase 5f: Decouple `machine_op_from_ir` from `fill_registers_ir` (COMPLETED) - -### What was done - -Rewrote `machine_op_from_ir` in `ir/machine_op.c` to read the register-allocation -interval table directly instead of calling `tcc_ir_fill_registers_ir()`. The function -no longer mutates the `IROperand` — its signature changed to `const IROperand *op`. - -### Key changes - -1. **`ir/machine_op.c`**: Complete rewrite of `machine_op_from_ir`: - - Reads `IRLiveInterval` directly for register/spill/offset info - - 5 sections: (1) IMM constants, (2) SYMREF symbols, (3) concrete stack slots - (vreg < 0, is_local/is_llocal/tag=STACKOFF), (4) allocated operands via interval, - (5) MACH_OP_NONE fallback - - Handles unallocated vregs (`PREG_NONE, offset=0`) as spills - - Sub-component offset delta computed inline (replaces fill's `old_stackoff - original_offset`) - -2. **`ir/machine_op.h`**: Signature updated to `const IROperand *op` - -3. **`ir/codegen.c`**: New `mop_fixup_subcomponent()` helper for LOAD/STORE - sub-component access (e.g., `__imag__` on `_Complex float`). Previously this - was done by reading `pr1_reg`/`u.imm32` from the filled operand. - -4. **LOAD/STORE dispatch guards**: Both dry-run and real-run LOAD/STORE checks - now verify `mop_src.kind != MACH_OP_NONE` (LOAD) or both operands (STORE) - before entering the MOP path. Operands with tag=VREG, vreg=-1 (unfilled - temporaries) produce MACH_OP_NONE and fall back to the old `_op` path with - explicit `ir_fill_op` calls. - -### Bug found and fixed - -Operands with `tag=IROP_TAG_VREG, vreg=-1` (negative vreg sentinel encoding, not -same as `IROP_NONE`) are not tracked by the interval table. The old code handled -them via `fill_registers_ir` which left them unchanged, and the old `machine_op_from_ir` -would produce a valid result via tag-based dispatch. The new code returns -`MACH_OP_NONE` for these, and the dispatch loop falls back to old `_op` path. - -Section 3 also broadened to catch `tag=IROP_TAG_STACKOFF` operands with vreg < 0 -even without `is_local`/`is_llocal` flags (raw stack offset references from struct -temporaries). - -### Results - -- `ir/machine_op.c`: `machine_op_from_ir` is now a pure query (no mutation) -- `fill_registers_ir` only called at old-path fallback sites (FP complex, - FUNCCALL complex, and MACH_OP_NONE fallback for LOAD/STORE) -- `ir/codegen.c`: ~2732 lines -- All 3310 IR tests pass, 156 asm tests pass -- Build clean with `-Werror` - -## Phase 5i: LOAD/STORE MACH_OP_NONE Fallback → tcc_error (COMPLETED) - -### What was done - -Converted the LOAD/STORE `MACH_OP_NONE` fallback branches from old `_op` path -calls to `tcc_error("compiler_error: ...")`. Zero tests in the full suite (3310 IR + -GCC torture + ASM) ever triggered these fallbacks, proving the old `_op` path is -dead for LOAD and STORE operations. - -### Impact - -- 4 fallback branches removed from `ir/codegen.c` (2 dry-run + 2 real-run) -- Simplifies future cleanup: any regression that hits these paths will be caught - at compile time with a clear error message instead of silently using stale code - -## Phase 5j: Dead `_op` Backend Function Removal (COMPLETED) - -### What was done - -Removed ~2400 lines of dead `_op` backend functions from `arm-thumb-gen.c`. These -functions were the old IROperand-based handlers that have been fully replaced by -MOP-based handlers. With Phase 5i proving the fallbacks are unreachable, these -functions are dead code. - -### Functions deleted - -| Function | Lines | Role | -|----------|-------|------| -| `tcc_gen_machine_data_processing_op` | ~350 | Old DP handler (ADD/SUB/CMP/etc.) | -| `tcc_gen_machine_assign_op` | ~200 | Old ASSIGN handler | -| `tcc_gen_machine_load_op` | ~400 | Old LOAD handler | -| `tcc_gen_machine_fp_op` | ~300 | Old FP handler | -| `tcc_gen_machine_func_call_op` | ~500 | Old FUNCCALL handler | -| `tcc_gen_machine_return_value_op` | ~150 | Old RETURNVALUE handler | -| `fill_register_arg` | ~100 | Old fill helper | -| `tcc_gen_machine_func_start_op` | ~80 | Old func_start helper | -| `tcc_gen_machine_func_jump_op` | ~80 | Old func_jump helper | -| Various supporting helpers | ~240 | Old-path-only utilities | - -### Additional fixes - -- `machine_op_from_ir`: VREG/-1 with pre-assigned `pr0_reg` now correctly produces - `MACH_OP_REG` (previously fell through to `MACH_OP_NONE`) -- `tcc_gen_machine_fp_mop`: Added `#ifndef FPU_NONE` compile guard for builds - without FPU support - -### Results - -- `arm-thumb-gen.c`: reduced from ~11700 → ~9300 lines -- All `_op` function declarations removed from `tcc.h` -- All 3310 tests pass - -## Phase 5k: Callsite Arg-Handling MOP Conversion (COMPLETED) - -### What was done - -Converted the entire callsite argument placement pipeline from IROperand to -MachineOperand, eliminating the last bridge between the two representations. - -### Key changes - -1. **`fill_arg_from_machine_op` bridge deleted** (~90 lines): This function - reverse-engineered IROperand fields from MachineOperand to pass to the old - arg-handling functions. With native MOP support, it's no longer needed. - -2. **`thumb_build_call_layout_from_ir` updated**: New 7th parameter - `MachineOperand **out_mops` — returns the MOP array alongside the existing - IROperand pool for struct and complex args still on the old path. - -3. **Arg placement functions rewritten**: - - `build_reg_move_64bit(ThumbArgMove*, int, MachineOperand*, IROperand*, int, ...)` - - `build_reg_move_32bit(ThumbArgMove*, int, MachineOperand*, IROperand*, int, ...)` - - `place_stack_arg_64bit(MachineOperand*, int, TCCIRState*)` - - `place_stack_arg_32bit(MachineOperand*, int, CallGenContext*)` - -4. **`THUMB_ARG_MOVE_LVAL` removed**: Was a special enum variant for lval args. - `THUMB_ARG_MOVE_MOP` with `needs_deref=true` handles all dereference cases. - -5. **`tcc_gen_machine_fp_mop` signature extended**: Added `int is_complex` param - so the FP handler can dispatch to complex float operations (add/sub/mul/div) - directly. - -6. **`is_complex` guards removed from ir/codegen.c**: FP and FUNCCALL dispatch - in both dry-run and real-run loops now unconditionally use the MOP path. - Complex type handling is inside the MOP handlers themselves. - -7. **`fill_registers_ir` / `ir_fill_op` wrapped in `#ifdef TCC_REGALLOC_DEBUG`**: - No longer called in production builds. Only used for debug trace output. - -### Bug fixes - -**ARM_R12 base clobber in `place_stack_arg_64bit`:** When placing a 64-bit -`needs_deref` operand on the stack, `mach_ensure_in_reg` could return ARM_R12 -as the base register. The code then did: -``` -ldr ip, [base] ; ip = lo half VALUE (base clobbered if base==ip) -str ip, [sp, #0] -ldr ip, [base, #4] ; BUG: base was clobbered → HardFault -str ip, [sp, #4] -``` -Fixed by excluding `(1u << ARM_R12)` from the base register allocation mask. - -**PARAM_STACK double-indirection:** `needs_deref=true` on PARAM_STACK operands -(from `interval->is_lvalue`) was incorrectly interpreted as "dereference this -pointer". For PARAM_STACK, the 64-bit value IS directly in the caller's argument -area — `needs_deref` just means the param is addressable, not that it's a pointer. -The `needs_deref` path did double indirection: load value from stack, then use -that value as a pointer → HardFault or garbage data. Fixed by excluding -`MACH_OP_PARAM_STACK` from the `needs_deref` path in both `place_stack_arg_64bit` -and the `THUMB_ARG_MOVE_MOP` handler. - -### Results - -- `arm-thumb-callsite.c`: 322 lines (−29 from bridge deletion) -- `ir/codegen.c`: 2630 lines (−100 from guard removal) -- `arm-thumb-gen.c`: 9332 lines (net change from rewrite) -- `fill_registers_ir` no longer called in production code -- All 3310 tests pass, 79 skipped, 582 xfailed, 0 failures -## Phase 5l: Remove `pr0_spilled` / `pr1_spilled` from `IROperand` (COMPLETED) - -### What was done - -Replaced `pr0_spilled : 1` and `pr1_spilled : 1` with `_reserved0 : 1` and -`_reserved1 : 1` in `IROperand` struct (`tccir_operand.h`) to maintain 10-byte -packed layout. Removed all `.pr0_spilled` / `.pr1_spilled` writes/reads. - -### Files modified - -- `tccir_operand.h`: struct fields, `IROP_NONE` macro, `irop_init_phys_regs` -- `tccir_operand.c`: `irop_copy_svalue_info` (removed copy), `irop_to_svalue` - (set SValue fields to 0), removed spill comparisons from validation function -- `arm-thumb-gen.c`: `load_to_dest_ir`, `load_to_reg_ir` — simplified conditional - logic that checked spill flags (all live callers already passed 0) -- `ir/codegen.c`: removed writes in `fill_registers_ir` (debug-only), removed - `spill=%d` from debug trace format -- `arm-thumb-asm.c`: removed 6 spill-flag assignments in `asm_gen_code` - -### Results - -- 2 bits freed in packed struct (currently `_reserved0`/`_reserved1`) -- All 3310 tests pass, 79 skipped, 582 xfailed — no regressions - -## Phase 5m: Delete `fill_registers_ir` Entirely (COMPLETED) - -### What was deleted (~256 lines) - -- `tcc_ir_fill_registers_ir()` body (~157 lines) + header comment -- `ir_fill_op()` wrapper (~8 lines) -- `_dbg_trace_all` variable + function name matching block (~25 lines) -- Main debug trace block calling `ir_fill_op` for `trc_s1/s2/d` (~60 lines) -- Declaration + comment (6 lines) from `tccir.h` -- Stale comments referencing `fill_registers_ir` / `ir_fill_op` - -### Files modified - -- `ir/codegen.c`, `tccir.h` - -**Note:** The `#ifdef TCC_REGALLOC_DEBUG` vreg statistics block and `[RA-PEEPHOLE]` -trace were kept — they don't depend on `fill_registers_ir`. - -### Results - -- All 3310 tests pass, 79 skipped, 582 xfailed — no regressions -- Clean build with `CFLAGS+='-DTCC_REGALLOC_DEBUG'` - -## Phase 5n: Delete Dead `_op` Declarations and Bodies (COMPLETED) - -### What was deleted (~700 lines) - -10 dead `_op` function bodies from `arm-thumb-gen.c` + 10 declarations from `tcc.h`: - -| Function | File | -|----------|------| -| `tcc_gen_machine_load_indexed_op` | tcc.h + arm-thumb-gen.c | -| `tcc_gen_machine_store_indexed_op` | tcc.h + arm-thumb-gen.c | -| `tcc_gen_machine_load_postinc_op` | tcc.h + arm-thumb-gen.c | -| `tcc_gen_machine_store_postinc_op` | tcc.h + arm-thumb-gen.c | -| `tcc_gen_machine_indirect_jump_op` | tcc.h + arm-thumb-gen.c | -| `tcc_gen_machine_switch_table_op` | tcc.h + arm-thumb-gen.c | -| `tcc_gen_machine_setif_op` | tcc.h + arm-thumb-gen.c | -| `tcc_gen_machine_bool_op` | tcc.h + arm-thumb-gen.c | -| `tcc_gen_machine_func_parameter_op` | tcc.h + arm-thumb-gen.c | -| `tcc_gen_machine_vla_op` | tcc.h + arm-thumb-gen.c | - -Also deleted 2 now-unused static helpers: `thumb_irop_has_immediate_value`, -`thumb_irop_needs_value_load`. - -### Results - -- `arm-thumb-gen.c`: −700 lines -- All 3310 tests pass — no regressions - -## Phase 5o: Convert Control-Flow `_op` Handlers to `_mop` (COMPLETED) - -### What was done - -Converted the last 3 `_op` handlers to `_mop` so the dispatch loop is 100% MOP: - -| Old | New | Change | -|---|---|---| -| `tcc_gen_machine_jump_op(TccIrOp, IROperand, int)` | `tcc_gen_machine_jump_mop(TccIrOp, int32_t, int)` | Extract `irop_get_imm32(dest)` at call site | -| `tcc_gen_machine_conditional_jump_op(IROperand, TccIrOp, IROperand, int)` | `tcc_gen_machine_conditional_jump_mop(int32_t, TccIrOp, int32_t, int)` | Extract raw scalars at call site | -| `tcc_gen_machine_trap_op(void)` | `tcc_gen_machine_trap_mop(void)` | Rename only | - -### Files changed - -- `tcc.h` (declarations), `arm-thumb-gen.c` (bodies), `ir/codegen.c` (5 call sites) - -### Results - -- All backend dispatch now uses `_mop` variants or extracted scalars -- No `IROperand` passed to any backend handler -- All 3310 tests pass — no regressions - -## Phase 5p: Decouple `machine_op_from_ir` from `pr0_reg` (COMPLETED) - -### What was done - -The `machine_op_from_ir()` dispatch path for vreg=-1 operands was reading -`op->pr0_reg` to determine which physical register to use. This was decoupled -via an encoding in `u.imm32`: - -1. Defined `IROP_VREG_PHYS_VALID` (0x100) and `IROP_VREG_PHYS_MASK` (0x1F) - in `tccir_operand.h` - -2. `svalue_to_iroperand()` Case 1b (vreg=-1): now sets - `result.u.imm32 = IROP_VREG_PHYS_VALID | (val_kind & IROP_VREG_PHYS_MASK)` - -3. `machine_op_from_ir()` vreg=-1 path: reads `op->u.imm32` instead of `op->pr0_reg` - -### Important constraint - -Case 1 (vr >= 0) must **NOT** set `u.imm32` — the legacy `load_to_dest_ir()` (now deleted in Phase 5q) -used `u.imm32 != 0` on VREG operands for sub-component access (complex imaginary part). -This constraint was validated during Phase 5p: setting it caused GCC torture test 20030222-1 to fail. - -### What remains - -**✅ All resolved (Phase 5q).** The following functions that read `pr0_reg`/`pr1_reg` have all been deleted: - -| Function | File | Status | -|---|---|---| -| `load_to_dest_ir` | `arm-thumb-gen.c` | ✅ Deleted (Phase 5q) | -| `store_ex_ir` | `arm-thumb-gen.c` | ✅ Deleted (Phase 5q) | -| `th_store_resolve_base_ir` | `arm-thumb-gen.c` | ✅ Deleted (Phase 5q) | -| `load_to_reg_ir` | `arm-thumb-gen.c` | ✅ Deleted (Phase 5q) | -| `irop_phys_r0` / `irop_phys_r1` | `arm-thumb-gen.c` | ✅ Deleted (Phase 5q) | -| `asm_gen_code` | `arm-thumb-asm.c` | ✅ Converted to `tcc_gen_mach_load_to_reg`/`tcc_gen_mach_store_from_reg` (Phase 5q) | -| `svalue_to_iroperand` | `tccir_operand.c` | ✅ Updated (Phase 5p — no pr0/pr1) | -| `iroperand_to_svalue` | `tccir_operand.c` | ✅ Updated (Phase 5p) | -| `irop_copy_svalue_info` | `tccir_operand.c` | ✅ Updated (Phase 5p) | -| `tcc_ir_fill_registers` (SValue) | `ir/codegen.c` | ✅ Updated (Phase 5p) | -| Validation function | `tccir_operand.c` | ✅ Updated (Phase 5p) | - -The inline asm path now uses `tcc_gen_mach_load_to_reg` (rewritten in Phase 5q to load directly into dest register without scratch intermediary) and `tcc_gen_mach_store_from_reg` (delegates to `mach_writeback_dest`). No `pr0_reg`/`pr1_reg` references remain in the codebase. - -### Results - -- `machine_op_from_ir` fully decoupled from `pr0_reg` -- 3 GCC torture tests confirmed working (pr41239, pr46309, pr58831) -- All 3310 tests pass — no regressions \ No newline at end of file diff --git a/docs/materialization/07_phase6_consolidate_dispatch.md b/docs/materialization/07_phase6_consolidate_dispatch.md deleted file mode 100644 index 4083bab5..00000000 --- a/docs/materialization/07_phase6_consolidate_dispatch.md +++ /dev/null @@ -1,84 +0,0 @@ -# Phase 6: Consolidate Dispatch Loops - -> **Status: ✅ Done** — All sub-steps (6a–6d) completed. `ir/codegen.c` reduced from 2106→1767 lines. All 3310 tests passing. - -## Goal - -Merge the dry-run and real-run dispatch loops in `ir/codegen.c` into a single parameterised loop, eliminating structural duplication. - -## Result (2026-03-06) - -`ir/codegen.c` is 1767 lines with a single unified two-pass dispatch loop: - -| Section | Lines | Content | -|---------|-------|---------| -| Helper functions | 1–1080 | `tcc_ir_fill_registers` (SValue), `tcc_ir_register_allocation_params`, branch opt, stack layout, inline asm helper, scratch fixup | -| Extracted helpers | 1081–1146 | `ir_codegen_before_ret_peephole()`, `ir_codegen_record_scratch()`, `ir_codegen_check_scratch()`, `ir_codegen_track_scratch()` | -| `tcc_ir_codegen_generate()` | 1148–1275 | Entry, stack_size, arrays, has_incoming_jump | -| **Unified two-pass loop** | 1286–1690 | `for (pass=0; pass<2)` with single `switch (cq->op)`, `is_dry_run` guards for pass-specific logic | -| Cleanup | 1690–1767 | Gap-fill, backpatch jumps, epilogue, free arrays | - -Both passes call the same `_mop` backend handlers via `machine_op_from_ir()`. No `_op` functions remain. - -## Completed Implementation - -### Extracted Helper Functions (lines 1081–1146) - -| Helper | Lines | Purpose | -|--------|-------|---------| -| `ir_codegen_before_ret_peephole()` | ~35 | Checks LOAD/LOAD_INDEXED/ASSIGN before RETURNVALUE, patches allocation to R0 | -| `ir_codegen_record_scratch()` | ~4 | Records per-instruction scratch counts during dry-run | -| `ir_codegen_check_scratch()` | ~11 | Verifies real-run scratch counts match dry-run (under `TCC_LS_DEBUG`) | -| `ir_codegen_track_scratch()` | ~7 | Unified wrapper: dispatches to record (dry) or check (real) | - -### Pass-Specific Guards (`is_dry_run` / `!is_dry_run`) - -| Op/Section | Dry-run (`pass == 0`) | Real-run (`pass == 1`) | -|---|---|---| -| Loop preamble | `ir_to_code_mapping[i] = ind`, scratch flags reset, debug op tracking | Same + `orig_ir_to_code_mapping` update + `tcc_debug_line_num()` | -| Scratch tracking | `ir_codegen_record_scratch()` via `ir_codegen_track_scratch()` | `ir_codegen_check_scratch()` via `ir_codegen_track_scratch()` | -| SWITCH_TABLE | Arithmetic: `ind += 14 + num_entries*4` | `tcc_gen_machine_switch_table_mop()` handler | -| RETURNVOID | No-op (no epilogue jump) | `return_jump_addrs[n++] = ind; tcc_gen_machine_jump_mop(...)` | -| JUMP/JUMPIF | Handler call only | Handler + `ir_to_code_mapping[i]` encoding correction | -| INLINE_ASM | Skipped (assembler has side effects beyond `ot()`) | `tcc_ir_codegen_inline_asm_ir()` + `spill_cache_clear` | -| default | Silent break | Fatal error with cleanup | -| Pass init | `dry_run_init`, `branch_opt_init`, save state | Prologue emission, `tcc_debug_prolog_epilog` | -| Pass end | `dry_run_end`, branch analyze, LR check, scratch fixup, state restore | (loop simply ends) | - -### Shared Logic (executed in both passes) - -- Operand extraction: `tcc_ir_op_get_src1/src2/dest(ir, cq)` -- MachineOperand conversion: `machine_op_from_ir(ir, &src_ir)` -- `before_ret` peephole for LOAD/LOAD_INDEXED/ASSIGN -- `mop_fixup_subcomponent()` for LOAD/STORE -- All `_mop` handler calls (DP, MUL, LOAD, STORE, ASSIGN, FP, FUNCCALL, etc.) -- `tcc_gen_machine_end_instruction()` cleanup -- `tcc_ir_spill_cache_clear()` after branches, calls, switch tables - -## Results - -| Metric | Before | After | -|--------|--------|-------| -| `ir/codegen.c` lines | 2106 | 1767 | -| Dispatch switch statements | 2 | 1 | -| `before_ret` peephole copies | 6 | 1 (helper function) | -| Scratch tracking inline code | ~240 lines | ~25 lines (4 helpers) | -| Lines to add for new IR op | 2 cases | 1 case | -| Line reduction | — | −339 lines (~16%) | - -## Implementation Notes - -The actual implementation took a slightly different approach from the original plan: - -- **Steps 6a–6c were done first** (helper extraction, preamble normalization) as preparatory refactors. -- **Step 6d merged the loops directly** rather than first extracting into a separate `ir_codegen_dispatch_one()` function. The switch body stays inline in the main function — the dispatch context struct was unnecessary since all state is already in local variables. This kept the code simpler and avoided function pointer / struct indirection overhead. -- **RETURNVALUE→RETURNVOID fallthrough was preserved** in the merged version with an `if (!is_dry_run)` guard in RETURNVOID, rather than using an explicit flag. -- **`tcc_ir_spill_cache_clear()`** calls were normalized to run in both passes (safe no-op during dry-run since cache is cleared at start). - -## Test Verification - -All tests passing after each sub-step and after the final merge: -``` -3310 passed, 79 skipped, 582 xfailed, 0 failed -``` - diff --git a/docs/materialization/plan.md b/docs/materialization/plan.md deleted file mode 100644 index 200fb65e..00000000 --- a/docs/materialization/plan.md +++ /dev/null @@ -1,706 +0,0 @@ -# Materialization Refactor: Move from IR to Machine Backend - -## Current Status (as of 2026-03-06) - -| Phase | Status | Commit | -|-------|--------|--------| -| 0: SValue Elimination | ✅ Done | `e19755e6` | -| 1: MachineOperand type | ✅ Done — type + `machine_op_from_ir()` reads interval table directly; no `fill_registers_ir` dependency | unstaged (`ir/machine_op.c`) | -| 2: Backend materialization | ✅ Done — all ops on MOP path; `!irop_needs_pair` guards removed; 64-bit pair sources handled via `mach_resolve_deref_64`; RETURNVALUE supports 64-bit; 3 backend bugs fixed | unstaged | -| 3: Dry-run integration | ✅ Done — scratch conflict fixup + R_FP exclusion | `c2569883` | -| 4: Eliminate `ir/mat.c` | ✅ Done — `ir/mat.c`, `ir/operand.c`, `ir/operand.h` deleted | `bc43b639` | -| 5 | Simplify Stack/Spill | ✅ Done — Phases 5b–5q ✅; all ops fully on MOP path; `fill_registers_ir` deleted; ~3100 lines dead `_op` functions+helpers deleted; callsite arg-handling on MOP; `is_complex` guards removed from FP/FUNCCALL dispatch; `pr0_spilled`/`pr1_spilled` removed from `IROperand`; 10 dead `_op` bodies removed; jump/cond_jump/trap converted to `_mop`; `pr0_reg`/`pr1_reg` fields removed from `IROperand` (10→9 bytes); all legacy `_ir` wrappers deleted (~560 lines); `tcc_gen_mach_load_to_reg` rewritten for direct-to-dest loading; inline asm path fully on MOP | unstaged | -| 6: Consolidate dispatch | ✅ Done — merged dry-run and real-run loops into single `for (pass = 0; pass < 2; pass++)` loop; extracted `ir_codegen_before_ret_peephole()`, `ir_codegen_record_scratch()`, `ir_codegen_check_scratch()`, `ir_codegen_track_scratch()` helpers; `ir/codegen.c` reduced from 2106→1767 lines (−339 lines, ~16%) | unstaged | - -**Next:** All phases complete. Legacy `_ir` wrapper functions deleted (Phase 5q). All codegen paths use MachineOperand exclusively. Ready for new feature work. - -## Problem Statement - -The current materialization layer (`ir/mat.c`, `ir/codegen.c`) sits between the IR and the backend (`arm-thumb-gen.c`), creating a tangled intermediate abstraction: - -1. **Materialization duplicates backend logic.** `ir/mat.c` decides when to load spills, how to handle constants, when addresses are encodable, etc. But the backend *also* makes these decisions (via `load_to_reg_ir`, `get_scratch_reg_with_save`, `tcc_machine_can_encode_stack_offset`). The two layers constantly second-guess each other. - -2. **Register fill is fragile.** `ir/codegen.c:tcc_ir_fill_registers()` translates allocation results back into `SValue`/`IROperand` flags (`VT_LOCAL`, `VT_LLOCAL`, `VT_LVAL`, `VT_PARAM`, `pr0_spilled`). This encoding is the source of most materialization bugs — a misset flag causes double-dereferences, missing loads, or wrong offsets. - -3. **Scratch register allocation happens too late.** Materialization acquires scratch registers *during* code emission. This means the backend can't plan register usage across an instruction — it discovers conflicts as it emits. - -4. **Two operand representations.** `SValue` (legacy) and `IROperand` (compact IR) both need parallel materialization paths. Every fix must be applied twice. - -5. **VT_LLOCAL (double indirection) is a symptom.** The entire VT_LLOCAL mechanism exists because materialization can't express "this value is a spilled pointer that needs dereferencing" cleanly. With backend-driven materialization, the backend simply loads what it needs. - -## Proposed Architecture - -### Core Idea - -**Operate on virtual registers throughout IR and codegen. Let the backend decide how and when to materialize physical values.** - -``` -Current: - IR → fill_registers() → materialize_*() → emit instructions - [ir/codegen.c] [ir/mat.c] [arm-thumb-gen.c] - -Proposed: - IR → backend dry run → backend real run - [arm-thumb-gen.c] [arm-thumb-gen.c] - (plan allocations) (emit with known allocations) -``` - -### Key Principles - -1. **IR operands stay virtual.** No `fill_registers()` pass. Operands carry vreg IDs and allocation metadata (physical reg or spill offset) but no VT_LOCAL/VT_LVAL rewriting. - -2. **Backend owns materialization.** Each instruction handler in `arm-thumb-gen.c` knows exactly what it needs: "src1 in register", "src2 as immediate or register", "dest in register, store back if spilled". No generic IR-level guessing. - -3. **Dry run determines scratch needs.** A first pass over instructions (without emitting) records what physical registers and scratch regs each instruction needs. This feeds register allocation constraints back to the allocator. - -4. **Single operand format.** Eliminate the `SValue` path entirely from codegen. All codegen works with `IROperand` + allocation metadata. - -## Detailed Design - -### Phase 0: Prerequisite — Eliminate SValue from Codegen Path - -**Goal:** Remove the `SValue`-based materialization and register fill paths. All backend codegen uses `IROperand` exclusively. - -**Files affected:** `ir/codegen.c`, `ir/mat.c`, `arm-thumb-gen.c` - -**Steps:** -- Audit all `arm-thumb-gen.c` instruction handlers that still consume `SValue` -- Convert remaining SValue consumers to IROperand -- Remove `tcc_ir_fill_registers()` (SValue version) from `ir/codegen.c` -- Remove `tcc_ir_materialize_value()`, `_const_to_reg()`, `_addr()`, `_dest()` (SValue versions) from `ir/mat.c` - -**Risk:** Medium. SValue is deeply embedded in the parser (`tccgen.c`). The boundary is at IR emission — the parser produces SValues, `ir/core.c` converts them to IR instructions with IROperands. We only need to eliminate SValue *after* IR construction. - -**Test:** All existing IR tests must pass. This is a pure refactor with no behavior change. - -### Phase 1: New Operand Representation — `MachineOperand` - -**Goal:** Replace the overloaded `IROperand` flags with a clear machine-level operand type that the backend can interpret without ambiguity. - -```c -typedef enum { - MACH_OP_REG, /* Value in physical register(s) */ - MACH_OP_SPILL, /* Value in spill slot, needs load */ - MACH_OP_IMM, /* Immediate constant */ - MACH_OP_FRAME_ADDR, /* Address = FP + offset (address-of local) */ - MACH_OP_SYMBOL, /* Symbol reference (global/extern) */ - MACH_OP_PARAM_STACK, /* Stack-passed parameter in caller frame */ -} MachineOperandKind; - -typedef struct { - MachineOperandKind kind; - CType type; - union { - struct { int r0, r1; } reg; /* MACH_OP_REG */ - struct { int offset; int size; } spill; /* MACH_OP_SPILL */ - struct { int64_t val; } imm; /* MACH_OP_IMM */ - struct { int offset; } frame; /* MACH_OP_FRAME_ADDR */ - struct { Sym *sym; int addend; } sym; /* MACH_OP_SYMBOL */ - struct { int offset; int size; } param; /* MACH_OP_PARAM_STACK */ - } u; - int vreg; /* Original vreg (for debug/liveness queries) */ - bool needs_deref; /* Load through this address (replaces VT_LVAL) */ - bool is_64bit; -} MachineOperand; -``` - -**Why:** This eliminates the VT_LOCAL/VT_LLOCAL/VT_LVAL/VT_PARAM/pr0_spilled encoding nightmare. Each case is a distinct enum variant. The backend switches on `kind` rather than testing combinations of bit flags. - -**Steps:** -- Define `MachineOperand` in a new header (e.g., `ir/machine_op.h`) -- Write `machine_op_from_ir(IROperand *op, IRLiveInterval *interval)` conversion -- This replaces `tcc_ir_fill_registers_ir()` — instead of rewriting IROperand in place, produce a clean MachineOperand - -**Test:** Add unit tests that verify MachineOperand construction matches the old fill_registers behavior for all operand categories. - -### Phase 2: Backend-Driven Materialization - -**Goal:** Move all materialization decisions into `arm-thumb-gen.c` instruction handlers. - -**Current pattern in backend (pseudo):** -```c -case TCCIR_OP_ADD: { - IROperand src1 = inst->src1; - IROperand src2 = inst->src2; - IROperand dest = inst->dest; - tcc_ir_fill_registers_ir(ir, &src1); // rewrite flags - tcc_ir_fill_registers_ir(ir, &src2); - tcc_ir_fill_registers_ir(ir, &dest); - tcc_ir_materialize_value_ir(ir, &src1, &mat1); // load if spilled - tcc_ir_materialize_value_ir(ir, &src2, &mat2); - tcc_ir_materialize_dest_ir(ir, &dest, &matd); // get dest reg - emit_add(dest_reg, src1_reg, src2_reg); - tcc_ir_storeback_materialized_dest_ir(&dest, &matd); - tcc_ir_release_materialized_value_ir(&mat1); - tcc_ir_release_materialized_value_ir(&mat2); -} -``` - -**Proposed pattern:** -```c -case TCCIR_OP_ADD: { - MachineOperand src1 = machine_op_from_ir(&inst->src1, ...); - MachineOperand src2 = machine_op_from_ir(&inst->src2, ...); - MachineOperand dest = machine_op_from_ir(&inst->dest, ...); - - int r_src1 = mach_ensure_in_reg(ctx, &src1); // backend loads if needed - int r_src2 = mach_ensure_in_reg(ctx, &src2); - int r_dest = mach_get_dest_reg(ctx, &dest); - - emit_add(r_dest, r_src1, r_src2); - - mach_writeback_dest(ctx, &dest, r_dest); // store if spilled - mach_release_scratch(ctx); -} -``` - -**Key `mach_*` helper functions (in arm-thumb-gen.c):** - -| Function | Role | -|---|---| -| `mach_ensure_in_reg(ctx, op)` | If `op` is REG: return reg. If SPILL: load to scratch, return scratch. If IMM: mov to scratch. If FRAME_ADDR: compute address. | -| `mach_ensure_in_reg_or_imm(ctx, op)` | For instructions with flexible operand 2 (ADD, SUB, CMP): return reg or encodable immediate | -| `mach_get_dest_reg(ctx, op)` | If dest is REG: return reg. If SPILL: allocate scratch for output. | -| `mach_writeback_dest(ctx, op, reg)` | If dest was SPILL: STR reg to spill slot. | -| `mach_ensure_addr(ctx, op)` | For LOAD/STORE: compute base register + offset. Handles FRAME_ADDR, SPILL (of pointer), PARAM_STACK. | -| `mach_release_scratch(ctx)` | Free scratch registers used in this instruction. | - -**Why this is better:** -- Each instruction knows its own addressing modes. ADD can accept an immediate operand2; LOAD needs a base+offset; MUL needs both in registers. The backend expresses this directly. -- No generic "materialize everything to registers before emitting" — only materialize what's needed. -- Scratch register lifetime is explicit and scoped to one instruction. - -**Steps:** -1. Implement `MachineCodegenContext` struct holding current instruction index, scratch pool, etc. -2. Implement `mach_ensure_in_reg()` and friends in `arm-thumb-gen.c` (initially wrapping existing `load_to_reg_ir` / `get_scratch_reg_with_save`) -3. Convert instruction handlers one-by-one from old materialize pattern to new pattern -4. After all handlers converted, remove `ir/mat.c` IROperand functions - -**Test:** Convert one instruction at a time, run full test suite after each. - -### Phase 3: Dry-Run Register Allocation - -**Goal:** Run the backend twice — first to discover register/scratch needs, then to emit code with perfect information. - -**Why:** Currently, scratch registers are allocated on-the-fly during emission. This can cause conflicts (scratch stomps a live value) that are hard to debug. A dry run lets us: -1. Know exactly which scratch registers each instruction needs -2. Feed scratch constraints back to the linear scan allocator (avoid allocating a vreg to a register that will be needed as scratch) -3. Detect register pressure issues *before* emission - -**Design:** - -```c -typedef struct { - int instruction_index; - int scratch_regs_needed; /* how many scratch regs this instruction needs */ - int scratch_reg_hints[4]; /* preferred scratch registers (if any) */ - bool needs_pair; /* needs an even-aligned register pair */ - bool clobbers[16]; /* which physical registers this instruction clobbers */ -} InstructionConstraints; -``` - -**Dry run pass:** -```c -for each IR instruction: - MachineOperand src1 = machine_op_from_ir(...) - MachineOperand src2 = machine_op_from_ir(...) - MachineOperand dest = machine_op_from_ir(...) - - // Instruction handler in "plan" mode: - constraints[i] = plan_instruction(opcode, src1, src2, dest) - // e.g., ADD with spilled src1: needs 1 scratch - // e.g., 64-bit MUL with both spilled: needs 4 scratches -``` - -**Integration with allocator:** - -The dry run produces per-instruction constraints. These are fed to the allocator as "clobber" intervals — the allocator avoids assigning live vregs to registers that will be clobbered at that instruction. - -``` -Current flow: - liveness → allocator → fill_registers → materialize → emit - -Proposed flow: - liveness → allocator (initial) → dry run → allocator (refined) → emit -``` - -The second allocator pass uses clobber information from the dry run to avoid conflicts. In most cases, the initial allocation is fine and the second pass is a no-op. - -**Steps:** -1. Add `plan_mode` flag to `MachineCodegenContext` -2. In plan mode, `mach_ensure_in_reg()` records what it *would* do instead of emitting -3. Collect `InstructionConstraints` array -4. Feed constraints to `tcc_ls_allocate_registers()` as additional pressure -5. Run real emission pass with final allocations - -**Test:** Verify that dry run + real run produces identical code to current single-pass approach. Then progressively add constraint-aware allocation. - -### Phase 4: Eliminate `ir/mat.c` - -**Goal:** With all materialization in the backend, remove the IR-level materialization module entirely. - -**What moves where:** -- `tcc_ir_materialize_value_ir()` → replaced by `mach_ensure_in_reg()` -- `tcc_ir_materialize_const_to_reg_ir()` → replaced by `mach_ensure_in_reg()` (IMM case) -- `tcc_ir_materialize_addr_ir()` → replaced by `mach_ensure_addr()` -- `tcc_ir_materialize_dest_ir()` → replaced by `mach_get_dest_reg()` -- `tcc_ir_storeback_materialized_dest_ir()` → replaced by `mach_writeback_dest()` -- `tcc_ir_release_materialized_*_ir()` → replaced by `mach_release_scratch()` - -**What stays in IR:** -- `ir/live.c` — liveness analysis (unchanged) -- `ir/vreg.c` — virtual register tracking (unchanged) -- `ir/stack.c` — stack layout (simplified, only real locals + spill slots) -- `ir/codegen.c` — reduced to just `machine_op_from_ir()` conversion - -**Files deleted:** `ir/mat.c` (entirely) - -**Files reduced:** `ir/codegen.c` (from 2331 lines to ~200-300) - -### Phase 5: Simplify Stack and Spill Management - -**Goal:** With backend-driven materialization, simplify the stack/spill data structures. - -**Changes:** -- Remove `TCCMaterializedValue`, `TCCMaterializedAddr`, `TCCMaterializedDest` structs — no longer needed -- Simplify `IROperand` — remove `pr0_spilled`, `pr1_spilled`, `is_local`, `is_llocal` flags (replaced by `MachineOperand::kind`) -- Remove `VT_LLOCAL` handling from backend — `MachineOperand::MACH_OP_SPILL` with `needs_deref=true` handles this case cleanly -- Simplify `TCCStackSlot` — remove `addressable`, `live_across_calls` fields that were only needed for materialization decisions - -## Implementation Order and Milestones - -### Milestone 1: SValue Elimination (Phase 0) -- **Scope:** ~500 lines removed/refactored in `ir/codegen.c` and `ir/mat.c` -- **Duration estimate:** Smallest, most mechanical change -- **Deliverable:** All codegen uses IROperand. SValue materialization functions deleted. -- **Test gate:** `make test -j16` all pass - -### Milestone 2: MachineOperand + Backend Materialization (Phase 1 + Phase 2) -- **Scope:** New `MachineOperand` type, new `mach_*` helpers, convert all instruction handlers -- **Deliverable:** Backend owns all materialization. `ir/mat.c` IROperand functions unused. -- **Test gate:** `make test -j16` + `make test-gcc-torture-compile` all pass - -### Milestone 3: Dry Run Pass (Phase 3) -- **Scope:** Dual-pass codegen with constraint collection -- **Deliverable:** Register allocation uses instruction-level scratch constraints -- **Test gate:** Full test suite + manual verification that scratch conflicts are eliminated - -### Milestone 4: Cleanup (Phase 4 + Phase 5) -- **Scope:** Delete `ir/mat.c`, simplify data structures, remove dead code -- **Deliverable:** Cleaner, smaller codebase with single materialization path -- **Test gate:** Full test suite + code size comparison - -## Risk Analysis - -| Risk | Mitigation | -|---|---| -| **Breaking existing tests during migration** | Convert one instruction handler at a time; run tests after each | -| **SValue still used in parser** | SValue stays in `tccgen.c`/`tccpp.c` — we only remove it from codegen path | -| **Dry run diverges from real run** | Assert-check that dry run predictions match real emission | -| **Performance regression from two passes** | Dry run is cheap (no I/O, no encoding); total overhead is small | -| **64-bit / float edge cases** | These are already the buggiest paths; explicit MachineOperand::kind makes them clearer | - -## Appendix: Current Bug Categories That This Fixes - -1. **Double-dereference bugs:** VT_LVAL set when it shouldn't be (or vice versa). Root cause: `fill_registers()` guessing wrong. Fix: explicit `needs_deref` flag in `MachineOperand`. - -2. **Scratch register stomping live value:** Scratch allocated at emit time conflicts with value that's about to be used. Fix: dry run knows all scratch needs upfront. - -3. **Stack offset encoding bugs:** Materialization skips load when offset "should be" encodable, but backend disagrees. Fix: backend decides directly — no IR-level guessing about encoding capabilities. - -4. **Parameter passing bugs:** VT_PARAM + VT_LOCAL + VT_LVAL combinations are ambiguous. Fix: `MACH_OP_PARAM_STACK` is unambiguous. - -5. **64-bit materialization bugs:** Two-register values need coordinated scratch allocation. Fix: `mach_ensure_in_reg()` for 64-bit returns a register pair explicitly. - ---- - -## Phase 5l–5p + Phase 6: Remaining Cleanup - -### Current State (post-Phase 5k) - -All instruction dispatch in `ir/codegen.c` (both dry-run and real-run) uses the MOP path unconditionally. The only remaining `_op` calls in production code are three control-flow handlers that read raw immediates (no regalloc fields): - -| Handler | Call sites | Reads regalloc fields? | -|---|---|---| -| `tcc_gen_machine_jump_op` | 3 (dry×1, real×2) | No — `irop_get_imm32(dest)` only | -| `tcc_gen_machine_conditional_jump_op` | 2 (dry×1, real×1) | No — `src1.u.imm32` + `irop_get_imm32(dest)` | -| `tcc_gen_machine_trap_op` | 2 (dry×1, real×1) | No — takes no arguments | - -`fill_registers_ir` and `ir_fill_op` are behind `#ifdef TCC_REGALLOC_DEBUG` — never called in production. - -**10 dead `_op` declarations** remain in `tcc.h` (lines 2131–2195) with corresponding dead bodies in `arm-thumb-gen.c`: `load_indexed_op`, `store_indexed_op`, `load_postinc_op`, `store_postinc_op`, `indirect_jump_op`, `switch_table_op`, `setif_op`, `bool_op`, `func_parameter_op`, `vla_op`. - -### Phase 5l: Remove `pr0_spilled` / `pr1_spilled` from `IROperand` — ✅ DONE - -**Completed:** 2026-03-05 - -**What was done:** -- Replaced `pr0_spilled : 1` and `pr1_spilled : 1` with `_reserved0 : 1` and `_reserved1 : 1` in `IROperand` struct (`tccir_operand.h`) to maintain 10-byte packed layout -- Removed all `.pr0_spilled` / `.pr1_spilled` writes/reads from `IROperand` usage sites: - - `arm-thumb-gen.c`: `load_to_dest_ir`, `load_to_reg_ir`, and dead `_op` functions — simplified conditional logic that checked spill flags (all live callers already passed 0) - - `ir/codegen.c`: removed writes in `fill_registers_ir` (debug-only), removed `spill=%d` from debug trace format - - `tccir_operand.c`: removed copies in `irop_copy_svalue_info`, set SValue fields to 0 in `irop_to_svalue` (SValue retains its own `pr0_spilled`/`pr1_spilled`), removed spill comparisons from validation function - - `arm-thumb-asm.c`: removed 6 spill-flag assignments in inline asm codegen (`asm_gen_code`) - - `tccir_operand.h`: updated `IROP_NONE` macro and `irop_init_phys_regs` - -**Files modified:** `tccir_operand.h`, `tccir_operand.c`, `arm-thumb-gen.c`, `ir/codegen.c`, `arm-thumb-asm.c` - -**Test result:** 3310 passed, 79 skipped, 582 xfailed — no regressions. - -**Reclaimed bits:** 2 bits freed in the packed struct (currently `_reserved0`/`_reserved1`). - -### Phase 5m: Delete `fill_registers_ir` Entirely — ✅ DONE - -**Completed:** 2026-03-05 - -**What was deleted (~256 lines):** -- `tcc_ir_fill_registers_ir()` body (~157 lines) + header comment from `ir/codegen.c` -- `ir_fill_op()` wrapper (~8 lines) from `ir/codegen.c` -- `_dbg_trace_all` variable + function name matching block (~25 lines) from `ir/codegen.c` -- Main debug trace block calling `ir_fill_op` for `trc_s1/s2/d` (~60 lines, including LOAD/AND/OR/ASSIGN diagnostics) from `ir/codegen.c` -- Declaration + comment (6 lines) from `tccir.h` -- Stale comments referencing `fill_registers_ir` / `ir_fill_op` in both dry-run and real-run dispatch loops - -**Files modified:** `ir/codegen.c`, `tccir.h` - -**Note:** The `#ifdef TCC_REGALLOC_DEBUG` vreg statistics block and `[RA-PEEPHOLE]` trace were kept — they don't depend on `fill_registers_ir`. - -**Test result:** 3310 passed, 79 skipped, 582 xfailed — no regressions. Also verified clean build with `CFLAGS+='-DTCC_REGALLOC_DEBUG'`. - -### Phase 5n: Delete Dead `_op` Declarations and Bodies ✅ DONE - -**Goal:** Remove the 10 dead `_op` function declarations from `tcc.h` and their corresponding bodies from `arm-thumb-gen.c`. - -**Deleted functions:** - -| Function | Location | -|---|---| -| `tcc_gen_machine_load_indexed_op` | tcc.h decl + arm-thumb-gen.c body | -| `tcc_gen_machine_store_indexed_op` | tcc.h decl + arm-thumb-gen.c body | -| `tcc_gen_machine_load_postinc_op` | tcc.h decl + arm-thumb-gen.c body | -| `tcc_gen_machine_store_postinc_op` | tcc.h decl + arm-thumb-gen.c body | -| `tcc_gen_machine_indirect_jump_op` | tcc.h decl + arm-thumb-gen.c body | -| `tcc_gen_machine_switch_table_op` | tcc.h decl + arm-thumb-gen.c body | -| `tcc_gen_machine_setif_op` | tcc.h decl + arm-thumb-gen.c body | -| `tcc_gen_machine_bool_op` | tcc.h decl + arm-thumb-gen.c body | -| `tcc_gen_machine_func_parameter_op` | tcc.h decl + arm-thumb-gen.c body | -| `tcc_gen_machine_vla_op` | tcc.h decl + arm-thumb-gen.c body | - -Also deleted 2 now-unused static helpers: `thumb_irop_has_immediate_value`, `thumb_irop_needs_value_load`. - -**Net reduction:** ~700 lines from `arm-thumb-gen.c`, 10 declarations from `tcc.h`. - -**Test result:** 3310 passed, 79 skipped, 582 xfailed — no regressions. - -### Phase 5o: Convert Control-Flow `_op` Handlers to `_mop` ✅ DONE - -**Goal:** Convert the last 3 `_op` handlers to `_mop` so the dispatch loop is 100% MOP. - -**Converted:** - -| Old | New | Change | -|---|---|---| -| `tcc_gen_machine_jump_op(TccIrOp, IROperand, int)` | `tcc_gen_machine_jump_mop(TccIrOp, int32_t target_ir, int)` | Extract `irop_get_imm32(dest)` at call site | -| `tcc_gen_machine_conditional_jump_op(IROperand, TccIrOp, IROperand, int)` | `tcc_gen_machine_conditional_jump_mop(int32_t cond, TccIrOp, int32_t target_ir, int)` | Extract `src.u.imm32` and `irop_get_imm32(dest)` at call site | -| `tcc_gen_machine_trap_op(void)` | `tcc_gen_machine_trap_mop(void)` | Rename only (no IROperand args) | - -**Files changed:** `tcc.h` (declarations), `arm-thumb-gen.c` (bodies), `ir/codegen.c` (5 call sites in dry-run + real-run loops). - -**Result:** All backend dispatch call sites now use `_mop` variants or pass extracted scalars. No `IROperand` is passed to any backend handler. - -**Test result:** 3310 passed, 79 skipped, 582 xfailed — no regressions. - -### Phase 5p: Remove `pr0_reg` / `pr1_reg` from `IROperand` - -**Goal:** Eliminate the physical register fields from `IROperand`. These were filled by `fill_registers_ir` and read by the old `_op` backend path. With both gone, the dispatch path no longer needs them. - -**Investigation findings (2026-03-06):** - -A comprehensive audit revealed **50+ live references** to `pr0_reg`/`pr1_reg` across the codebase, far more than the original estimate of 3 readers: - -| Reader/Writer | File | Nature | -|---|---|---| -| `machine_op_from_ir` vreg=-1 path | `ir/machine_op.c` L167–177 | **Critical:** pinned physical register for vreg=-1 operands | -| `load_to_dest_ir` | `arm-thumb-gen.c` L3416+ | ~38 reads, 3 writes — live for inline asm + VLA | -| `store_ex_ir` | `arm-thumb-gen.c` L2622+ | ~10 reads — live for inline asm | -| `th_store_resolve_base_ir` | `arm-thumb-gen.c` L2508+ | 2 reads — live for inline asm | -| `load_to_reg_ir` | `arm-thumb-gen.c` L3745+ | 2 writes — live for inline asm | -| `asm_gen_code` | `arm-thumb-asm.c` L254+ | 6 writes — constructs IROperands with `pr0_reg` | -| `svalue_to_iroperand` Case 1/1b | `tccir_operand.c` L343/359 | Writes `pr0_reg = val_kind` from `sv->r & VT_VALMASK` | -| `iroperand_to_svalue` | `tccir_operand.c` L655 | Reads `op.pr0_reg` back to SValue | -| `irop_copy_svalue_info` | `tccir_operand.c` L298 | Copies `sv->pr0_reg` → `op->pr0_reg` | -| `tcc_ir_fill_registers` | `ir/codegen.c` L21+ | Writes `sv->pr0_reg` from interval (inline asm only) | - -**Root cause discovery:** `tcc_ir_put()` clears `sv->pr0_reg = PREG_REG_NONE` before calling `svalue_to_iroperand()`, but `svalue_to_iroperand()` Case 1b **re-derives** `result.pr0_reg = val_kind` from `sv->r & VT_VALMASK`. So the clearing is ineffective for vreg=-1 operands with a physical register. Three GCC torture tests (pr41239, pr46309, pr58831) confirmed the vreg=-1 path with `pr0_reg≠PREG_REG_NONE` is live. - -**Approach taken (Option 3: encode in `u.imm32`):** - -Rather than plumbing interval entries for all vreg=-1 creation sites, we encode the pinned physical register in `u.imm32` for IROP_TAG_VREG operands: - -- Defines: `IROP_VREG_PHYS_VALID` (0x100, validity flag) and `IROP_VREG_PHYS_MASK` (0x1F, register number) in `tccir_operand.h` -- `svalue_to_iroperand()` Case 1b (vreg=-1): sets `result.u.imm32 = IROP_VREG_PHYS_VALID | (val_kind & IROP_VREG_PHYS_MASK)` -- `machine_op_from_ir()` vreg=-1 path: reads `op->u.imm32` instead of `op->pr0_reg` - -**Important:** Case 1 (vr >= 0) must **NOT** set `u.imm32` — `load_to_dest_ir()` uses `u.imm32 != 0` on VREG operands for sub-component access (complex imaginary part). Setting it caused GCC torture test 20030222-1 to fail: inline asm `"=r" (int_out) : "0" (long_long_in)` loaded the high word instead of the low word. - -**Status:** ✅ Complete. The `pr0_reg`/`pr1_reg` fields have been removed from `IROperand`. The struct is now 9 bytes (down from 10). All legacy `_ir` functions use `irop_phys_r0()`/`irop_phys_r1()` helpers that read physical registers from the interval table. The `load_to_dest_ir` signature was changed to `(int dest_r0, int dest_r1, IROperand src)`. The `arm-thumb-asm.c::asm_gen_code` was updated to pass explicit register args. `tccir_operand.c` conversion functions no longer copy pr0/pr1. `irop_init_phys_regs()` was deleted. Remaining IROperand flags repacked into a single byte: `is_unsigned:1, is_static:1, is_sym:1, is_param:1, _pad:4`. - -**Completed steps:** -1. ✅ Added `irop_phys_r0()`/`irop_phys_r1()` helpers in `arm-thumb-gen.c` — read interval table or IROP_VREG_PHYS encoding -2. ✅ Converted `load_to_dest_ir` signature to `(int dest_r0, int dest_r1, IROperand src)` — removed dead spilled-dest path -3. ✅ Converted `store_ex_ir`/`th_store_resolve_base_ir` to use `irop_phys_r0()`/`irop_phys_r1()` -4. ✅ Updated `arm-thumb-asm.c::asm_gen_code` to pass explicit register args -5. ✅ Updated `tccir_operand.c` — removed pr0/pr1 from `irop_copy_svalue_info`, `svalue_to_iroperand`, `iroperand_to_svalue`, `irop_compare_svalue` -6. ✅ Removed `pr0_reg:5`, `pr1_reg:5`, `_reserved0:1`, `_reserved1:1` from `IROperand` — struct shrunk to 9 bytes -7. ✅ Removed dead pr0_reg/pr1_reg init writes from `ir/core.c` -8. ✅ Updated test `bug_packed10_array` for 9-byte layout - -**Dependency:** Phase 5m (delete `fill_registers_ir`) and Phase 5n (delete dead `_op` functions) — both done. - -### Phase 5q: Delete Legacy `_ir` Wrappers + Rewrite `tcc_gen_mach_load_to_reg` (COMPLETED) - -**What was done:** - -Deleted all remaining legacy `_ir` wrapper functions from `arm-thumb-gen.c` (~560 lines) and rewrote `tcc_gen_mach_load_to_reg` for correctness. - -**Functions deleted:** - -| Function | ~Lines | Role | -|----------|--------|------| -| `load_to_dest_ir` | 268 | Legacy IROperand-based load (read pr0_reg/pr1_reg from interval) | -| `store_ex_ir` | 170 | Legacy IROperand-based store | -| `store_ir` | 3 | Thin wrapper around `store_ex_ir` | -| `th_store_resolve_base_ir` | 114 | Legacy base-resolution for stores | -| `irop_phys_r0` / `irop_phys_r1` | 47 | Interval-table helpers (only used by `_ir` functions) | -| `th_store32_imm_or_reg` | 5 | Became unused after `store_ex_ir` deletion | -| Forward declarations | 3 | Stale declarations for deleted functions | - -Also deleted: `irop_phys_r0`/`irop_phys_r1` helper forward declarations. - -**`tcc_gen_mach_load_to_reg` rewrite:** - -The original 6-line implementation used `mach_ensure_in_reg` which allocates a scratch register. When inline asm loads multiple operands sequentially, the scratch for operand N could clobber operand N-1's already-loaded register (pr49390 regression). - -Rewritten as a ~105-line switch covering all `MachineOperandKind` values, loading directly into `dest_reg`: - -| Kind | Strategy | -|------|----------| -| `MACH_OP_REG` | `mov dest, src` (or deref via `load_from_base`) | -| `MACH_OP_SPILL` | `load_spill_slot` (with LLOCAL double-deref) | -| `MACH_OP_IMM` | `load_constant` directly into dest | -| `MACH_OP_FRAME_ADDR` | `addr_of_stack_slot` directly into dest | -| `MACH_OP_SYMBOL` | Direct load/deref; scratch via `get_scratch_reg_with_save` excluding dest | -| `MACH_OP_PARAM_STACK` | `load_from_base` from SP | -| `MACH_OP_CHAIN_REL` | `resolve_chain_base` + `load_from_base` | - -Key property: **no scratch register can clobber `dest_reg`** — scratch allocation explicitly excludes `dest_reg` when needed. - -**Results:** -- `arm-thumb-gen.c`: 8578 → 8055 lines (−523) -- All 3310 tests pass, 0 failed -- Inline asm operand sequential loading works correctly (pr49390 fixed) - -### Phase 6: Consolidate `ir/codegen.c` - -**Goal:** Reduce `ir/codegen.c` from 2362 lines to ~1400–1600 by removing structural duplication between the dry-run and real-run dispatch loops. - -**Current structure (as of 2026-03-06):** - -``` -Lines 1–16: Header, includes -Lines 17–190: tcc_ir_fill_registers (SValue, used by inline asm only) -Lines 188–382: tcc_ir_register_allocation_params -Lines 382–723: Helper functions (branch optimization, stack layout) -Lines 723–860: Inline asm codegen helper (tcc_ir_codegen_inline_asm_ir) -Lines 860–1059: try_reassign_scratch_conflict, has_incoming_jump analysis -Lines 1059–1160: tcc_ir_codegen_generate() entry, stack_size computation -Lines 1160–1693: DRY-RUN PASS (dispatch loop L1210–L1628, ~420 lines of switch cases) -Lines 1693–1710: Inter-pass: prologue gen, debug prolog -Lines 1710–2350: REAL-RUN PASS (dispatch loop L1730–2320, ~590 lines of switch cases) -Lines 2350–2363: Cleanup, backpatch, epilogue -``` - -The dry-run loop is ~420 lines and the real-run loop is ~590 lines. The real-run is larger because it includes: -1. `#ifdef TCC_LS_DEBUG` scratch consistency checks (~120 lines across all ops) -2. `ir_to_code_mapping[i]` updates for JUMP/JUMPIF -3. `tcc_ir_spill_cache_clear()` calls after branches, calls, and inline asm -4. SWITCH_TABLE: dry-run computes `ind += size`, real-run calls `tcc_gen_machine_switch_table_mop` -5. RETURNVOID: dry-run does nothing, real-run emits jump-to-epilogue -6. FUNCCALLVOID: real-run sets `drop_return_value = 1` via fallthrough -7. INLINE_ASM: dry-run skips via `continue`, real-run calls `tcc_ir_codegen_inline_asm_ir` -8. `before_ret` peephole: identical in both loops but duplicated (LOAD/LOAD_INDEXED/ASSIGN) - -**Strategy: Unified dispatch with mode flag** - -```c -for (int pass = 0; pass < 2; pass++) { - bool is_dry_run = (pass == 0); - if (pass == 1) { - /* inter-pass: prologue, debug, branch optimization */ - } - - for (int i = 0; i < ir->next_instruction_index; i++) { - IROperand src1_ir = tcc_ir_op_get_src1(ir, cq); - // ... operand extraction ... - // ... before_ret peephole (shared) ... - - switch (cq->op) { - case TCCIR_OP_ADD: ... { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - // ... same handler call ... - if (is_dry_run) { - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - break; - } - case TCCIR_OP_JUMP: - tcc_gen_machine_jump_mop(cq->op, irop_get_imm32(dest_ir), i); - if (!is_dry_run) { - ir_to_code_mapping[i] = ind - (...); - tcc_ir_spill_cache_clear(&ir->spill_cache); - } - break; - // ... - } - tcc_gen_machine_end_instruction(); - } -} -``` - -**Detailed differences between loops (audit):** - -| Op | Dry-run | Real-run | Merge strategy | -|---|---|---|---| -| Most MOP ops (DP, LOAD, STORE, ...) | call handler + record scratch | call handler + `#ifdef TCC_LS_DEBUG` check | Shared; `if (is_dry_run)` for scratch recording | -| SWITCH_TABLE | `ind += 14 + table_data_size` | `tcc_gen_machine_switch_table_mop()` | `if (is_dry_run) ind += ...; else switch_table_mop()` | -| RETURNVOID | `break` (no-op) | emit jump to epilogue | `if (!is_dry_run) { ... }` | -| FUNCCALLVOID | no fallthrough to FUNCCALLVAL | `drop_return_value = 1` + fallthrough | Use explicit flag instead of fallthrough | -| JUMP/JUMPIF | `tcc_gen_machine_jump_mop()` | same + `ir_to_code_mapping` update + `spill_cache_clear` | `if (!is_dry_run) { mapping; cache_clear; }` | -| INLINE_ASM | `continue` (skipped) | `tcc_ir_codegen_inline_asm_ir()` + `spill_cache_clear` | `if (!is_dry_run) { ... }` | -| ASM_INPUT/OUTPUT/NOP | `continue` | `break` | Normalize to `continue` or `break` | -| Loop preamble | no `ir_to_code_mapping`, no `tcc_debug_line_num`, no `codegen_materialize_scratch_flags` | all of these | `if (!is_dry_run) { ... }` | -| `before_ret` peephole | Identical to real-run | Identical to dry-run | Shared | - -**Sub-steps:** - -#### 6a: Normalize loop preambles - -The real-run loop has extra per-iteration setup: -- `ir_to_code_mapping[i] = ind` -- `orig_ir_to_code_mapping[cq->orig_index] = ind` -- `tcc_debug_line_num(tcc_state, cq->line_num)` -- `ir->codegen_materialize_scratch_flags = 0` - -Wrap these in `if (!is_dry_run)`. The dry-run loop doesn't do debug line emission or mapping updates — it only needs `ir_to_code_mapping[i] = ind` for branch offset analysis (already present). - -#### 6b: Extract `before_ret` peephole into helper - -The LOAD/LOAD_INDEXED/ASSIGN `before_ret` peephole is ~30 lines duplicated 3× in each loop (6× total). Extract: - -```c -static bool ir_codegen_check_before_ret(TCCIRState *ir, int i, IROperand *dest_ir, - const uint8_t *has_incoming_jump) -``` - -Returns bool and patches interval + constructs synthetic MOP dest. - -#### 6c: Extract shared dispatch into function - -Create `ir_codegen_dispatch_one(TCCIRState *ir, int i, bool is_dry_run, ...)` containing the switch. Both loops call it. - -#### 6d: Merge into single outer loop - -Replace `#if 1 /* DRY_RUN_ENABLED */ ... #endif ... /* REAL RUN */` with: - -```c -for (int pass = 0; pass < 2; pass++) { - bool is_dry_run = (pass == 0); - if (pass == 0) { /* dry-run init */ } - if (pass == 1) { /* inter-pass: fixup, prologue, restore */ } - for (int i = 0; ...) { - ir_codegen_dispatch_one(ir, i, is_dry_run, ...); - } - if (pass == 0) { /* dry-run end, branch analysis, scratch fixup */ } -} -``` - -#### 6e: Clean up `#ifdef TCC_LS_DEBUG` scratch checks - -The ~120 lines of `#ifdef TCC_LS_DEBUG` scratch consistency checks only run in the real-run pass. Factor into a single helper: - -```c -static inline void ir_codegen_check_scratch(int i, TccIrOp op, int *dry_scratch, uint16_t *dry_saves) -{ -#ifdef TCC_LS_DEBUG - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_scratch[i] && dry_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)op, dry_scratch[i], real_scratch); -#endif -} -``` - -Call at the end of each op's case in the unified dispatch. - -**Actual result (Phase 6 ✅ Done):** -- `ir/codegen.c`: 2106 → 1767 lines (−339 lines, ~16%) -- Single source of truth for dispatch logic -- Adding a new IR op means adding one `case`, not two -- `before_ret` peephole logic in one place instead of six -- Four extracted helpers: `ir_codegen_before_ret_peephole()`, `ir_codegen_record_scratch()`, `ir_codegen_check_scratch()`, `ir_codegen_track_scratch()` - -**Risks (all resolved):** - -1. **SWITCH_TABLE** — dry-run computes size arithmetically; real-run emits via handler. The handler must still produce the same `ind` advance. Can be verified with an assert. -2. **RETURNVOID jump-to-epilogue** — only needed in real-run. Simple `if (!is_dry_run)` guard. -3. **`ir_to_code_mapping` / `orig_ir_to_code_mapping`** — only meaningful in real-run. Must not be written to in dry-run (would corrupt saved state). -4. **`spill_cache_clear` after branches/calls** — no-op semantics in dry-run (cache was cleared at start). Can safely call in both passes or guard. - -**Mitigation:** Do this incrementally: -1. First, extract `before_ret` peephole helper (6b) — low risk, high dedup value -2. Extract `ir_codegen_check_scratch` helper (6e) — mechanical, reduces noise -3. Extract shared dispatch function (6c) — verifiable by running both paths -4. Merge loops (6d) — final step, requires full test suite validation - -**Test:** After each sub-step: `make clean && make cross && make test -j16 && make test-all` - -## Updated Implementation Order - -| Step | Phase | Status | Scope | Est. lines changed | Dependency | -|---|---|---|---|---|---| -| 1 | **5l** | ✅ Done | Remove `pr0_spilled`/`pr1_spilled` | ~20 lines | None | -| 2 | **5m** | ✅ Done | Delete `fill_registers_ir` (production) | ~256 lines deleted | 5l | -| 3 | **5n** | ✅ Done | Delete 10 dead `_op` declarations + bodies | ~700 lines deleted | None | -| 4 | **5o** | ✅ Done | Convert jump/conditional_jump/trap to `_mop` | ~60 lines changed | 5n | -| 5 | **5p** | ✅ Done | Decouple `machine_op_from_ir` from `pr0_reg`; add `irop_phys_r0/r1` helpers; remove fields from `IROperand` (10→9 bytes); update all callers | ~200 lines changed | 5m + 5o | -| 5 | **5q** | ✅ Done | Delete all legacy `_ir` wrappers (~560 lines); rewrite `tcc_gen_mach_load_to_reg` for direct-dest loading; fix inline asm operand clobber (pr49390) | ~560 lines deleted, ~105 lines added | 5p | -| 6 | **6a** | ✅ Done | Normalize loop preambles | ~30 lines | None | -| 7 | **6b** | ✅ Done | Extract `before_ret` peephole helper | ~120 lines deduped | None | -| 8 | **6c** | ✅ Done | Extract scratch record/check helpers | ~120 lines deduped | None | -| 9 | **6d** | ✅ Done | Merge into single `for (pass=0; pass<2)` loop | ~339 lines saved | 6a+6b+6c | - -**Total expected line reduction from remaining work:** ~1000–1200 lines across all files. - -### Current file sizes (2026-03-06) - -| File | Lines | Notes | -|---|---|---| -| `ir/codegen.c` | 1767 | Single unified two-pass dispatch loop (`for (pass=0; pass<2)`) | -| `arm-thumb-gen.c` | 8055 | All legacy `_ir` functions deleted; `tcc_gen_mach_load_to_reg` rewritten for direct-dest loading | -| `arm-thumb-asm.c` | 3539 | Inline asm path fully on MOP via `tcc_gen_mach_load_to_reg`/`tcc_gen_mach_store_from_reg` | -| `ir/machine_op.c` | 328 | `machine_op_from_ir()` — reads interval table directly | -| `tccir_operand.h` | 560 | `IROperand` = 9 bytes; `pr0_reg`/`pr1_reg` removed | -| `tccir_operand.c` | 844 | SValue↔IROperand conversions updated (no pr0/pr1 copy) | -| `arm-thumb-callsite.c` | 322 | Callsite arg-handling fully on MOP | -| `ir/core.c` | 1951 | Removed dead `pr0_reg`/`pr1_reg` init writes | - -## Updated Risk Analysis - -| Risk | Mitigation | -|---|---| -| **~~`IROperand` struct size change breaks packed layout~~** | ✅ Resolved — `sizeof(IROperand)` = 9 bytes; `_Static_assert` updated; test `bug_packed10_array` updated to 9-byte layout | -| **~~vreg=-1 interval plumbing incomplete (Phase 5p)~~** | ✅ Resolved — `IROP_VREG_PHYS` encoding used by both `machine_op_from_ir` and `irop_phys_r0()` | -| **~~Dispatch loop merge (Phase 6) introduces subtle ordering bugs~~** | ✅ Resolved — merge completed successfully; all 3310 tests pass | -| **`is_local`/`is_llocal`/`is_param` still needed by IR optimizations** | These fields stay — they are IR-semantic. Only codegen-time _mutation_ is gone (`fill_registers_ir` deleted). The fields remain read-only during codegen via `machine_op_from_ir`. | -| **~~SWITCH_TABLE dry-run vs real-run divergence~~** | ✅ Resolved — unified loop handles both passes correctly | -| **Debug builds (`TCC_REGALLOC_DEBUG`) broken** | Replace deleted debug trace with MachineOperand dump; test with `make cross CFLAGS+='-DTCC_REGALLOC_DEBUG'` | diff --git a/docs/materialization/review.md b/docs/materialization/review.md deleted file mode 100644 index ccf37291..00000000 --- a/docs/materialization/review.md +++ /dev/null @@ -1,105 +0,0 @@ -# Plan Review: Materialization Refactor - -> **Note (2026-03-06):** Much of this review describes findings made *before* implementation started. Several items are now moot: -> - `ir/mat.c` (1096 lines) — **deleted** (Phase 4 ✅) -> - `ir/operand.h` + `ir/operand.c` — **deleted** (Phase 4 ✅) -> - SValue materialization path — **deleted** (Phase 0 ✅) -> - `tcc_ir_codegen_generate()` at 2331 lines — now **1767 lines** after Phase 6 consolidated dispatch loops -> - Dry-run constraint collection — **implemented** as `dry_insn_scratch[]`/`dry_insn_saves[]` arrays (Phase 3 ✅) -> - Dispatch loop consolidation — **done** (Phase 6 ✅): single `for (pass=0; pass<2)` loop; −339 lines (~16%) -> - All backend handlers now use `_mop` variants exclusively (Phase 5o ✅) -> - `pr0_reg`/`pr1_reg` fields removed from `IROperand` (Phase 5p ✅): struct shrunk from 10→9 bytes; `irop_phys_r0()`/`irop_phys_r1()` helpers read interval table -> - All legacy `_ir` wrapper functions deleted (Phase 5q ✅): `load_to_dest_ir`, `store_ex_ir`, `store_ir`, `th_store_resolve_base_ir`, `irop_phys_r0`/`irop_phys_r1`; `tcc_gen_mach_load_to_reg` rewritten for direct-dest loading - -Review of `plan.md` against the actual codebase state (original analysis). Based on reading `ir/codegen.c` (1767 lines), `arm-thumb-gen.c` (8055 lines), `tccir_operand.h` (560 lines), `tccir_operand.c` (844 lines), `ir/machine_op.c` (328 lines), `svalue.h`, and `ir/stack.h`. *(Note: `ir/mat.c`, `ir/operand.h` deleted in Phase 4.)* - ---- - -## Key Finding 1: The Plan's "Current Pattern" Pseudocode Is Inaccurate - -**Plan says** the backend (`arm-thumb-gen.c`) calls `tcc_ir_materialize_value_ir()` etc. directly. - -**Reality:** `arm-thumb-gen.c` does **NOT** call any `tcc_ir_materialize_*` or `tcc_ir_mat_*` APIs. Zero calls. The materialization happens in `ir/codegen.c`'s dispatch loop *before* calling into the backend. The backend receives already-filled `IROperand` values and then does its **own** scratch+load pattern via `get_scratch_reg_with_save()` (66 calls) and `load_to_reg_ir()` (63 calls). - -**Impact on plan:** The architecture is worse than described — there are **two independent materialization layers** running in series, not one. The plan's proposed change is still the right fix, but the migration path is different: -- We're not replacing materialize calls *in the backend* — we're removing the `ir/codegen.c` materialize layer and making the backend's existing load pattern the sole path. -- The `mach_*` helpers are essentially a clean API over what `arm-thumb-gen.c` already does informally. - -**Action taken:** Phase 2 step file corrected to reflect actual architecture. - ---- - -## Key Finding 2: Dry Run Already Exists - -**Plan says** Phase 3 introduces a dry-run pass — "Run the backend twice." - -**Reality:** `ir/codegen.c::tcc_ir_codegen_generate()` already runs a dry run followed by a real run. It calls `tcc_gen_machine_dry_run_begin()`, runs the full dispatch loop, calls `tcc_gen_machine_dry_run_end()`, analyzes branch offsets, then re-runs for real emission. - -**Impact on plan:** Phase 3 is not "add a dry run" — it's "extend the existing dry run with constraint collection." This is a smaller, less risky change than described. - -**Action taken:** Phase 3 step file corrected to frame this as an extension, not a new feature. - ---- - -## Key Finding 3: Three Parallel APIs in `ir/mat.c` - -**Plan mentions** two parallel paths (SValue and IROperand). - -**Reality:** There are **three** layers: -1. Legacy SValue API: `tcc_ir_materialize_value()`, `_const_to_reg()`, `_addr()`, `_dest()` -2. IROperand API: `tcc_ir_materialize_value_ir()`, `_const_to_reg_ir()`, `_addr_ir()`, `_dest_ir()` -3. New wrapper API: `tcc_ir_mat_value()`, `_const()`, `_addr()`, `_dest()` (with `TCCMatValue`/`TCCMatAddr`/`TCCMatDest` types) - -Layer 3 wraps layer 1. The active codegen path uses layer 2. - -**Impact on plan:** Phase 0 (SValue elimination) should delete layers 1 and 3 (both SValue-based). Layer 2 is the one that stays until Phase 4. - ---- - -## Key Finding 4: Duplicate Operand Headers - -**Not mentioned in the original plan.** - -`tccir_operand.h` (567 lines) and `ir/operand.h` (539 lines) are near-duplicate headers with divergent position field widths (17-bit vs 18-bit). This is a maintenance hazard — a fix applied to one may not be applied to the other. - -**Impact on plan:** Added to Phase 5 as a cleanup step. Should arguably be fixed earlier to prevent bugs during the refactor. - ---- - -## Key Finding 5: `ir/codegen.c` Has Multiple Dispatch Paths - -The file contains **4 occurrences** of `case TCCIR_OP_ADD:`, suggesting multiple switch statements. Investigation shows: - -1. **Lines ~1335–1435:** Operand need classification (sets `need_src1_value`, etc.) -2. **Lines ~1530–1610:** Main dispatch to backend `tcc_gen_machine_*_op()` functions -3. **Lines ~1820+:** Possibly a 64-bit or alternative dispatch path -4. **Lines ~1960+:** Possibly a legacy SValue dispatch path - -This complexity is exactly what the refactor aims to eliminate. However, migrating requires understanding all 4 paths and ensuring none are silently active. - -**Recommendation:** Before Phase 2, audit which paths execute under which conditions. Mark dead paths for removal. This could be a sub-step of Phase 0. - ---- - -## Overall Assessment - -| Aspect | Rating | Notes | -|---|---|---| -| **Problem diagnosis** | Accurate | The dual-materialization problem is real and well-identified | -| **Proposed solution** | Sound | MachineOperand + backend-driven materialization is the right approach | -| **Architecture understanding** | Partially inaccurate | Backend doesn't call mat APIs; dry run already exists | -| **Phase ordering** | Good | Dependencies are correct: 0→1→2→3→4→5 | -| **Risk assessment** | Understated | Duplicate operand headers and multiple dispatch paths add risk | -| **Estimated effort** | Reasonable | Phase 2 (convert ~14 instruction handlers) is the largest effort | - -### Recommendations - -1. **Phase 0 should include an audit of all 4 dispatch paths** in `ir/codegen.c` to determine which are active and which are dead. - -2. **Consolidate operand headers early** (could be Phase 0.5) to prevent bugs during refactor where the wrong header is edited. - -3. **Phase 2 conversion order should match instruction frequency** in the test suite. Convert the most-exercised handlers first to get maximum test coverage early. - -4. **Add a "parallel validation" step** in Phase 1 where both old and new paths run and results are compared with assertions. This was added to the Phase 1 step file. - -5. **Consider whether `machine_op_from_ir()` should read directly from the allocator** rather than from the filled `IROperand` flags. This would bypass `tcc_ir_fill_registers_ir()` entirely, making Phase 1 independent of the fill logic and reducing the risk of flag-encoding bugs. diff --git a/docs/metrics_dashboard.md b/docs/metrics_dashboard.md new file mode 100644 index 00000000..42b854c2 --- /dev/null +++ b/docs/metrics_dashboard.md @@ -0,0 +1,225 @@ +# Per-revision optimizer metrics dashboard + +Tracks code size, compile time, and RP2350 cycle counts per commit in a +Grafana dashboard backed by SQLite, so an SSA-migration commit's effect is a +graph, not a guess. The fuzz correctness sweep (O1/O2 divergence) is +deliberately **not** run automatically — it's expensive; run it by hand (see +below) and let `metrics/gate.py` judge the result. + +## Layout + +``` +metrics/ + schema.sql -- SQLite DDL (runs, correctness, codesize, compile_time, perf, accepted_divergence) + record.py -- collects one commit's metrics, upserts into metrics.db + gate.py -- compares a run against its parent; --strict to fail the build + grafana/ + docker-compose.yml + tcc-metrics-grafana.service -- systemd unit, wraps podman-compose up/down + provisioning/datasources/sqlite.yml + provisioning/dashboards/dashboards.yml + dashboards/optimizer_regressions.json +.github/workflows/ci.yml -- build, build-and-test, build-and-measure, rp2350-perf +``` + +`record.py` reuses existing tooling rather than reimplementing it: +[scripts/regression_disasm.py](../scripts/regression_disasm.py) `run_csv_mode` +for code size, [tests/benchmarks/run_benchmark.py](../tests/benchmarks/run_benchmark.py) +for RP2350 perf, and [tests/fuzz/sweep_all.py](../tests/fuzz/sweep_all.py) for +the (manual) correctness sweep. + +## One-time Pi setup + +```bash +sudo mkdir -p /var/lib/tcc-metrics +sudo chown "$(whoami)" /var/lib/tcc-metrics +sqlite3 /var/lib/tcc-metrics/metrics.db < metrics/schema.sql +``` + +The DB lives outside the Actions workspace so `actions/checkout` never +touches it. + +### Runner + +Only the `rp2350-perf` job needs the Pi — it reuses the org-scoped +self-hosted runner already registered for other projects, no new runner to +register. Two things to check: + +1. The tinycc repo has access to that runner's runner group (org Settings -> + Actions -> Runner groups). +2. The runner carries the `rpi5`/`pimoroni_pico_plus2` labels + (`.github/workflows/ci.yml`'s `rp2350-perf` job targets + `runs-on: [self-hosted, rpi5, pimoroni_pico_plus2]`). Add them via the + runner's `config.sh --labels rpi5,pimoroni_pico_plus2` (or editing labels + via the GitHub UI) and restarting the runner service. + +`ci.yml` builds the cross compiler exactly once, in a dedicated `build` job +on a regular GitHub-hosted runner (`runs-on: ubuntu-latest`, same container +image `build-and-test` uses) — compiling on the Pi is much slower than a +cloud runner. `build` uploads `armv8m-tcc`/`armv8m-libtcc1.a` as a GitHub +Actions artifact; `build-and-measure` (`needs: build`) downloads it to +measure code size/compile time (no board needed) and uploads a scratch +metrics db of its own; `rp2350-perf` (`needs: build-and-measure`) downloads +both artifacts, so it never rebuilds tcc and never re-measures code size — +it only does what actually needs the board (running benchmarks over SSH), +then imports the earlier job's numbers into the persistent db via +`record.py --import-codesize-from` (see "What CI does" below). +`build-and-test` (the actual test suite) does **not** consume the `build` +artifact — `make test` depends on `cross`, which reaches through object +files and checksum/fp-libs/PCH stamp files, not just the final binary, so a +pre-built `armv8m-tcc` wouldn't save it a recompile; it stays fully +self-contained and runs in parallel with `build`. + +A self-hosted runner executes one job at a time, so `rp2350-perf` still +queues behind (or blocks) other repos' jobs on the same box while it runs, +and vice versa — that's why its `concurrency: group: metrics-rpi5` is scoped +to just that job; the cloud `build`/`build-and-measure` jobs don't need to +queue behind Pi-bound work. + +Runner dependencies (installed once on the Pi, not per-run): +- Python 3 + `pip install paramiko` — required, for the RP2350 perf step. +- The RP2350 board wired to the Pi over USB, reachable via `127.0.0.1` SSH + (`PERF_HOST`/`PERF_IDENTITY` in the workflow). If it's ever unplugged, + `record.py` skips perf for that commit rather than failing. +- `arm-none-eabi-gcc`/`objdump`/`nm` and `qemu-system-arm` (mps2-an505) + + the built newlib under `tests/ir_tests/qemu/mps2-an505` — **only** needed + if you run a manual full sweep (below) directly on the Pi; the automatic + CI path no longer measures code size there, so these aren't required for + `rp2350-perf` itself. + +### Security note + +`ci.yml`'s `rp2350-perf` job triggers on `pull_request`. Combined with a +self-hosted runner, that means PR code executes with access to this machine +and the attached hardware. Only safe as long as untrusted forks can't open +PRs against this repo. If that ever changes, either drop the `pull_request` +trigger or require maintainer approval for external-contributor workflow runs +(repo Settings -> Actions -> "Fork pull request workflows"). + +## What CI does + +On every push and PR to `mob`, `ci.yml` runs four jobs (no schedule/cron, no +fuzz sweep in any of them): + +1. `build` (cloud runner) builds `armv8m-tcc`/`armv8m-libtcc1.a` once and + uploads them as an artifact. +2. `build-and-test` (cloud runner, runs in parallel with `build` -- does + its own independent build, see the "Runner" section above for why it + can't reuse `build`'s artifact) runs the full test suite. +3. `build-and-measure` (cloud runner, `needs: build`) downloads the tcc + build, then runs `metrics/record.py --no-correctness` against a + throwaway scratch db to measure code size (via `regression_disasm.py`) + and compile time (the code-size corpus's wall time). It uploads the + scratch db as an artifact. +4. `rp2350-perf` (self-hosted Pi, `needs: build-and-measure`) downloads the + tcc build and the scratch db, imports the scratch db's + codesize/compile-time rows into the persistent + `/var/lib/tcc-metrics/metrics.db` via + `record.py --import-codesize-from `, and measures RP2350 + perf if the board answers. + +`build-and-measure` and `rp2350-perf` record under the same synthetic host +key (`METRICS_HOST: armv8m-metrics`, set at the workflow level) so they land +on **one** run row per commit instead of two — the db keys `runs` by +`(commit_sha, host)`, and both `gate.py` and the Grafana dashboard assume +one host owns every metric for a commit. `--import-codesize-from` is what +makes that work: it copies `codesize_rollup`/`codesize_func`/`compile_time` +rows for the matching commit from another metrics db instead of +recomputing them, so the `rp2350-perf` job's `upsert_run` (which always +clears a run's child tables before re-populating them) doesn't need to redo +`build-and-measure`'s measurement to fill them back in. + +The gate step is present but a no-op until the `METRICS_GATE_ENABLED` repo +variable is set to `true` (Settings -> Actions -> Variables) — see "Gate +policy" below. + +## Manual correctness sweeps + +Run these by hand whenever you want a divergence data point (e.g. before/after +a legacy-pass retirement commit): + +```bash +python3 metrics/record.py --db /var/lib/tcc-metrics/metrics.db --rev HEAD \ + --seed-lo 0 --seed-hi 1000 --mode prescan --jobs "$(nproc)" +``` + +Bump `--seed-hi` or use `--mode triage` (full-recall, slower, also +culprit-bisects) for a more thorough pass. Recording is idempotent — re-run +against the same commit any time to widen the band. + +## Gate policy: track first, then block + +`metrics/gate.py` compares a run against its parent commit's run: + +```bash +python3 metrics/gate.py --db /var/lib/tcc-metrics/metrics.db --rev HEAD +``` + +Without `--strict` it only reports (exit 0 always) — safe to run before the +baseline is provably green. Add `--strict` to fail the build on a correctness +regression (a new divergent seed not seen in the parent) or a code-size +regression beyond `--codesize-tolerance-pct` (default 1%). compile time and +perf are reported but never gate — judge those by eye on the dashboard. + +A pre-existing divergence (found once you finally run a wide correctness +sweep) is not a build failure — allowlist it: + +```bash +python3 metrics/gate.py --db /var/lib/tcc-metrics/metrics.db \ + --accept ptr:olevels:12345 --reason "pre-existing, see docs/bugs.md" +``` + +Once a `--strict` run comes back clean, flip the CI gate on by setting the +`METRICS_GATE_ENABLED` repo variable to `true`. + +## Grafana + +Grafana runs as a systemd-managed `podman-compose` stack, so it comes back on +its own after a reboot or crash instead of needing someone to SSH in and +re-run `podman-compose up -d`. Rootless Podman has no persistent daemon +equivalent to `dockerd` — `podman-compose` just shells out to `podman` — so +the unit only waits on the network, not a container-runtime service. + +Grafana's compose file (`metrics/grafana/docker-compose.yml`) reads +`/var/lib/tcc-metrics/metrics.db` and needs to live somewhere stable — clone +the repo to a persistent path on the Pi (e.g. `/opt/tcc-metrics/tinycc`), not +the ephemeral `actions/checkout` workspace the CI job uses. + +```bash +sudo git clone /opt/tcc-metrics/tinycc # one-time, or pull to update +sudo cp /opt/tcc-metrics/tinycc/metrics/grafana/tcc-metrics-grafana.service \ + /etc/systemd/system/ +sudo systemctl daemon-reload +sudo systemctl enable --now tcc-metrics-grafana.service +``` + +Edit the unit's `WorkingDirectory` first if the clone isn't at +`/opt/tcc-metrics/tinycc`. Manage it like any other service: + +```bash +systemctl status tcc-metrics-grafana # is it up? +journalctl -u tcc-metrics-grafana # compose up/down output +sudo systemctl restart tcc-metrics-grafana # e.g. after editing docker-compose.yml +``` + +Opens on `http://:3000`. The SQLite datasource and the +"TinyCC Optimizer Regressions" dashboard are provisioned automatically from +`provisioning/` and `dashboards/`. Panels: per-profile divergence, total +divergence, code-size ratio vs GCC, compile-time trend, RP2350 cycles, and a +"regressed since parent" table — the last one is the accept/reject signal for +each migration commit (see +[docs/plan_opt_predicate_framework.md](plan_opt_predicate_framework.md) and +the optimizer migration plan for how it's used). + +## Backfilling history + +Code size and compile time can be backfilled across past commits (correctness +and perf cannot — see `record.py`'s docstring for why): + +```bash +python3 metrics/record.py --db /var/lib/tcc-metrics/metrics.db --backfill 100 +``` + +This builds each of the last 100 first-parent commits into a throwaway tmpdir +(`regression_disasm.build_tcc_at_rev`) and measures against that binary. Slow +(a full `configure && make cross` per commit) — run it once, manually. diff --git a/docs/nested_functions/README.md b/docs/nested_functions/README.md deleted file mode 100644 index f5be6d64..00000000 --- a/docs/nested_functions/README.md +++ /dev/null @@ -1,132 +0,0 @@ -# GCC Nested Functions Support — Implementation Plan - -## Problem Statement - -``` -❯ python run.py -c ../gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/execute/20000822-1.c --cflags="-O0" -Using CFLAGS: -O0 -Compilation failed: - 20000822-1.c:15: error: cannot use local functions -``` - -TinyCC rejects GCC nested functions with a hard error at `tccgen.c:11393`. This plan adds full support including captured variables and trampolines for ARMv8-M (Cortex-M33). - -## Architecture Decision: Save-Tokens + Reparse - -We reuse TCC's inline function model (`skip_or_save_block` + `begin_macro` replay) rather than trying to suspend/resume `gen_function()` mid-compilation. See [Phase 1](phase1_parser.md) for rationale. - -## Phases - -| Phase | File | Summary | Effort | -|-------|------|---------|--------| -| 1 | [phase1_parser.md](phase1_parser.md) | Save nested func bodies as tokens, reparse after parent `block(0)` | 2-3 days | -| 2 | [phase2_static_chain.md](phase2_static_chain.md) | R10 static chain, captured variable access, pre-scan marking | 3-5 days | -| 3 | [phase3_trampolines.md](phase3_trampolines.md) | Static `.text` trampoline + `.data` chain slot for address-of | 5-7 days | -| 4 | [phase4_ir.md](phase4_ir.md) | IR integration: chain vreg, optimization safety, SET_CHAIN | 3-4 days | -| 5 | [phase5_arm_codegen.md](phase5_arm_codegen.md) | Thumb-2 codegen: prologue, chain load/store, trampoline emit | 3-5 days | -| 6 | [phase6_linker.md](phase6_linker.md) | Linker: R_ARM_ABS32 relocs, STB_LOCAL symbols | 1-2 days | -| 7 | [phase7_testing.md](phase7_testing.md) | Incremental test plan + GCC torture test integration | 3-5 days | - -## Recommended Implementation Order - -Phases are interleaved in practice: - -1. **Phase 1 + Phase 4 (core) + Phase 5 (stub)** → `nested_basic.c` works (no capture) -2. **Phase 2 + Phase 4 (capture) + Phase 5 (chain codegen)** → `nested_capture_*.c` works -3. **Phase 3 + Phase 5 (trampoline) + Phase 6** → `20000822-1.c` works -4. **Phase 7** → Full GCC torture suite validation - -## Milestones - -| Milestone | Target | Tests Passing | -|-----------|--------|---------------| -| M1 (~1 week) | Direct nested function calls, no capture | `nested_basic.c` | -| M2 (~2 weeks) | Captured variable read/write | `nested_capture_read.c`, `nested_capture_write.c` | -| M3 (~3.5 weeks) | Trampoline support | `20000822-1.c`, `nested_funcptr.c` | -| M4 (~4.5 weeks) | All applicable GCC torture tests | 10-14 of 14 tests | - -## Test Cases - -Test source files are in [tests/](tests/). Each test targets specific phases: - -| Test File | Phases | Description | -|-----------|--------|-------------| -| [nested_basic.c](tests/nested_basic.c) | 1 | No capture, direct call | -| [nested_basic_args.c](tests/nested_basic_args.c) | 1 | Nested function with arguments | -| [nested_multiple.c](tests/nested_multiple.c) | 1 | Multiple nested functions in one parent | -| [nested_capture_read.c](tests/nested_capture_read.c) | 1+2 | Read parent variable | -| [nested_capture_write.c](tests/nested_capture_write.c) | 1+2 | Write parent variable | -| [nested_capture_multiple.c](tests/nested_capture_multiple.c) | 1+2 | Capture multiple variables | -| [nested_capture_array.c](tests/nested_capture_array.c) | 1+2 | Capture array/pointer | -| [nested_direct_call_args.c](tests/nested_direct_call_args.c) | 1+2 | Arguments + captures combined | -| [nested_funcptr.c](tests/nested_funcptr.c) | 1+2+3 | Address-of + trampoline | -| [nested_funcptr_indirect.c](tests/nested_funcptr_indirect.c) | 1+2+3 | Nested func passed through another function | -| [nested_funcptr_call_twice.c](tests/nested_funcptr_call_twice.c) | 1+2+3 | Call via function pointer multiple times | -| [nested_multi_level.c](tests/nested_multi_level.c) | 1+2 | f → g → h chain | -| [nested_recursive_parent.c](tests/nested_recursive_parent.c) | 1+2+3 | Recursive parent with nested func | -| [nested_shadowing.c](tests/nested_shadowing.c) | 1+2 | Local shadows parent variable | -| [nested_struct_return.c](tests/nested_struct_return.c) | 1+2 | Nested function returns struct | - -## Affected GCC Torture Tests (14 total) - -| Test | Features | Status | -|------|----------|--------| -| `20000822-1.c` | Capture + address-of + indirect call | Target for M3 | -| `920428-2.c` | Capture | Target for M2 | -| `920501-7.c` | Capture | Target for M2 | -| `920612-2.c` | Capture | Target for M2 | -| `921017-1.c` | Capture | Target for M2 | -| `921215-1.c` | Capture | Target for M2 | -| `931002-1.c` | Capture | Target for M2 | -| `nestfunc-1.c` | Basics | Target for M1 | -| `nestfunc-2.c` | Arguments | Target for M1 | -| `nestfunc-3.c` | Struct returns | Target for M2 | -| `comp-goto-2.c` | Computed goto | Deferred (needs computed goto) | -| `nestfunc-5.c` | `__label__` | Deferred (needs nonlocal goto) | -| `nestfunc-6.c` | Nonlocal goto | Deferred (needs nonlocal goto) | -| `pr24135.c` | `__label__` + nonlocal goto | Deferred (needs nonlocal goto) | - -## Key Codebase Context - -### Current error location -```c -// tccgen.c:11391-11393 -if (tok == '{') { - if (l != VT_CONST) - tcc_error("cannot use local functions"); -``` - -### Global state to save/restore - -| Global | Type | Purpose | -|--------|------|---------| -| `tcc_state->ir` | `TCCIRState*` | Current IR state | -| `loc` | `int` | Local stack offset | -| `ind` | `int` | Code output index | -| `rsym` | `int` | Return symbol chain | -| `func_ind` | `int` | Function start index | -| `funcname` | `const char*` | Function name | -| `func_vt` | `CType` | Return type | -| `func_var` | `int` | Variadic flag | -| `cur_scope`, `root_scope`, `loop_scope` | `struct scope*` | Scope chain | -| `local_stack` | `Sym*` | Local symbol stack | -| `local_label_stack` | `Sym*` | Local labels | -| `global_label_stack` | `Sym*` | Global labels | -| `nocode_wanted` | `int` | Code suppression | -| `local_scope` | `int` | Scope depth | -| `nb_temp_local_vars` | `int` | Temp local count | -| `arr_temp_local_vars` | `struct[8]` | Temp local info | -| `cur_text_section` | `Section*` | Output section | -| `cur_switch` | `struct switch_t*` | Switch state | - -## Risks & Open Questions - -1. **Re-entrancy** — Static `.data` chain slots are not re-entrant for recursive parents. Acceptable for now. -2. **Token stream end** — `gen_function()` calls `next()` at end; verify `begin_macro`/`end_macro` handles this. -3. **Symbol mangling** — Use `f1__nested__f2` or internal token IDs to avoid collisions. -4. **Multi-level nesting** — Requires chain-of-chains (each level one pointer indirection). -5. **Inline functions** — Token-save works naturally; trampoline names need uniqueness per instantiation. -6. **Nonlocal goto** — 4 tests deferred; needs stack unwinding support. -7. **Optimization safety** — Chain loads/stores use non-FP base; existing conservative rules should suffice. -8. **Thread safety** — `.data` chain slots not thread-safe; OK for Cortex-M33. -9. **Pre-scan accuracy** — `prescan_captured_vars` over-marks (safe but suboptimal); can refine later. diff --git a/docs/nested_functions/fixes/fix1_capture_array.md b/docs/nested_functions/fixes/fix1_capture_array.md deleted file mode 100644 index c0b9ea82..00000000 --- a/docs/nested_functions/fixes/fix1_capture_array.md +++ /dev/null @@ -1,79 +0,0 @@ -# Fix 1: `nested_capture_array.c` — Array Capture Type Propagation - -**Test**: `tests/ir_tests/nested_capture_array.c` -**Error**: "pointer expected" — `arr[i]` fails because captured `arr` has type `VT_INT` instead of `int[5]` -**Root Cause**: Captured variable type hardcoded to `VT_INT` at `tccgen.c:7376` -**Complexity**: Low - -## Problem - -When a nested function references a parent variable, the captured-var resolver at `tccgen.c:7376` creates a fake symbol with: - -```c -s->type.t = VT_INT; /* Default to int - type will be cast later if needed */ -``` - -For arrays, this means `arr` is treated as a plain `int`, so applying `[]` to it triggers "pointer expected". The real type (`int[5]`) is never propagated. - -## Changes - -### 1. Add `captured_types[]` to `NestedFunc` (`tcc.h:~722`) - -Add a `CType` array to store the full type of each captured variable: - -```c -typedef struct NestedFunc -{ - // ... existing fields ... - int captured_offsets[MAX_CAPTURED_VARS]; - int captured_tokens[MAX_CAPTURED_VARS]; - int captured_vregs[MAX_CAPTURED_VARS]; - CType captured_types[MAX_CAPTURED_VARS]; // <-- NEW: full type of captured vars - int nb_captured; - // ... -} NestedFunc; -``` - -### 2. Record parent symbol's `CType` in `prescan_captured_vars()` (`tccgen.c:~11198`) - -When a captured variable is recorded, also store its type: - -```c -if (!already_captured && nf->nb_captured < MAX_CAPTURED_VARS) -{ - nf->captured_vregs[nf->nb_captured] = s->vreg; - nf->captured_offsets[nf->nb_captured] = s->c; - nf->captured_tokens[nf->nb_captured] = t; - nf->captured_types[nf->nb_captured] = s->type; // <-- NEW - nf->nb_captured++; -} -``` - -### 3. Use real type in captured-var resolver (`tccgen.c:~7376`) - -Replace the hardcoded `VT_INT` with the actual captured type: - -```c -// BEFORE: -s->type.t = VT_INT; - -// AFTER: -s->type = nf->captured_types[i]; -``` - -### 4. Remove xfail (`tests/ir_tests/test_qemu.py:~289`) - -Remove `("nested_capture_array.c", 0)` from `NESTED_XFAIL_TEST_FILES`. - -## Why This Works - -- Arrays accessed via the static chain: the chain-relative offset (R10 + parent FP offset) points to the start of the array in the parent's stack frame -- With the correct `VT_ARRAY` type, the `[]` operator triggers normal array-to-pointer decay (`gaddrof()`) + index arithmetic -- ARM codegen at `arm-thumb-gen.c:2282-2294` already handles arbitrary offsets from R10 — no backend changes needed - -## Verification - -```bash -cd tests/ir_tests && python run.py -c nested_capture_array.c --dump-ir -make test -j16 # no regressions -``` diff --git a/docs/nested_functions/fixes/fix2_struct_return.md b/docs/nested_functions/fixes/fix2_struct_return.md deleted file mode 100644 index f62ac270..00000000 --- a/docs/nested_functions/fixes/fix2_struct_return.md +++ /dev/null @@ -1,79 +0,0 @@ -# Fix 2: `nested_struct_return.c` — Struct Return from Nested Functions - -**Test**: `tests/ir_tests/nested_struct_return.c` -**Error**: Type mismatch / incorrect codegen for struct return via sret -**Root Cause**: sret (struct return) ABI interaction with nested function static chain -**Complexity**: Medium -**Depends on**: Fix 1 (captured_types propagation) - -## Problem - -The nested function `Point offset(Point p)` returns a `Point` (8 bytes). On ARM, `gfunc_sret()` (`arm-thumb-gen.c:2165`) returns 0 for structs > 4 bytes, meaning the sret convention is used: a hidden first parameter (pointer to caller-allocated return buffer) is passed in R0. - -The interaction between `SET_CHAIN` (R10 = parent FP) and the sret hidden pointer needs verification. Possible failure modes: - -1. Parameter numbering is off — the sret pointer is param #0, but call_id encoding may not account for it correctly alongside SET_CHAIN -2. The nested function's `gen_function()` doesn't correctly set up the implicit sret parameter when `has_static_chain` is also active -3. Type propagation issues (resolved by Fix 1's `captured_types` change—`dx` and `dy` are `int` which was already correct, but other captured types may be wrong) - -## Diagnostic Steps - -### 1. Compile with IR dump - -```bash -cd tests/ir_tests -python run.py -c nested_struct_return.c --dump-ir -``` - -Examine the IR around the `offset(p)` call. Check: -- `SET_CHAIN` emission relative to `FUNCPARAMVAL` for sret pointer -- `FUNCPARAMVAL` numbering: sret = param #0, `p` = param #1 -- The nested `offset` function's prologue: sret hidden param + static chain - -### 2. Disassemble - -```bash -arm-none-eabi-objdump -d tests/ir_tests/build/nested_struct_return.elf | grep -A 30 'offset\.' -``` - -Check register usage: R0 = sret pointer (hidden), R1-R2 = Point p (8 bytes), R10 = chain (parent FP). - -## Changes - -### 1. Verify SET_CHAIN / sret ordering (`tccgen.c:~7520-7600`) - -The `SET_CHAIN` IR op is emitted at `tccgen.c:7531` **before** any `FUNCPARAMVAL` instructions. The sret hidden pointer is emitted as `FUNCPARAMVAL` at `tccgen.c:7575-7584`. This ordering should be correct: - -- `SET_CHAIN` → sets R10 (not a register parameter, no conflict) -- `FUNCPARAMVAL` param #0 → sret pointer in R0 -- `FUNCPARAMVAL` param #1 → Point p in R1-R2 - -Verify this is the actual ordering in the IR dump. If not, fix the emission sequence. - -### 2. Check nested function prologue (`ir/core.c:~599`) - -When the nested `offset` function is compiled: -- `gfunc_sret()` detects struct return → sret convention -- `gen_function()` creates the implicit sret parameter (func_vc) -- The static chain (R10) is set up as a separate vreg, NOT as a parameter - -Ensure the parameter list setup in `ir/core.c` correctly handles sret + static chain together. The sret pointer should be parameter #0 (in R0), and `Point p` should be parameter #1 (in R1-R2). R10 is independent. - -### 3. Fix any parameter count mismatch - -If the sret hidden parameter is counted differently when `has_static_chain` is set, fix the count. The chain is NOT a parameter in the AAPCS sense—it uses R10, not R0-R3. - -### 4. Apply Fix 1 first - -The `captured_types` fix ensures `dx` and `dy` have correct types. While they happen to be `int` (matching the hardcoded `VT_INT`), having real types prevents fragile assumptions. - -### 5. Remove xfail (`tests/ir_tests/test_qemu.py:~288`) - -Remove `("nested_struct_return.c", 0)` from `NESTED_XFAIL_TEST_FILES`. - -## Verification - -```bash -cd tests/ir_tests && python run.py -c nested_struct_return.c --dump-ir -make test -j16 # no regressions -``` diff --git a/docs/nested_functions/fixes/fix3_recursive_parent.md b/docs/nested_functions/fixes/fix3_recursive_parent.md deleted file mode 100644 index 814c0b54..00000000 --- a/docs/nested_functions/fixes/fix3_recursive_parent.md +++ /dev/null @@ -1,90 +0,0 @@ -# Fix 3: `nested_recursive_parent.c` — Scope Resolution for Parameters - -**Test**: `tests/ir_tests/nested_recursive_parent.c` -**Error**: "undeclared" — captured variable `n` (parameter) or `result` (local) not found -**Root Cause**: `prescan_captured_vars()` filter condition may reject parameter symbols -**Complexity**: Low - -## Problem - -`factorial_with_nested(int n)` is a file-scope function containing nested function `accumulate()` which captures both: -- `result` — local variable -- `n` — function parameter - -The phase2 doc states this fails with "'n' undeclared" or similar. The prescan at `tccgen.c:11178` uses: - -```c -Sym *s = sym_find2(parent_local_stack, t); -if (s && (s->r & VT_VALMASK) == VT_LOCAL) -``` - -Function parameters are pushed onto `local_stack` during `gen_function()` and should have `VT_LOCAL` in their `r` field. However, they may also carry `VT_PARAM` or other flags that cause the `VT_VALMASK` check to reject them. - -The **alternative theory**: since `factorial_with_nested` is a file-scope function (not itself nested), `decl(VT_LOCAL)` handles the nested definition inside its body. The `local_stack` at prescan time should include both `n` (parameter, pushed by `gen_function`) and `result` (local, pushed by `decl_initializer_alloc`). If parameters are pushed AFTER `block(0)` starts but the nested function definition comes before `result` is declared, then the ordering matters. - -## Diagnostic Steps - -### 1. Add debug output to prescan - -Temporarily add to `prescan_captured_vars()`: -```c -fprintf(stderr, "PRESCAN: token=%s sym=%p r=0x%x valmask=0x%x\n", - get_tok_str(t, NULL), s, s ? s->r : 0, s ? (s->r & VT_VALMASK) : 0); -``` - -### 2. Compile and check - -```bash -./armv8m-tcc -c tests/ir_tests/nested_recursive_parent.c 2>&1 | head -20 -``` - -Check which tokens are scanned, whether `result` and `n` are found on `parent_local_stack`, and what their `s->r` values are. - -## Changes - -### 1. Fix prescan filter condition (`tccgen.c:~11180`) - -If the diagnostic shows parameters have flags beyond `VT_LOCAL`, broaden the check: - -```c -// BEFORE: -if (s && (s->r & VT_VALMASK) == VT_LOCAL) - -// AFTER (option A — also accept parameters explicitly): -if (s && ((s->r & VT_VALMASK) == VT_LOCAL || (s->r & VT_PARAM))) - -// AFTER (option B — accept any stack-resident symbol): -if (s && ((s->r & VT_VALMASK) == VT_LOCAL)) -// (if VT_PARAM symbols already have VT_LOCAL in VT_VALMASK, this is already correct -// and the issue is elsewhere) -``` - -The exact fix depends on the diagnostic output. If parameters already have `(s->r & VT_VALMASK) == VT_LOCAL`, the prescan filter is fine and the issue is in the captured-var resolver at `tccgen.c:7370`—possibly the resolver can't match because the token ID differs for parameters vs locals. - -### 2. Verify parameter offset stability - -Parameters' FP offsets are deterministic (assigned during `gen_function()` before `block(0)`). Since `prescan_captured_vars` runs during `block(0) → decl(VT_LOCAL)`, the parameter's `s->c` should be correct. Verify that `captured_offsets[]` gets the right value for `n`. - -### 3. Verify recursion correctness (no code changes expected) - -Each recursive call to `factorial_with_nested` creates a new stack frame. At each call to `accumulate()`: -- `SET_CHAIN` copies the current FP to R10 -- `accumulate()` accesses `result` and `n` via R10 + offset -- This correctly accesses the current invocation's variables - -No codegen changes needed for recursion support. - -### 4. Apply Fix 1 (`captured_types`) - -With the `captured_types` change from Fix 1, `result` and `n` will have correct `int` type (already `VT_INT` by coincidence, but proper propagation is better). - -### 5. Remove xfail (`tests/ir_tests/test_qemu.py:~287`) - -Remove `("nested_recursive_parent.c", 0)` from `NESTED_XFAIL_TEST_FILES`. - -## Verification - -```bash -cd tests/ir_tests && python run.py -c nested_recursive_parent.c --dump-ir -make test -j16 # no regressions -``` diff --git a/docs/nested_functions/fixes/fix4_multi_level.md b/docs/nested_functions/fixes/fix4_multi_level.md deleted file mode 100644 index d58c29bc..00000000 --- a/docs/nested_functions/fixes/fix4_multi_level.md +++ /dev/null @@ -1,348 +0,0 @@ -# Fix 4: `nested_multi_level.c` — Multi-Level Nesting (Chain-of-Chains) - -**Test**: `tests/ir_tests/nested_multi_level.c` -**Error**: `'a' undeclared` — `level2` can't access grandparent variable `a` from `main` -**Root Cause**: Two independent problems: - 1. `prescan_captured_vars()` only searches immediate parent's `local_stack` - 2. ARM codegen only does single-hop chain dereference (R10 as direct base) -**Complexity**: High — touches parser prescan, IR metadata, and 4+ codegen paths - ---- - -## Problem - -```c -int main(void) { // "grandparent" - int a = 1; - int level1(int x) { // "parent" — captures a (prescan sees it in token stream) - int b = 20; - int level2(int y) { // "child" — needs a, b, x - return a + b + x + y; // ERROR: 'a' undeclared - } - return level2(300); - } - printf("%d\n", level1(10)); // expected: 1+20+10+300 = 331 - a = 100; - printf("%d\n", level1(10)); // expected: 100+20+10+300 = 430 -} -``` - -`level2` accesses: -| Var | Origin | Chain depth | Access pattern | -|-----|-------------|-------------|-----------------------------------------| -| `b` | level1 | 1 | `[R10 + offset_b]` (direct) | -| `x` | level1 | 1 | `[R10 + offset_x]` (direct) | -| `a` | main | 2 | `[[R10 + CHAIN_SLOT] + offset_a]` | - -### Why level1 already captures `a` - -`prescan_captured_vars(nf_for_level1, main_local_stack)` runs during main's -parsing (`tccgen.c:11978`). It does a **flat token scan** of level1's entire -body — including the tokens inside level2's definition. The token `a` appears -in level2's `return a + b + x + y;`, and `a` IS in main's `local_stack`. -So level1 already captures `a` with depth 1. **This is correct and works today.** - -### Why level2 fails to capture `a` - -When `compile_nested_functions()` compiles level1 (`tccgen.c:11111`), level1's -`block(0)` discovers level2 and calls -`prescan_captured_vars(nf_for_level2, level1_local_stack)` (`tccgen.c:11978`). - -- `b` found in level1's local_stack → captured ✓ -- `x` found in level1's params → captured ✓ -- `a` **NOT** in level1's local_stack → **not captured** ✗ - -The prescan never checks `tcc_state->current_nested_func` (level1's captured -vars). Later, when level2's parser hits `a` at `tok_identifier` (`tccgen.c:7374`), -it searches `nf_for_level2->captured_tokens` — empty for `a` — and falls -through to `tcc_error("'a' undeclared")`. - ---- - -## Design: Fixed Chain Slot Convention - -R10 is already pushed as a callee-saved register in the function prologue, but -its position in the PUSH frame varies depending on which other registers are -pushed. Computing the push-frame offset is possible but fragile and couples -codegen tightly to the register allocator. - -**Chosen approach**: every function with `has_static_chain` explicitly stores -R10 at a **fixed, known offset** from FP immediately after the frame pointer -setup. This is the **chain slot**. - -``` -CHAIN_SLOT_OFFSET = -4 (first slot below FP, i.e. FP - 4) -``` - -Multi-hop access is then uniform — each hop loads `[current_fp + CHAIN_SLOT_OFFSET]`: - -```asm -; depth 1 (parent var): direct -LDR Rd, [R10, #var_offset] - -; depth 2 (grandparent var): -LDR temp, [R10, #-4] ; temp = saved chain = grandparent's FP -LDR Rd, [temp, #var_offset] - -; depth 3 (great-grandparent var): -LDR temp, [R10, #-4] ; temp → grandparent's FP -LDR temp, [temp, #-4] ; temp → great-grandparent's FP -LDR Rd, [temp, #var_offset] -``` - -**Cost**: 4 bytes of stack + 1 STR instruction per nested function that -receives a static chain. Acceptable for correctness. - ---- - -## Changes (7 steps) - -### Step 1 — Add `captured_chain_depth[]` to `NestedFunc` (`tcc.h:~733`) - -```c -typedef struct NestedFunc -{ - /* ... existing fields ... */ - int captured_offsets[MAX_CAPTURED_VARS]; - int captured_tokens[MAX_CAPTURED_VARS]; - int captured_vregs[MAX_CAPTURED_VARS]; - CType captured_types[MAX_CAPTURED_VARS]; -+ int captured_chain_depth[MAX_CAPTURED_VARS]; /* 1 = parent, 2 = grandparent, ... */ - int nb_captured; - /* ... */ -} NestedFunc; -``` - -All existing captures get depth 1 (set in prescan, Step 3). - -### Step 2 — Add `captured_chain_depths[]` to `TCCIRState` (`tccir.h:~379`) - -Parallel array to `captured_offsets_list[]`: - -```c - int32_t captured_offsets_list[32]; -+ int32_t captured_chain_depths[32]; /* 1 = direct R10, 2+ = multi-hop */ - int32_t captured_count; -``` - -Initialize to 0 in `tcc_ir_alloc()` (already zeroed by `tcc_mallocz`). - -### Step 3 — Extend `prescan_captured_vars()` to walk ancestor captures (`tccgen.c:11196`) - -Current code (simplified): -```c -Sym *s = sym_find2(parent_local_stack, t); -if (s && ((s->r & VT_VALMASK) == VT_LOCAL || (s->r & VT_PARAM))) -{ - /* ... existing capture logic — mark addrtaken, record offset, etc. ... */ - nf->nb_captured++; -} -``` - -Extend with an `else` branch after the existing capture block: -```c - /* ... existing capture block (now also sets chain_depth = 1) ... */ - nf->captured_chain_depth[nf->nb_captured] = 1; - nf->nb_captured++; - } -+ /* Not found in parent locals — search parent's own captured vars. -+ * When compiling level1, current_nested_func == nf_for_level1. -+ * level1 captured 'a' from main with depth 1, so level2 inherits -+ * it with depth 2. */ -+ else if (tcc_state->current_nested_func) -+ { -+ NestedFunc *parent_nf = tcc_state->current_nested_func; -+ for (int j = 0; j < parent_nf->nb_captured; j++) -+ { -+ if (parent_nf->captured_tokens[j] == t) -+ { -+ /* Guard: check not already captured (e.g. token appears twice) */ -+ int dup = 0; -+ for (int k = 0; k < nf->nb_captured; k++) -+ if (nf->captured_tokens[k] == t) { dup = 1; break; } -+ if (dup) break; -+ -+ nf->captured_offsets[nf->nb_captured] = parent_nf->captured_offsets[j]; -+ nf->captured_tokens[nf->nb_captured] = t; -+ nf->captured_types[nf->nb_captured] = parent_nf->captured_types[j]; -+ nf->captured_chain_depth[nf->nb_captured] = parent_nf->captured_chain_depth[j] + 1; -+ nf->nb_captured++; -+ break; -+ } -+ } -+ } -``` - -**Why this works**: at prescan time for level2, `tcc_state->current_nested_func` -points to level1's `NestedFunc`. level1's prescan (run during main's parsing) -already captured `a` with depth 1. So the lookup finds `a` there and captures -it for level2 with depth 2. This generalizes transitively to arbitrary depth. - -### Step 4 — Propagate chain depths to IR (`tccgen.c:~11293`) - -In `gen_function()`, where `captured_offsets_list` is populated: - -```c - ir->captured_count = nf->nb_captured; - for (int j = 0; j < nf->nb_captured && j < 32; j++) -+ { - ir->captured_offsets_list[j] = nf->captured_offsets[j]; -+ ir->captured_chain_depths[j] = nf->captured_chain_depth[j]; -+ } -``` - -### Step 5 — Emit chain save in prologue (`arm-thumb-gen.c`, prologue) - -In `tcc_gen_machine_prologue()`, after the frame pointer setup (`MOV FP, SP`) -and stack allocation (`SUB SP, #stack_size`): - -```c -+ /* Save incoming static chain (R10) at fixed chain slot [FP - 4]. -+ * This allows child nested functions to follow the chain to -+ * grandparent frames via multi-hop LDR sequences. */ -+ if (ir && ir->has_static_chain) -+ { -+ ot_check(th_str_imm(architecture_config.static_chain_reg, R_FP, -+ 4, /* abs offset for FP-4 encoding */ -+ 6, ENFORCE_ENCODING_NONE)); -+ /* Note: the stack allocator must reserve this slot — see Step 5b. */ -+ } -``` - -**Step 5b — Reserve chain slot in stack layout**. In `tccgen.c` (or `ir/core.c`), -when `has_static_chain` is set, bias `loc` by -4 before local variable -allocation begins, so that FP-4 is never assigned to a local var: - -```c - /* Reserve chain save slot at FP-4 */ - if (ir->has_static_chain) - ir->loc -= 4; /* or equivalent mechanism in the stack allocator */ -``` - -If `loc` is not used directly (IR manages its own stack layout), add an -explicit 4-byte reserved region at the top of the local area in `ir/stack.c`. -The key invariant is: **no variable or spill slot may be placed at FP-4 when -`has_static_chain` is set**. - -### Step 6 — ARM codegen: multi-hop chain dereference (4 sites) - -The pattern is the same at all 4 sites. Extract a helper function: - -```c -/* Resolve the base register for a captured variable access. - * For depth 1, returns R10 directly. - * For depth > 1, emits LDR chain to follow ancestor frame pointers - * and returns a scratch register holding the target ancestor's FP. - * Caller must restore scratch via *out_scratch when done. */ -static int resolve_chain_base(TCCIRState *ir, int ci, - uint32_t exclude_regs, - ScratchRegAlloc *out_scratch, - int *used_scratch) -{ - int depth = ir->captured_chain_depths[ci]; - if (depth <= 1) - { - *used_scratch = 0; - return architecture_config.static_chain_reg; /* R10 */ - } - - /* Multi-hop: follow chain through (depth - 1) intermediate frames. - * Each frame saves its incoming R10 at [FP - 4] (CHAIN_SLOT_OFFSET). */ - *out_scratch = get_scratch_reg_with_save(exclude_regs); - *used_scratch = 1; - - /* Start from R10 (points to immediate parent's FP) */ - thumb_shift no_shift = {THUMB_SHIFT_NONE, 0, THUMB_SHIFT_IMMEDIATE}; - ot_check(th_mov_reg(out_scratch->reg, - architecture_config.static_chain_reg, - FLAGS_BEHAVIOUR_NOT_IMPORTANT, - no_shift, ENFORCE_ENCODING_NONE, false)); - - for (int hop = 1; hop < depth; hop++) - { - /* LDR temp, [temp, #-4] — follow chain link */ - load_from_base_ir(out_scratch->reg, PREG_REG_NONE, - IROP_BTYPE_INT32, 0, - 4 /* abs */, 1 /* sign: negative */, - out_scratch->reg); - } - return out_scratch->reg; -} -``` - -Then update each of the 4 chain-access sites: - -| # | File | Line | Context | -|---|------|------|---------| -| 1 | `arm-thumb-gen.c` | 2287 | LOAD path (`resolve_base_ir`) | -| 2 | `arm-thumb-gen.c` | 3215 | STORE path (`store_ex_ir`) | -| 3 | `arm-thumb-gen.c` | 4816 | LEA / ADD accumulator path | -| 4 | `arm-thumb-gen.c` | 6375 | Additional chain-relative access | - -At each site, replace: -```c -base_reg = architecture_config.static_chain_reg; -``` -with: -```c -ScratchRegAlloc chain_scratch; -int chain_used = 0; -base_reg = resolve_chain_base(ir, ci, exclude_regs, &chain_scratch, &chain_used); -/* ... existing access using base_reg ... */ -if (chain_used) restore_scratch_reg(&chain_scratch); -``` - -### Step 7 — Remove xfail (`tests/ir_tests/test_qemu.py:290`) - -```python -NESTED_XFAIL_TEST_FILES = [ -- ("nested_multi_level.c", 0), -] -``` - -Move the test to the passing `NESTED_TEST_FILES` list. - ---- - -## Compilation & Verification - -```bash -# 1. Build -make cross -j16 - -# 2. Quick manual test -cd tests/ir_tests -python run.py -c nested_multi_level.c -# Expected output: -# 331 -# 430 - -# 3. Dump IR to verify chain_depth metadata -python run.py -c nested_multi_level.c --dump-ir -# Look for captured var 'a' with chain_depth=2 - -# 4. Disassemble level2 to verify double-dereference -arm-none-eabi-objdump -d build/nested_multi_level.elf | grep -A 30 ' 2**: The multi-hop loop generalizes, but add a test with 3 levels - (f → g → h → i accessing f's var) to confirm. -4. **Mixed depths**: A single nested function may capture vars at different - depths (depth 1 for parent vars, depth 2 for grandparent vars). Each - captured var uses its own `chain_depths[ci]` — no conflict. -5. **Address-of captured var**: `LEA` on a depth-2 variable must produce the - correct address. The chain hop gives the ancestor FP, and adding the offset - gives the variable's address — same pattern, just no final LDR. -6. **Store to grandparent var**: `a = 100` in the test mutates `a` in main's - frame via the chain. The STORE path (site #2) must use the resolved base - register. diff --git a/docs/nested_functions/fixes/fix5_test_all_docs.md b/docs/nested_functions/fixes/fix5_test_all_docs.md deleted file mode 100644 index fac8fff1..00000000 --- a/docs/nested_functions/fixes/fix5_test_all_docs.md +++ /dev/null @@ -1,65 +0,0 @@ -# Task 5: Run `make test-all` and Document Final Results - -**Depends on**: Fixes 1-4 applied -**Complexity**: Low (documentation only) - -## Steps - -### 1. Run full test suite - -```bash -# Initialize GCC testsuite submodule if not already done -git submodule update --init --depth 1 tests/gcctestsuite/gcc-testsuite - -# Run all tests -make test-all -``` - -### 2. Capture results - -Record the final counts: -- Total compile tests passed/failed/skipped -- Total execute tests passed/failed/skipped -- Any new GCC torture tests that now pass (compared to current xfail list) - -### 3. Update GCC xfail list if needed - -In `tests/gcctestsuite/conftest.py`: -- If any tests in `GCC_XFAIL_TESTS` now pass, remove them from the xfail list -- If any new tests fail, investigate and either fix or add to xfail with reason - -### 4. Update `docs/nested_functions/phase7_testing.md` - -Move all 4 items from "Remaining (Known Limitations) 🚧" to "Completed ✅": - -```markdown -### Completed ✅ -// ... existing items ... -- [x] `nested_capture_array.c` — Array capture from parent (Fix 1: type propagation) -- [x] `nested_multi_level.c` — Multi-level nesting (Fix 4: chain-of-chains) -- [x] `nested_recursive_parent.c` — Recursive parent function (Fix 3: prescan filter) -- [x] `nested_struct_return.c` — Nested function returning struct (Fix 2: sret + types) -- [x] Run `make test-all` and document final GCC torture suite results -``` - -Update the test summary table: - -```markdown -| Category | Passing | Failing | Status | -|----------|---------|---------|--------| -| Milestone 1 (Basic) | 3 | 0 | ✅ Complete | -| Milestone 2 (Capture) | 5 | 0 | ✅ Complete | -| Milestone 3 (Funcptr/Advanced) | 8 | 0 | ✅ Complete | -| GCC Torture (enabled) | 8+ | 0 | ✅ Complete | -| GCC Torture (skipped) | - | 6 | ⚪ Expected | -``` - -Add a "GCC Torture Suite Final Results" section with the `make test-all` output summary. - -### 5. Verify clean test run - -```bash -make test -j16 # IR tests — all pass, zero xfail -make test-all # GCC torture — document results -make test-asm -j16 # Assembly tests — unaffected -``` diff --git a/docs/nested_functions/phase1_parser.md b/docs/nested_functions/phase1_parser.md deleted file mode 100644 index 4d90c030..00000000 --- a/docs/nested_functions/phase1_parser.md +++ /dev/null @@ -1,192 +0,0 @@ -# Phase 1: Parser — Save Nested Function Bodies as Tokens - -**Effort**: 2-3 days -**Files**: `tccgen.c`, `tcc.h`, `tccir.h` - -## Overview - -When `decl(VT_LOCAL)` encounters a function body `{`, instead of erroring, save the token stream via `skip_or_save_block()` and compile the nested function after the parent's `block(0)` completes. This reuses TCC's proven inline function model. - -## TODO - -- [x] Define `NestedFunc` struct in `tcc.h` -- [x] Add `nested_funcs` array + capacity fields to `TCCIRState` in `tccir.h` -- [x] Modify `decl()` in `tccgen.c`: replace error gate at line ~11393 with nested function save logic -- [x] Validate nested func parameters (same checks as file-scope path) -- [ ] Create mangled symbol name (e.g., `parent__nested__child`) -- [x] Push nested func symbol into `local_stack` so parent body can reference it -- [x] Call `skip_or_save_block(&nf->func_str)` to save body tokens -- [x] Implement `compile_nested_functions()` in `tccgen.c` -- [x] Define `ParentSavedState` struct for all globals that must be saved/restored -- [x] Save all ~20 globals before nested func compilation -- [x] For each `NestedFunc`: replay tokens via `begin_macro`/`end_macro`, call `gen_function()` -- [x] Restore all globals after nested func compilation -- [x] Insert `compile_nested_functions()` call in `gen_function()` after `block(0)`, before optimizations -- [x] Handle `ind` correctly — nested func code goes to `.text` at current `ind`, then parent's `ind` restored -- [x] Free `NestedFunc` token strings in `tcc_ir_free()` -- [ ] Test with `nested_basic.c` (no capture, direct call only) - -## Data Structures - -```c -// tcc.h — new struct -typedef struct NestedFunc { - TokenString *func_str; // saved token stream of function body - Sym *sym; // function symbol in parent's local scope - CType type; // full function type - AttributeDef ad; // function attributes - int v; // token id (function name) - char filename[256]; // source filename for error messages -} NestedFunc; - -// tccir.h — additions to TCCIRState -// NestedFunc *nested_funcs; -// int nb_nested_funcs; -// int nested_funcs_capacity; -``` - -## Pseudocode: Modify `decl(VT_LOCAL)` - -``` -function decl(l): - ...existing type parsing... - - if tok == '{': - if l == VT_LOCAL: - // ── nested function definition ── - assert (type.t & VT_BTYPE) == VT_FUNC - - // Validate parameters (same as file-scope path) - foreach param in type.ref->next: - if param has no identifier: error("expected identifier") - if param is void: param.type = int_type - - merge_funcattr(&type.ref->f, &ad.f) - - // Create mangled symbol: "parent__nested__child" - mangled_name = concat(funcname, "__nested__", get_tok_str(v)) - - // Push symbol into LOCAL scope so parent body can reference it - type.t &= ~VT_EXTERN - sym = sym_push(v, &type, VT_CONST, 0) // VT_CONST: it's a function - put_extern_sym(sym, cur_text_section, 0, 0) // placeholder address - - // Save the token stream - ir = tcc_state->ir - grow_nested_funcs_if_needed(ir) - nf = &ir->nested_funcs[ir->nb_nested_funcs++] - nf->sym = sym - nf->type = type - nf->ad = ad - nf->v = v - strcpy(nf->filename, file->filename) - skip_or_save_block(&nf->func_str) // saves '{' ... '}' - - break // continue parsing parent body - else: - // existing file-scope path (unchanged) - ... -``` - -## Pseudocode: `compile_nested_functions()` - -``` -function compile_nested_functions(parent_ir, parent_sym): - // Save ALL parent global state - saved = ParentSavedState { - .ir = tcc_state->ir, - .loc = loc, - .ind = ind, - .rsym = rsym, - .func_ind = func_ind, - .funcname = funcname, - .func_vt = func_vt, - .func_var = func_var, - .cur_scope = cur_scope, - .root_scope = root_scope, - .loop_scope = loop_scope, - .local_stack = local_stack, - .local_label_stack = local_label_stack, - .global_label_stack = global_label_stack, - .nocode_wanted = nocode_wanted, - .local_scope = local_scope, - .nb_temp_local_vars = nb_temp_local_vars, - .cur_text_section = cur_text_section, - .cur_switch = cur_switch, - } - memcpy(saved.arr_temp_local_vars, arr_temp_local_vars, sizeof arr_temp_local_vars) - - for each nf in parent_ir->nested_funcs: - // Replay saved token stream (same as inline function expansion) - tccpp_putfile(nf->filename) - begin_macro(nf->func_str, 1) - next() // prime the first token - - cur_text_section = saved.cur_text_section - gen_function(nf->sym) - end_macro() - - // Restore ALL parent state - tcc_state->ir = saved.ir - loc = saved.loc - // NOTE: do NOT restore ind — nested func code is in .text and - // the parent's codegen will emit at the CURRENT ind (after nested funcs) - // Actually: we DO restore ind. The parent's IR codegen emits code later - // during tcc_ir_codegen_generate(), which sets ind itself. - // Wait — gen_function() for the nested func modifies ind (it writes code). - // The parent needs ind to continue where IT left off... but the parent - // hasn't emitted code yet (we're before parent's optimization/codegen). - // So nested func code goes at the current ind, and the parent will emit - // its code at the NEW ind after all nested funcs. - // DECISION: Do NOT restore ind. Let nested funcs claim their .text space. - rsym = saved.rsym - func_ind = saved.func_ind - funcname = saved.funcname - func_vt = saved.func_vt - func_var = saved.func_var - cur_scope = saved.cur_scope - root_scope = saved.root_scope - loop_scope = saved.loop_scope - local_stack = saved.local_stack - local_label_stack = saved.local_label_stack - global_label_stack = saved.global_label_stack - nocode_wanted = saved.nocode_wanted - local_scope = saved.local_scope - nb_temp_local_vars = saved.nb_temp_local_vars - cur_text_section = saved.cur_text_section - cur_switch = saved.cur_switch - memcpy(arr_temp_local_vars, saved.arr_temp_local_vars, sizeof arr_temp_local_vars) -``` - -### Key detail: `ind` handling - -`gen_function()` writes machine code at `ind` via `tcc_ir_codegen_generate()`. The nested function's code is written first (it runs `gen_function` end-to-end, including codegen). Then the parent resumes its own IR pipeline. The parent's `tcc_ir_codegen_generate()` will write code at the new `ind` (after nested funcs). So we do NOT restore `ind`. - -But we DO need to restore `func_ind` — this tracks the START of the parent function in `.text` (used for symbol size calculation: `elfsym(sym)->st_size = ind - func_ind`). - -## Pseudocode: Integration point in `gen_function()` - -``` -function gen_function(sym): - ...existing setup (ir = tcc_ir_alloc(), params, etc.)... - - block(0) - tcc_ir_backpatch_to_here(ir, rsym) - - // ── NEW: compile nested functions ── - if ir->nb_nested_funcs > 0: - compile_nested_functions(ir, sym) - - // ...existing optimization passes (operate on parent's ir)... - // ...register allocation... - // ...tcc_ir_codegen_generate(ir) — parent's code emitted AFTER nested funcs... - // ...tcc_ir_free(ir)... -``` - -## Symbol Visibility - -After `skip_or_save_block`, the nested function's `Sym` is on `local_stack`. When the parent body references `f2`, `sym_find()` resolves it to a function symbol just like any external function. Direct calls work with no special handling. - -## Test Cases (Phase 1) - -See [tests/nested_basic.c](tests/nested_basic.c), [tests/nested_basic_args.c](tests/nested_basic_args.c), [tests/nested_multiple.c](tests/nested_multiple.c). diff --git a/docs/nested_functions/phase2_static_chain.md b/docs/nested_functions/phase2_static_chain.md deleted file mode 100644 index ba1d3379..00000000 --- a/docs/nested_functions/phase2_static_chain.md +++ /dev/null @@ -1,156 +0,0 @@ -# Phase 2: Static Chain — Captured Variable Access - -**Effort**: 3-5 days -**Files**: `tccgen.c`, `tcc.h`, `tccir.h`, `ir/core.c`, `ir/core.h`, `tccls.c`, `arch/armv8m.c`, `arm-thumb-defs.h` - -## Overview - -Enable nested functions to read/write variables from the parent's stack frame via a static chain pointer passed in R10 (following GCC's ARM convention). Includes a token pre-scan to mark captured variables as address-taken before the parent's IR is generated. - -## TODO - -- [x] Define `REG_STATIC_CHAIN 10` in `arm-thumb-defs.h` -- [x] Add `static_chain_reg` field to `ArchitectureConfig` in `tcc.h` -- [x] Set `.static_chain_reg = 10` in `arch/armv8m.c` -- [x] Add `has_static_chain`, `static_chain_vreg` fields to `TCCIRState` -- [x] Add `captured_offsets[]`, `captured_vregs[]`, `captured_tokens[]`, `nb_captured` fields to `NestedFunc` struct -- [x] Implement `prescan_captured_vars()` — token scan for parent variable references -- [x] Call `prescan_captured_vars()` in `decl(VT_LOCAL)` right after `skip_or_save_block()` -- [x] Mark captured parent symbols with `addrtaken` + `tcc_ir_set_addrtaken()` to force stack spill -- [x] Store captured variable FP offsets in `NestedFunc.captured_offsets[]` -- [x] Resolve captured variable offsets post-register-allocation (lookup vreg → `allocation.offset`) -- [x] In nested `gen_function()`: detect `has_static_chain`, allocate chain vreg -- [x] Emit chain vreg initialization: `chain_vreg = R10` at function entry -- [x] Modify variable resolution in nested function: detect parent-scope variables (`tok_identifier`) -- [x] Generate chain-relative LOAD/STORE IR for captured variable access (base=R10, offset=parent FP offset) -- [x] In register allocator (`tccls.c`): exclude R10 from allocatable set when `has_static_chain` -- [x] Pre-assign chain vreg interval to R10 (like parameter incoming_reg) -- [x] In parent's call to nested function: emit `SET_CHAIN` (MOV R10, R7) before call -- [x] Detect nested function at call site via `vtop->sym->a.nested_func` (not `vtop->type.ref`) -- [x] Add `SET_CHAIN` to real codegen pass in `ir/codegen.c` (not just dry-run) -- [x] Add `SET_CHAIN` to `tcc_ir_get_op_name()` in `ir/dump.c` -- [x] Name mangling: GCC convention `funcname.N` via `asm_label` + `tok_alloc` -- [x] `VT_STATIC` for nested function symbols (STB_LOCAL binding) -- [x] Save/restore `cur_text_section` + `ind` after each nested `gen_function()` (safety resets) -- [x] Save/restore debug state (`debug_info`, `debug_info_root`) via `tcc_debug_save_state()`/`tcc_debug_restore_state()` -- [x] Nested function code emitted BEFORE parent code in `.text` (layout: nested funcs → parent) -- [x] Parent ELF symbol updated post-nested-compilation (`func_ind = ind; put_extern_sym(...)`) -- [x] Test with `nested_capture_read.c` — **PASS** ✓ -- [x] Test with `nested_capture_write.c` — **PASS** ✓ -- [x] Test with `nested_capture_multiple.c` — **PASS** ✓ -- [x] Test with `nested_multiple.c` — **PASS** ✓ -- [x] Test with `nested_basic.c`, `nested_basic_args.c`, `nested_basic_simple.c` — **PASS** ✓ -- [x] Test with `nested_direct_call_args.c` — **PASS** ✓ -- [x] Test with `nested_shadowing.c` — **PASS** ✓ - -### Known Limitations (out of scope for Phase 2) - -- [ ] `nested_capture_array.c` — array capture fails ("pointer expected") -- [ ] `nested_multi_level.c` — multi-level nesting fails ("undeclared" — prescan only sees immediate parent) -- [ ] `nested_recursive_parent.c` — captured var in recursive parent fails ("undeclared") -- [ ] `nested_struct_return.c` — struct return from nested function fails (type mismatch) -- [ ] `nested_funcptr.c`, `nested_funcptr_call_twice.c`, `nested_funcptr_indirect.c` — function pointer / trampoline support (Phase 3) - -## Key Design: Token Pre-scan - -The pre-scan runs at parse time (during `decl(VT_LOCAL)` right after `skip_or_save_block`) — before the parent's `block(0)` generates IR for variables that might be captured. This ensures captured variables are marked `addrtaken` early enough. - -``` -function prescan_captured_vars(nf, parent_local_stack): - // Walk the saved TokenString looking for identifiers - // that match parent local variable names. - - tokens = tok_str_buf(nf->func_str) - pos = 0 - while tokens[pos] != TOK_EOF: - t = tokens[pos] - if t >= TOK_IDENT: - sym = lookup in parent_local_stack for token t - if sym != NULL && sym->r & VT_LOCAL: - sym->type.t |= VT_ADDRTAKEN // force to stack - nf->captured_offsets[nf->nb_captured++] = sym->c - pos = advance past token + associated data - - // NOTE: This is a shallow scan. If the nested function declares - // a local with the same name as a parent variable, we over-mark. - // Conservative over-marking is safe (extra stack spills) but suboptimal. -``` - -## Key Design: Captured Variable Resolution - -During nested function compilation, variable lookups that find parent-scope symbols must produce chain-relative addressing instead of FP-relative: - -``` -// Before compiling nested function: -parent_local_stack_top = local_stack - -// Inside nested gen_function, in variable resolution: -function resolve_variable_access(tok_id): - sym = sym_find(tok_id) - if sym == NULL: return NULL - - if sym->r & VT_LOCAL: - if sym was pushed before parent_local_stack_top: - // Captured variable — access via chain register - return svalue_chain_relative(sym->c) // offset from parent FP - else: - // Nested function's own local — normal FP access - return svalue_fp_relative(sym->c) - - return sym // global/external — unchanged - -function svalue_chain_relative(parent_offset): - // Use existing LOAD/STORE with chain_vreg as base (no new SValue kind) - // Option B from plan: check ir->has_static_chain + sym_scope - sv.r = VT_LOCAL | VT_LVAL - sv.c.i = parent_offset - // Tag this SValue so IR emitter uses chain_vreg instead of FP - // Implementation: check if sym_scope < nested function scope - return sv -``` - -## Key Design: Chain Vreg Setup - -``` -function gen_function_nested_setup(ir): - if not ir->has_static_chain: return - - // Allocate a vreg for the chain — behaves like a parameter in R10 - chain_vreg = tcc_ir_alloc_local_vreg(ir) - ir->static_chain_vreg = chain_vreg - - // The register allocator will: - // 1. Exclude R10 from general allocation - // 2. Pre-assign chain_vreg to R10 - // 3. Mark its live range as the entire function (conservative) -``` - -## Key Design: Register Allocation - -``` -function tcc_ls_allocate_registers(ls, params, float_params, spill_base): - ...existing setup... - - if current function has_static_chain: - // Remove R10 from allocatable set - ls->registers_map &= ~(1ULL << 10) - - // Pre-assign chain vreg to R10 - chain_interval = find_interval(ls, ir->static_chain_vreg) - chain_interval->r0 = 10 -``` - -## Key Design: Direct Call Chain Setup - -``` -// In parent's gfunc_call path, when calling nested function: -function gen_call(func_sym, args): - if func_sym is a nested function: - // Emit: MOV R10, R7 (pass parent FP as chain) - emit TCCIR_OP_SET_CHAIN // implicit: R10 <- FP - emit TCCIR_OP_FUNCCALLVAL func_sym, args... -``` - -## Test Cases (Phase 2) - -See [tests/nested_capture_read.c](tests/nested_capture_read.c), [tests/nested_capture_write.c](tests/nested_capture_write.c), [tests/nested_capture_multiple.c](tests/nested_capture_multiple.c), [tests/nested_capture_array.c](tests/nested_capture_array.c), [tests/nested_direct_call_args.c](tests/nested_direct_call_args.c), [tests/nested_shadowing.c](tests/nested_shadowing.c). diff --git a/docs/nested_functions/phase3_trampolines.md b/docs/nested_functions/phase3_trampolines.md deleted file mode 100644 index ac8a2a23..00000000 --- a/docs/nested_functions/phase3_trampolines.md +++ /dev/null @@ -1,171 +0,0 @@ -# Phase 3: Trampoline Generation (Address-of Nested Function) - -**Effort**: 5-7 days -**Files**: `tccgen.c`, `arm-thumb-gen.c`, `arm-thumb-opcodes.c`, `tccelf.c` - -## Overview - -When a nested function's address is taken (e.g., passed as a function pointer), generate a static trampoline in `.text` that sets up the static chain (R10) before jumping to the actual function. A writable chain slot in `.data` holds the parent's FP value. - -## TODO - -- [x] Add `trampoline_needed` flag to `NestedFunc` struct -- [x] Add `trampoline_sym` and `chain_slot_sym` fields to `NestedFunc` or nested `Sym` -- [x] Detect address-of-nested-function in expression evaluation (`tccgen.c`) -- [x] Differentiate direct call vs address-taken contexts for nested function symbols -- [x] Implement `create_chain_slot()` — allocate 4 bytes in `.data` section -- [x] Implement `emit_trampoline_code()` — emit Thumb-2 trampoline in `.text` -- [x] Trampoline instruction sequence: LDR R10 chain_ptr → LDR R10 [R10] → LDR PC func_addr -- [x] Add `R_ARM_ABS32` relocations for function address and chain slot address data words -- [x] At address-of site: emit IR to write current FP into chain slot (`STR R7, [chain_slot_addr]`) -- [x] At address-of site: push trampoline address as the "function pointer" value -- [x] Call `emit_trampoline_code()` during/after nested function's `gen_function()` -- [x] Create `STB_LOCAL` ELF symbols for trampoline and chain slot -- [x] Handle Thumb bit (+1) on trampoline symbol address -- [x] Document re-entrancy limitation (recursive parent corrupts chain slot) -- [x] Test with `nested_funcptr.c`, `nested_funcptr_indirect.c` -- [x] Test with `20000822-1.c` (the original GCC torture test) - -## Implementation Status - -**Completed:** -- Core trampoline mechanism in `tccgen.c`: - - Detection of address-of-nested-function in `unary()` at `&` operator - - Implicit function-to-pointer decay for nested functions (when not directly called) - - Chain slot allocation in `.data` section via `setup_nested_func_trampoline()` - - Trampoline code emission (20 bytes: 3×LDR + literal pool) in `emit_trampoline_for_nested_func()` - - Relocations for function and chain slot addresses (`R_ARM_ABS32`) -- New `TCCIR_OP_INIT_CHAIN_SLOT` IR opcode to store parent FP into chain slot at address-of site -- `tcc_gen_machine_init_chain_slot()` in `arm-thumb-gen.c`: emits LDR chain_addr + STR R7 sequence -- Proper `Sym *` tracking: `trampoline_tcc_sym` and `chain_slot_tcc_sym` in `NestedFunc` -- Trampoline emission inside `compile_nested_functions()` (before clearing nested func list) -- Section buffer management via `section_prealloc()` for trampoline bytes -- All tests passing: - - `nested_funcptr.c` → 50, 15 ✓ - - `nested_funcptr_indirect.c` → 105, 205 ✓ - - `nested_funcptr_call_twice.c` → 20, 102 ✓ - - GCC torture `20000822-1.c` → exit 0 ✓ - - Full IR test suite: 3106 passed, 0 failures ✓ - -## Why Not Executable Stack Trampolines? - -GCC generates small code snippets on the stack. This is **ruled out for ARMv8-M**: the stack is non-executable when MPU is enabled. We must keep trampoline code in `.text`. - -## Chosen Approach: Static Trampoline in `.text` + Chain Slot in `.data` - -### Trampoline Layout (20 bytes total) - -```asm -; In .text — trampoline for f1.f2: -__tramp_f1__f2: - LDR r10, [pc, #8] ; +0: r10 = chain slot address (from +12) - LDR r10, [r10] ; +4: r10 = *chain_slot = parent FP value - LDR pc, [pc, #4] ; +8: pc = function address (from +16), tail call -.Ldata_chain_ptr: - .word __chain_slot_f1__f2 ; +12: R_ARM_ABS32 → writable slot in .data -.Ldata_func: - .word f1__f2 ; +16: R_ARM_ABS32 → nested function - -; In .data: -__chain_slot_f1__f2: - .word 0 ; parent writes FP here at runtime -``` - -PC-relative offset calculation (Thumb: PC reads as current + 4): -- LDR at +0: PC=+4, offset=8 → loads from +12 (chain_slot address) -- LDR at +8: PC=+12, offset=4 → loads from +16 (function address) - -### Execution Flow - -1. Parent takes `&f2` → writes parent FP to chain slot, gets trampoline address -2. Caller invokes the "function pointer" (trampoline address) -3. Trampoline loads chain slot address, dereferences to get parent FP into R10 -4. Trampoline jumps to actual nested function -5. Nested function uses R10 to access captured variables - -## Pseudocode: Trampoline Emission - -``` -function emit_trampoline_code(nested_sym, chain_slot_sym): - tramp_start = ind - - // LDR R10, [PC, #8] — load address of chain slot from literal pool - arm_thumb_ldr_literal_w(R10, 8) // Thumb-2: F8DF A008 - - // LDR R10, [R10, #0] — dereference: r10 = *chain_slot = parent FP - arm_thumb_ldr_imm_w(R10, R10, 0) // Thumb-2: F8DA A000 - - // LDR PC, [PC, #4] — tail jump to nested function - arm_thumb_ldr_literal_w(PC, 4) // Thumb-2: F8DF F004 - - // NOP (alignment) - arm_thumb_nop() // Thumb-2: BF00 - - // Literal pool: - emit_word(0) // function address placeholder - add_relocation(R_ARM_ABS32, nested_sym, ind - 4) - - emit_word(0) // chain slot address placeholder - add_relocation(R_ARM_ABS32, chain_slot_sym, ind - 4) - - // Register trampoline symbol - put_extern_sym_2(tramp_sym, cur_text_section, tramp_start + 1, ind - tramp_start, 0) - // +1 for Thumb bit -``` - -## Pseudocode: Chain Slot Creation - -``` -function create_chain_slot(nested_sym): - data_sec = tcc_state->data_section - offset = section_add(data_sec, 4, 4) // 4 bytes, 4-byte aligned - - chain_slot_name = concat("__chain_", nested_sym->name) - chain_slot_sym = put_elf_sym(...) // STB_LOCAL - - // Initialize to 0 - write32le(data_sec->data + offset, 0) - - return chain_slot_sym -``` - -## Pseudocode: Address-of Detection & IR Generation - -``` -// In expression evaluation (tccgen.c): -function handle_symbol_reference(sym): - if sym is a nested function: - if context is direct function call (immediately followed by '('): - // Direct call — use SET_CHAIN (Phase 2) + BL - gen_call_nested_direct(sym, args) - else: - // Address taken — need trampoline - sym->nested_addr_taken = 1 - gen_addr_of_nested_func(sym) - -function gen_addr_of_nested_func(nested_sym): - // 1. Write current FP to chain slot - emit IR: chain_addr <- SYMBOL(__chain_slot_f1__f2) - emit IR: STORE [chain_addr], FP - - // 2. Push trampoline address as function pointer value - emit IR: result <- SYMBOL(__tramp_f1__f2 + 1) // +1 Thumb bit - vpush(result) -``` - -## Re-entrancy Limitation - -This approach is **NOT re-entrant**: if the parent function recurses, each invocation writes the same `.data` chain slot. The last writer wins, corrupting earlier invocations' nested function pointers. - -**Acceptable for now**: most GCC torture tests don't combine recursion + nested function pointers. - -**Future fix (deferred)**: Stack-allocated trampoline descriptors: -- Allocate `{func_addr, chain_value}` pair on parent stack -- Trampoline reads from descriptor address passed via R12 (IP) -- Requires `alloca`-like mechanism or static stack reservation - -## Test Cases (Phase 3) - -See [tests/nested_funcptr.c](tests/nested_funcptr.c), [tests/nested_funcptr_indirect.c](tests/nested_funcptr_indirect.c), [tests/nested_funcptr_call_twice.c](tests/nested_funcptr_call_twice.c), [tests/nested_recursive_parent.c](tests/nested_recursive_parent.c). - -Final validation: `20000822-1.c` from GCC torture suite. diff --git a/docs/nested_functions/phase4_ir.md b/docs/nested_functions/phase4_ir.md deleted file mode 100644 index 511ab6d9..00000000 --- a/docs/nested_functions/phase4_ir.md +++ /dev/null @@ -1,121 +0,0 @@ -# Phase 4: IR Integration & Optimization Safety - -**Effort**: 3-4 days -**Files**: `ir/core.c`, `ir/core.h`, `ir/codegen.c`, `ir/live.c`, `tccir.h`, `tccls.c` - -## Overview - -Add nested function metadata to `TCCIRState`, model the static chain register (R10) as a parameter-like vreg, ensure IR optimizations don't eliminate captured variable accesses, and add the `SET_CHAIN` IR instruction for parent→nested calls. - -## TODO - -- [x] Add `NestedFunc *nested_funcs`, `nb_nested_funcs`, `nested_funcs_capacity` to `TCCIRState` -- [x] Add `has_static_chain` (uint8_t), `static_chain_vreg` (int), `parent_loc` (int) to `TCCIRState` -- [x] Initialize new fields in `tcc_ir_alloc()` -- [x] Free `nested_funcs` array in `tcc_ir_free()` -- [x] Allocate chain vreg via `tcc_ir_alloc_var()` when `has_static_chain` (using VAR not PARAM to avoid shifting parameter indices) -- [x] Mark chain vreg live-in at instruction 0 with full-function live range -- [x] Set chain vreg `incoming_reg = REG_STATIC_CHAIN` (R10) — like param incoming regs -- [x] Add chain vreg to liveness analysis: mark live-in, extend to all chain load/store uses, precolor to R10 -- [x] Add `TCCIR_OP_SET_CHAIN` to `TccIrOp` enum in `tccir.h` -- [x] Define `SET_CHAIN` semantics: "write FP to R10 before next call" -- [x] Add SET_CHAIN to IR dump output -- [x] Fix store path for captured variables in `th_store_resolve_base_ir()` -- [ ] Verify store-load forwarding does NOT apply to chain-relative loads (non-FP base) -- [ ] Verify dead store elimination does NOT remove chain-relative stores (external side effect) -- [ ] Verify constant propagation stops at chain-relative loads -- [ ] Verify CSE CAN optimize chain loads from same offset within a basic block -- [x] Test IR dump output with `--dump-ir` for nested function compilation - -## New IR Instruction: `SET_CHAIN` - -``` -TCCIR_OP_SET_CHAIN // no operands — implicit: R10 <- FP -``` - -This is emitted in the **parent** before calling a nested function directly. The codegen lowers it to `MOV R10, R7`. - -Alternative: make it explicit with operands: `SET_CHAIN dest=R10, src=FP`. But the implicit form is simpler since the source (FP) and destination (R10) are always the same on ARM. - -## Chain Vreg as Parameter-like Entity - -The static chain vreg models the R10 register (static chain pointer) as a live-in value at function entry. It is allocated as a **VAR** type vreg (not PARAM) to avoid shifting the actual function parameter indices. - -``` -// During nested gen_function setup: -function gen_function_nested_setup(ir): - if not ir->has_static_chain: return - - // Allocate as VAR (not PARAM) to avoid shifting parameter indices - chain_vreg = tcc_ir_vreg_alloc_var(ir) - ir->static_chain_vreg = chain_vreg - - // Create a live interval for chain_vreg: - // - start = 0 (live at entry) - // - end = last instruction (conservative; could compute tighter range) - // - incoming_reg = 10 (R10) - // - addrtaken = 0 - interval = find_or_create_interval(chain_vreg) - interval->start = 0 - interval->end = ir->next_instruction_index - interval->incoming_reg0 = 10 // R10 -``` - -## Optimization Safety - -Chain-relative loads/stores use a non-FP base register (chain vreg → R10). The existing optimizer conservative rules should apply: - -| Optimization | Safe? | Reason | -|-------------|-------|--------| -| Store-load forwarding | YES | Only applies to same-base, same-offset; chain base ≠ FP base | -| Dead store elimination | YES | Only applies to stack locals (FP-relative); chain stores use different base | -| Constant propagation | YES | Cannot propagate through memory loads; chain loads are memory ops | -| CSE (intra-block) | YES | Chain loads from same offset can be CSE'd within a basic block | -| CSE (inter-block) | CAUTION | Safe IF no calls between load and reuse (parent frame unchanged) | -| Copy propagation | YES | Standard rules apply | -| DCE | YES | If chain load result unused, can be eliminated | - -**Key insight**: Since captured variable access goes through a vreg (chain_vreg) as base rather than FP, the optimizer already treats these as generic memory operations, not stack locals. No special marking needed for most passes. - -**Exception**: Store-load forwarding and dead store elimination are currently conservative — they only optimize stack locals whose address is NOT taken (FP-relative, addrtaken=0). Chain-relative ops use a different base, so they're automatically excluded. - -## Pseudocode: Chain-relative IR Generation - -``` -// No new opcodes — use existing LOAD/STORE with chain_vreg as base: - -function emit_chain_load(ir, dest_vreg, parent_offset): - src = make_operand_vreg_plus_offset(ir->static_chain_vreg, parent_offset) - dest = make_operand_vreg(dest_vreg) - tcc_ir_put_op(ir, TCCIR_OP_LOAD, src, NONE, dest) - -function emit_chain_store(ir, parent_offset, src_vreg): - dest = make_operand_vreg_plus_offset(ir->static_chain_vreg, parent_offset) - src = make_operand_vreg(src_vreg) - tcc_ir_put_op(ir, TCCIR_OP_STORE, src, NONE, dest) -``` - -## Pseudocode: Parent Call Chain Setup (IR) - -``` -// In parent's gfunc_call path: -function gen_call_to_nested(ir, nested_sym, args): - // Option A: dedicated SET_CHAIN instruction - emit TCCIR_OP_SET_CHAIN - emit TCCIR_OP_FUNCCALLVAL nested_sym, args - - // Option B: explicit MOV via vreg - tmp = alloc_temp_vreg() - emit TCCIR_OP_ASSIGN tmp <- FP_OPERAND - // annotate call: R10 must hold `tmp` - emit TCCIR_OP_FUNCCALLVAL nested_sym, args, extra_reg={R10, tmp} - - // DECISION: Option A (simpler) -``` - -## Test Cases - -- Dump IR with `--dump-ir` for each Phase 2 test and verify chain load/store instructions appear -- Verify chain stores are NOT eliminated by dead store elimination -- Verify chain loads from same offset in same block ARE CSE'd -- Verify SET_CHAIN appears before direct calls to nested functions in parent IR diff --git a/docs/nested_functions/phase5_arm_codegen.md b/docs/nested_functions/phase5_arm_codegen.md deleted file mode 100644 index 8699fb77..00000000 --- a/docs/nested_functions/phase5_arm_codegen.md +++ /dev/null @@ -1,198 +0,0 @@ -# Phase 5: ARM Thumb-2 Code Generation - -**Effort**: 3-5 days -**Files**: `arm-thumb-gen.c`, `arm-thumb-opcodes.c`, `arm-thumb-opcodes.h`, `ir/codegen.c` - -## Overview - -Lower chain-relative IR operations to Thumb-2 instructions. Modify prologue/epilogue to save/restore R10. Emit trampoline machine code and chain slots. Lower `SET_CHAIN` to `MOV R10, R7`. - -## TODO - -- [x] Modify `gen_func_prologue()` to push R10 when `ir->has_static_chain` -- [x] Verify R10 is already in the callee-saved register set in `arch/armv8m.c` (`static_chain_reg = 10`) -- [x] Modify `gen_func_epilogue()` to pop R10 (via existing push_mask — R10 included in `pushed_registers`) -- [x] Implement chain-relative `LDR.W Rd, [R10, #offset]` codegen path (via `base_reg = architecture_config.static_chain_reg`) -- [x] Implement chain-relative `STR.W Rd, [R10, #offset]` codegen path (via `base_reg = architecture_config.static_chain_reg`) -- [x] Handle large offsets (>4095) via scratch register + register-offset addressing (fallback in `load_word_from_base`/`store_word_to_base`) -- [x] Implement `tcc_gen_machine_set_chain()` — emit `MOV R10, R7` (Thumb-2) -- [x] Add `TCCIR_OP_SET_CHAIN` case in `ir/codegen.c` dispatch -- [x] Implement `emit_trampoline_for_nested_func()` in `tccgen.c`: - - [x] `LDR.W R10, [PC, #offset]` — load chain slot address - - [x] `LDR.W R10, [R10, #0]` — dereference chain slot - - [x] `LDR.W PC, [PC, #offset]` — branch to nested function - - [x] NOP for alignment if needed - - [x] Emit data words (function addr, chain slot addr) with R_ARM_ABS32 relocations -- [x] Implement chain slot allocation — allocate 4 bytes in `.data` section (`setup_nested_func_trampoline()`) -- [x] Create chain slot ELF symbol (`__chain_`, STB_LOCAL) -- [x] Create trampoline ELF symbol (`__tramp_`, STB_LOCAL, +1 Thumb bit) -- [x] Wire trampoline emission into `compile_nested_functions()` flow (emit only if `trampoline_needed`) -- [x] Test trampoline disassembly matches expected Thumb-2 encoding (all tests pass) - -## Register Conventions - -| Register | Role | Notes | -|----------|------|-------| -| R0-R3 | Arguments / return | Caller-saved | -| R7 | Frame pointer | Thumb convention | -| R10 | Static chain | Callee-saved, loaded before nested call | -| R12 | IP (scratch) | Used by trampoline if needed | -| LR / R14 | Link register | Saved in prologue | -| PC / R15 | Program counter | Trampoline branch target | - -## Prologue/Epilogue Pseudocode - -``` -function gen_func_prologue(ir): - push_mask = compute_callee_saved_registers(ir) - - if ir->has_static_chain: - push_mask |= (1 << 10) // R10 callee-saved - // R10 arrives with chain value — no extra setup needed - - emit PUSH {push_mask} - if need_frame_pointer: - emit MOV R7, SP - emit SUB SP, SP, #frame_size - -function gen_func_epilogue(ir): - emit ADD SP, SP, #frame_size - emit POP {push_mask | (1 << PC)} // restores R10 and returns -``` - -## Chain-relative Load/Store Codegen - -``` -function codegen_load_via_chain(instruction): - base_reg = get_physical_reg(instruction.src1) // R10 - offset = instruction.offset - dest_reg = get_physical_reg(instruction.dest) - - if 0 <= offset <= 4095: - // Thumb-2 LDR.W Rd, [Rn, #imm12] - emit_thumb32_ldr_imm12(dest_reg, base_reg, offset) - else: - // Large offset needs scratch register - scratch = get_scratch_register() - emit_thumb32_movw(scratch, offset & 0xFFFF) - if offset > 0xFFFF: - emit_thumb32_movt(scratch, (offset >> 16) & 0xFFFF) - emit_thumb32_ldr_reg(dest_reg, base_reg, scratch) - -function codegen_store_via_chain(instruction): - base_reg = get_physical_reg(instruction.dest_addr) // R10 - offset = instruction.offset - src_reg = get_physical_reg(instruction.src1) - - if 0 <= offset <= 4095: - emit_thumb32_str_imm12(src_reg, base_reg, offset) - else: - scratch = get_scratch_register() - emit_thumb32_movw(scratch, offset & 0xFFFF) - if offset > 0xFFFF: - emit_thumb32_movt(scratch, (offset >> 16) & 0xFFFF) - emit_thumb32_str_reg(src_reg, base_reg, scratch) -``` - -## SET_CHAIN Lowering - -``` -function codegen_set_chain(instruction): - // Parent is about to call a nested function. - // Copy FP to static chain register: MOV R10, R7 - // Thumb-2: 0x4637 would be MOV R7, R6 — wrong - // High register MOV: 0x46BA = MOV R10, R7 (01000110 10 111 010) - emit_thumb16(0x46BA) // MOV R10, R7 -``` - -## Trampoline Machine Code Layout (24 bytes) - -``` -Offset Encoding Instruction Comment ------- -------- ----------- ------- -+0 F8DF A008 LDR.W R10, [PC, #8] R10 = &chain_slot (from +16) -+4 F8DA A000 LDR.W R10, [R10, #0] R10 = *chain_slot (FP value) -+8 F8DF F004 LDR.W PC, [PC, #4] PC = func_addr (from +16) -+12 BF00 NOP alignment padding -+14 BF00 NOP alignment padding -+16 [4 bytes] .word chain_slot_addr R_ARM_ABS32 relocation -+20 [4 bytes] .word func_addr | 1 R_ARM_ABS32 relocation (+1 Thumb) -``` - -Total: 24 bytes per trampoline. - -### Trampoline Emission Pseudocode - -``` -function emit_trampoline_code(nested_sym, chain_slot_sym): - tramp_name = mangle("__tramp_", nested_sym->name) - tramp_start = ind - - // LDR.W R10, [PC, #8] — PC+4+8 = tramp_start+12, but Thumb PC = inst+4 - // At offset +0: PC = tramp_start+4, want data at +16, offset = 16-4 = 12 - // Wait: recalculate for Thumb-2 LDR literal - // PC reads as instruction_address + 4, word-aligned down - // LDR.W Rt, [PC, #imm12] — PC is Align(PC,4) - // Must compute exact offsets at emission time - - arm_thumb_ldr_pc_literal_w(REG_R10, chain_slot_ptr_offset) // +0 - arm_thumb_ldr_imm_w(REG_R10, REG_R10, 0) // +4 - arm_thumb_ldr_pc_literal_w(REG_PC, func_ptr_offset) // +8 - arm_thumb_nop16() // +12 - arm_thumb_nop16() // +14 - - // Data words at +16 and +20 - chain_slot_data_offset = ind - emit_word(0) - add_reloc(cur_text_section, chain_slot_sym, chain_slot_data_offset, R_ARM_ABS32) - - func_addr_data_offset = ind - emit_word(0) - add_reloc(cur_text_section, nested_sym, func_addr_data_offset, R_ARM_ABS32) - - // Register trampoline symbol (address +1 for Thumb bit) - put_extern_sym_2(tramp_sym, cur_text_section, - tramp_start | 1, ind - tramp_start, 0) -``` - -### Chain Slot Creation Pseudocode - -``` -function create_chain_slot(nested_sym): - slot_name = mangle("__chain_", nested_sym->name) - - // Allocate in .data (not .bss — explicit zero init) - data_sec = s1->data_section - offset = section_add(data_sec, 4, 4) // 4 bytes, 4-byte align - write32le(data_sec->data + offset, 0) // init to 0 - - // Create local ELF symbol - slot_sym = put_elf_sym(s1->symtab_section, offset, 4, - ELF32_ST_INFO(STB_LOCAL, STT_OBJECT), - 0, data_sec->sh_num, slot_name) - return slot_sym -``` - -## Parent Chain Slot Write - -Before calling a nested function through a pointer, the parent must write its FP to the chain slot: - -``` -function gen_write_chain_slot(chain_slot_sym): - // STR R7, [addr_of_chain_slot] - // This is an absolute address store — needs full address materialization - scratch = get_scratch_register() - emit_movw_movt(scratch, chain_slot_sym) // with R_ARM_ABS32 or MOVW/MOVT reloc pair - emit_str(R7, scratch, 0) // STR R7, [scratch] -``` - -## Test Cases - -| Test File | Validates | -|-----------|-----------| -| `nested_basic.c` | Prologue/epilogue R10 save, direct call SET_CHAIN | -| `nested_capture_read.c` | LDR.W via chain (R10+offset) | -| `nested_capture_write.c` | STR.W via chain (R10+offset) | -| `nested_funcptr.c` | Trampoline emission, chain slot, indirect call | -| `nested_funcptr_indirect.c` | Trampoline passed to external function | -| `nested_struct_return.c` | LDR/STR via chain with struct size > 4 | diff --git a/docs/nested_functions/phase6_linker.md b/docs/nested_functions/phase6_linker.md deleted file mode 100644 index db117ccd..00000000 --- a/docs/nested_functions/phase6_linker.md +++ /dev/null @@ -1,136 +0,0 @@ -# Phase 6: Linker Support - -**Effort**: 1-2 days -**Files**: `arm-link.c`, `tccelf.c` - -## Overview - -Enable relocations and symbol visibility for nested function artifacts: nested function symbols, trampoline symbols, and chain slot symbols. Almost entirely covered by existing `R_ARM_ABS32` relocation handling — the main work is ensuring correct symbol binding. - -## TODO - -- [x] Verify `R_ARM_ABS32` relocs emitted by trampoline resolve correctly in `relocate_section()` (`arm-link.c`) -- [x] Ensure nested function symbol `.text` address includes +1 Thumb bit in relocation value -- [x] Set nested function symbols to `STB_LOCAL` binding (not exported) -- [x] Set trampoline symbols (`__tramp_*`) to `STB_LOCAL` binding -- [x] Set chain slot symbols (`__chain_*`) to `STB_LOCAL` binding -- [x] Verify no duplicate symbol names when parent is called recursively (unique mangling) -- [x] Test ELF output with `arm-none-eabi-objdump -t` to verify symbol table -- [x] Test ELF output with `arm-none-eabi-objdump -r` to verify relocations - -## Relocations - -The trampoline uses two `R_ARM_ABS32` entries in `.text` (data words embedded after instructions): - -| Data Word | Relocation Target | Value After Linking | -|-----------|--------------------|---------------------| -| `+16: .word 0` | `__chain_` (`.data`) | Absolute address of chain slot | -| `+20: .word 0` | `` (`.text`) | Absolute address of nested function \| 1 (Thumb) | - -The existing `arm-link.c` `relocate_section()` handles `R_ARM_ABS32`: - -```c -case R_ARM_ABS32: - *(uint32_t *)ptr += val; - break; -``` - -This should work without modification. The Thumb bit (+1) is part of the symbol value, set when the symbol is created with `put_extern_sym_2()`. - -## Symbol Visibility - -All nested function artifacts are file-local: - -``` -function create_nested_func_symbol(mangled_name, text_section, offset, size): - sym = put_elf_sym(s1->symtab_section, offset | 1, // +1 Thumb - size, - ELF32_ST_INFO(STB_LOCAL, STT_FUNC), - 0, text_section->sh_num, - mangled_name) - return sym - -function create_trampoline_symbol(tramp_name, text_section, offset, size): - sym = put_elf_sym(s1->symtab_section, offset | 1, // +1 Thumb - size, - ELF32_ST_INFO(STB_LOCAL, STT_FUNC), - 0, text_section->sh_num, - tramp_name) - return sym - -function create_chain_slot_symbol(slot_name, data_section, offset): - sym = put_elf_sym(s1->symtab_section, offset, 4, - ELF32_ST_INFO(STB_LOCAL, STT_OBJECT), - 0, data_section->sh_num, - slot_name) - return sym -``` - -## Name Mangling - -Nested function names use GCC convention to ensure uniqueness: - -| Artifact | Name Pattern | Example | -|----------|-------------|---------| -| Nested function | `.` | `multiply.0` | -| Trampoline | `__tramp_.` | `__tramp_multiply.0` | -| Chain slot | `__chain_.` | `__chain_multiply.0` | - -The `.N` suffix is the nested function index within the parent (0, 1, 2, ...). This ensures unique symbol names even when the parent function is called recursively. The mangled name is stored in `sym->asm_label` (see `tccgen.c:11942-11944`). - -## Potential Issues - -1. **Section ordering**: Trampoline code is emitted in `.text` after the nested function. The linker must not reorder or coalesce these sections. - -2. **Alignment**: Trampoline data words at `+16` and `+20` must be 4-byte aligned. The NOP padding at `+12`/`+14` ensures this (trampoline starts at a 2-byte aligned address in `.text`). - -3. **PIC/PIE**: Not applicable for ARMv8-M embedded targets (absolute addressing only). - -## Implementation Status - -**Status**: ✅ COMPLETE - -All linker support for nested functions has been implemented and verified. The existing `R_ARM_ABS32` relocation handling in `arm-link.c` works correctly for the trampoline data words. - -### Symbol Creation Locations - -| Symbol Type | Location | Binding | -|-------------|----------|---------| -| Nested function | `tccgen.c:11948` - `put_extern_sym()` | `STB_LOCAL` via `VT_STATIC` | -| Chain slot | `tccgen.c:10857` - `put_elf_sym()` | `STB_LOCAL` explicit | -| Trampoline | `tccgen.c:10881` - `put_elf_sym()` | `STB_LOCAL` explicit | - -### Verification - -Symbol table from `nested_funcptr.c`: - -``` -$ arm-none-eabi-readelf -s nested_funcptr.o - - Num: Value Size Type Bind Vis Ndx Name - 2: 00000001 20 FUNC LOCAL DEFAULT 1 multiply.0 - 3: 00000000 4 OBJECT LOCAL DEFAULT 2 __chain_multiply.0 - 4: 00000015 20 FUNC LOCAL DEFAULT 1 __tramp_multiply.0 - 11: 00000029 92 FUNC GLOBAL DEFAULT 1 main -``` - -Relocations from `nested_funcptr.o`: - -``` -$ arm-none-eabi-readelf -r nested_funcptr.o - -Relocation section '.rel.text': - Offset Type Sym.Value Sym. Name -00000020 R_ARM_ABS32 00000000 __chain_multiply.0 -00000024 R_ARM_ABS32 00000001 multiply.0 # +1 Thumb bit -00000078 R_ARM_ABS32 00000015 __tramp_multiply.0 -``` - -## Test Cases - -| Test | Validates | Status | -|------|-----------|--------| -| `nested_funcptr.c` | R_ARM_ABS32 relocs resolve, trampoline branches to correct address | ✅ PASS | -| `nested_funcptr_indirect.c` | Chain slot address resolves, trampoline works across call boundary | ✅ PASS | -| `objdump -t` on any nested func ELF | STB_LOCAL symbols present with correct names | ✅ VERIFIED | -| `objdump -r` on relocatable output | R_ARM_ABS32 entries for trampoline data words | ✅ VERIFIED | diff --git a/docs/nested_functions/phase7_testing.md b/docs/nested_functions/phase7_testing.md deleted file mode 100644 index 41b314bc..00000000 --- a/docs/nested_functions/phase7_testing.md +++ /dev/null @@ -1,235 +0,0 @@ -# Phase 7: Testing & Validation - -**Effort**: 3-5 days -**Files**: `tests/ir_tests/`, `tests/gcctestsuite/conftest.py` - -## Overview - -Incremental test plan aligned with milestones. Custom test cases validate each feature in isolation. GCC torture tests validate compatibility. Tests run via `pytest` in the existing IR test infrastructure. - -## TODO - -### Completed ✅ - -- [x] Create test `.c` files in `tests/ir_tests/` (with corresponding `.expect` files) -- [x] Milestone 1: get `nested_basic.c` and `nested_basic_args.c` passing -- [x] Milestone 2: get `nested_capture_read.c`, `nested_capture_write.c`, `nested_capture_multiple.c` passing -- [x] Milestone 2: get `nested_capture_array.c` passing (Fix 1: type propagation) -- [x] Milestone 2: get `nested_multiple.c`, `nested_direct_call_args.c` passing -- [x] Milestone 3: get `nested_funcptr*.c` tests passing -- [x] Milestone 3: get `nested_shadowing.c` passing -- [x] Milestone 3: get `nested_struct_return.c` passing (Fix 2: sret + types) -- [x] Milestone 3: get `nested_recursive_parent.c` passing (Fix 3: prescan filter) -- [x] Update `tests/gcctestsuite/conftest.py` — remove skip for applicable GCC torture tests -- [x] Milestone 4: verify 8 GCC torture tests pass (non-goto, non-label_values) -- [x] Verify 6 deferred GCC torture tests remain skipped (4 nonlocal goto + 2 label_values) -- [x] Run full `make test -j16` with no regressions -- [x] Add `--dump-ir` verification for at least 3 tests (basic, capture_read, funcptr) -- [x] Verify QEMU execution output matches `.expect` files -- [x] Run `make test-all` and document final GCC torture suite results - -### Remaining (Known Limitations) 🚧 - -- [ ] `nested_multi_level.c` — Multi-level nesting (f → g → h, chain-of-chains) — Fix 4 - -## Incremental Test Plan - -### Milestone 1: Direct Call, No Capture (~1 week) - -| Test File | Description | Phases Required | -|-----------|-------------|-----------------| -| `nested_basic.c` | Simple nested function, direct call, returns value | 1, 4(stub), 5(stub) | -| `nested_basic_args.c` | Nested function with parameters | 1, 4(stub), 5(stub) | - -### Milestone 2: Capture via Static Chain (~2 weeks) - -| Test File | Description | Phases Required | -|-----------|-------------|-----------------| -| `nested_capture_read.c` | Read parent local variable | 1, 2, 4, 5 | -| `nested_capture_write.c` | Write parent local variable | 1, 2, 4, 5 | -| `nested_capture_multiple.c` | Multiple captured variables | 1, 2, 4, 5 | -| `nested_capture_array.c` | Capture array from parent | 1, 2, 4, 5 | -| `nested_multiple.c` | Multiple nested funcs in one parent | 1, 2, 4, 5 | -| `nested_direct_call_args.c` | Args + captured vars combined | 1, 2, 4, 5 | - -### Milestone 3: Trampolines & Advanced (~3.5 weeks) - -| Test File | Description | Phases Required | -|-----------|-------------|-----------------| -| `nested_funcptr.c` | Address-of nested function, call via pointer | 1, 2, 3, 4, 5, 6 | -| `nested_funcptr_indirect.c` | Nested func ptr passed to another function | 1, 2, 3, 4, 5, 6 | -| `nested_funcptr_call_twice.c` | Call funcptr twice (chain slot stability) | 1, 2, 3, 4, 5, 6 | -| `nested_multi_level.c` | f → g → h, double nest, chain-of-chains | 1, 2, 4, 5 | -| `nested_recursive_parent.c` | Recursive parent + nested call at each depth | 1, 2, 3, 4, 5, 6 | -| `nested_shadowing.c` | Nested function shadows parent variable name | 1, 2, 4, 5 | -| `nested_struct_return.c` | Nested function returns struct by value | 1, 2, 4, 5 | - -### Milestone 4: GCC Torture Tests (~4.5 weeks) - -#### Enabled (now passing) — 8 tests: - -| GCC Test | Feature Tested | Status | -|----------|----------------|--------| -| `20000822-1.c` | Nested func via pointer, basic capture | ✅ PASS | -| `920612-2.c` | Nested function with capture | ✅ PASS | -| `921017-1.c` | Nested function scoping | ✅ PASS | -| `921215-1.c` | Nested function with pointers | ✅ PASS | -| `931002-1.c` | Nested function recursion | ✅ PASS | -| `nestfunc-1.c` | Basic nested function | ✅ PASS | -| `nestfunc-2.c` | Nested function with arrays | ✅ PASS | -| `nestfunc-3.c` | Nested function with structs | ✅ PASS | - -#### Skipped — label_values (computed goto) — 2 tests: - -| GCC Test | Reason | -|----------|--------| -| `920428-2.c` | Requires computed goto (`&&label`) - skipped via `label_values` check | -| `920501-7.c` | Requires computed goto (`&&label`) - skipped via `label_values` check | - -#### Defer (xfail) — nonlocal goto — 4 tests: - -| GCC Test | Reason | -|----------|--------| -| `comp-goto-2.c` | Requires computed goto (`&&label`) | -| `nestfunc-5.c` | Requires nonlocal goto from nested function | -| `nestfunc-6.c` | Requires nonlocal goto from nested function | -| `pr24135.c` | Requires nonlocal goto | - -## Test File Format - -Each test consists of a `.c` file and a `.expect` file: - -``` -tests/ir_tests/nested_basic.c # C source -tests/ir_tests/nested_basic.expect # Expected stdout output -``` - -The test runner (`conftest.py`) compiles with `armv8m-tcc`, links with newlib, runs via QEMU, and compares output. - -## Regression Testing - -After each milestone, run the full suite to verify no regressions: - -```bash -# Full IR test suite -make test -j16 - -# GCC torture tests (after Phase 7 conftest.py update) -make test-all - -# Assembly tests (should be unaffected) -make test-asm -j16 -``` - -## Implementation Status - -**Status**: ✅ MOSTLY COMPLETE - -### Test Summary - -| Category | Passing | Failing | Status | -|----------|---------|---------|--------| -| Milestone 1 (Basic) | 4 | 0 | ✅ Complete | -| Milestone 2 (Capture) | 5 | 0 | ✅ Complete | -| Milestone 3 (Funcptr/Advanced) | 8 | 1 | 🟡 Partial | -| GCC Torture (compile) | 224 | 452 xfail | ✅ Expected | -| GCC Torture (execute) | See IR tests | - | ⚪ Via IR framework | -| GCC Torture (skipped) | - | 70 | ⚪ Expected | - -### Milestone 1: Direct Call (Complete) ✅ - -All tests passing: -- `nested_basic.c` ✅ -- `nested_basic_simple.c` ✅ -- `nested_basic_args.c` ✅ -- `nested_direct_call_args.c` ✅ - -### Milestone 2: Capture via Static Chain (Complete) ✅ - -All tests passing (5/5): -- `nested_capture_array.c` ✅ (Fix 1: type propagation) -- `nested_capture_read.c` ✅ -- `nested_capture_write.c` ✅ -- `nested_capture_multiple.c` ✅ -- `nested_multiple.c` ✅ - -### Milestone 3: Trampolines & Advanced (Partial) 🟡 - -Passing (7/8): -- `nested_funcptr.c` ✅ -- `nested_funcptr_indirect.c` ✅ -- `nested_funcptr_call_twice.c` ✅ -- `nested_recursive_parent.c` ✅ (Fix 3: prescan filter) -- `nested_shadowing.c` ✅ -- `nested_struct_return.c` ✅ (Fix 2: sret + types) - -Known limitation (not linker-related): -- `nested_multi_level.c` ❌ (multi-level nesting - Fix 4 not implemented) - -### GCC Torture Tests - -#### Changes to `conftest.py`: - -1. **Removed trampoline skip** - Tests with `dg-require-effective-target trampolines` are no longer skipped -2. **Added label_values skip** - Tests with `dg-require-effective-target label_values` are now skipped (computed goto not supported) -3. **Removed xfail for 8 tests** - These now pass: - - `20000822-1`, `920612-2`, `921017-1`, `921215-1`, `931002-1` - - `nestfunc-1`, `nestfunc-2`, `nestfunc-3` - -#### Still xfail (nonlocal goto): -- `nestfunc-5`, `nestfunc-6`, `nestfunc-7` -- `comp-goto-2`, `pr24135` - -### GCC Torture Suite Final Results - -Latest `make test-all` run: - -``` -GCC Torture Compile Tests: -- 224 passed -- 452 failed (expected - these are in GCC_XFAIL_TESTS) -- 70 skipped (label_values, unsupported features) -- 3,248 xfailed (known failures) - -GCC Torture Execute Tests: -- Integrated with IR tests framework via test_gcc_torture_ir.py -- Execution via QEMU with newlib linking -``` - -### Conftest.py Changes - -```python -# tests/gcctestsuite/conftest.py - -# Removed from GCC_XFAIL_TESTS: -# - "20000822-1", "920612-2", "921017-1", "921215-1", "931002-1" -# - "nestfunc-1", "nestfunc-2", "nestfunc-3" - -# Removed skip pattern: -# - "dg-require-effective-target trampolines" (now supported) - -# Added skip pattern: -# - "dg-require-effective-target label_values" (computed goto not supported) -``` - -## Debugging Failed Tests - -```bash -# Dump IR for a failing test -./armv8m-tcc -dump-ir -c tests/ir_tests/nested_capture_read.c - -# Compile and run manually with QEMU -cd tests/ir_tests -python run.py -c nested_capture_read.c --dump-ir - -# Disassemble the ELF to inspect codegen -arm-none-eabi-objdump -d tests/ir_tests/build/nested_capture_read.elf - -# Check symbols -arm-none-eabi-objdump -t tests/ir_tests/build/nested_funcptr.elf | grep nested - -# GDB debug -python run.py -c nested_capture_read.c --gdb -# In another terminal: -arm-none-eabi-gdb tests/ir_tests/build/nested_capture_read.elf -ex "target remote :1234" -``` diff --git a/docs/plan_closing_gcc_gap.md b/docs/plan_closing_gcc_gap.md deleted file mode 100644 index ec1fb93e..00000000 --- a/docs/plan_closing_gcc_gap.md +++ /dev/null @@ -1,269 +0,0 @@ -# Plan: Closing the TCC–GCC Code Size Gap - -## Current State - -Benchmark of TCC -O2 vs GCC -O2 across IR test suite (ARM Thumb-2, Cortex-M33): - -| Test / Function | TCC | GCC | Ratio | Root Cause | -|-------------------------------|-----|-----|--------|--------------------------| -| test_llong_load_unsigned/main | 102 | 8 | 12.75x | Inlining + const fold | -| test_u64_shift_add/main | 117 | 26 | 4.50x | Inlining + const fold | -| test_fp_offset_cache/mixed | 15 | 5 | 3.00x | Const fold + DCE | -| test_return64/main | 38 | 14 | 2.71x | Inlining + const fold | -| test_dcmp/main | 21 | 8 | 2.62x | Inlining + const fold | -| test_fp_offset_cache/loop | 61 | 27 | 2.26x | Loop opts + addr reuse | -| test_double_arith/main | 49 | 22 | 2.23x | Inlining + const fold | -| test_fp_offset_cache/swap | 52 | 27 | 1.93x | Loop opts + cond exec | -| bubble_sort | 44 | 27 | 1.63x | Addr modes + cond exec | -| test_f2d_bits/main | 48 | 30 | 1.60x | Inlining | - -TCC already matches or beats GCC on leaf functions: test_simple_return (1.00x), -test_llong_mul_unsigned (0.88x), test_semihosting (0.60x), test_aeabi_dneg (0.65x). - -### What GCC does for 12.75x case - -`test_llong_load_unsigned` defines `load_through_ptr`, `store_through_ptr`, `check_u64` -(all static, <20 lines) and calls them from `main` with known global/constant args. - -GCC: inlines everything → propagates `load_through_ptr(&g1) == g1` → folds -`check_u64("g1", g1, g1)` to return 0 → eliminates all dead branches → only -two `puts` calls and `return 0` remain (8 instructions). - -### What TCC does today - -Token-stream auto-inlining IS working: `load_through_ptr` (len=13) and `check_u64` -(len=54) are registered as inline candidates and replayed at call sites. - -Constant evaluation also works for calls with all-VT_CONST args: -- `load_through_ptr(&g1)` → evaluated, folded ✓ (first two calls) -- `load_through_ptr(&arr[0])` → FAILS: stack address not VT_CONST ✗ -- `check_u64("g1", , g1)` → FAILS: inlined result in register, not VT_CONST ✗ - -`store_through_ptr` is not appearing in inline candidate list (cause TBD — likely -the void return + VT_LLONG param combination). - -After token-replay inlining, the full check_u64 body (including the printf error -path) stays in the IR. The IR optimizer cannot prove the comparison always succeeds -because it lacks store-load forwarding through memory: `arr[0] = g1; *(&arr[0])` -does not resolve to `g1` at the IR level. - ---- - -## Step 1: Improve Post-Inline Constant Propagation - -**Goal:** After token-replay inlining of `check_u64`, fold `got != exp` to false -when both operands trace back to the same value. - -**What to do:** -1. In `ir/opt.c`, extend `tcc_ir_opt_const_prop` to handle the pattern: - `STORE val → addr` followed by `LOAD addr → tmp` → replace tmp with val. - This is store-load forwarding for the *same* basic block (intra-BB). -2. Extend the existing `tcc_ir_opt_sl_forward` to handle 64-bit (LLONG) values - stored/loaded via `strd`/`ldrd` patterns. -3. After forwarding, existing branch folding + DCE eliminates the dead printf path. - -**Test:** `test_llong_load_unsigned` — first two `check_u64` calls (with global -addresses) should be fully eliminated from the IR. - -**Expected improvement:** 12.75x → ~4x (eliminates 2 of 5 check blocks). - -**Files:** `ir/opt.c` (store-load forwarding), `tccir.h` (if new flags needed) - ---- - -## Step 2: Propagate Constants Through Local Arrays - -**Goal:** After `arr[0] = g1`, resolve `load_through_ptr(&arr[0])` to `g1`. - -**What to do:** -1. Track stores to local array elements with constant indices in a shadow map - during constant propagation: `stack_offset + idx*size → stored_value`. -2. When a LOAD from a known stack address matches a previous STORE to the same - address (no intervening aliasing store), forward the value. -3. Handle the specific pattern: `LEA(stack, offset)` passed as arg to inlined - `load_through_ptr` which does `LOAD(arg)` — after inlining, this becomes - `LOAD(LEA(stack, offset))` which can resolve via the shadow map. - -**Test:** `test_llong_load_unsigned` — all `check_u64` calls with arr elements -should be eliminated. - -**Expected improvement:** 12.75x → ~2x (eliminates arr-based checks, only -`store_through_ptr` + final check remain). - -**Files:** `ir/opt.c` - ---- - -## Step 3: Fix store_through_ptr Not Being Inlined - -**Goal:** Ensure void functions with VT_LLONG parameters are auto-inlined. - -**What to do:** -1. Add INLINE_STRUCT logging around `auto_inline_sig_ok` rejection path to - identify exactly why `store_through_ptr` is being skipped. -2. Fix the rejection (likely in `auto_inline_sig_ok` parameter loop or the - void+LLONG combination). -3. After inlining `store_through_ptr(&local, arr[2])`, Step 2's forwarding can - propagate `local == 0xffffffffffffffff` to the final `check_u64`. - -**Test:** `test_llong_load_unsigned` — final code should match GCC: two `puts` -calls + `return 0`. - -**Expected improvement:** 12.75x → ~1.0x for this specific test. - -**Files:** `tccgen.c` (auto_inline_sig_ok, call-site inline logic) - ---- - -## Step 4: Fix LICM Instruction Index Bug - -**Goal:** Re-enable loop-invariant code motion. - -**Current state:** LICM is disabled at `tccgen.c:25176`. The old pattern-based -`hoist_from_loop` returns 0 unconditionally (`licm.c:590`). A new dominance-based -`tcc_ir_opt_licm_ex` exists but the old pass is dead. The bug is documented: -> instruction indices are not adjusted by total_inserted when reading original -> instructions during the insertion loop, causing operand_base corruption - -**What to do:** -1. The dominance-based LICM (`tcc_ir_opt_licm_ex`) is already implemented with - CFG + dominator tree. Verify it handles instruction index adjustment correctly. -2. Remove the `return 0` guard in `hoist_from_loop` OR remove the old pass - entirely and rely on the dominance-based version. -3. Enable LICM by removing the comment/guard at `tccgen.c:25176` (set - `opt_licm=1` at `-O1`+). -4. Run full test suite to validate: `make test -j16 && make test-gcc-torture-compile`. - -**Test:** `test_fp_offset_cache/test_loop_access` (2.26x), bubble_sort (1.63x). - -**Expected improvement:** ~15-25% reduction in loop-heavy functions. - -**Files:** `ir/licm.c`, `tccgen.c` (optimization pipeline) - ---- - -## Step 5: Copy Coalescing in Register Allocator - -**Goal:** Eliminate redundant `mov` instructions from ASSIGN IR ops. - -**Current state:** The linear scan allocator in `tccls.c` assigns physical registers -independently. The optimized IR contains many identity assigns like: -``` -R0(T1) <-- R5(V0) [ASSIGN] → mov r0, r5 -R1(T9) <-- R4(V0) [ASSIGN] → mov r1, r4 -``` - -**What to do:** -1. After liveness analysis (`ir/live.c`), add a coalescing pre-pass that merges - virtual register live ranges connected by ASSIGN when they don't interfere. -2. Specifically: for `Tx <-- Vy [ASSIGN]`, if Tx and Vy have non-overlapping live - ranges (or Vy dies at this instruction), assign the same physical register. -3. After coalescing, the ASSIGN becomes a no-op and can be eliminated by DCE. - -Alternative lighter approach: add a post-regalloc peephole in `arm-thumb-gen.c` -that eliminates `mov Rx, Rx` (same register). - -**Test:** Every function — count `mov` instructions before/after. - -**Expected improvement:** ~15-20% across the board. In bubble_sort: 44 → ~35. - -**Files:** `tccls.c` (register allocator), `ir/live.c` (liveness) - ---- - -## Step 6: If-Conversion for Small Conditional Blocks (IT Blocks) - -**Goal:** Replace short branch-over patterns with ARM IT conditional execution. - -**Current state:** TCC generates full branch diamonds even for single-instruction -if-then bodies. GCC uses IT blocks: -``` -; GCC bubble sort swap: -cmp r2, r1 -it gt -strdgt r1, r2, [r3, #-4] ; 1 conditional instruction, no branch - -; TCC bubble sort swap: -cmp r1, r2 -ble .skip -; ... 10 instructions for swap ... -.skip: -``` - -**What to do:** -1. Add an IR-level if-conversion pass that detects diamond/triangle patterns where - the "then" block has 1-4 instructions and no side effects beyond stores. -2. Convert to `SELECT` IR ops (already defined in `tccir.h`) or emit IT blocks - directly in `arm-thumb-gen.c`. -3. ARM Thumb-2 IT blocks support up to 4 conditional instructions. Focus on the - common pattern: compare + conditional store (swap, min/max). - -**Test:** bubble_sort, test_swap_pattern, any conditional move patterns. - -**Expected improvement:** ~10-15% in branch-heavy inner loops. Bubble sort: 35 → ~28. - -**Files:** `ir/opt.c` (new pass), `arm-thumb-gen.c` (IT block emission) - ---- - -## Step 7: Improved Induction Variable Strength Reduction - -**Goal:** Convert `base + i*4` recomputed each iteration into pointer increment. - -**Current state:** IV strength reduction exists (`tcc_ir_opt_iv_strength_reduction`) -but doesn't catch all patterns, especially when the same array index is used -multiple times in a loop body (like swap: `arr[j]`, `arr[j+1]` used in load, store, -and recomputed independently). - -**What to do:** -1. Extend IV SR to identify groups of array accesses sharing the same base and - induction variable: `arr[j]`, `arr[j+1]` → single pointer `p` with `p[0]`, - `p[1]`, incremented once per iteration. -2. After the pointer is introduced, existing indexed load fusion - (`LOAD_INDEXED`) handles the rest. -3. Requires LICM (Step 4) to hoist the base address first. - -**Test:** bubble_sort, test_loop_access, test_swap_pattern. - -**Expected improvement:** ~10% additional on loop-heavy code. - -**Files:** `ir/opt.c` (IV strength reduction) - ---- - -## Execution Order & Dependencies - -``` -Step 1 ──→ Step 2 ──→ Step 3 (inlining + const prop chain) - │ - │ Step 4 ──→ Step 7 (LICM enables better IV SR) - │ - │ Step 5 (independent: regalloc) - │ - │ Step 6 (independent: if-conversion) - ↓ - Steps 4-7 can run in parallel with Steps 1-3 -``` - -Steps 1-3 are the highest leverage: they address the 12.75x/4.50x/2.71x outliers. -Steps 4-7 improve the 1.5x-2.3x cases (loops, branches, register pressure). - -## Validation - -After each step, run: -```bash -make test -j16 # IR tests pass -make test-gcc-torture-compile # no regressions -python3 scripts/compare_disasm.py tests/ir_tests/test_llong_load_unsigned.c # track ratio -python3 scripts/compare_disasm.py bubble # track ratio -``` - -## Target - -| Test | Current | After Steps 1-3 | After All | -|-------------------------------|---------|------------------|-----------| -| test_llong_load_unsigned/main | 12.75x | ~1.0x | ~1.0x | -| test_u64_shift_add/main | 4.50x | ~2.0x | ~1.5x | -| test_return64/main | 2.71x | ~1.2x | ~1.0x | -| test_fp_offset_cache/loop | 2.26x | ~2.26x | ~1.3x | -| bubble_sort | 1.63x | ~1.63x | ~1.1x | diff --git a/docs/plan_iv_sr_rotated_loop.md b/docs/plan_iv_sr_rotated_loop.md deleted file mode 100644 index 8d9c5170..00000000 --- a/docs/plan_iv_sr_rotated_loop.md +++ /dev/null @@ -1,228 +0,0 @@ -# Plan: IV Strength Reduction for Rotated Loops with `arr[i*const]` - -## Context - -`test_llong_relops::run_signed` and `run_unsigned` are ~1.39x and ~1.41x larger -than GCC's output (139 vs 100, 128 vs 91). The gap is dominated by: - -1. The loop counter `i` is spilled to `[sp, #36]` and the address - `&cases[i]` is recomputed each iteration via `mla r9, r0, r1, r2`. -2. GCC instead uses a pointer-IV: `r4 = &cases[0]` in the preheader, - `r4 += 40` in the latch, eliminating both the multiply and an `i` reload. - -TCC already has an IV strength reduction pass -([`tcc_ir_opt_iv_strength_reduction`](ir/opt.c:20889)) that's designed for -exactly this pattern — but it doesn't fire in `test_llong_relops`. This plan -covers what blocks it and how to fix it. - -## Root Cause - -The fix has two distinct blockers. Either one alone keeps the pointer-IV -transform from firing. - -### Blocker 1: pre-SSA MLA fusion rejects immediate multipliers - -[`tcc_ir_opt_fusion_pass`](ir/opt.c:14461) fuses `T = a * b; V = base + T` -into `V = a MLA b + base`. The gate at [ir/opt.c:14523-14524](ir/opt.c#L14523) -excludes the case where `a` or `b` is an immediate: - -```c -!irop_is_immediate(ms1) && !irop_is_immediate(ms2) && ir_opt_du_uses(...) == 1 -``` - -For `T = i * 40; V = base + T`, `ms2` is `#40` (immediate), so MLA fusion -skips it. The MUL+ADD form survives until the ARM-specific SSA-stage MLA -fusion in [`arch/arm/ssa_opt_arm.c:100`](arch/arm/ssa_opt_arm.c#L100) — but -**that runs after IV-SR**, so IV-SR never sees an MLA to operate on. - -The pre-SSA gate was presumably added because MUL-by-power-of-2 gets -strength-reduced to SHL later, which would render the MLA wasteful. But for -non-power-of-2 immediates (40, 12, etc.) the strength reducer at -[ir/opt.c:18846](ir/opt.c#L18846) bails out (multi-instruction patterns -aren't supported), so the MUL stays as MUL and MLA fusion was the right call -all along. - -### Blocker 2: `loop->body_instrs` is too narrow for TCC's rotated layout - -`find_derived_ivs` ([ir/opt.c:19115](ir/opt.c#L19115)) has two scan passes: - -| Pass | What it finds | Scan range | -|------|---------------|------------| -| 1 (line 19164) | `ADD` with MUL/SHL src — i.e. unfused MUL+ADD | `loop->body_instrs` | -| 2 (line 19400) | `MLA` directly | `mla_scan_start..mla_scan_end` (extended) | - -The extended range walks forward jumps iteratively past the back-edge — it's -specifically designed to catch rotated loops with the body proper *after* the -latch in instruction order. But it's only wired to pass 2 (MLA-detection). - -In `test_llong_relops`, loop rotation produces: - -``` -op 3: CMP i, 10 ← header -op 4: JMP if >=U exit -op 5: JMP to 10 ← into body -op 6: T = i + 1 ← latch (increment) -op 8: i = T ← latch (write-back) -op 9: JMP to 3 ← back to header -op 10: T3 = i * 40 ← body proper (MUL) -op 11: V1 = base + T3 ← body proper (ADD) — this is the DIV! -... -op 110: JMP to 6 ← back-edge to the latch -``` - -LICM's body detector ([ir/licm.c:228-264](ir/licm.c#L228-L264)) only follows -forward jumps one level deep when extending the body range, so -`loop->body_instrs` for this loop is `{2, 3, 4, 5, 6, 7, 8}` — it never -reaches op 11. Pass 1 misses the MUL+ADD. - -Even after fixing Blocker 1 (so the MUL+ADD becomes an MLA), Pass 2 catches -it because Pass 2 uses the extended scan range. - -## What I Tried — and Why It Failed - -Lifted the immediate-operand gate on pre-SSA MLA fusion. IV-SR then *did* -fire and produced the textbook pointer-IV in the IR dump: - -``` -0002: R4(T27) <-- Addr[StackLoc[-48]] [ASSIGN] ← preheader: p = base -... -0013: R4(T27) <-- R4(T27) ADD #12 ← latch: p += stride -``` - -But the **emitted assembly didn't match the IR**: -[`bug_struct_array_index_mul_clobber`](tests/ir_tests/bug_struct_array_index_mul_clobber.c) -crashed in QEMU because `main`'s emitted code loaded from `[r4, #0]` without -ever initializing r4. The preheader `ASSIGN R4 <- Addr[...]` was in the IR -but absent from the machine code. The latch `R4 += 12` was also missing. - -So there's a third blocker hiding behind the first two: when IV-SR inserts -new instructions *outside the original loop range* (specifically into the -preheader/latch), something in the codegen path doesn't pick them up. - -I reverted the MLA fusion change. The peephole improvement in commit -`e76cee04` (which is an unrelated, smaller win) stands. - -## The Real Fix - -Three changes, in order. Land each on its own commit and run the full IR -suite (1026 tests) plus a regression-disasm diff between each. - -### Step 1 — Verify and fix the codegen-doesn't-honor-inserted-instructions bug - -Without this, Steps 2-3 produce miscompiles. - -1. Reproduce with a minimal case. Apply the immediate-allowing MLA fusion - from this session (`git show e76cee04^..HEAD` is the wrong base — apply - the change as a separate scratch commit). Compile - `tests/ir_tests/bug_struct_array_index_mul_clobber.c` with `-O2 -dump-ir`. - The "AFTER OPTIMIZATIONS" IR dump for `main` will show - `R4(T27) <-- Addr[StackLoc[-48]]` near the top and `R4 += 12` in the - latch. -2. Confirm the disassembly is missing both: there's no `add r4, sp, #N` in - `main`'s preheader and no `adds r4, #12` in the loop's bottom block. -3. Hypothesis: IV-SR's `transform_derived_iv` - ([ir/opt.c:~19500](ir/opt.c) — search for it) inserts via - `insert_instr_at` at `loop->preheader_idx + 1` and at the latch position. - Those inserts shift indices. Either: - - the inserts land in an IR slot that codegen skips (NOP-classified, or - marked unreachable), or - - the inserts happen *after* the SSA-renaming snapshot codegen uses, and - codegen runs from the pre-IV-SR snapshot. -4. The way to find out is to instrument `tcc_ir_codegen_generate` to print - `(i, op, dest_vreg, dest_alloc.r0)` for every IR instruction it dispatches - on, and compare against the dumped IR. The first divergence is the bug. - -Most likely fix is in `transform_derived_iv` (it needs to mark new -instructions with the right flags), or in the SSA construction pass (it -needs to rebuild after IV-SR runs). Don't guess — the trace will say. - -### Step 2 — Relax pre-SSA MLA fusion to accept non-power-of-2 immediates - -Once Step 1 is done, re-land the immediate-allowing MLA fusion. The patch -in [ir/opt.c:14523](ir/opt.c#L14523): - -```diff -+ int ms1_imm = irop_is_immediate(ms1); -+ int ms2_imm = irop_is_immediate(ms2); -+ int allow_one_imm = (ms1_imm ^ ms2_imm); -+ if (allow_one_imm) { -+ int64_t mval = ms1_imm ? irop_get_imm64_ex(ir, ms1) -+ : irop_get_imm64_ex(ir, ms2); -+ if (is_power_of_2(mval) >= 0 || mval == 0 || mval == 1) -+ allow_one_imm = 0; /* leave for strength reduction */ -+ } - if (... && -- !irop_is_immediate(ms1) && !irop_is_immediate(ms2) && ...) { -+ (allow_one_imm || (!ms1_imm && !ms2_imm)) && ...) { -``` - -Forward-declare `is_power_of_2` near the top of `ir/opt.c`. - -Do **not** also drop the `STACKOFF && !is_lval` accumulator exclusion. That -exclusion is load-bearing (dropping it breaks `test_llong_relops` and -`bug_bitfield_packed10` in different ways — distinct from Step 1's bug). - -### Step 3 — Optional: extend Pass 1 of `find_derived_ivs` to the MLA scan range - -After Step 2, the test_llong_relops MUL+ADD becomes an MLA in pre-SSA, so -Pass 2 catches it. But other callers / code shapes may still have unfused -MUL+ADD outside `body_instrs`. The cleanest follow-up is to teach Pass 1 to -walk `mla_scan_start..mla_scan_end` as well, gated to only consider ADDs -whose matched MUL/SHL is *also* in the extended range. This preserves the -"don't extend body for SHR/AND chains" guarantee the comment at -[ir/opt.c:19126-19131](ir/opt.c#L19126-L19131) warns about. - -This is genuinely optional — Step 2 alone should close the test_llong_relops -gap once Step 1 is in place. - -## Expected Impact - -| Function | Before | After Steps 1-2 | GCC | -|---|---|---|---| -| `test_llong_relops::run_signed` | 138 | ~115 (-23) | 100 | -| `test_llong_relops::run_unsigned` | 127 | ~104 (-23) | 91 | -| (`bug_ull_mul10_loop`, others with `arr[i*c]`) | — | likely improves | — | - -The 23-instruction estimate per function comes from: -- Eliminate `mla r9, r0, r1, r2` plus its prep (`movs r1, #40; add r2, sp, - #40`) per iter → -3 insns in body, but body executes ×10/8 → counted as - static body shrink. -- Eliminate `i` spill (`str/ldr` to `[sp, #36]` ~6 times per iter once `i` - fits in a callee-saved reg, since one register is freed by the IV-SR - collapse) → ~6 insns gone from body. -- Net ~9 insns saved in the body, plus 14 in the prologue/preheader once the - computed-each-iter MLA collapses to a single preheader init + latch ADD. - -This won't close the gap entirely (GCC also uses cleaner long-long -relational comparisons — `sbcs`/`ite` patterns that TCC already produces but -spills around for the last comparison; see todo #3 from the original -analysis: `ne_s`/`ne_u` regalloc collision). - -## Out of Scope - -- The regalloc collision causing `ne_s`/`ne_u` to spill `got` and `exp` to - `[sp, #32]`/`[sp, #28]` (separate fix, ~6-8 insns). -- The dead intermediate `[sp, #24]` store from `i++` (would require DSE on - the post-codegen stack slot, or IR-level coalescing of T54 with T51). -- LICM body detection fix in `ir/licm.c` (a more thorough fix to Blocker 2 - but with broader regression surface — Step 3 above is the targeted - alternative). - -## Validation - -Per step: - -```bash -make cross -cd tests/ir_tests && source .venv/bin/activate -python -m pytest test_qemu.py -n auto # 1026 tests must pass -cd /home/mateusz/repos/tinycc -python scripts/regression_disasm.py --suite=ir -O2 # check function-level deltas -``` - -Specifically watch: -- `test_llong_relops::{run_signed,run_unsigned}` (target test) -- `bug_struct_array_index_mul_clobber::main` (Step 1 canary) -- `bug_bitfield_packed10::{check,main}` (was broken by dropping STACKOFF - exclusion — must stay passing) -- `110_iv_strength_reduction::*` (existing IV-SR test surface) diff --git a/docs/plan_opt_modularization.md b/docs/plan_opt_modularization.md deleted file mode 100644 index f4a0b162..00000000 --- a/docs/plan_opt_modularization.md +++ /dev/null @@ -1,494 +0,0 @@ -# Pre-SSA Optimization: Engine + Modularization Plan - -## Progress checklist - -### Phase 0 — Delete dead code -- [x] Remove `tcc_ir_opt_run_by_name` stub (opt.c, opt.h) -- [x] Remove `tcc_ir_opt_run_all` stub (opt.c, opt.h) -- [x] Remove `tcc_ir_opt_return` stub + call site in tccgen.c -- [x] Remove `opt_return_value` flag (tcc.h, libtcc.c) — was the only consumer of the deleted stub - -### Phase 1 — Extract shared analysis & primitives -- [x] **1.1** `ir/opt_du.{h,c}` — `IROptDU` + `ir_opt_du_build/idx/def/uses` -- [x] **1.2** `ir/opt_xform.{h,c}` — `ir_xform_nop` (inline), `ir_xform_same_block` (5/6 call sites migrated; 1 site keeps non-canonical NOP-boundary semantics) -- [x] **1.3** `ir/opt_utils.{h,c}` — constant evaluators, BB/CFG helpers, purity tables, expression equality, call-param helpers -- [x] **1.4** `ir/opt_alias.{h,c}` — stack-slot aliasing helpers -- [x] **1.5** `ir/opt_loop_utils.{h,c}` — IV analysis, loop bounds, loop transforms - -### Phase 2 — Build the pre-SSA engine -- [x] **2.1** `ir/opt_engine.{h,c}` — `IROptCtx`, `IROptGen`, `tcc_ir_opt_run_gens`, lazy analysis cache -- [x] **2.2** Build-only verify (no rules wired yet) - -### Phase 3 — Convert pass groups to generator tables -- [x] **3.1** Fusion group → `ir/opt_gens_fusion.c` (7 converted: rotate, mla, indexed_mem, deref_indexed, disp, indexed_chain, indexed_pair_reorder; hand-written: postinc, lea_fold, assign_fuse) -- [x] **3.2** Branch-folding group → `ir/opt_gens_branch.c` (branch_folding + setif_branch_fuse converted to generators; or_bool_diamond, stack_addr_nonnull_fold, stack_bool_diamond stay hand-written — flow-sensitive/CFG patterns) -- [x] **3.3** Boolean simplification → `ir/opt_gens_bool.c` (bool_idempotent + bool_simplify + idempotent half of bool_pass) -- [x] **3.4** BB-scoped hash CSE — `cse_bool` converted to `IROptHashTable`; remaining passes (cse_global_load, globalsym_cse, cse_param_add, local_load_cse, local_alu_cse, stackoff_addr_cse) use ≤32-entry flat arrays where linear scan is faster than hash overhead — no conversion needed -- [x] **3.5** Call-result dead group → `ir/opt_gens_call_result.c` (dead_call_result_elim, dead_sret_call_elim, fold_call_result_store converted; dead_init_via_call stays in opt.c — FWS dependency) - -### Phase 4 — Generic hash table -- [x] **4.1** `ir/opt_hash.{h,c}` — `IROptHashTable`, bump-allocated entry pool, applied to `bool_cse` (replaces malloc-per-entry `BoolCSEEntry`); remaining CSE passes use flat arrays that don't benefit from hashing - -### Phase 5 — Collect-then-transform engine variant (optional) -- [x] **5.1** `IROptCollectGen` 2-phase dispatch — evaluated and skipped: candidate passes (const_var_prop, dead_var_store_elim, redundant_var_assign) each use unique per-pass state types that can't be shared through a generic interface; shared boilerplate is only ~5 lines of iteration loop per pass, not worth a new abstraction - -### Phase 6 — Theme-based file split (optional, zero flash savings) -- [x] **6.1** Theme-based split started: `opt_loop.c` (1,052 lines — strength reduction, IV, unroll, rotation, decrement-to-zero), `opt_memory.c` (3,259 lines — sl_forward, entry_store_prop, store_redundant, deref_fwd); `opt.c` reduced from 28,973 → 17,861 lines - ---- - -## Current State (2026-05) - -`ir/opt.c` is **28,973 lines** containing **81 pass functions**. It is the single largest source file in the project. The SSA optimization engine (`ir/opt/`, 8,500 lines across 13 files) has been built and runs on SSA-renamed IR before SSA destruction — but it did **not** displace the pre-SSA monolith. Both layers exist in production and the pre-SSA layer keeps growing as new post-destruction peepholes are needed for address materialization, indexed-mode fusion, and stack-aware patterns. - -### Why the monolith keeps growing - -The expectation in the original plan — "as SSA passes mature, pre-SSA equivalents are removed" — has not held. The pre-SSA layer operates on flat IR after SSA destruction, where vregs are no longer single-assignment and stack/local layout is materialized. Several optimization classes only make sense at this layer: - -- ARM addressing-mode fusion (`LOAD_INDEXED`, `LOAD_POSTINC`, `MLA`, displacement folding) -- Stack-slot aliasing and forwarding (`sl_forward`, `stack_addr_cse`) -- 64-bit register-pair tracking (`pack64`, `pack64_tautology`) -- Call-result lifetime analysis (`dead_call_result_elim`, `dead_init_via_call`, `dead_sret_call_elim`, `fold_call_result_store`) - -Since the original plan was written, 21 new pre-SSA passes have been added (full list in the census below). The pre-SSA optimizer is **permanent infrastructure**, not a migration bridge. - -### Two goals driving this rewrite - -1. **Save flash memory.** The compiler ships on flash-constrained embedded targets. Each pass has ~30–50 lines of duplicated iteration boilerplate (forward loop, NOP skip, BB-boundary check, local DU-table build). Across 81 passes that's roughly **3,000–4,000 lines** of redundant code, plus 4 hand-rolled hash tables and 6+ inlined "same-block check" loops. -2. **Combine passes into single forward loops.** Many passes only differ in their trigger opcode and pattern body. Today the pipeline runs 7+ separate fusion forward-scans back-to-back (each rebuilding the DU table); they could all run in one scan. - -The SSA engine has already proven the answer: a generator-based dispatch (`IRSSAOptGen` in [ir/opt/ssa_opt.h:62-66](ir/opt/ssa_opt.h#L62-L66), `ssa_opt_run_gens` in [ir/opt/ssa_opt.c:604-622](ir/opt/ssa_opt.c#L604-L622)) lets a single `O(n)` engine pass dispatch dozens of rules. The pre-SSA layer needs the same shape, with a context that survives the dispatch loop and caches analyses. - ---- - -## Pass Census (current) - -`opt.c` pass functions, grouped by pattern affinity: - -### Cleanup / DCE -`dce`, `compact_nops`, `dead_var_store_elim`, `dead_addrvar_elim`, `redundant_var_assign`, `redundant_init_elim`, `dse`, `dead_loop_elim`, `dead_call_result_elim`, `dead_init_via_call`, `dead_sret_call_elim` - -### Constant / value propagation -`const_var_prop`, `global_init_prop`, `const_prop`, `const_prop_tmp`, `value_tracking`, `complex_const_param_fold`, `param_addrof_const_fold`, `local_addrof_const_fold`, `add_reassoc`, `cmp_expr_fold` - -### Memory -`sl_forward`, `entry_store_prop`, `store_redundant`, `block_copy_init`, `deref_fwd`, `fold_call_result_store` - -### Fusion & addressing -`fusion_pass` (mla+indexed), `rotate_fusion`, `deref_indexed_fusion`, `disp_fusion`, `lea_fold`, `postinc_fusion`, `loop_postinc_fusion`, `indexed_chain`, `indexed_pair_reorder`, `add_deref_fold`, `stackoff_addr_cse`, `call_chain_rename`, `assign_fuse` - -### CSE / copy propagation -`copy_prop`, `cse_global_load`, `globalsym_cse`, `cse_param_add`, `local_load_cse`, `local_alu_cse`, `stack_addr_cse` - -### Branch / boolean -`branch_folding`, `setif_branch_fuse`, `stack_addr_nonnull_fold`, `stack_bool_diamond`, `or_bool_diamond`, `nonneg_branch_fold`, `float_branch_fold`, `bool_idempotent`, `bool_simplify`, `bool_pass` - -### Loop -`loop_unroll`, `loop_rotation`, `loop_bound_remat`, `iv_strength_reduction`, `iv_strength_reduction_with_loops`, `decrement_to_zero`, `redundant_loop_check`, `backedge_phi_hoist` - -### Other / peephole -`vrp`, `var_tmp_fwd`, `var_to_tmp`, `float_narrowing`, `strength_reduction`, `select`, `postinc_assign_fold`, `returnvalue_merge`, `const_string_calls`, `const_call_replace`, `pack64`, `pack64_tautology`, `fp_cache_*` - -### Stubs (delete in Phase 0) -`tcc_ir_opt_return`, `tcc_ir_opt_run_by_name` - -The original plan's `tcc_ir_opt_run_all` is already gone. `opt_jump_thread.c` already lives outside `opt.c` and provides `tcc_ir_opt_jump_threading` + `tcc_ir_opt_eliminate_fallthrough`. - ---- - -## Architecture: mirror the SSA engine for pre-SSA - -``` -┌──────────────────────────── Pipeline (tccgen.c) ─────────────────────────────┐ -│ │ -│ SSA layer: IRSSAOptCtx + IRSSAOptGen + ssa_opt_run_gens() │ -│ ✓ shipped: 13 passes, generator-based dispatch │ -│ │ -│ Pre-SSA layer (this plan): │ -│ IROptCtx + IROptGen + tcc_ir_opt_run_gens() │ -│ one engine, ~25 fusion/branch/bool peepholes registered as gens │ -│ ~55 remaining passes call into shared infra but stay bespoke │ -│ │ -├─────────────────────────── Shared analysis cache ────────────────────────────┤ -│ IROptCtx { du, bb_starts, pred_count, merge_bitmap } — lazy, generational │ -├─────────────────────────────── Libraries ────────────────────────────────────┤ -│ opt_du opt_utils opt_alias opt_loop_utils opt_hash opt_xform │ -├──────────────────────────────── IR core ─────────────────────────────────────┤ -│ core.c ir.h cfg.c ssa.c vreg.c pool.c machine_op.c │ -└──────────────────────────────────────────────────────────────────────────────┘ -``` - -The pre-SSA engine deliberately mirrors the SSA engine's type and function naming: - -| SSA layer | Pre-SSA mirror | -|--------------------------|----------------------------| -| `IRSSAOptCtx` | `IROptCtx` | -| `IRSSAOptGen` | `IROptGen` | -| `ssa_opt_run_gens()` | `tcc_ir_opt_run_gens()` | -| `ssa_gen_*` functions | `ir_gen_*` functions | -| `ssa_opt_()` | `tcc_ir_opt_()` | - -Contributors who know one layer learn the other for free, and one implementation informs the other. - ---- - -## Flash savings estimate - -| Source of saving | Approx. lines removed | -|-------------------------------------------------------------------------|-----------------------| -| Iteration-loop boilerplate deduplicated across ~25 peephole passes | ~2,500 | -| DU-table builds: 20+ inline `ir_opt_du_build` call-sites → cache lookup | ~300 | -| Same-block check: 6+ inlined `for (j=...) if (JUMP/JUMPIF)` loops | ~200 | -| Pool-slot grow loops in fusion passes (`while (count <= n) pool_add`) | ~100 | -| `IROptHashTable` collapsing 4 hand-rolled CSE hash tables | ~400 | -| Constants in 2 idempotent/simplify boolean passes merged into one scan | ~150 | -| Branch-folding family (5 JUMPIF-triggered passes) merged into one scan | ~400 | -| **Total estimate** | **~4,000 lines (~14% of opt.c)** | - -Conservative because it counts only what duplication clearly costs; the engine creates new abstraction surface (~600 lines) that must be subtracted. **Net ~3,400 lines / ~12%.** - -The other win — not visible in line count — is **fewer O(n) scans** through the IR. The fusion group alone goes from 7+ separate forward scans (each rebuilding DU) to 1 scan with 1 DU build. For a function with 10,000 instructions that's 60,000–70,000 fewer dispatch-loop iterations per compile. - ---- - -## Migration phases - -The phase order has changed from the original plan. **Engine work goes first** because it produces all the flash savings; theme-based file splitting goes last because it produces zero flash savings (only readability). - -### Phase 0 — Delete dead code (15 min) - -1. Remove `tcc_ir_opt_run_by_name` ([opt.c:15131](ir/opt.c#L15131)) — empty stub. -2. Remove `tcc_ir_opt_return` ([opt.c:11202](ir/opt.c#L11202)) — 5-line stub never called from any pipeline path that needs it. -3. Delete `ir/opt_embedded_deref.c` if still present on disk (orphaned, not in `Makefile`). -4. Remove matching declarations from `ir/opt.h`. - -**Verify:** `make cross && make test -j16`. - ---- - -### Phase 1 — Extract shared analysis & primitives (4–6 h) - -This is the highest-leverage phase for flash savings. All subsequent phases depend on the libraries created here. - -#### 1.1 `ir/opt_du.h` + `ir/opt_du.c` (~200 lines) -- Move `IROptDU`, `ir_opt_du_build/def/uses/idx` from `opt.c`. -- Used by 20+ pass sites today; each currently writes its own `IROptDU du; ir_opt_du_build(ir, &du); …; tcc_free(du.def)` block (~10–15 lines per site). -- After extraction these collapse to `const IROptDU *du = ir_opt_ctx_require_du(&ctx);`. - -#### 1.2 `ir/opt_xform.h` + `ir/opt_xform.c` (~150 lines) -Six primitives, mirrors the most-duplicated patterns: -```c -static inline void ir_xform_nop(TCCIRState *ir, int idx); /* 81 sites */ -void ir_xform_replace_with_assign(TCCIRState *ir, int idx, IROperand src); /* ~40 sites */ -void ir_xform_replace_with_imm(TCCIRState *ir, int idx, int64_t v, int btype); -int ir_xform_same_block(TCCIRState *ir, int from, int to); /* 6+ sites */ -int ir_xform_alloc_pool(TCCIRState *ir, int n_slots); /* every fusion pass */ -void ir_xform_nop_with_du(TCCIRState *ir, int idx, IROptDU *du); -``` - -#### 1.3 `ir/opt_utils.h` + `ir/opt_utils.c` (~1,500 lines) -Extract from `opt.c`: -- Constant evaluators: `ir_opt_eval_const_u64`, `ir_opt_eval_const_string`, `evaluate_compare_condition`, `is_power_of_2`, condition-token helpers (`invert_cond_token`, `vrp_swap_cmp_tok`, `vrp_negate_cmp_tok`). -- BB / CFG helpers: `ir_opt_build_merge_bitmap`, `ir_opt_mark_block_starts`, `ir_opt_next_non_nop`, `ir_skip_nops_forward`, `ir_has_other_jump_to`, `ir_negate_condition`, `invert_condition`. -- Purity tables: `ir_opt_is_pure_helper_name`, `ir_opt_is_flag_cmp_helper_name`, `ir_opt_is_pure_fallthrough_instruction`, `tcc_ir_is_pure_aeabi`. -- Expression equality: `ir_opt_pure_expr_equal`, `ir_opt_pure_def_equal`, `ir_opt_nonvreg_expr_equal`. -- Call-param helpers: `ir_opt_get_call_param_operand` (27 sites), `ir_opt_nop_call_params` (15 sites), `ir_opt_nop_call_param`, `ir_opt_change_call_argc`. - -#### 1.4 `ir/opt_alias.h` + `ir/opt_alias.c` (~600 lines) -- `ir_opt_store_btype_size_bytes`, `ir_opt_stack_slot_range_for_offset`, `stackoff_same_slot`, `operand_references_slot`, `is_stack_address_operand`, `find_deref_use_operand`. - -#### 1.5 `ir/opt_loop_utils.h` + `ir/opt_loop_utils.c` (~1,800 lines) -- IV analysis (`find_induction_vars_ex`, `find_derived_ivs`, `transform_derived_iv`, `iv_strength_reduction_core`). -- Loop bounds (`find_loop_exit_condition`, `compute_trip_count`, `collect_body_instructions`). -- Loop transforms (`try_eliminate_loop`, `try_unroll_loop`, `try_rotate_loop`). -- Structs `InductionVar`, `DerivedIV`. - -**At end of Phase 1:** `opt.c` shrinks from 28,973 to ~24,000 lines. No pass logic moves yet; only their shared helpers. `static` → `extern` for everything pulled out. Build is verified after each step. - ---- - -### Phase 2 — Build the engine (3–4 h) - -#### 2.1 `ir/opt_engine.h` + `ir/opt_engine.c` - -Mirror the SSA engine's shape: - -```c -typedef struct IROptCtx { - TCCIRState *ir; - int n; /* cached ir->next_instruction_index */ - uint32_t generation; /* bumped on invalidation */ - - /* Lazy-built analyses — accessor builds on first use */ - IROptDU du; - uint32_t du_gen; - - int *pred_count; - uint32_t pred_gen; - - uint8_t *merge_bitmap; - uint32_t merge_gen; - - int changes; -} IROptCtx; - -typedef int (*ir_opt_gen_fn)(IROptCtx *ctx, int instr_idx); - -typedef struct IROptGen { - int op; /* trigger opcode; -1 = match any */ - ir_opt_gen_fn fn; - const char *name; - uint8_t needs_du; /* engine builds DU before dispatch if any gen requires */ - uint8_t same_block; /* engine wraps fn with same-BB check */ -} IROptGen; - -/* Lifecycle */ -void tcc_ir_opt_ctx_init(IROptCtx *ctx, TCCIRState *ir); -void tcc_ir_opt_ctx_free(IROptCtx *ctx); -void tcc_ir_opt_ctx_invalidate(IROptCtx *ctx); - -/* Lazy analysis accessors */ -const IROptDU *tcc_ir_opt_ctx_require_du(IROptCtx *ctx); -const int *tcc_ir_opt_ctx_require_pred(IROptCtx *ctx); -const uint8_t *tcc_ir_opt_ctx_require_merge(IROptCtx *ctx); - -/* Run a table of generators in a single forward pass */ -int tcc_ir_opt_run_gens(IROptCtx *ctx, const IROptGen *gens, int count); -``` - -Engine loop (mirrors `ssa_opt_run_gens` shape): -```c -int tcc_ir_opt_run_gens(IROptCtx *ctx, const IROptGen *gens, int count) -{ - TCCIRState *ir = ctx->ir; - int changes = 0; - - /* Ensure analyses are built once if any rule needs them */ - int any_du = 0; - for (int g = 0; g < count; g++) if (gens[g].needs_du) { any_du = 1; break; } - if (any_du) tcc_ir_opt_ctx_require_du(ctx); - - for (int i = 0; i < ir->next_instruction_index; i++) { - int op = ir->compact_instructions[i].op; - if (op == TCCIR_OP_NOP) continue; - for (int g = 0; g < count; g++) { - if (gens[g].op >= 0 && gens[g].op != op) continue; - int d = gens[g].fn(ctx, i); - if (d > 0) { changes += d; break; } /* first-match-wins */ - } - } - return changes; -} -``` - -**Same-block check:** When `gens[g].same_block` is set, the generator is wrapped by a helper that calls the user's `fn`, captures the matched instruction range, and calls `ir_xform_same_block` before allowing the transform. The cleanest place to put this check is inside the generator (it knows which range to test); a helper macro `IR_OPT_REQUIRE_SAME_BLOCK(ctx, from, to)` makes it one line. - -#### 2.2 Verify -Build only — no rules yet. Add `opt_engine.c`/`opt_du.c`/`opt_xform.c` to `Makefile` `IR_FILES`. Both engines coexist; pre-SSA passes still call the old way. - ---- - -### Phase 3 — Convert pass groups to generator tables - -Order is by **density of duplication** (highest payoff first), not by file location. - -#### 3.1 Fusion group → `ir/opt_gens_fusion.c` (4–6 h) - -Convert 7+ fusion passes into generators sharing one engine run. Current passes: - -| Pass | Trigger | Today's lines | After (match+transform) | -|---------------------------|----------------------|---------------|-------------------------| -| `fusion_pass` (mla+indexed) | `ADD`, `LOAD`, `STORE` | ~300 | ~120 | -| `rotate_fusion` | `ADD`/`OR` patterns | ~260 | ~100 | -| `deref_indexed_fusion` | ALU with deref | ~215 | ~100 | -| `disp_fusion` | `LOAD`/`STORE`/`ASSIGN` | ~260 | ~90 | -| `postinc_fusion` | `LOAD`/`STORE` | ~280 | ~90 | -| `lea_fold` | any deref source | ~420 | ~120 | -| `indexed_chain` | `LOAD_INDEXED`/`STORE_INDEXED` | ~150 | ~60 | -| `indexed_pair_reorder` | `LOAD_INDEXED` pairs | ~200 | ~70 | -| `assign_fuse` | `ASSIGN` chain | ~190 | ~70 | - -Hand-written exceptions: -- `add_deref_fold` (inserts new instructions, can't fit a same-index forward engine). -- `loop_postinc_fusion` (needs loop structure from `IRLoops`). -- `stackoff_addr_cse`, `call_chain_rename` (BB-scoped hash, see Phase 3.4). - -**Pipeline integration:** -```c -/* Before — 8 separate forward scans, 8 DU builds */ -tcc_ir_opt_rotate_fusion(ir); -tcc_ir_opt_fusion_pass(ir, opt_mla, opt_indexed); -tcc_ir_opt_deref_indexed_fusion(ir); -tcc_ir_opt_disp_fusion(ir); -tcc_ir_opt_indexed_chain(ir); -tcc_ir_opt_indexed_pair_reorder(ir); -tcc_ir_opt_assign_fuse(ir); -tcc_ir_opt_lea_fold(ir); -tcc_ir_opt_postinc_fusion(ir); - -/* After — 1 scan, 1 DU build */ -IROptCtx ctx; -tcc_ir_opt_ctx_init(&ctx, ir); -tcc_ir_opt_run_gens(&ctx, fusion_gens, FUSION_GENS_COUNT); -tcc_ir_opt_ctx_free(&ctx); - -tcc_ir_opt_add_deref_fold(ir); /* inserts → hand-written */ -tcc_ir_opt_loop_postinc_fusion(ir); /* needs IRLoops → hand-written */ -``` - -Convert one generator at a time, run `make test -j16` after each. Use existing IR tests (`tests/ir_tests/`) that exercise each pattern to catch ordering regressions. - -#### 3.2 Branch-folding group → `ir/opt_gens_branch.c` (3–4 h) - -All these trigger on `JUMPIF` and inspect the backward def chain. Currently 5 separate forward scans: - -| Pass | Trigger | Today | After | -|---------------------------|-------------|-------|-------| -| `branch_folding` | `JUMPIF` | ~160 | ~55 | -| `setif_branch_fuse` | `JUMPIF` | ~130 | ~65 | -| `stack_addr_nonnull_fold` | `JUMPIF` | ~470 | keep hand-written *or* split simple cases (~120) into generator and leave deep def-chain tracing (~350) in a helper | -| `or_bool_diamond` | `JUMPIF` | ~230 | ~80 | -| `stack_bool_diamond` | CFG diamond | ~270 | keep hand-written (4-instruction CFG pattern doesn't fit single-trigger dispatch) | - -Hand-written exceptions: `nonneg_branch_fold`, `float_branch_fold` (need merge-bitmap value tracking that doesn't fit per-instruction dispatch). - -#### 3.3 Boolean simplification → `ir/opt_gens_bool.c` (1–2 h) - -`bool_idempotent` + `bool_simplify` + the idempotent half of `bool_pass` collapse into 2–3 generators triggered on `BOOL_AND`/`BOOL_OR`. CSE half of `bool_pass` keeps its hash table and uses the new generic `IROptHashTable` from Phase 4. - -#### 3.4 BB-scoped hash CSE → use `opt_hash` (3–4 h) - -`cse_global_load`, `globalsym_cse`, `cse_param_add`, `local_load_cse`, `local_alu_cse`, `stackoff_addr_cse`, `cse_bool` all maintain a hash table that resets at BB boundaries. They are too varied for a single engine but they all reinvent the same hash-table lifecycle. - -**Phase 4 builds a shared `IROptHashTable`** (see below) — these passes are then rewritten to use it. Body logic stays per-pass; only the hash-table alloc/lookup/insert/clear/free becomes shared. ~400 lines saved across the 7 passes. - -#### 3.5 Call-result dead group → `ir/opt_gens_call_result.c` (2 h) - -`dead_call_result_elim`, `dead_init_via_call`, `dead_sret_call_elim`, `fold_call_result_store` all trigger on `FUNCCALLVAL` / `RETURNVALUE` and inspect the result's use chain. Collect-then-transform pattern fits the engine if a 2-phase variant is added (see Phase 5). - ---- - -### Phase 4 — Generic hash table (3–4 h) - -`ir/opt_hash.h` + `ir/opt_hash.c` (~200 lines) providing a bump-allocated CSE hash table. Drop-in replacement for 4 hand-rolled tables in `opt.c`: - -| Pass | Local struct | Buckets | -|---------------------|--------------------|---------| -| `cse_arith` (in `local_alu_cse`) | `ArithCSEEntry` | 256 | -| `cse_bool` (in `bool_pass`) | `BoolCSEEntry` | 64 | -| `sl_forward` | `StoreEntry` | 128 | -| `globalsym_cse` | `GSymCSEEntry` | linear-16 | - -API mirrors what `ssa_opt_load_cse` uses internally: - -```c -typedef struct IROptHashEntry { - uint32_t hash; - int instruction_idx; - int32_t result_vr; - int extra[4]; /* pass-specific payload */ - struct IROptHashEntry *next; -} IROptHashEntry; - -typedef struct IROptHashTable { - IROptHashEntry **buckets; - int n_buckets; - IROptHashEntry *pool; /* bump-allocated */ - int pool_count; -} IROptHashTable; - -void ir_opt_hash_init(IROptHashTable *, int n_buckets, int max_entries); -void ir_opt_hash_clear(IROptHashTable *); /* O(n_buckets), not O(entries) */ -void ir_opt_hash_free(IROptHashTable *); -IROptHashEntry *ir_opt_hash_lookup(IROptHashTable *, uint32_t hash, - int (*eq)(const IROptHashEntry *, const void *), - const void *key); -IROptHashEntry *ir_opt_hash_insert(IROptHashTable *, uint32_t hash); -``` - -`sl_forward`'s store-entry table has alias semantics that don't fit; **don't** touch it. The other 3 are straight rewrites. - ---- - -### Phase 5 — Collect-then-transform engine variant (optional, 2–3 h) - -Several passes (`const_var_prop`, `dead_call_result_elim`, `redundant_var_assign`, `dead_var_store_elim`) follow the pattern: forward pass to collect metadata, finalize, forward pass to transform. A 2-phase engine collapses their boilerplate: - -```c -typedef struct IROptCollectGen { - const char *name; - int op; - int (*collect)(IROptCtx *, int idx); /* phase 1 */ - int (*transform)(IROptCtx *, int idx); /* phase 2 */ -} IROptCollectGen; - -int tcc_ir_opt_run_collect_gens(IROptCtx *, const IROptCollectGen *, int n); -``` - -This is **optional** and should only be done after Phase 3 if the collect-transform passes still show significant boilerplate. If they don't, keep them hand-written and skip this phase. - ---- - -### Phase 6 — Theme-based file split (3–5 h, optional, zero flash savings) - -After Phases 0–5 the pre-SSA layer is: -- `opt.c` core (~16,000 lines of hand-written passes that don't fit any engine variant) -- `opt_engine.c`, `opt_du.c`, `opt_xform.c`, `opt_utils.c`, `opt_alias.c`, `opt_loop_utils.c`, `opt_hash.c` -- `opt_gens_fusion.c`, `opt_gens_branch.c`, `opt_gens_bool.c`, `opt_gens_call_result.c` - -Splitting the remaining `opt.c` by theme (cleanup / constprop / memory / loop / promote / peephole) is a pure-readability change and produces **zero flash savings**. It is worth doing once everything else is stable, mostly to make merge conflicts less painful. Don't block any of the earlier phases on this. - ---- - -## Pipeline driver changes - -The optimization driver lives in `tccgen.c` (~lines 25227–26230). Most changes are local one-block replacements where 7 sequential pass calls become 1 engine call: - -- Fusion section (~25446–25478): 9 calls → 1 engine call + 2 hand-written holdouts. -- Branch section (~25277–25291 and ~25535–25589 inside iterative loop): 3–4 calls → 1 engine call. -- Boolean section (~25480–25484): 2 calls → 1 engine call + 1 hand-written CSE. - -Inside the iterative `do { changes += … } while (changes)` loop, each engine invocation creates and destroys its own `IROptCtx` — the analysis cache must not span iterations because `compact_nops` and `dce` between iterations renumber instructions. - ---- - -## Risks - -- **Generator function-pointer dispatch overhead.** With ~10 fusion gens and 20K instructions, that's up to 200K indirect calls per engine run. Trigger-op filtering skips ~90% of gens per instruction. If profiling shows >5% overhead, switch to a `switch (op)` dispatch table generated at compile time. Mitigation already proven by `ssa_opt_run_gens` running in production with 14+ gens in `fold` alone. -- **Ordering changes when batching.** Today MLA fusion finishes the entire IR before disp fusion starts. After batching they run at the same instruction. First-match-wins + rule ordering (MLA before disp, indexed before plain disp, etc.) handles this, but every conversion needs a test verifying IR-dump equivalence on a representative input. -- **DU-table invalidation mid-pass.** When a generator changes `MUL→MLA` or `LOAD→LOAD_INDEXED`, the set of defined/used vregs around that index changes. NOP-only transforms preserve DU. Each generator must declare whether it changes opcodes; the engine refreshes DU between gens that need it. The SSA engine handles this via `tcc_ir_ssa_opt_rebuild` — borrow the same approach. -- **Pre-SSA passes that insert instructions.** `add_deref_fold` is the canonical example. Inserting shifts subsequent indices, invalidating the engine's loop counter. These stay hand-written and run **outside** the engine call. Document the rule: "generators must not change instruction count." - ---- - -## Estimated effort - -| Phase | What | Time | Net lines removed | -|------:|---------------------------------------------------|----------|-------------------| -| 0 | Delete dead stubs | 15 min | ~30 | -| 1 | Libraries: opt_du / opt_xform / opt_utils / opt_alias / opt_loop_utils | 4–6 h | ~500 (dedup) | -| 2 | Engine: opt_engine.c | 3–4 h | -600 (added) | -| 3.1 | Fusion gens | 4–6 h | ~1,400 | -| 3.2 | Branch gens | 3–4 h | ~500 | -| 3.3 | Bool gens | 1–2 h | ~200 | -| 3.4 | BB hash CSE rewrites | 3–4 h | ~400 | -| 3.5 | Call-result gens | 2 h | ~300 | -| 4 | Generic IROptHashTable | 3–4 h | (counted in 3.4) | -| 5 | Collect-transform engine variant (optional) | 2–3 h | ~250 | -| 6 | Theme-based split of remaining opt.c (optional) | 3–5 h | 0 | -| **Total (phases 0–4)** | **~20–28 h** | **~3,400 (~12%)** | - -Each phase produces a working build. Each can ship independently. If the project ships at any intermediate state, the result is strictly better than today. - ---- - -## Why this rewrite is different from the original plan - -| Original plan said… | This plan says… | -|---------------------------------------------------|--------------------------------------------------------------| -| opt.c is 22,712 lines, ~60 passes | opt.c is 28,973 lines, 81 passes (and growing) | -| Pre-SSA is a migration bridge — passes die as SSA matures | Pre-SSA is permanent infrastructure for post-destruction IR | -| Phase 4 (engine) is optional contingency | Phase 2 (engine) is the **primary** flash-saving mechanism | -| Phases 2 (theme split) first, then engine | Engine first; theme split last (or skip entirely) | -| Invent a fresh `IRPeepholeRule` API | **Mirror** the proven `IRSSAOptGen` / `ssa_opt_run_gens` API | -| Pass conversion is a 4–6 h side project | Pass conversion is **the whole point** — most of the work | \ No newline at end of file diff --git a/docs/plan_opt_predicate_framework.html b/docs/plan_opt_predicate_framework.html new file mode 100644 index 00000000..aec45fc5 --- /dev/null +++ b/docs/plan_opt_predicate_framework.html @@ -0,0 +1,1067 @@ + + + + + +tinycc — optimizer predicate & guard framework + + + + +
+ + +
+
+

tinycc · armv8-m fork · optimizer proposal

+

Guards, not folklore — a predicate & query framework for the IR optimizer

+

Optimization passes are filters and selectors: scan, check conditions, + rewrite. Nearly every fuzzer miscompile was one missing guard condition. This plan makes + guards a shared, named, composable, observable vocabulary — one op-property + table, one operand iterator, one range engine, one fluent guard DSL, one mutation + funnel, one invalidation walker — so each class of fix lands once, centrally, + forever.

+
    +
  • ~300 opt functions + 15 SSA passes
  • +
  • 1,962 op == TCCIR_OP_* comparisons
  • +
  • ~75 whole-function scan loops
  • +
  • 220 is_jump_target guard sites
  • +
  • 110 op4 sites
  • +
  • ~82 invalidation sites
  • +
  • 10+ fixes that were missing guards
  • +
+
+ +
+

§1The anatomy of a miscompile

+

every fix was a two-line guard; every sibling pass kept the landmine

+

The differential fuzzer finds an O1/O2 divergence; triage bisects to a pass; the root + cause is one absent condition — the transform was legal except when an MLA + accumulator, a barrel-shift annotation, a switch side-table, a spill-encoded stack + operand, or a join point was involved. The fix is a two-line guard. The same latent gap + usually survives in every sibling pass, because each pass re-derives its guards + privately.

+

The record, mapped to the layer of this framework that makes each class + structural:

+
+ + + + + + + + + + + + + + + + + + + + + + + + +
Bug classTestsWhat went wrongLayer that ends the class
MLA accumulator invisible to use/def scans257 · 267 · 2854th operand at pool[operand_base+3] not advertised by irop_configL2 ir_q_operands() includes op4 by construction
Barrel-shift annotation ignored280 · 281barrel_shifts[orig_index] check private to 2 files, absent elsewhereL1/L2 ir_q_barrel_shifted() in the shared vocabulary
Missing invalidation on def/store/call243 · 248 · 266each tracking pass re-implements the event set, each missing one eventL6 the walker enumerates events; opting out is explicit
SWITCH_TABLE targets not renumbered on insert268private insert helper knew about jumps, not switch_tables[]L5 one mutation funnel carries all remap invariants
Spill-encoded STACKOFF read as a real slotpack64the vreg_type == 0 rule lived in a comment, not an accessorL2 irop_is_direct_stack_slot()
Fusion across a jump target251is_jump_target clause forgotten in one peephole scanL0/L3 join-point stop is default-on
Divergent purity/side-effect op-setslatent8-op vs 30-op classifiers answer the same question differentlyL1 one table, named masks, diffs greppable
+
+ good news first +

The raw material already exists: a def-use table (IROptDU, + ir/opt_du.h:46–97) and a flat def-count + (ir_opt_build_def_count), prefix-sum range queries in the register + allocator (ra_has_call_in_range, ir/regalloc.c:109), + a declarative pass pipeline with requires/invalidates + bitmasks (ir/opt_pipeline.c:338–521), and a central kill + switch (TCC_DISABLE_PASS). None of it is the default path — + ~75 loops still hand-roll what these facilities already answer. This plan finishes + plumbing that is 30% built; it does not start from zero.

+
+
+ +
+

§2The shapes of optimizer code today

+

what ~75 scan loops, five classifiers and six fact-trackers re-derive by hand

+

Every pass opens with the same overture before its actual idea starts:

+
/* the shape that appears ~75 times across ir/ — bounds, NOP skip,
+ * join-point stop, then a hand-rolled op classification */
+for (k = lo + 1; k < hi; k++) {
+  IRQuadCompact *q = &ir->compact_instructions[k];
+  if (q->op == TCCIR_OP_NOP)
+    continue;
+  if (q->is_jump_target)            /* the clause test 251 was missing */
+    return 0;
+  switch (q->op) {
+  case TCCIR_OP_STORE:              /* ...a 30-case switch, different  */
+  case TCCIR_OP_STORE_INDEXED:      /*    in every copy...             */
+  /* ... */
+  }
+}
+

What the survey found (counts from the working tree, branch + heapOverflowBug):

+
    +
  • Range scans, ~35 of them. "Is [lo,hi] free of stores + / calls / joins / redefinitions?" re-implemented with different op sets and different + interval conventions: ir_xform_range_preserves_memory + (ir/opt_xform.c:28), ir_opt_pure_def_memory_stable + (ir/opt_utils.c:880), cse_cmp_op_may_clobber + (ir/opt.c:2332), loop_body_may_clobber_memory + (ir/licm.c:1633), ir_opt_vreg_has_def_in_range + (ir/opt_dce.c:577). Only the register allocator precomputes + prefix sums (ir/regalloc.c:84/125); everyone else re-scans + O(range) inside O(n) outer loops.
  • +
  • Op classifiers, duplicated and divergent. + has_side_effects (ir/licm.c:43) knows 8 ops; + ssa_opt_has_side_effects (ir/opt/ssa_opt.c:244) + knows 30 — including STORE_POSTINC, VLA ops, inline asm and setjmp, + which licm's copy simply does not. Plus gvn_is_pure_alu / + gvn_is_commutative, op_is_unsafe_for_reroll (27 cases), + lcs_op_supported (27 cases) — same concept, five op-sets. 1,962 raw + op == comparisons total.
  • +
  • Operand-kind folklore. 323 irop_is_immediate sites, + 882 is_lval reads, 809 TCCIR_DECODE_VREG_TYPE sites. The + header rule that a STACKOFF operand is a real stack slot only when + vreg_type == 0 (tccir_operand.h:55–66) is + honored by ~2 call sites; five near-identical stack-address predicates exist + (opt_alias.c:84 · core.c:327 · licm.c:34 · licm.c:1238 · + opt_knownbits.c:195) — not all apply the rule.
  • +
  • The 4th operand. pool[operand_base+3] is overloaded + per-op — MLA accumulator, indexed scale, SELECT condition + (tccir.h:813/800/833) — and irop_config + advertises only dest/src1/src2. 110 sites hand-handle it; the helper + ir_opt_mla_accum_vreg (ir/opt_constprop.c:353) + reached only 7 of them.
  • +
  • Use/def scans. ~34 ad-hoc "count uses of vreg X" full scans and + ~48 backward find-the-def scans, despite IROptDU, + DC_IS_SINGLE_DEF (ir/opt_du.h:104–107) and the + SSA per-vreg use lists all existing.
  • +
  • Duplicated annotation checks. + has_barrel_shift_annotation copy-pasted verbatim in + ssa_opt_fold.c:26 and + ssa_opt_reassoc.c:36.
  • +
  • Invalidation, hand-rolled six times. ~82 "drop cached facts on + def/store/call" sites across opt_memory.c (46), + opt_knownbits.c (15), opt_copyprop.c + (9), opt_constprop.c (6), ssa_opt_sccp.c, + ssa_opt_cprop.c.
  • +
  • Call purity by name. ir_opt_is_pure_helper_name and + siblings (ir/opt_utils.c:688+) — reasonable, but consulted + ad hoc rather than through one call-classification point.
  • +
+
+ +
+

§3Design overview — seven layers, one vocabulary

+

pure additions over the existing representation; old helpers become wrappers, then die

+

Seven layers, L0–L6. Each is independently adoptable and lands as a + pure addition; an old helper becomes a one-line wrapper over the framework and is + deleted with its last caller. No IR redesign: everything operates on the existing flat + compact_instructions[], the operand pool, and the side tables keyed by + orig_index.

+ +
+ + + + + + + + + + + + + + ~300 opt passes · 15 SSA passes · licm · regalloc · codegen peepholes + what remains per pass: match → guard → transform + + + + L4 ir/guard.h + when(x) and(not(y)) + TCC_TRACE_GUARDS + + + L6 ir/track.c + event walker: def · mem + call · barrier · join + + + L5 ir/mutate.c + insert · delete · replace + all side-table remaps + + + + L3+L0 ir/query.c + ir_range_ok() · IRRangeIndex O(1) + IRCursor / IR_SCAN boilerplate + + + L2 ir/predicates.h + ir_q_operands() — op4 aware + irop_is_direct_stack_slot() + + + + L1 ir/predicates.c — ir_op_props[TCCIR_OP_COUNT] + one property table · IROP_M_* named masks · IROP_P_KNOWN selftest + + + + IROptDU · def_count + SSA use lists — existing; + becomes the default path (ph. 5) + + + opt_pipeline groups + requires/invalidates bitmasks + TCC_DISABLE_PASS — unchanged + + + regalloc prefix sums + generalize into IRRangeIndex + + + + representation — unchanged + compact_instructions[] · iroperand_pool · irop_config · switch_tables[] · + barrel_shifts[orig_index] · shift64_dead_half[] · bfi_params[] + + + + + + + + + + + + + + + + + + + + +
Fig. 1 — The layer stack. Amber layers are new; green blocks already exist + and get promoted to the default path (dashed arrows: DU serves the passes directly, + regalloc's prefix sums generalize into IRRangeIndex); the purple representation does not + change.
+
+
+ new framework layer + existing, promoted + representation (unchanged) + the passes +
+ +
+ + + + + + + + + + + + + + + + + + + + + +
FileLayerContentsNaming
tccir_operand.h (existing)L2irop_is_direct_stack_slot() family — beside the prose rule it encodesirop_*
ir/predicates.h + .cL1+L2op-property table, masks, ir_q_* quad queries, selftestir_op_* · ir_q_*
ir/guard.hL4the fluent guard DSL — opt-in include, never dragged in by ir/ir.hwhen · and · and_not · not
ir/query.h + .cL0+L3cursors, range engine, IRRangeIndexir_cursor_* · ir_range_*
ir/mutate.h + .cL5insert/delete/replace funneltcc_ir_* (public)
ir/track.h + .cL6tracking-pass event walkerir_track_*
+
+ +
+

§4L1 — one op-property table

+

op classification becomes data; unknown means dangerous

+

One table, orthogonal property bits, and named masks that reproduce + each legacy classifier so the historical differences become one greppable line each:

+
/* ir/predicates.h */
+typedef uint32_t IROpProps;
+#define IROP_P_KNOWN        (1u << 0)   /* entry was written on purpose  */
+#define IROP_P_WRITES_MEM   (1u << 1)   /* STORE*, BLOCK_COPY            */
+#define IROP_P_READS_MEM    (1u << 2)
+#define IROP_P_CALL_LIKE    (1u << 3)   /* FUNCCALL*, builtin apply, ... */
+#define IROP_P_TERMINATOR   (1u << 4)   /* JUMP/JUMPIF/SWITCH_*/RETURN*  */
+#define IROP_P_ASM          (1u << 5)
+#define IROP_P_SP_EFFECT    (1u << 6)   /* VLA alloc / SP save-restore   */
+#define IROP_P_EH           (1u << 7)   /* setjmp/longjmp                */
+#define IROP_P_CALLSEQ      (1u << 8)   /* call-arg staging ops          */
+#define IROP_P_ALU          (1u << 9)   /* pure computation, incl. MLA   */
+#define IROP_P_COMMUTATIVE  (1u << 10)
+#define IROP_P_CMP          (1u << 11)
+#define IROP_P_HAS_OP4      (1u << 12)  /* MLA / *_INDEXED / SELECT      */
+
+extern const IROpProps ir_op_props[TCCIR_OP_COUNT];  /* new sentinel after
+                                                        TCCIR_OP_SMULL (tccir.h:229) */
+static inline IROpProps ir_op_p(TccIrOp op)
+{
+  IROpProps p = ir_op_props[op];
+  return (p & IROP_P_KNOWN) ? p : ~0u;   /* unknown = has every effect */
+}
+static inline int ir_op_any(TccIrOp op, IROpProps mask)
+{
+  return (ir_op_p(op) & mask) != 0;
+}
+
+/* each legacy classifier, as one reviewable line: */
+#define IROP_M_CLOBBERS_MEM (IROP_P_WRITES_MEM|IROP_P_CALL_LIKE|IROP_P_ASM|\
+                             IROP_P_SP_EFFECT|IROP_P_EH)
+#define IROP_M_SIDE_EFFECT  (IROP_M_CLOBBERS_MEM|IROP_P_TERMINATOR|IROP_P_CALLSEQ)
+#define IROP_M_BARRIER      (IROP_M_CLOBBERS_MEM|IROP_P_TERMINATOR)
+

gvn_is_pure_alu (26 lines) becomes + ir_op_any(op, IROP_P_ALU). The licm/ssa_opt disagreement becomes a diff + between two IROP_M_* definitions instead of two 30-line switches in + different files.

+
+
Decision — unknown means dangerous
+

With designated initializers, a forgotten table entry reads as all-zero — + i.e. "pure", exactly the failure mode this framework exists to kill. The + IROP_P_KNOWN bit inverts it: an unclassified op behaves as + clobbers-everything, so forgetting an entry can only pessimize, never miscompile. + ir_predicates_selftest() — run under TCC_IR_SELFTEST=1 and + from the unit suite — asserts every op below TCCIR_OP_COUNT has + IROP_P_KNOWN and cross-checks IROP_P_HAS_OP4 against + irop_config.

+
recommended · fail conservative, fail loud
+
+
+ +
+

§5L2 — operands without folklore

+

the 4th operand and the STACKOFF rule, as accessors instead of comments

+

Two representation subtleties caused five separate miscompiles. Both become + accessors.

+

The 4th operand

+ +
+ + + + + + + + iroperand_pool[q->operand_base + …] + + + + + +0 dest + +1 src1 + +2 src2 + +3 op4 + irop_config.has_dest + irop_config.has_src1 + irop_config.has_src2 + not advertised + + + covered by every irop_config-driven operand fan-out + + invisible to naïve scans + + + + per-op meaning of slot 3 + TCCIR_OP_MLA → accumulator — + a real VREG USE + LOAD/STORE_INDEXED → scale (immediate) + TCCIR_OP_SELECT → condition + +
Fig. 2 — The quad operand layout. irop_config advertises three + slots; the overloaded 4th is the "invisible use" behind tests 257, 267 and 285 — + ir_q_operands() makes it impossible to miss.
+
+ +
/* ir/predicates.h */
+typedef struct IROperandRef {
+  IROperand op;
+  uint8_t slot;         /* 0=dest 1=src1 2=src2 3=op4 */
+  uint8_t is_def;       /* writes a vreg (non-lval dest) */
+  uint8_t is_vreg_use;  /* reads a vreg: srcs, MLA accum, AND an lval
+                           dest — a store THROUGH dest reads its address */
+  uint8_t writes_mem;
+} IROperandRef;
+
+int ir_q_operands(const TCCIRState *ir, const IRQuadCompact *q,
+                   IROperandRef out[4]);                    /* returns count */
+int ir_q_vreg_uses(const TCCIRState *ir, const IRQuadCompact *q,
+                    int32_t out[4]);                        /* op4 included  */
+int32_t ir_q_def_vreg(const TCCIRState *ir, const IRQuadCompact *q); /* -1 if none */
+
+/* deduped from ssa_opt_fold.c:26 / ssa_opt_reassoc.c:36 (verbatim clones) */
+static inline int ir_q_barrel_shifted(const TCCIRState *ir, const IRQuadCompact *q)
+{
+  return ir->barrel_shifts && q->orig_index >= 0 &&
+         q->orig_index <= ir->max_orig_index &&
+         ir->barrel_shifts[q->orig_index];
+}
+ +

A use-count scan written against ir_q_vreg_uses cannot miss the + accumulator — the bug class of tests 257/267/285 stops being writable:

+
+
+

before — accum handled only if the author remembered

+
if (irop_config[q->op].has_src1 &&
+    irop_get_vreg(src1) == vr) uses++;
+if (irop_config[q->op].has_src2 &&
+    irop_get_vreg(src2) == vr) uses++;
+if (q->op == TCCIR_OP_MLA && ...)
+  /* often absent — tests 257/267/285 */
+
+
+

after — op4 included by construction

+
int32_t u[4];
+int n = ir_q_vreg_uses(ir, q, u);
+for (int k = 0; k < n; k++)
+  if (u[k] == vr)
+    uses++;
+
+
+ +

The STACKOFF rule

+

The vreg_type == 0 real-slot test moves from prose + (tccir_operand.h:55–66) into accessors that live right beside + it:

+
/* tccir_operand.h — the rule, as code */
+static inline int irop_is_direct_stack_slot(IROperand op)
+{ return irop_get_tag(op) == IROP_TAG_STACKOFF && op.vr.vreg_type == 0; }
+
+static inline int irop_is_stack_slot_addr(IROperand op)   /* Addr[StackLoc]  */
+{ return irop_is_direct_stack_slot(op) && !op.vr.is_lval; }
+static inline int irop_is_stack_slot_deref(IROperand op)  /* StackLoc deref  */
+{ return irop_is_direct_stack_slot(op) && op.vr.is_lval; }
+

The five scattered stack-address predicates become wrappers, then callers migrate, + then the wrappers go. The one in ir/licm.c:34 that + omits the vreg_type check gets the fix for free.

+
+ +
+

§6L3 — range queries: one engine

+

one function answers "is this range safe?" — O(1) on the hot path

+

The stop-set is expressed in L1 masks, the common structural conditions are flags, + and an escape hatch exists for genuinely custom checks:

+
/* ir/query.h */
+#define IR_RANGE_NO_JUMP_TARGET   (1u << 0)  /* no join point inside — DEFAULT ON */
+#define IR_RANGE_NO_LVAL_DEST     (1u << 1)  /* no memory write via lval dest     */
+#define IR_RANGE_ALLOW_PURE_CALLS (1u << 2)  /* ir_opt_is_pure_helper_name carve-out */
+
+typedef struct IRRangeQuery {
+  IROpProps stop;          /* any matching op → fail (use IROP_M_* masks) */
+  uint32_t  flags;
+  int32_t   no_redef[4];   /* vregs that must not be (re)defined inside  */
+  int       n_redef;
+  int (*extra)(void *uctx, TCCIRState *ir, int idx, const IRQuadCompact *q);
+  void     *extra_ctx;     /* extra must be file-scope static — see §7   */
+} IRRangeQuery;
+
+int ir_range_ok(TCCIRState *ir, int lo, int hi, const IRRangeQuery *rq);
+int ir_range_ok_simple(TCCIRState *ir, int lo, int hi,
+                        IROpProps stop, uint32_t flags);
+

The six duplicated scanners become wrappers whose masks reproduce today's op sets + bit-exactly (semantic unification, where wanted, is a separate, + separately-swept commit):

+
int ir_range_preserves_memory(TCCIRState *ir, int lo, int hi)  /* opt_xform/utils/cse */
+{
+  return hi >= lo && ir_range_ok_simple(ir, lo, hi, IROP_M_BARRIER,
+                       IR_RANGE_NO_JUMP_TARGET | IR_RANGE_NO_LVAL_DEST);
+}
+int ir_range_no_redef(TCCIRState *ir, int lo, int hi, int32_t vreg);  /* opt_dce.c:577 */
+
+
Decision — the interval is the open interior (lo, hi)
+

Endpoints are never inspected; inclusive-end variants (the regalloc + backward-switch-target case, ra_has_switch_in_range) are explicit + wrappers, not flags. Today every scanner picks its own convention — off-by-one + differences between them are unauditable.

+
one convention, asserted · wrappers for the exceptions
+
+

Prefix sums by default for the hot path. IRRangeIndex + generalizes the register allocator's private ra_build_call_prefix / + ra_build_switch_prefix: per-class (CALL / STORE / JUMP_TARGET / SWITCH / + TERMINATOR) prefix counts, cached in IROptCtx behind a generation counter + exactly like the existing du_gen + (ir/opt_engine.h:24–31). + ir_range_ok_ctx(ctx, …) answers flags-only queries in O(1); only + no_redef/extra clauses walk instructions. Several O(n·range) + passes become O(n) with no caller restructuring.

+

L0 rides along in the same header — a cursor that owns the boilerplate overture:

+
IR_SCAN(c, ir) {                      /* bounds + NOP skip, nothing hidden:  */
+  if (c.q->op != TCCIR_OP_MUL)        /* c.i and c.q are plain fields,       */
+    continue;                         /* single-steppable in gdb             */
+  ...
+}
+IR_SCAN_BLOCK(c, ir, start) { ... }   /* additionally stops at is_jump_target
+                                         joins and after terminators */
+
+ +
+

§7L4 — the guard DSL: when(x) and(not(y))

+

fluent surface, macro splicing, zero indirection — and observable rejections

+

The centerpiece. The composable conditions read fluently, but the mechanism is macro + splicing onto C's own short-circuiting && — fluent surface, zero + runtime indirection, every clause a plain expression you can breakpoint:

+
/* ir/guard.h — opt-in include for pass files, never pulled in by ir/ir.h */
+#define when(x)     (ir_guard_clause((x), #x, __FILE__, __LINE__))
+#define and(x)      && when(x)
+#define and_not(x)  && when(!(x))
+#define not(x)      (!(x))
+
+static inline int ir_guard_clause(int ok, const char *txt,
+                                    const char *file, int line)
+{
+  if (!ok && tcc_ir_guard_trace_match(file))     /* one cached-flag branch */
+    fprintf(stderr, "[GUARD] %s:%d rejected: %s\n", file, line, txt);
+  return ok;
+}
+

Usage — the reassoc guard that tests 280/281 retrofitted, as one legible unit:

+
if (when(ir_op_any(q->op, IROP_P_ALU))
+    and(ssa_single_use(ctx, t_vr))
+    and_not(ir_q_barrel_shifted(ir, q))
+    and_not(ir_q_barrel_shifted(ir, inner))
+    and(ir_range_ok_simple(ir, def_idx, use_idx, IROP_M_CLOBBERS_MEM,
+                           IR_RANGE_NO_JUMP_TARGET)))
+{
+  /* transform */
+}
+

Observability is the point. During fuzz triage, "which clause + admitted (or rejected) this transform" is the whole game. + TCC_TRACE_GUARDS=<substring> (matched against the file name, same + style as TCC_DISABLE_PASS) makes every failing clause print its own source + text and location — the bisect workflow gets clause-level resolution for free.

+

Nested functions: welcome, with one rule. Both host gcc (16.1.1, + -std=c11 -Werror, no -pedantic) and tcc itself support GNU + nested functions — this fork even implements the static chain for them — so + self-hosting survives. Used as directly-called, locally named guards + they cost nothing and keep guard logic next to the transform:

+
static int fuse_pair(IRSSAOptCtx *ctx, int i)
+{
+  TCCIRState *ir = ctx->ir;
+  int operand_ok(IROperand a) {                  /* local guard: direct calls
+                                                    only — no trampoline */
+    return !a.vr.is_lval && irop_get_tag(a) == IROP_TAG_VREG;
+  }
+  ...
+  if (when(operand_ok(s1)) and(operand_ok(s2)) ...) { ... }
+}
+

Taking a nested function's address is the line not to cross: that + materializes a trampoline and an executable stack. So: custom predicates passed + into scanners (IRRangeQuery.extra) must be file-scope + static; the rule is enforced mechanically by adding + -Wtrampolines to the build (with the existing -Werror it is a + hard error, and it fires exactly and only when a trampoline is generated).

+
+
Decision — language features
+

C11 + GNU extensions now (nested functions, statement expressions, + typeof); C23 conveniences (__VA_OPT__, + constexpr tables) may be adopted as the macro machinery wants them — with + the standing rule that anything the tcc frontend doesn't yet accept gets + implemented in tcc first, so the compiler always compiles itself. The host + toolchain (gcc 16) already accepts all of it; nothing in the build adds + -pedantic.

+
self-hosting is the invariant, not the standard revision
+
+
+ namespace caveat +

Lowercase when/and/and_not/not + is the requested aesthetic and is legal C provided <iso646.h> is + never included (it defines and, not as operator macros) and + no included header uses those identifiers. That is why ir/guard.h is an + explicit opt-in include for pass files, placed after system headers. If a collision + ever appears, the escape hatch is one sed to + WHEN/AND/AND_NOT/NOT — the design + does not depend on the casing.

+
+

Rejected alternatives, honestly: builder-struct method chaining + (ir_when(q)->is_op(..)->ok()) needs function-pointer fields or + closures, evaluates eagerly unless wrapped in macros anyway, and puts an indirection + between gdb and every clause. X-macro condition tables add indirection + without power — except where conditions genuinely are data, which is exactly the L1 + property table and the existing pass pipeline, and those stay.

+
+ +
+

§8L5 — mutation is a funnel

+

insert/delete/replace with every side-table invariant in one place

+

Structural edits must maintain, atomically:

+
    +
  1. JUMP/JUMPIF absolute-index immediates,
  2. +
  3. switch_tables[].targets and .default_target (and the + SWITCH_LOAD value tables),
  4. +
  5. is_jump_target bits,
  6. +
  7. orig_index stability — barrel_shifts[], + shift64_dead_half[], bfi_params[] are keyed by it.
  8. +
+

tcc_ir_opt_compact_nops does all four correctly (the + old_to_new[] remap, ir/opt_dce.c:2618 onward). + licm's private insert_instruction_before + (ir/licm.c:477) knew about jumps but historically not switch + side-tables — that was test 268, and the ninth defect of the pure-call-hoist saga. The + framework makes the blessed path the only path:

+
/* ir/mutate.h */
+int  tcc_ir_insert_before(TCCIRState *ir, int idx, TccIrOp op,
+                           const IROperand *ops, int n_ops);
+     /* capacity, shift, +1 remap of jump immediates AND switch tables,
+        is_jump_target migration, FRESH orig_index (side tables grown) —
+        returns the new index */
+void tcc_ir_q_delete(TCCIRState *ir, int idx);
+     /* logical delete: NOP-out, operands cleared; indices stable.
+        Physical removal happens only in the one blessed compactor. */
+int  tcc_ir_q_replace_op(TCCIRState *ir, int idx, TccIrOp new_op);
+     /* asserts slot-count compatibility against irop_config — catches
+        "replaced MLA with MUL, orphaned the accumulator" edits */
+

All three bump ir->mutation_gen, so the IROptCtx caches + (DU, IRRangeIndex) can assert freshness instead of trusting pass + authors to invalidate. Implementation is mostly promotion: hoist licm's insert, add the + switch-table remap loop from compact_nops, delete the private copy.

+
+
Decision — inserts get a fresh orig_index
+

Not a -1 sentinel: annotation readers are already bounds-checked + against max_orig_index, and fresh IDs (growing the side tables) keep + "annotate the instruction you just created" a legal operation.

+
fresh ids · side tables grow · readers unchanged
+
+
+ +
+

§9L6 — tracking passes share one walker

+

six passes, one event walker; "forgot to invalidate" becomes unwritable

+

The six value-tracking passes are the same machine with different fact tables: walk + forward, accumulate facts, drop facts on events (def, memory write, + call, barrier, join), act on what remains. Each re-implements the event set; tests 243, + 248 and 266 were each one forgotten event in one pass. The walker owns event enumeration + and ordering; the pass owns only its facts:

+
/* ir/track.h */
+typedef struct IRTrackHooks {
+  void (*on_def)(void *st, int idx, int32_t vreg, IROperand dest);
+  void (*on_mem_write)(void *st, int idx, const IRQuadCompact *q);
+  void (*on_call)(void *st, int idx, const IRQuadCompact *q, int purity);
+  void (*on_barrier)(void *st, int idx, const IRQuadCompact *q); /* asm/vla/eh */
+  void (*on_join)(void *st, int idx);         /* is_jump_target: paths merge  */
+  int  (*on_instr)(void *st, int idx, IRQuadCompact *q);  /* the pass's work,
+                                                 runs AFTER this index's events */
+} IRTrackHooks;
+
+int ir_track_walk(IROptCtx *ctx, const IRTrackHooks *hooks, void *state);
+ +
+ + + + + + + + + instruction stream → + + + + + #12 V3 ← 40 + #13 [S0+8] ← V3 + #14 call memcpy() + #15 (join) T2 ← … + definition + memory write + classified call + jump target + + + + + + + + + ir_track_walk — every event, in order; opting out = explicit track_ignore + on_def(V3) + on_mem_write + on_call(purity) + on_join + events fire before this index's on_instr — one ordering convention, + enforced by the walker (today each pass implicitly picks its own) + + + + + + + constprop facts + vreg → const · 6 invalidation + sites become hooks + + knownbits facts + vreg → bit lattice · 15 sites + become hooks + +
Fig. 3 — One walker fires the events; client passes only maintain fact + tables. on_def enumerates definitions via ir_q_operands, so + op4 is handled centrally; on_call arrives pre-classified through the + purity helpers.
+
+ +

Every hook is mandatory (the walker asserts non-NULL). A pass that + genuinely doesn't care about an event registers the documented no-op + track_ignore — "forgot to invalidate" becomes a visible, greppable, + reviewable decision instead of an absence. Cost: one indirect call per event on an O(n) + walk — noise next to the switch bodies these passes already execute; verified with the + existing TCC_PASS_TIMING infrastructure.

+

Pilot order by blast radius: opt_constprop (6 sites) → + opt_copyprop (9) → opt_knownbits (15) → + checkpointopt_memory.c (46 sites, phase-structured + entry-store machinery) is explicitly a stretch goal, not a plan dependency — if the + walker doesn't fit it, it keeps its hand-rolled loop and the plan still closes.

+
+ +
+

§10What this deletes

+

net −300 lines now; the prize is the marginal cost of the next fix

+
+ + + + + + + + + + + + + + + + + +
ConsolidationSites today≈ LOC out
Divergent side-effect/purity classifiers → L1 masks5 classifiers (licm, ssa_opt, cse, reroll, lcs)−250
6 range scanners → L3 wrappers; ~25 more inline range loopsopt_xform · opt_utils · opt · licm · opt_dce · regalloc−400
5 stack-addr predicates + 2 barrel-shift clones → L2opt_alias · core · licm ×2 · knownbits; fold + reassoc−120
Ad-hoc use-count / find-def scans → IROptDU / DC_*~34 + ~48 sites−500
Manual op4 handling → ir_q_operands110 sites (a subset are emitters that stay)−130
Tracking-pass invalidation → L6 walker~82 sites, 3 pilot passes−300 (−800 more if opt_memory converts)
New framework codepredicates · query · guard · mutate · track+1,380
+
+ honest framing +

Net is only ≈ −300 lines on day one (≈ −1,100 if the stretch goal lands). The + prize is not the delta — it is the marginal cost of the next pass and the + next fix: guards written in vocabulary instead of re-derived 30-op switches, + and a fuzz fix that lands in one table row or one walker event instead of N passes. + Every row of the §1 table is a fix that was applied to one pass and stayed a landmine + in the others.

+
+
+ +
+

§11Migration plan — seven phases, each shippable

+

standard gate: make test -j16 green + touched fuzz profiles swept clean · TCC_DISABLE_PASS names unchanged

+
+
0table
+

Op-property table + selftest — zero call-site changes

+

ir/predicates.{h,c}: ir_op_props[] + + IROP_P_KNOWN selftest + TCCIR_OP_COUNT sentinel. + Pure addition, ≈ +350 LOC. Risk ~nil.

+
selftest wired into unit suite / CI
+
+
1operands
+

Operand vocabulary

+

L2 accessors + ir_q_*; convert the 5 stack-addr predicates, the 2 + barrel-shift clones, and the manual op4 scan sites. ≈ +150/−250 LOC. Risk low.

+
regression tests 257/267/285 + pack64 suite
+
+
2ranges
+

Cursor + range engine + prefix sums

+

L0 cursor, ir_range_ok, IRRangeIndex; replace the 6 + named scanners with bit-exact wrappers; pilot ~10 inline range + loops. Semantic merges are separate, separately-swept commits. ≈ +300/−400 LOC. + Risk medium.

+
TCC_PASS_TIMING corpus run — no compile-time regression >2%
+
+
3guards
+

The guard DSL

+

ir/guard.h + TCC_TRACE_GUARDS; adopt across the 15 SSA + passes; add -Wtrampolines to CFLAGS. ≈ +80/−100 LOC. Risk low.

+
trace output exercised in the bisect/triage workflow
+
+
4mutate
+

The mutation funnel

+

ir/mutate.{h,c}; route licm and all inserters/deleters through the + funnel; mutation_gen freshness asserts. ≈ +200/−150 LOC. Risk + medium.

+
test 268 + switch-heavy fuzz seeds
+
+
5def-use
+

Def-use tables become the default path

+

Convert the ~34 use-count + ~48 find-def scans to + IROptDU/DC_IS_SINGLE_DEF/SSA use lists. ≈ +50/−500 LOC. + Risk medium (mechanical but wide).

+
per-pass commits · timing check (expected improvement)
+
+
6tracking
+

The event walker

+

ir/track.{h,c}; constprop → copyprop → knownbits → + checkpoint → (stretch) opt_memory. ≈ +250/−300 LOC. Risk high — + one pass per PR.

+
tests 243/248/266 · extended fuzz budget · one pass per PR
+
+
+
+ sequencing constraints +

Phases 0–1 are safe any time. Phase 2's wrapper masks must reproduce legacy op + sets bit-exactly — any intentional strengthening is its own commit with its own + sweep. Phase 6 is one pass per PR with a checkpoint before opt_memory. + Never run fuzz sweeps or reducers while the tree is mid-conversion — sweeps racing a + rebuild report phantom divergences, and the sweep cache misses header changes (clear + .sweep_cache after phases 0–2).

+
+
+ +
+

§12Risks & open questions

+

what could bite, and the calls already made

+
+ + + + + + + + + + + + + + + + + + + + + + + +
Risk / questionPosition
Generic scanner slower than inlined loops in the + O(n²)-ish big passes (opt_dce.c, + opt_memory.c).The flags-only path is the same loop it replaces; IRRangeIndex + makes hot queries O(1). Every phase gates on a TCC_PASS_TIMING + corpus run.
Semantic drift while merging classifiers — the real hazard + of L1.Phase-2 rule: wrappers reproduce each legacy op set bit-exactly; unification + is a separate, separately-swept commit per merge.
Table rot when opcodes are added.IROP_P_KNOWN makes rot conservative, not wrong; the selftest + makes it loud.
Nested functions: portability. clang would reject them; a + future non-gcc host build breaks.Build is gcc-only today (config.mak: CC=gcc) and tcc self-hosts + them. The DSL itself uses no nested functions — they are an allowed + pattern, fenced by -Wtrampolines -Werror.
Lowercase and/not/when macro + collisions.Opt-in ir/guard.h, included last, ir/-internal + only; documented one-sed rename to uppercase as the escape hatch.
Guard-macro debuggability.Clauses stay plain expressions — breakpointable, no interpreter. Bounded + splice, no recursive metaprogramming. TCC_TRACE_GUARDS actively + improves triage.
opt_memory.c may not fit the L6 walker + (phase-structured entry-store machine, 46 sites).Explicit checkpoint after knownbits; converting it is stretch, not a + dependency.
Open: SSA passes — keep their vinfo use lists + or adopt IROptCtx caches?predicates.h/guard.h are context-free (usable from both); query-ctx variants + stay pre-SSA; SSA keeps vinfo until proven otherwise.
Open: regalloc adopts IRRangeIndex?Its bespoke prefix sums are already correct; converting is optional cleanup, + never a phase gate.
Open: C23 adoption pace.Only as the macro machinery earns it, and tcc's frontend implements each + feature first (self-hosting invariant).
+
+ +
+ tinycc armv8-m fork · optimizer predicate & guard framework proposal · 2026-07-03 · + counts & line numbers from a source survey of the working tree (branch + heapOverflowBug) · markdown source: + plan_opt_predicate_framework.md +
+
+
+ + diff --git a/docs/plan_opt_predicate_framework.md b/docs/plan_opt_predicate_framework.md new file mode 100644 index 00000000..3c7622da --- /dev/null +++ b/docs/plan_opt_predicate_framework.md @@ -0,0 +1,639 @@ +# Guards, not folklore — a predicate & query framework for the IR optimizer + +> tinycc · armv8-m fork · optimizer proposal · 2026-07-03 +> +> Styled version with full diagrams: [plan_opt_predicate_framework.html](plan_opt_predicate_framework.html) +> (self-contained, open in a browser). This Markdown is the diff-friendly source of truth; +> Mermaid diagrams render on GitHub and in VS Code preview. + +Optimization passes are filters and selectors: scan instructions, check conditions, +rewrite. Nearly every fuzzer miscompile fixed in this fork was one **missing guard +condition** — a check that a sibling pass had already learned the hard way. This plan +turns guards from per-pass folklore into a shared, named, composable, *observable* +vocabulary — one op-property table, one operand iterator, one range engine, one fluent +guard DSL, one mutation funnel, one invalidation walker — so each class of fix lands +once, centrally, forever. + +| | | +|---|---| +| `tcc_ir_opt_*` functions | ~300, plus 15 SSA passes | +| `op == TCCIR_OP_*` comparisons | 1,962 | +| whole-function scan loops | ~75 | +| range-scan predicates | ~35 (only 2 use prefix sums) | +| `is_jump_target` guard sites | 220 | +| `operand_base+3` (op4) sites | 110 | +| invalidation sites in 6 tracking passes | ~82 | +| fuzz fixes that were missing guards | 10+ named regression tests | + +## Contents + +1. [The anatomy of a miscompile](#1-the-anatomy-of-a-miscompile) +2. [The shapes of optimizer code today](#2-the-shapes-of-optimizer-code-today) +3. [Design overview](#3-design-overview--seven-layers-one-vocabulary) +4. [L1 — one op-property table](#4-l1--one-op-property-table) +5. [L2 — operands without folklore](#5-l2--operands-without-folklore) +6. [L3 — range queries: one engine](#6-l3--range-queries-one-engine) +7. [L4 — the guard DSL](#7-l4--the-guard-dsl-whenx-andnoty) +8. [L5 — mutation is a funnel](#8-l5--mutation-is-a-funnel) +9. [L6 — tracking passes share one walker](#9-l6--tracking-passes-share-one-walker) +10. [What this deletes](#10-what-this-deletes) +11. [Migration plan](#11-migration-plan--seven-phases-each-shippable) +12. [Risks & open questions](#12-risks--open-questions) + +--- + +## §1 The anatomy of a miscompile + +The differential fuzzer finds an O1/O2 divergence; triage bisects to a pass; the root +cause is one absent condition — the transform was legal *except* when an MLA accumulator, +a barrel-shift annotation, a switch side-table, a spill-encoded stack operand, or a join +point was involved. The fix is a two-line guard. The same latent gap usually survives in +every sibling pass, because each pass re-derives its guards privately. + +The record, mapped to the layer of this framework that makes each class structural: + +| Bug class | Regression tests | What went wrong | Layer that ends the class | +|---|---|---|---| +| MLA accumulator invisible to use/def scans | 257, 267, 285 | 4th operand at `pool[operand_base+3]` not advertised by `irop_config` | **L2** — `ir_q_operands()` includes op4 by construction | +| Barrel-shift annotation ignored | 280, 281 | `ir->barrel_shifts[orig_index]` check private to 2 files, absent elsewhere | **L1/L2** — `ir_q_barrel_shifted()` in the shared vocabulary | +| Missing invalidation on def/store/call | 243, 248, 266 | each tracking pass re-implements the event set, each missing one event | **L6** — the walker enumerates events; opting out is explicit | +| SWITCH_TABLE targets not renumbered on insert | 268 | private insert helper knew about jumps, not `switch_tables[]` | **L5** — one mutation funnel carries all remap invariants | +| Spill-encoded STACKOFF read as a real slot | pack64 (longlong 7–85) | the `vreg_type == 0` rule lived in a comment, not an accessor | **L2** — `irop_is_direct_stack_slot()` | +| Fusion across a jump target | 251 | `is_jump_target` clause forgotten in one peephole scan | **L0/L3** — join-point stop is default-on | +| Divergent purity/side-effect op-sets | latent class | 8-op vs 30-op classifiers answer the same question differently | **L1** — one table, named masks, diffs greppable | + +> **Good news first.** The raw material already exists: a def-use table (`IROptDU`, +> `ir/opt_du.h:46–97`) and a flat def-count (`ir_opt_build_def_count`), prefix-sum range +> queries in the register allocator (`ra_has_call_in_range`, `ir/regalloc.c:109`), a +> declarative pass pipeline with `requires`/`invalidates` bitmasks +> (`ir/opt_pipeline.c:338–521`), and a central kill switch +> (`TCC_DISABLE_PASS` → `tcc_ir_opt_pass_disabled`, `ir/opt_utils.c:28`). None of it is +> the *default path* — ~75 loops still hand-roll what these facilities already answer. +> This plan finishes plumbing that is 30% built, it does not start from zero. + +## §2 The shapes of optimizer code today + +Every pass opens with the same overture before its actual idea starts: + +```c +/* the shape that appears ~75 times across ir/ — bounds, NOP skip, + * join-point stop, then a hand-rolled op classification */ +for (k = lo + 1; k < hi; k++) { + IRQuadCompact *q = &ir->compact_instructions[k]; + if (q->op == TCCIR_OP_NOP) + continue; + if (q->is_jump_target) /* the clause test 251 was missing */ + return 0; + switch (q->op) { + case TCCIR_OP_STORE: /* ...a 30-case switch, different */ + case TCCIR_OP_STORE_INDEXED: /* in every copy... */ + /* ... */ + } +} +``` + +What the survey found (counts from the working tree, branch `heapOverflowBug`): + +- **Range scans, ~35 of them.** "Is `[lo,hi]` free of stores / calls / joins / + redefinitions?" re-implemented with different op sets and different interval + conventions: `ir_xform_range_preserves_memory` (`ir/opt_xform.c:28`), + `ir_opt_pure_def_memory_stable` (`ir/opt_utils.c:880`), `cse_cmp_op_may_clobber` + (`ir/opt.c:2332`), `loop_body_may_clobber_memory` (`ir/licm.c:1633`), + `ir_opt_vreg_has_def_in_range` (`ir/opt_dce.c:577`). Only the register allocator + precomputes prefix sums (`ra_build_call_prefix` / `ra_build_switch_prefix`, + `ir/regalloc.c:84/125`); everyone else re-scans O(range) inside O(n) outer loops. +- **Op classifiers, duplicated and divergent.** `has_side_effects` (`ir/licm.c:43`) + knows 8 ops; `ssa_opt_has_side_effects` (`ir/opt/ssa_opt.c:244`) knows 30 — including + `STORE_POSTINC`, VLA ops, inline asm, and setjmp, which licm's copy simply does not. + Plus `gvn_is_pure_alu` / `gvn_is_commutative` (`ir/opt/ssa_opt_gvn.c:44/66`), + `op_is_unsafe_for_reroll` (27 cases), `lcs_op_supported` (27 cases) — same concept, + five op-sets. 1,962 raw `op ==` comparisons total. +- **Operand-kind folklore.** 323 `irop_is_immediate` sites, 882 `is_lval` reads, 809 + `TCCIR_DECODE_VREG_TYPE` sites. The header rule that a STACKOFF operand is a *real* + stack slot only when `vreg_type == 0` (`tccir_operand.h:55–66`, in bold prose: *"New + passes that inspect stack operands MUST check vreg_type == 0"*) is honored by ~2 call + sites (`kb_is_direct_stackoff`, `ir/opt_knownbits.c:153`). Five near-identical + stack-address predicates exist (`ir/opt_alias.c:84`, `ir/core.c:327`, `ir/licm.c:34`, + `ir/licm.c:1238`, `ir/opt_knownbits.c:195`) — not all of them apply the rule. +- **The 4th operand.** `pool[operand_base+3]` is overloaded per-op: MLA accumulator, + indexed-addressing scale, SELECT condition (`tcc_ir_op_get_accum/scale/cond`, + `tccir.h:813/800/833`). `irop_config` advertises only dest/src1/src2, so every naïve + operand fan-out misses it — 110 sites hand-handle it today; the helper + `ir_opt_mla_accum_vreg` (`ir/opt_constprop.c:353`) exists but reached only 7 call sites. +- **Use/def scans.** ~34 ad-hoc "count uses of vreg X" full scans and ~48 backward + find-the-def scans, despite `IROptDU`, `DC_IS_SINGLE_DEF` (`ir/opt_du.h:104–107`), and + the SSA per-vreg use lists all existing. +- **Duplicated annotation checks.** `has_barrel_shift_annotation` copy-pasted verbatim in + `ir/opt/ssa_opt_fold.c:26` and `ir/opt/ssa_opt_reassoc.c:36`. +- **Invalidation, hand-rolled six times.** ~82 "drop cached facts on def/store/call" + sites across `opt_memory.c` (46), `opt_knownbits.c` (15), `opt_copyprop.c` (9), + `opt_constprop.c` (6), `ssa_opt_sccp.c`, `ssa_opt_cprop.c`. +- **Call purity by name.** `ir_opt_is_pure_helper_name` and siblings + (`ir/opt_utils.c:688+`) — reasonable, but consulted ad hoc rather than through one + call-classification point. + +## §3 Design overview — seven layers, one vocabulary + +Seven layers, L0–L6. Each is **independently adoptable** and lands as a pure addition; +an old helper becomes a one-line wrapper over the framework and is deleted with its last +caller. No IR redesign: everything operates on the existing flat +`ir->compact_instructions[0 .. next_instruction_index)`, the operand pool, and the +side tables keyed by `orig_index`. + +```mermaid +flowchart TB + passes["~300 opt passes · 15 SSA passes · licm · regalloc · codegen peepholes
what remains per pass: match → guard → transform"]:::fe + + guard["L4 · ir/guard.h — fluent guard DSL
when(x) and(not(y)) · TCC_TRACE_GUARDS"]:::seam + track["L6 · ir/track.c — event walker
def / mem-write / call / barrier / join"]:::seam + mutate["L5 · ir/mutate.c — insert · delete · replace
one funnel for all side-table remaps"]:::seam + range["L3 · ir/query.c — ir_range_ok()
IRRangeIndex prefix sums"]:::seam + quad["L2 · ir/predicates.h — ir_q_operands()
op4-aware · STACKOFF rule as code"]:::seam + props["L1 · ir/predicates.c — ir_op_props[]
one property table, named masks"]:::seam + cursor["L0 · IRCursor — NOP skip, block stop"]:::seam + + du["IROptDU · def_count · SSA use lists
(existing — becomes the default path)"]:::arch + pipe["opt_pipeline requires/invalidates
TCC_DISABLE_PASS (existing)"]:::arch + repr["compact_instructions[] · iroperand_pool · irop_config
switch_tables[] · barrel_shifts[orig_index] · bfi_params[]"]:::ir + + passes --> guard + passes --> track + passes --> mutate + guard --> range + guard --> quad + track --> quad + range --> props + range --> cursor + quad --> props + du -.-> passes + pipe -.-> passes + cursor --> repr + props --> repr + quad --> repr + mutate --> repr + + classDef fe stroke:#2C5E8F,stroke-width:2px + classDef seam stroke:#A8672A,stroke-width:2px + classDef arch stroke:#0E7B5B,stroke-width:2px + classDef ir stroke:#6B4E9E,stroke-width:2px +``` + +*Fig. 1 — The layer stack. Amber layers are new; green blocks already exist and get +promoted to the default path; the representation (purple) does not change.* + +| File | Layer | Contents | Naming | +|------|-------|----------|--------| +| `tccir_operand.h` (existing) | L2 | `irop_is_direct_stack_slot()` family — beside the prose rule it encodes | `irop_*` | +| `ir/predicates.h` + `.c` | L1+L2 | op-property table, masks, `ir_q_*` quad queries, selftest | `ir_op_*`, `ir_q_*` | +| `ir/guard.h` | L4 | the fluent guard DSL — **opt-in include**, never dragged in by `ir/ir.h` | `when`/`and`/`and_not`/`not` | +| `ir/query.h` + `.c` | L0+L3 | cursors, range engine, `IRRangeIndex` | `ir_cursor_*`, `ir_range_*` | +| `ir/mutate.h` + `.c` | L5 | insert/delete/replace funnel | public `tcc_ir_*` | +| `ir/track.h` + `.c` | L6 | tracking-pass event walker | `ir_track_*` | + +Internal functions keep the `ir__()` convention; public mutations use +the `tcc_ir_()` prefix, mirroring `tcc_ir_opt_compact_nops`. + +## §4 L1 — one op-property table + +Op classification becomes data. One table, orthogonal property bits, and **named masks** +that reproduce each legacy classifier so the historical differences become one greppable +line each: + +```c +/* ir/predicates.h */ +typedef uint32_t IROpProps; +#define IROP_P_KNOWN (1u << 0) /* entry was written on purpose */ +#define IROP_P_WRITES_MEM (1u << 1) /* STORE*, BLOCK_COPY */ +#define IROP_P_READS_MEM (1u << 2) +#define IROP_P_CALL_LIKE (1u << 3) /* FUNCCALL*, builtin apply, ... */ +#define IROP_P_TERMINATOR (1u << 4) /* JUMP/JUMPIF/IJUMP/SWITCH_*/RETURN* */ +#define IROP_P_ASM (1u << 5) +#define IROP_P_SP_EFFECT (1u << 6) /* VLA alloc / SP save-restore */ +#define IROP_P_EH (1u << 7) /* setjmp/longjmp */ +#define IROP_P_CALLSEQ (1u << 8) /* call-arg staging ops */ +#define IROP_P_ALU (1u << 9) /* pure computation, incl. MLA */ +#define IROP_P_COMMUTATIVE (1u << 10) +#define IROP_P_CMP (1u << 11) +#define IROP_P_HAS_OP4 (1u << 12) /* MLA / *_INDEXED / SELECT */ + +extern const IROpProps ir_op_props[TCCIR_OP_COUNT]; /* new sentinel after + TCCIR_OP_SMULL (tccir.h:229) */ +static inline IROpProps ir_op_p(TccIrOp op) +{ + IROpProps p = ir_op_props[op]; + return (p & IROP_P_KNOWN) ? p : ~0u; /* unknown = has every effect */ +} +static inline int ir_op_any(TccIrOp op, IROpProps mask) +{ + return (ir_op_p(op) & mask) != 0; +} + +/* each legacy classifier, as one reviewable line: */ +#define IROP_M_CLOBBERS_MEM (IROP_P_WRITES_MEM|IROP_P_CALL_LIKE|IROP_P_ASM|\ + IROP_P_SP_EFFECT|IROP_P_EH) +#define IROP_M_SIDE_EFFECT (IROP_M_CLOBBERS_MEM|IROP_P_TERMINATOR|IROP_P_CALLSEQ) +#define IROP_M_BARRIER (IROP_M_CLOBBERS_MEM|IROP_P_TERMINATOR) +``` + +`gvn_is_pure_alu` (26 lines) becomes `ir_op_any(op, IROP_P_ALU)`. The licm/ssa_opt +disagreement becomes a diff between two `IROP_M_*` definitions instead of two 30-line +switches in different files. + +> **Decision: unknown means dangerous.** With designated initializers, a *forgotten* +> table entry reads as all-zero — i.e. "pure", exactly the failure mode this framework +> exists to kill. The `IROP_P_KNOWN` bit inverts it: an unclassified op behaves as +> clobbers-everything, so forgetting an entry can only pessimize, never miscompile. +> `ir_predicates_selftest()` — run under `TCC_IR_SELFTEST=1` and from the unit suite — +> asserts every op below `TCCIR_OP_COUNT` has `IROP_P_KNOWN` and cross-checks +> `IROP_P_HAS_OP4` against `irop_config`. + +## §5 L2 — operands without folklore + +Two representation subtleties caused five separate miscompiles. Both become accessors. + +**The 4th operand.** The quad layout is `[dest, src1, src2, op4]` where `op4`'s meaning +is per-op — MLA accumulator (a real vreg **use**), indexed scale, SELECT condition — and +`irop_config` doesn't know it exists: + +```text + iroperand_pool[q->operand_base + ...] + ┌────────┬────────┬────────┬─────────────────────────┐ + │ 0 dest │ 1 src1 │ 2 src2 │ 3 op4 │ + └────────┴────────┴────────┴─────────────────────────┘ +irop_config → has_dest has_src1 has_src2 ── not advertised ── + MLA → accum (VREG USE!) + *_INDEXED→ scale (imm) + SELECT → cond +``` + +```c +/* ir/predicates.h */ +typedef struct IROperandRef { + IROperand op; + uint8_t slot; /* 0=dest 1=src1 2=src2 3=op4 */ + uint8_t is_def; /* writes a vreg (non-lval dest) */ + uint8_t is_vreg_use; /* reads a vreg: srcs, MLA accum, AND an lval + dest — a store THROUGH dest reads its address */ + uint8_t writes_mem; +} IROperandRef; + +int ir_q_operands(const TCCIRState *ir, const IRQuadCompact *q, + IROperandRef out[4]); /* returns count */ +int ir_q_vreg_uses(const TCCIRState *ir, const IRQuadCompact *q, + int32_t out[4]); /* op4 included */ +int32_t ir_q_def_vreg(const TCCIRState *ir, const IRQuadCompact *q); /* -1 if none */ + +/* deduped from ssa_opt_fold.c:26 / ssa_opt_reassoc.c:36 (verbatim clones) */ +static inline int ir_q_barrel_shifted(const TCCIRState *ir, const IRQuadCompact *q) +{ + return ir->barrel_shifts && q->orig_index >= 0 && + q->orig_index <= ir->max_orig_index && + ir->barrel_shifts[q->orig_index]; +} +``` + +A use-count scan written against `ir_q_vreg_uses` *cannot* miss the accumulator — the +bug class of tests 257/267/285 stops being writable: + +```c +/* before — misses MLA accum unless the /* after */ + author remembered (3 didn't) */ +if (irop_config[q->op].has_src1 && int32_t u[4]; + irop_get_vreg(src1) == vr) uses++; int n = ir_q_vreg_uses(ir, q, u); +if (irop_config[q->op].has_src2 && for (int k = 0; k < n; k++) + irop_get_vreg(src2) == vr) uses++; if (u[k] == vr) uses++; +if (q->op == TCCIR_OP_MLA && /* often absent */) + ... +``` + +**The STACKOFF rule.** The `vreg_type == 0` real-slot test moves from prose +(`tccir_operand.h:55–66`) into accessors that live right beside it: + +```c +/* tccir_operand.h — the rule, as code */ +static inline int irop_is_direct_stack_slot(IROperand op) +{ return irop_get_tag(op) == IROP_TAG_STACKOFF && op.vr.vreg_type == 0; } + +static inline int irop_is_stack_slot_addr(IROperand op) /* Addr[StackLoc] */ +{ return irop_is_direct_stack_slot(op) && !op.vr.is_lval; } +static inline int irop_is_stack_slot_deref(IROperand op) /* StackLoc deref */ +{ return irop_is_direct_stack_slot(op) && op.vr.is_lval; } +``` + +The five scattered stack-address predicates become wrappers, then callers migrate, then +the wrappers go. The one in `ir/licm.c:34` that *omits* the `vreg_type` check gets the +fix for free. + +## §6 L3 — range queries: one engine + +One function answers "is this range safe?", with the stop-set expressed in L1 masks, the +common structural conditions as flags, and an escape hatch for genuinely custom checks: + +```c +/* ir/query.h */ +#define IR_RANGE_NO_JUMP_TARGET (1u << 0) /* no join point inside — DEFAULT ON */ +#define IR_RANGE_NO_LVAL_DEST (1u << 1) /* no memory write via lval dest */ +#define IR_RANGE_ALLOW_PURE_CALLS (1u << 2) /* pure-helper carve-out + (ir_opt_is_pure_helper_name) */ +typedef struct IRRangeQuery { + IROpProps stop; /* any matching op → fail (use IROP_M_* masks) */ + uint32_t flags; + int32_t no_redef[4]; /* vregs that must not be (re)defined inside */ + int n_redef; + int (*extra)(void *uctx, TCCIRState *ir, int idx, const IRQuadCompact *q); + void *extra_ctx; /* extra must be file-scope static — see §7 */ +} IRRangeQuery; + +int ir_range_ok(TCCIRState *ir, int lo, int hi, const IRRangeQuery *rq); +int ir_range_ok_simple(TCCIRState *ir, int lo, int hi, + IROpProps stop, uint32_t flags); +``` + +The six duplicated scanners become wrappers whose masks reproduce today's op sets +**bit-exactly** (semantic unification, where wanted, is a separate, separately-swept +commit): + +```c +int ir_range_preserves_memory(TCCIRState *ir, int lo, int hi) /* opt_xform/utils/cse */ +{ + return hi >= lo && ir_range_ok_simple(ir, lo, hi, IROP_M_BARRIER, + IR_RANGE_NO_JUMP_TARGET | IR_RANGE_NO_LVAL_DEST); +} +int ir_range_no_redef(TCCIRState *ir, int lo, int hi, int32_t vreg); /* opt_dce.c:577 */ +``` + +> **Decision: the interval is the open interior `(lo, hi)`.** Endpoints are never +> inspected; inclusive-end variants (the regalloc backward-switch-target case, +> `ra_has_switch_in_range`) are explicit wrappers, not flags. Today every scanner picks +> its own convention — off-by-one differences between them are unauditable. + +**Prefix sums by default for the hot path.** `IRRangeIndex` generalizes the register +allocator's private `ra_build_call_prefix` / `ra_build_switch_prefix`: per-class +(CALL / STORE / JUMP_TARGET / SWITCH / TERMINATOR) prefix counts, cached in `IROptCtx` +behind a generation counter exactly like the existing `du_gen` +(`ir/opt_engine.h:24–31`). `ir_range_ok_ctx(ctx, ...)` answers flags-only queries in +O(1); only `no_redef`/`extra` clauses walk instructions. Several O(n·range) passes +become O(n) with no caller restructuring. + +L0 rides along in the same header — a cursor that owns the boilerplate overture: + +```c +IR_SCAN(c, ir) { /* bounds + NOP skip, nothing hidden: */ + if (c.q->op != TCCIR_OP_MUL) /* c.i and c.q are plain fields, */ + continue; /* single-steppable in gdb */ + ... +} +IR_SCAN_BLOCK(c, ir, start) { ... } /* additionally stops at is_jump_target + joins and after terminators */ +``` + +## §7 L4 — the guard DSL: when(x) and(not(y)) + +The centerpiece. The composable conditions read fluently, but the mechanism is macro +splicing onto C's own short-circuiting `&&` — fluent surface, zero indirection, every +clause a plain expression you can breakpoint: + +```c +/* ir/guard.h — opt-in include for pass files, never pulled in by ir/ir.h */ +#define when(x) (ir_guard_clause((x), #x, __FILE__, __LINE__)) +#define and(x) && when(x) +#define and_not(x) && when(!(x)) +#define not(x) (!(x)) + +static inline int ir_guard_clause(int ok, const char *txt, + const char *file, int line) +{ + if (!ok && tcc_ir_guard_trace_match(file)) /* one cached-flag branch */ + fprintf(stderr, "[GUARD] %s:%d rejected: %s\n", file, line, txt); + return ok; +} +``` + +Usage — the reassoc guard that tests 280/281 retrofitted, as one legible unit: + +```c +if (when(ir_op_any(q->op, IROP_P_ALU)) + and(ssa_single_use(ctx, t_vr)) + and_not(ir_q_barrel_shifted(ir, q)) + and_not(ir_q_barrel_shifted(ir, inner)) + and(ir_range_ok_simple(ir, def_idx, use_idx, IROP_M_CLOBBERS_MEM, + IR_RANGE_NO_JUMP_TARGET))) +{ + /* transform */ +} +``` + +**Observability is the point.** During fuzz triage, "which clause admitted (or rejected) +this transform" is the whole game. `TCC_TRACE_GUARDS=` (matched against the +file name, same style as `TCC_DISABLE_PASS`) makes every failing clause print its own +source text and location — the bisect workflow gets clause-level resolution for free. + +**Nested functions: welcome, with one rule.** Both host gcc (16.1.1, `-std=c11 -Werror`, +no `-pedantic`) and tcc itself support GNU nested functions — this fork even implements +the static chain for them — so self-hosting survives. Used as **directly-called, locally +named guards** they cost nothing and keep guard logic next to the transform: + +```c +static int fuse_pair(IRSSAOptCtx *ctx, int i) +{ + TCCIRState *ir = ctx->ir; + int operand_ok(IROperand a) { /* local guard: direct calls + only — no trampoline */ + return !a.vr.is_lval && irop_get_tag(a) == IROP_TAG_VREG; + } + ... + if (when(operand_ok(s1)) and(operand_ok(s2)) ...) { ... } +} +``` + +Taking a nested function's **address** is the line not to cross: that materializes a +trampoline and an executable stack. So: custom predicates passed *into* scanners +(`IRRangeQuery.extra`) must be file-scope `static`; the rule is enforced mechanically by +adding `-Wtrampolines` to the build (with the existing `-Werror` it is a hard error, and +it fires exactly and only when a trampoline is generated). + +> **Decision: language features.** C11 + GNU extensions now (nested functions, statement +> expressions, `typeof`); C23 conveniences (`__VA_OPT__`, `constexpr` tables) may be +> adopted as the macro machinery wants them — with the standing rule that **anything the +> tcc frontend doesn't yet accept gets implemented in tcc first**, so the compiler always +> compiles itself. The host toolchain (gcc 16) already accepts all of it; nothing in the +> build adds `-pedantic`. + +> **Namespace caveat.** Lowercase `when`/`and`/`and_not`/`not` is the requested +> aesthetic and is legal C provided `` is never included (it defines `and`, +> `not` as operator macros) and no included header uses those identifiers. That is why +> `ir/guard.h` is an explicit opt-in include for pass files, placed after system +> headers. If a collision ever appears, the escape hatch is one sed to `WHEN`/`AND`/ +> `AND_NOT`/`NOT` — the design does not depend on the casing. + +Rejected alternatives, honestly: **builder-struct method chaining** +(`ir_when(q)->is_op(..)->ok()`) needs function-pointer fields or closures, evaluates +eagerly unless wrapped in macros anyway, and puts an indirection between gdb and every +clause. **X-macro condition tables** add indirection without power — except where +conditions genuinely are data, which is exactly the L1 property table and the existing +pass pipeline, and those stay. + +## §8 L5 — mutation is a funnel + +Structural edits must maintain, atomically: + +1. `JUMP`/`JUMPIF` absolute-index immediates, +2. `switch_tables[].targets` and `.default_target` (and the SWITCH_LOAD value tables), +3. `is_jump_target` bits, +4. `orig_index` stability — `barrel_shifts[]`, `shift64_dead_half[]`, `bfi_params[]` + are keyed by it. + +`tcc_ir_opt_compact_nops` does all four correctly (the `old_to_new[]` remap, +`ir/opt_dce.c:2618` onward). licm's private `insert_instruction_before` +(`ir/licm.c:477`) knew about jumps but historically not switch side-tables — that was +test 268, and the ninth defect of the pure-call-hoist saga. The framework makes the +blessed path the only path: + +```c +/* ir/mutate.h */ +int tcc_ir_insert_before(TCCIRState *ir, int idx, TccIrOp op, + const IROperand *ops, int n_ops); + /* capacity, shift, +1 remap of jump immediates AND switch tables, + is_jump_target migration, FRESH orig_index (side tables grown) — + returns the new index */ +void tcc_ir_q_delete(TCCIRState *ir, int idx); + /* logical delete: NOP-out, operands cleared; indices stable. + Physical removal happens only in the one blessed compactor. */ +int tcc_ir_q_replace_op(TCCIRState *ir, int idx, TccIrOp new_op); + /* asserts slot-count compatibility against irop_config — catches + "replaced MLA with MUL, orphaned the accumulator" edits */ +``` + +All three bump `ir->mutation_gen`, so the `IROptCtx` caches (DU, `IRRangeIndex`) can +*assert* freshness instead of trusting pass authors to invalidate. Implementation is +mostly promotion: hoist licm's insert, add the switch-table remap loop from +`compact_nops`, delete the private copy. + +> **Decision: inserts get a fresh `orig_index`** (growing the side tables), not a `-1` +> sentinel. Annotation readers are already bounds-checked against `max_orig_index`, and +> fresh IDs keep "annotate the instruction you just created" a legal operation. + +## §9 L6 — tracking passes share one walker + +The six value-tracking passes are the same machine with different fact tables: walk +forward, accumulate facts, **drop facts on events** (def, memory write, call, barrier, +join), act on what remains. Each re-implements the event set; tests 243, 248, 266 were +each one forgotten event in one pass. The walker owns event enumeration and ordering; +the pass owns only its facts: + +```c +/* ir/track.h */ +typedef struct IRTrackHooks { + void (*on_def)(void *st, int idx, int32_t vreg, IROperand dest); + void (*on_mem_write)(void *st, int idx, const IRQuadCompact *q); + void (*on_call)(void *st, int idx, const IRQuadCompact *q, int purity); + void (*on_barrier)(void *st, int idx, const IRQuadCompact *q); /* asm/vla/eh */ + void (*on_join)(void *st, int idx); /* is_jump_target: paths merge */ + int (*on_instr)(void *st, int idx, IRQuadCompact *q); /* the pass's work, + runs AFTER this index's events */ +} IRTrackHooks; + +int ir_track_walk(IROptCtx *ctx, const IRTrackHooks *hooks, void *state); +``` + +```mermaid +flowchart LR + subgraph stream ["instruction stream"] + direction LR + i1["#12 V3 ← 40"] --> i2["#13 [S0+8] ← V3"] --> i3["#14 call memcpy"] --> i4["#15 (join) T2 ← …"] + end + subgraph walk ["ir_track_walk — every event, in order, or an explicit track_ignore"] + e1["on_def(V3)"] + e2["on_mem_write"] + e3["on_call(purity)"] + e4["on_join · on_instr"] + end + i1 --> e1 + i2 --> e2 + i3 --> e3 + i4 --> e4 + walk --> cp["constprop facts"] + walk --> kb["knownbits facts"] + + classDef default stroke:#DCE2DC +``` + +*Fig. 2 — One walker fires the events; client passes only maintain fact tables. +`on_def` enumerates definitions via `ir_q_operands`, so op4 is handled centrally; +`on_call` arrives pre-classified through the L1/A8 purity helpers.* + +**Every hook is mandatory** (the walker asserts non-NULL). A pass that genuinely doesn't +care about an event registers the documented no-op `track_ignore` — "forgot to +invalidate" becomes a visible, greppable, reviewable decision instead of an absence. +Cost: one indirect call per event on an O(n) walk — noise next to the switch bodies +these passes already execute; verified with the existing `TCC_PASS_TIMING` +infrastructure. + +Pilot order by blast radius: `opt_constprop` (6 sites) → `opt_copyprop` (9) → +`opt_knownbits` (15) → **checkpoint** → `opt_memory.c` (46 sites, phase-structured +entry-store machinery) is explicitly a stretch goal, not a plan dependency — if the +walker doesn't fit it, it keeps its hand-rolled loop and the plan still closes. + +## §10 What this deletes + +| Consolidation | Sites today | ≈ LOC out | +|---|---|--:| +| Divergent side-effect/purity classifiers → L1 masks | 5 classifiers (licm, ssa_opt, cse, reroll, lcs) | −250 | +| 6 range scanners → L3 wrappers; ~25 more inline range loops | opt_xform, opt_utils, opt, licm, opt_dce, regalloc | −400 | +| 5 stack-addr predicates + 2 barrel-shift clones → L2 | opt_alias, core, licm ×2, knownbits; fold+reassoc | −120 | +| Ad-hoc use-count / find-def scans → `IROptDU` / `DC_*` | ~34 + ~48 sites | −500 | +| Manual op4 handling → `ir_q_operands` | 110 sites (a subset are emitters that stay) | −130 | +| Tracking-pass invalidation → L6 walker | ~82 sites, 3 pilot passes | −300 (−800 more if `opt_memory` converts) | +| New framework code | predicates, query, guard, mutate, track | **+1,380** | + +> **Honest framing.** Net is only ≈ −300 lines on day one (≈ −1,100 if the stretch goal +> lands). The prize is not the delta — it is the **marginal cost of the next pass and +> the next fix**: guards written in vocabulary instead of re-derived 30-op switches, and +> a fuzz fix that lands in one table row or one walker event instead of N passes. Every +> row of the §1 table is a fix that was applied to one pass and stayed a landmine in the +> others. + +## §11 Migration plan — seven phases, each shippable + +Standard gate for every phase: `make test -j16` green + the touched fuzz profiles swept +clean. The framework sits *under* passes, so every existing `TCC_DISABLE_PASS` name +keeps working unchanged. Convention: the pure-addition commit lands first, then per-pass +conversion commits, each individually revertible. + +| Phase | Content | Risk | ΔLOC | Gate extras | +|-------|---------|------|------|-------------| +| **0** table | `ir/predicates.{h,c}`: op-props + `IROP_P_KNOWN` selftest + `TCCIR_OP_COUNT` sentinel; zero call-site changes | ~nil | +350 | selftest wired into unit suite / CI | +| **1** operands | L2 accessors + `ir_q_*`; convert the 5 stack-addr predicates, 2 barrel-shift clones, manual op4 scan sites | low | +150 −250 | regression tests 257/267/285 + pack64 suite | +| **2** ranges | L0 cursor + `ir_range_ok` + `IRRangeIndex`; replace the 6 named scanners with bit-exact wrappers; pilot ~10 inline range loops | med | +300 −400 | `TCC_PASS_TIMING` corpus run — no compile-time regression >2% | +| **3** guards | `ir/guard.h` + `TCC_TRACE_GUARDS`; adopt across the 15 SSA passes; add `-Wtrampolines` to CFLAGS | low | +80 −100 | trace output exercised in the bisect/triage workflow | +| **4** mutate | `ir/mutate.{h,c}`; route licm + all inserters/deleters through the funnel; `mutation_gen` asserts | med | +200 −150 | test 268 + switch-heavy fuzz seeds | +| **5** def-use | Convert the ~34 use-count + ~48 find-def scans to `IROptDU`/`DC_IS_SINGLE_DEF`/SSA use lists | med | +50 −500 | per-pass commits; timing check (expected improvement) | +| **6** tracking | `ir/track.{h,c}`; constprop → copyprop → knownbits → checkpoint → (stretch) opt_memory | high | +250 −300 | one pass per PR; tests 243/248/266; extended fuzz budget | + +> **Sequencing constraints.** Phases 0–1 are safe any time. Phase 2's wrapper masks must +> reproduce legacy op sets bit-exactly — any intentional strengthening is its own commit +> with its own sweep. Phase 6 is one pass per PR with a checkpoint before `opt_memory`. +> Never run fuzz sweeps or reducers while the tree is mid-conversion — sweeps racing a +> rebuild report phantom divergences, and the sweep cache misses header changes (clear +> `.sweep_cache` after phases 0–2). + +## §12 Risks & open questions + +| Risk / question | Position | +|---|---| +| **Generic scanner slower than inlined loops** in the O(n²)-ish big passes (`opt_dce.c`, `opt_memory.c`). | The flags-only path is the same loop it replaces; `IRRangeIndex` makes hot queries O(1). Every phase gates on a `TCC_PASS_TIMING` corpus run. | +| **Semantic drift while merging classifiers** — the real hazard of L1. | Phase-2 rule: wrappers reproduce each legacy op set bit-exactly; unification is a separate, separately-swept commit per merge. | +| **Table rot when opcodes are added.** | `IROP_P_KNOWN` makes rot conservative, not wrong; the selftest makes it loud. | +| **Nested functions: portability.** clang would reject them; a future non-gcc host build breaks. | Build is gcc-only today (`config.mak: CC=gcc`) and tcc self-hosts them. The DSL itself uses no nested functions — they are an *allowed pattern*, fenced by `-Wtrampolines -Werror`. | +| **Lowercase `and`/`not`/`when` macro collisions.** | Opt-in `ir/guard.h`, included last, `ir/`-internal only; documented one-sed rename to uppercase as the escape hatch. | +| **Guard-macro debuggability.** | Clauses stay plain expressions — breakpointable, no interpreter. Macro is a bounded foreach (≤10 clauses), no recursive metaprogramming. `TCC_TRACE_GUARDS` actively improves triage. | +| **`opt_memory.c` may not fit the L6 walker** (phase-structured entry-store machine, 46 sites). | Explicit checkpoint after knownbits; converting it is stretch, not a dependency. | +| **Open: SSA passes** — keep their `vinfo` use lists or adopt `IROptCtx` caches? | predicates.h/guard.h are context-free (usable from both); query-ctx variants stay pre-SSA; SSA keeps `vinfo` until proven otherwise. | +| **Open: regalloc adopts `IRRangeIndex`?** | Its bespoke prefix sums are already correct; converting is optional cleanup, never a phase gate. | +| **Open: C23 adoption pace.** | Only as the macro machinery earns it, and tcc's frontend implements each feature first (self-hosting invariant). | + +--- + +*Counts and line numbers from a source survey of the working tree (branch +`heapOverflowBug`), 2026-07-03. Styled HTML version: +[plan_opt_predicate_framework.html](plan_opt_predicate_framework.html).* diff --git a/docs/plan_opt_split.md b/docs/plan_opt_split.md deleted file mode 100644 index 006a1968..00000000 --- a/docs/plan_opt_split.md +++ /dev/null @@ -1,362 +0,0 @@ -# Plan: Split `ir/opt.c` Into Themed Modules - -## Current State - -`ir/opt.c` is **17,861 lines** (down from 28,973 after Phase 6.1 extracted `opt_loop.c` and `opt_memory.c`). It still contains **67 functions** spanning 6+ distinct optimization themes. The already-extracted modules total ~13,200 lines across 14 files — so the remaining monolith is still the single largest source file. - -### Already extracted (for reference) - -| File | Lines | Contents | -|------|-------|----------| -| `opt_loop_utils.c` | 3,498 | IV analysis, loop bounds, loop transforms | -| `opt_memory.c` | 3,259 | sl_forward, entry_store_prop, store_redundant, deref_fwd | -| `opt_loop.c` | 1,052 | Strength reduction, unroll, rotation, decrement-to-zero | -| `opt_utils.c` | 978 | Constant evaluators, BB/CFG helpers, purity tables | -| `opt_gens_fusion.c` | 818 | Engine-based fusion generators | -| `opt_gens_call_result.c` | 301 | Dead call result generators | -| `opt_jump_thread.c` | 203 | Jump threading + fallthrough elimination | -| `opt_gens_branch.c` | 176 | Branch folding generators | -| `opt_alias.c` | 127 | Stack-slot aliasing helpers | -| `opt_engine.c` | 100 | IROptCtx, IROptGen, tcc_ir_opt_run_gens | -| `opt_du.c` | 98 | Def-use build/query | -| `opt_hash.c` | 63 | Generic hash table for CSE | -| `opt_gens_bool.c` | 57 | Boolean simplification generators | -| `opt_xform.c` | 24 | Transform primitives | - ---- - -## Proposed Split - -Split the remaining 17,861 lines into **7 new themed files** + a slim residual `opt.c` (~1,600 lines). - ---- - -### 1. `ir/opt_dce.c` — Dead Code & Cleanup (~2,200 lines) - -Functions to move: - -| Function | Lines | Range | -|----------|-------|-------| -| `tcc_ir_opt_dce` | 122 | 97–218 | -| `tcc_ir_opt_compact_nops` | 203 | 219–421 | -| `tcc_ir_opt_dead_var_store_elim` | 131 | 2985–3115 | -| `tcc_ir_opt_dead_addrvar_elim` | 330 | 3348–3677 | -| `tcc_ir_opt_redundant_var_assign` | 157 | 3678–3834 | -| `tcc_ir_opt_redundant_init_elim` | 156 | 14531–14686 | -| `tcc_ir_opt_dead_loop_elim` | 228 | 15500–15727 | -| `tcc_ir_opt_dse` | 1,269 | 1716–2984 | - -**Rationale:** All these passes remove dead/redundant IR — NOPs, unreachable code, dead stores, dead variables. `dse` is the largest single pass (1,269 lines) and is purely elimination logic. Grouping gives a single file for "what can I safely delete." - -**Internal dependencies:** -- `dse` uses `ir_opt_build_def_count` (shared static helper → move or expose via `opt_du.h`) -- All use `ir_xform_nop` (already in `opt_xform.h`) -- `dead_addrvar_elim` and `dse` use alias helpers (already in `opt_alias.h`) - ---- - -### 2. `ir/opt_constprop.c` — Constant & Value Propagation (~4,100 lines) - -Functions to move: - -| Function | Lines | Range | -|----------|-------|-------| -| `tcc_ir_opt_const_var_prop` | 253 | 422–674 | -| `tcc_ir_opt_global_init_prop` | 137 | 675–811 | -| `tcc_ir_opt_complex_const_param_fold` | 177 | 812–988 | -| `tcc_ir_opt_const_prop` | 1,235 | 3835–5069 | -| `tcc_ir_opt_value_tracking` | 1,647 | 5070–6716 | -| `tcc_ir_opt_const_prop_tmp` | 368 | 7928–8295 | -| `tcc_ir_opt_add_reassoc` | 125 | 8330–8454 | -| `tcc_ir_opt_cmp_expr_fold` | 166 | 8455–8620 | -| `ir_opt_build_def_count` (static) | 34 | 8296–8329 | - -**Rationale:** These are the "what values do I know at this point" passes. `const_prop` (1,235 lines) and `value_tracking` (1,647 lines) are the two biggest passes remaining in opt.c and they share constant-evaluation infrastructure. Together they form the core analysis engine. - -**Internal dependencies:** -- `const_prop` and `value_tracking` share evaluation helpers from `opt_utils.h` -- `ir_opt_build_def_count` is used by `add_reassoc` and `copy_prop` → make non-static, expose from header -- `value_tracking` uses VRP slot helpers (`vrp_get_slot`, `vrp_fold_cmp`) — move with it - ---- - -### 3. `ir/opt_copyprop.c` — Copy Propagation & CSE (~1,500 lines) - -Functions to move: - -| Function | Lines | Range | -|----------|-------|-------| -| `tcc_ir_opt_copy_prop` | 449 | 8621–9069 | -| `tcc_ir_opt_cse_global_load` | 214 | 9104–9317 | -| `tcc_ir_opt_globalsym_cse` | 133 | 9362–9494 | -| `gsym_cse_insert_before` (static) | 44 | 9318–9361 | -| `tcc_ir_opt_cse_param_add` | 194 | 9495–9688 | -| `tcc_ir_opt_local_load_cse` | 189 | 13737–13925 | -| `tcc_ir_opt_local_alu_cse` | 255 | 13926–14180 | -| `bool_cse_hash` / `bool_cse_eq` (statics) | 34 | 9070–9103 | - -**Rationale:** All these passes identify redundant computations (copy chains, repeated loads, repeated ALU ops) and eliminate them via forwarding or CSE. They share the same flat-array or hash-table BB-scoped pattern. - -**Internal dependencies:** -- Uses `IROptHashTable` from `opt_hash.h` -- `copy_prop` uses `ir_opt_build_def_count` (from opt_constprop.c or made public) -- `gsym_cse_insert_before` inserts instructions — unique to this group - ---- - -### 4. `ir/opt_branch.c` — Branch & Boolean Optimization (~2,200 lines) - -Functions to move: - -| Function | Lines | Range | -|----------|-------|-------| -| `tcc_ir_opt_float_branch_fold` | 252 | 7178–7429 | -| `ir_opt_match_zero_test` (static) | 35 | 7143–7177 | -| `tcc_ir_opt_vrp` | 330 | 7430–7759 | -| `vrp_get_slot` / `vrp_fold_cmp` (statics) | 29 | 6717–6745 | -| `tcc_ir_opt_nonneg_branch_fold` | 365 | 9720–10084 | -| `nonneg_func_names` / `flag_cmp_funcs` (tables) | 31 | 9689–9719 | -| `tcc_ir_opt_branch_folding` | 30 | 12447–12476 | -| `tcc_ir_opt_stack_addr_nonnull_fold` | 423 | 12477–12899 | -| `tcc_ir_opt_setif_branch_fuse` | 39 | 12900–12938 | -| `tcc_ir_opt_stack_bool_diamond` | 268 | 12939–13206 | -| `tcc_ir_opt_or_bool_diamond` | 232 | 13207–13438 | -| `tcc_ir_opt_bool_cse` | 75 | 12324–12398 | - -**Rationale:** All passes that reason about conditional branches, VRP (value-range propagation), boolean CSE, and control-flow diamonds. They share `JUMPIF`-triggered pattern matching and backward def-chain tracing. `vrp` and `nonneg_branch_fold` both use the VRP slot/fold helpers. - -**Internal dependencies:** -- `vrp` range tables are self-contained -- `nonneg_branch_fold` uses `change_callee_sym` (shared with float_narrowing → move to opt_utils or keep in residual) -- Branch passes use `ir_opt_match_zero_test` → move together - ---- - -### 5. `ir/opt_fusion.c` — Fusion & Addressing Mode (hand-written) (~2,050 lines) - -Functions to move: - -| Function | Lines | Range | -|----------|-------|-------| -| `tcc_ir_opt_add_deref_fold` | 232 | 3116–3347 | -| `tcc_ir_opt_postinc_fusion` | 278 | 10673–10950 | -| `tcc_ir_opt_loop_postinc_fusion` | 476 | 10951–11426 | -| `tcc_ir_barrel_shift_fusion` | 146 | 11427–11572 | -| `tcc_ir_opt_call_chain_rename` | 155 | 11573–11727 | -| `tcc_ir_opt_stackoff_addr_cse` | 176 | 11728–11903 | -| `tcc_ir_opt_lea_fold` | 420 | 11904–12323 | -| `tcc_ir_opt_assign_fuse` | 184 | 17486–17669 | - -**Rationale:** Hand-written fusion passes that couldn't be converted to engine generators (they insert instructions, need loop structure, or use BB-scoped hash tables). These are the ARM addressing-mode optimization passes — `LOAD_INDEXED`, `LOAD_POSTINC`, barrel-shift folding, LEA elimination, displacement fusion. Distinct from `opt_gens_fusion.c` which holds the engine-compatible generators. - -**Internal dependencies:** -- `loop_postinc_fusion` uses `IRLoops` from `opt_loop_utils.h` -- `lea_fold` uses def-use from `opt_du.h` -- `call_chain_rename` uses `change_callee_sym` helpers - ---- - -### 6. `ir/opt_promote.c` — Variable-to-Temp Promotion & Forwarding (~1,600 lines) - -Functions to move: - -| Function | Lines | Range | -|----------|-------|-------| -| `tcc_ir_opt_var_tmp_fwd` | 298 | 13439–13736 | -| `tcc_ir_opt_var_to_tmp` | 350 | 14181–14530 | -| `tcc_ir_opt_select` | 410 | 14687–15096 | -| `tcc_ir_opt_postinc_assign_fold` | 145 | 15303–15447 | -| `tcc_ir_opt_returnvalue_merge` | 52 | 15448–15499 | -| `tcc_ir_opt_backedge_phi_hoist` | 205 | 15920–16124 | -| `tcc_ir_opt_redundant_loop_check` | 168 | 7760–7927 | - -**Rationale:** These passes promote stack variables to temporaries, forward values through variable stores/loads, and select-ify simple if/else diamonds. They bridge the gap between flat variable-based IR (post-SSA destruction) and the register allocator which needs temporaries. `select` is the largest (410 lines) — it converts store-to-var-in-both-branches into a conditional move. - ---- - -### 7. `ir/opt_constfold.c` — Constant String/Call/Addrof Folding (~1,800 lines) - -Functions to move: - -| Function | Lines | Range | -|----------|-------|-------| -| `ir_opt_eval_const_string_operand` (static) | 70 | 6746–6815 | -| `ir_opt_fold_strcmp_result` (static) | 13 | 6816–6828 | -| `ir_opt_fold_strncmp_result` (static) | 16 | 6829–6844 | -| `ir_opt_fold_memcmp_result` (static) | 15 | 6845–6859 | -| `ir_opt_fold_memchr_offset` (static) | 20 | 6860–6879 | -| `tcc_ir_opt_const_string_calls` | 263 | 6880–7142 | -| `tcc_ir_opt_const_call_replace` | 90 | 15830–15919 | -| `tcc_ir_detect_const_result` | 73 | 15728–15800 | -| `tcc_ir_cache_const_result` | 15 | 15801–15815 | -| `tcc_ir_lookup_const_result` | 14 | 15816–15829 | -| `tcc_ir_opt_param_addrof_const_fold` | 435 | 16125–16559 | -| `tcc_ir_opt_local_addrof_const_fold` | 471 | 16560–17030 | -| `tcc_ir_opt_float_narrowing` | 307 | 10151–10457 | -| `float_narrow_table` / `change_callee_sym*` | 66 | 10085–10150 | - -**Rationale:** These passes evaluate calls and expressions at compile time when arguments are known constants — string library folding (`strcmp`, `strlen`, `memcmp`), memoized pure-function results, address-of-parameter constant propagation, and float type narrowing (e.g., `double→float` when precision allows). All share the "trace constant operands backward, fold result" pattern. - -**Internal dependencies:** -- `change_callee_sym` / `change_callee_sym_keep_type` → used by both `float_narrowing` and `nonneg_branch_fold`. Move to this file (it's defined here at line 10106) or to `opt_utils.c` if needed by `opt_branch.c` too. - ---- - -### 8. `ir/opt_pack64.c` — 64-bit Register Pair Optimization (~650 lines) - -Functions to move: - -| Function | Lines | Range | -|----------|-------|-------| -| `tcc_ir_opt_pack64` | 179 | 17031–17209 | -| `p64taut_trace_back` (static) | 51 | 17210–17260 | -| `tcc_ir_opt_pack64_tautology` | 225 | 17261–17485 | -| `tcc_ir_opt_cmp_narrow_64` | 192 | 17670–17861 | - -**Rationale:** ARM-specific 64-bit register-pair tracking. These passes combine/split `PACK64` pseudo-ops and eliminate redundant 64→32→64 conversions. Self-contained logic with no significant shared state. - ---- - -### 9. Residual `ir/opt.c` (~1,600 lines) - -What stays: - -| Function | Lines | Why stays | -|----------|-------|-----------| -| FP cache wrappers | 40 | Thin delegation layer, trivial | -| `tcc_ir_analyze_pure_via_sret` | 250 | Cross-cutting interprocedural analysis | -| FWS (func write summary) block | 400 | `fws_*` + `tcc_ir_compute_func_write_summary` — interprocedural, used by `dead_init_via_call` | -| `tcc_ir_opt_dead_init_via_call` | 116 | Depends on FWS, tight coupling | -| `tcc_ir_opt_stack_addr_cse` | 215 | Doesn't fit cleanly elsewhere (BB hash + stack aliasing hybrid) | -| `tcc_ir_opt_block_copy_init` | 206 | Memory/struct init hybrid | -| `tcc_ir_find_defining_instruction` | 18 | Small utility, widely used | -| `tcc_ir_vreg_has_single_use` | 30 | Small utility, widely used | -| Forward decls, includes, macros | ~50 | Boilerplate | - -The residual `opt.c` becomes a "miscellaneous + interprocedural" file. As these grow, they can be split further (e.g., `opt_interproc.c` for FWS + sret analysis). - ---- - -## Dependency Graph - -``` -opt.c (residual, 1.6K) - ├── opt_dce.c (2.2K) → opt_xform, opt_alias, opt_utils - ├── opt_constprop.c (4.1K) → opt_utils, opt_du - ├── opt_copyprop.c (1.5K) → opt_hash, opt_du, opt_utils - ├── opt_branch.c (2.2K) → opt_utils, opt_du - ├── opt_fusion.c (2.0K) → opt_du, opt_loop_utils, opt_alias - ├── opt_promote.c (1.6K) → opt_du, opt_utils - ├── opt_constfold.c (1.8K) → opt_utils - └── opt_pack64.c (0.6K) → (self-contained) -``` - -No circular dependencies. Each new file includes `ir.h` (which pulls in `tccir.h` + core types) plus the specific `opt_*.h` headers it needs. - ---- - -## Shared Helpers To Expose - -Before splitting, these currently-`static` helpers need to become non-static (add to appropriate header): - -| Helper | Current location | Move to | -|--------|-----------------|---------| -| `ir_opt_build_def_count` | opt.c:8296 | `opt_du.h` / `opt_du.c` | -| `change_callee_sym` | opt.c:10106 | `opt_utils.h` / `opt_utils.c` | -| `change_callee_sym_keep_type` | opt.c:10133 | `opt_utils.h` / `opt_utils.c` | -| `vrp_get_slot` / `vrp_fold_cmp` | opt.c:6717 | `opt_branch.c` (file-local) | -| `ir_opt_match_zero_test` | opt.c:7143 | `opt_branch.c` (file-local) | -| `ir_opt_eval_const_string_operand` | opt.c:6746 | `opt_constfold.c` (file-local) | -| `ir_opt_fold_str*` / `ir_opt_fold_mem*` | opt.c:6816–6879 | `opt_constfold.c` (file-local) | -| `p64taut_trace_back` | opt.c:17210 | `opt_pack64.c` (file-local) | -| `gsym_cse_insert_before` | opt.c:9318 | `opt_copyprop.c` (file-local) | -| `bool_cse_hash` / `bool_cse_eq` | opt.c:9070 | `opt_copyprop.c` (file-local) | - ---- - -## Execution Plan - -### Step 1: Expose shared helpers (30 min) -- [ ] Move `ir_opt_build_def_count` → `opt_du.c` / `opt_du.h` -- [ ] Move `change_callee_sym` + `change_callee_sym_keep_type` → `opt_utils.c` / `opt_utils.h` -- [ ] Verify: `make cross && make test -j16` - -### Step 2: Extract `opt_pack64.c` (30 min) -- [ ] Create `ir/opt_pack64.c` with `#define USING_GLOBALS` + `#include "ir.h"` -- [ ] Move `tcc_ir_opt_pack64`, `p64taut_trace_back`, `tcc_ir_opt_pack64_tautology`, `tcc_ir_opt_cmp_narrow_64` -- [ ] Add to `Makefile` `IR_FILES` -- [ ] Verify: `make cross && make test -j16` - -### Step 3: Extract `opt_dce.c` (45 min) -- [ ] Create `ir/opt_dce.c` -- [ ] Move 8 functions: `dce`, `compact_nops`, `dead_var_store_elim`, `dead_addrvar_elim`, `redundant_var_assign`, `redundant_init_elim`, `dead_loop_elim`, `dse` -- [ ] Create `ir/opt_dce.h` with public declarations -- [ ] Verify: `make cross && make test -j16` - -### Step 4: Extract `opt_constfold.c` (45 min) -- [ ] Create `ir/opt_constfold.c` -- [ ] Move 14 functions: string fold helpers, `const_string_calls`, `const_call_replace`, `detect_const_result`, `cache_const_result`, `lookup_const_result`, `param_addrof_const_fold`, `local_addrof_const_fold`, `float_narrowing`, `float_narrow_table` -- [ ] Verify: `make cross && make test -j16` - -### Step 5: Extract `opt_branch.c` (45 min) -- [ ] Create `ir/opt_branch.c` -- [ ] Move 12 functions: `float_branch_fold`, `match_zero_test`, `vrp`, VRP statics, `nonneg_branch_fold`, name tables, `branch_folding`, `stack_addr_nonnull_fold`, `setif_branch_fuse`, `stack_bool_diamond`, `or_bool_diamond`, `bool_cse` -- [ ] Verify: `make cross && make test -j16` - -### Step 6: Extract `opt_copyprop.c` (45 min) -- [ ] Create `ir/opt_copyprop.c` -- [ ] Move 8 functions: `copy_prop`, `cse_global_load`, `globalsym_cse`, `gsym_cse_insert_before`, `cse_param_add`, `local_load_cse`, `local_alu_cse`, `bool_cse_hash`/`bool_cse_eq` -- [ ] Verify: `make cross && make test -j16` - -### Step 7: Extract `opt_fusion.c` (45 min) -- [ ] Create `ir/opt_fusion.c` -- [ ] Move 8 functions: `add_deref_fold`, `postinc_fusion`, `loop_postinc_fusion`, `barrel_shift_fusion`, `call_chain_rename`, `stackoff_addr_cse`, `lea_fold`, `assign_fuse` -- [ ] Verify: `make cross && make test -j16` - -### Step 8: Extract `opt_promote.c` (30 min) -- [ ] Create `ir/opt_promote.c` -- [ ] Move 7 functions: `var_tmp_fwd`, `var_to_tmp`, `select`, `postinc_assign_fold`, `returnvalue_merge`, `backedge_phi_hoist`, `redundant_loop_check` -- [ ] Verify: `make cross && make test -j16` - -### Step 9: Extract `opt_constprop.c` (45 min) -- [ ] Create `ir/opt_constprop.c` -- [ ] Move 9 functions: `const_var_prop`, `global_init_prop`, `complex_const_param_fold`, `const_prop`, `value_tracking`, `const_prop_tmp`, `add_reassoc`, `cmp_expr_fold`, `ir_opt_build_def_count` -- [ ] Verify: `make cross && make test -j16` - -### Step 10: Final cleanup (30 min) -- [ ] Verify residual `opt.c` is ~1,600 lines -- [ ] Update `opt.h` — ensure all public function declarations reference correct headers -- [ ] Audit includes in each new file — remove unnecessary ones -- [ ] Final: `make cross && make test -j16 && make test-asm -j16` - ---- - -## Result Summary - -| File | Lines | Theme | -|------|-------|-------| -| `opt.c` (residual) | ~1,600 | Interprocedural (FWS, sret), misc | -| `opt_constprop.c` | ~4,100 | Constant/value propagation | -| `opt_dce.c` | ~2,200 | Dead code/store elimination | -| `opt_branch.c` | ~2,200 | Branch/VRP/boolean | -| `opt_fusion.c` | ~2,050 | Hand-written addressing-mode fusion | -| `opt_constfold.c` | ~1,800 | Compile-time call/string/addrof folding | -| `opt_promote.c` | ~1,600 | Variable→temp promotion | -| `opt_copyprop.c` | ~1,500 | Copy propagation & CSE | -| `opt_pack64.c` | ~650 | 64-bit register pair | - -**Total estimated effort: ~6 hours** (mechanical moves, no logic changes). - -**No flash savings** — this is purely a readability/maintainability refactor. The engine work (Phases 2–5 in the parent plan) is what saves flash. - ---- - -## Risks & Mitigations - -1. **Compilation unit boundaries change optimizer behavior.** Static functions that were previously inlinable across passes become extern calls. Mitigation: critical hot helpers stay `static inline` in headers (e.g., `ir_xform_nop` already is). - -2. **Include order sensitivity.** `opt.c` currently relies on `#define USING_GLOBALS` at the top. Each new file needs this + `#include "ir.h"`. Verify with `-Werror` that no implicit declarations creep in. - -3. **`change_callee_sym` used by 2 target files.** Moving it to `opt_utils.c` means both `opt_branch.c` and `opt_constfold.c` can call it. Alternative: duplicate in each file (worse) or keep in residual `opt.c` (limits extraction). - -4. **Build time.** More `.o` files = more linker inputs but better incremental build (touching one pass doesn't recompile 17K lines). Net positive for development velocity. diff --git a/docs/plan_ssa.md b/docs/plan_ssa.md deleted file mode 100644 index 292b9f08..00000000 --- a/docs/plan_ssa.md +++ /dev/null @@ -1,315 +0,0 @@ -# SSA Conversion Plan - -## Goal - -Insert a mandatory SSA (Static Single Assignment) construction pass between IR generation and optimization. The current `ir/opt.c` will be rewritten against SSA form. This document covers only the SSA infrastructure — no new optimizations yet. - -## Current IR Summary - -- Flat array of `IRQuadCompact` instructions -- Three vreg namespaces: VAR (locals), TEMP (compiler-generated), PARAM (function args) -- VARs can be assigned multiple times (not SSA) -- TEMPs are mostly single-def but not enforced -- Basic block boundaries are implicit: instructions following a JUMP/JUMPIF target (`is_jump_target` flag) start a new block -- No explicit CFG data structure — passes scan linearly and track jump targets -- Operands stored in a pool indexed by `operand_base` - -## Design - -### Phase 1: CFG Construction - -Build an explicit control flow graph from the flat instruction stream. - -**Data structures:** - -```c -typedef struct IRBasicBlock { - int start_idx; /* first instruction index (inclusive) */ - int end_idx; /* last instruction index (inclusive) */ - int id; /* block index */ - - int *preds; /* predecessor block IDs */ - int nb_preds; - int *succs; /* successor block IDs */ - int nb_succs; - - int idom; /* immediate dominator block ID */ - int *dom_frontier; /* dominance frontier set */ - int nb_dom_frontier; - int *dom_children; /* children in dominator tree */ - int nb_dom_children; -} IRBasicBlock; -``` - -**Algorithm:** -1. Scan instruction array; every `is_jump_target` or instruction following a JUMP/JUMPIF/RETURNVALUE/RETURNVOID starts a new block -2. Build successor edges: JUMP → target block, JUMPIF → target + fallthrough, RETURN → (none), IJUMP → all possible targets -3. Build predecessor edges (reverse of successors) - -**File:** `ir/cfg.c` - -### Phase 2: Dominator Tree - -Compute immediate dominators using the Cooper-Harvey-Kennedy algorithm (simple iterative, efficient for reducible CFGs which TCC always produces). - -**Algorithm:** "A Simple, Fast Dominance Algorithm" (Keith D. Cooper, Timothy J. Harvey, Ken Kennedy, 2001) - -1. Initialize idom[entry] = entry, all others undefined -2. Iterate in reverse postorder until fixed point: - - For each block b (except entry), idom[b] = intersect(idom of all preds) -3. Compute dominance frontier from idom tree - -**File:** `ir/cfg.c` (same file, closely coupled with CFG) - -### Phase 3: SSA Construction - -Convert VARs and TEMPs into SSA form using the standard algorithm: - -1. **Phi placement** (iterated dominance frontier): - - For each variable v, find all blocks that define v - - Place phi nodes at the dominance frontier of those blocks - - Iterate until no new phis are added - -2. **Renaming** (dominator tree walk): - - Walk dominator tree in preorder - - Maintain a rename stack per variable - - At each use: replace vreg with current SSA name from stack - - At each def: push new SSA name onto stack - - At each phi in successor: fill the phi operand for this edge - -**Phi node representation:** - -```c -typedef struct IRPhiNode { - int32_t dest_vreg; /* SSA vreg being defined */ - int nb_operands; - struct { - int32_t vreg; /* SSA vreg from this predecessor */ - int pred_block_id; /* which predecessor edge */ - } *operands; -} IRPhiNode; -``` - -Phi nodes are stored per-block (array at the top of each `IRBasicBlock`), not as regular instructions. This avoids disturbing the compact instruction array. - -**What gets SSA-renamed:** -- VAR vregs (locals) — these are the primary multi-def case -- TEMP vregs — already mostly single-def, but SSA enforces it -- PARAM vregs — treated as a single def at function entry - -**What does NOT get SSA-renamed:** -- StackLoc stores/loads (memory operations through pointers) -- Global symbol references -- Immediate constants - -**File:** `ir/ssa.c` - -### Phase 4: SSA Destruction (before regalloc) - -Convert out of SSA form for the register allocator (`tccls.c`) which expects the current flat IR format. - -**Algorithm:** naive phi elimination (sufficient for now, can optimize later with copy coalescing): - -1. For each phi node `v_i = phi(v_a, v_b, ...)`: - - Insert `ASSIGN v_i ← v_a` at end of predecessor block for edge a - - Insert `ASSIGN v_i ← v_b` at end of predecessor block for edge b -2. Remove all phi nodes -3. Flatten CFG back to linear instruction array - -Lost-copy and swap problems are rare in practice with linear scan; can add parallel-copy resolution later if needed. - -**File:** `ir/ssa.c` (destruction is the inverse of construction) - -## Integration Points - -### Pipeline position - -Current pipeline at -O1+ (SSA regalloc is default): -``` -tccgen.c (IR emission) - → ir/opt.c: pre-SSA optimizations (iterative loop) - → ir/regalloc.c: SSA-based register allocation - internally: build CFG → construct SSA → rename - → ir/opt/: SSA optimization engine (cprop → dce → target generators) - → build intervals → linear scan → phi resolution - → ir/codegen.c + arm-thumb-gen.c: code generation -``` - -Fallback pipeline at -O0 (or `-fno-ssa-regalloc`): -``` -tccgen.c (IR emission) - → ir/cfg.c + ir/ssa.c: construct SSA → rename - → ir/opt/: SSA optimization engine - → ir/ssa.c: destroy SSA - → ir/opt.c: pre-SSA optimizations - → tccls.c: legacy liveness + linear scan - → ir/codegen.c + arm-thumb-gen.c: code generation -``` - -Final pipeline (step 7 done — SSA is default, legacy removed): -``` -tccgen.c (IR emission) - → ir/opt.c: pre-SSA optimizations (iterative loop) - → ir/regalloc.c: SSA-based register allocation - internally: build CFG → construct SSA → rename - → ir/opt/: SSA optimization engine (SCCP, GVN, DCE, target generators) - → build intervals → linear scan → phi resolution - → ir/codegen.c + arm-thumb-gen.c: code generation -``` - -### Interface to existing code - -- `tccgen.c`: orchestrates SSA pipeline (build CFG → construct → rename → optimize → destroy) -- `ir/opt/`: SSA optimization engine — target-independent passes + registered target generators -- `arch/arm/ssa_opt_arm.c`: ARM target-specific generators, registered via `tcc_ir_ssa_opt_register_target()` -- `ir/opt.c`: pre-SSA optimization passes — run after SSA destruction on flat IR -- `tccls.c`: unchanged (receives flat IR after SSA destruction); replaced by `ir/regalloc.c` in step 5 -- `ir/codegen.c`: unchanged — operates post-regalloc - -### New API surface - -```c -/* ir/cfg.c */ -typedef struct IRCFG { ... } IRCFG; -IRCFG *tcc_ir_cfg_build(TCCIRState *ir); -void tcc_ir_cfg_free(IRCFG *cfg); - -/* ir/ssa.c */ -void tcc_ir_ssa_construct(TCCIRState *ir, IRCFG *cfg); -void tcc_ir_ssa_destroy(TCCIRState *ir, IRCFG *cfg); -``` - -### vreg numbering - -SSA creates new vregs (each def gets a unique name). Options: - -**Option A: Extend existing vreg encoding.** -Use TCCIR_VREG_TYPE_TEMP with new positions beyond the original max. Phi dests and renamed defs get fresh positions. Simple, no encoding changes. - -**Option B: New TCCIR_VREG_TYPE_SSA.** -Add a 4th vreg type. Cleaner separation, easier to assert "is this SSA?" but uses one of the few remaining type bits. - -Recommendation: **Option A** — reuse TEMP namespace. SSA vregs are just temps with the invariant that each position has exactly one def. No encoding changes needed. - -## Implementation Order - -### Done - -1. **`ir/cfg.c`** — CFG + dominator tree + dominance frontier ✓ - - CFG build, RPO, CHK dominators, dominance frontier all working - - Infinite-loop guard + bitset dedup optimization applied - - All tests pass with SSA phi placement enabled at -O1+ - -2. **`ir/ssa.c` phi placement** ✓ - - Only VARs with multi-block defs (skips TEMPs/PARAMs) - - Single-scan, bulk allocation, early-exit for trivial functions - - Wired into pipeline at -O1+ (`-fssa` / `-fno-ssa`) - -3. **SSA renaming** ✓ - - `tcc_ir_ssa_rename()` implemented and produces correct SSA form - - Enabled in pipeline with SSA construct → rename → optimize → destroy flow - - SSA destruction inserts phi-resolution copies at predecessor block ends - -4. **SSA optimization engine** ✓ (initial passes implemented) - - Modular engine in `ir/opt/` with generator-based dispatch (like `thop_*` instruction builders) - - Target-independent passes in `ir/opt/`, target-specific generators in `arch/arm/` - - Backend registers generators via `tcc_ir_ssa_opt_register_target()` — generic code knows nothing about the target - - **Infrastructure (`ir/opt/ssa_opt.h` + `ir/opt/ssa_opt.c`):** - - `IRSSAOptCtx` — shared context with use-def chains per TEMP vreg - - `IRSSAOptGen` — per-opcode generator descriptor (opcode → rewrite function) - - `IRSSAOptPass` — pass descriptor (custom function or generator table) - - Use-def chain builder: scans instructions + phi nodes in one pass - - Helpers: `ssa_opt_nop_instr()`, `ssa_opt_replace_all_uses()`, `ssa_opt_run_gens()` - - **DCE (`ir/opt/ssa_opt_dce.c`):** worklist-based, use-count == 0 → NOP defining instruction → cascade - - **Copy propagation (`ir/opt/ssa_opt_cprop.c`):** generators `ssa_gen_cprop_assign` (vreg→vreg) and `ssa_gen_cprop_imm` (vreg→immediate) - - **ARM generators (`arch/arm/ssa_opt_arm.c`):** `ssa_gen_arm_fuse_mul_add_to_mla`, `ssa_gen_arm_fuse_shl_add_to_load_indexed`, `ssa_gen_arm_fuse_shl_add_to_store_indexed`, `ssa_gen_arm_reduce_mul_to_shift` - -5. **SSA-based register allocator** ✓ - - `ir/regalloc.c` (1633 lines) — arch-independent SSA-aware linear scan - - `arch/arm/arm_regalloc.c` — ARM register tables (AAPCS, VFP) - - Consumes SSA-renamed IR + phi nodes directly (no SSA destruction step) - - Algorithm: linear scan on SSA with precoloring, call-crossing, 64-bit pairs - - Phi resolution: topological sort, cycle breaking, ASSIGN insertion - - Enabled at -O1+ via `-fssa-regalloc` (default on) - - SSA optimization engine now wired in: runs between SSA rename and interval building - -### Next - -6. **Port remaining opts to SSA** - - Constant propagation → sparse conditional constant propagation (SCCP) - - CSE → dominator-tree-based value numbering (GVN) - - Dead store elimination → SSA + alias analysis - - Dead pure call elimination → use-count on call result vreg - -7. **SSA default + legacy cleanup** - - Make SSA the mandatory path — remove `-fssa` / `-fno-ssa` toggle, SSA always runs - - Remove SSA destruction (`tcc_ir_ssa_destroy`) — regalloc consumes SSA directly - - Delete legacy allocator: `tccls.c`, `ir/live.c`, associated headers - - Delete pre-SSA passes replaced by SSA equivalents from `ir/opt.c`: - - `tcc_ir_opt_dce` (replaced by `ssa_opt_dce`) - - `tcc_ir_opt_copy_prop` (replaced by `ssa_opt_cprop`) - - `tcc_ir_opt_mla_fusion`, `tcc_ir_opt_indexed_memory_fusion` (replaced by ARM generators) - - `tcc_ir_opt_const_prop`, `tcc_ir_opt_const_prop_tmp`, `tcc_ir_opt_value_tracking` (replaced by SCCP) - - `tcc_ir_opt_cse_arith`, `tcc_ir_opt_cse_global_load` (replaced by GVN) - - Remove `IROptDU` infrastructure in `ir/opt.c` (superseded by `IRSSAVregInfo` use-def chains) - - Clean up `tccgen.c` pipeline: single path through SSA construct → optimize → regalloc → codegen - - Remove `opt_ssa` / `opt_ssa_regalloc` flags from `TCCState` - - Update Makefile: remove deleted files from `IR_FILES` / `CORE_FILES` - -## Complexity Estimates - -| Component | Lines (est.) | Algorithm complexity | Status | -|-----------|-------------|---------------------|--------| -| CFG build | ~150 | O(n) — single scan | ✓ | -| Dominator tree (CHK) | ~120 | O(n * d) — fast for structured code | ✓ | -| Dominance frontier | ~80 | O(n_blocks^2) worst case, O(n) typical | ✓ | -| Phi placement | ~100 | O(vars * blocks) | ✓ | -| SSA renaming | ~150 | O(instructions) | ✓ | -| SSA destruction | ~120 | O(phi_nodes) — interim until SSA regalloc | ✓ | -| SSA opt engine | ~400 | O(n * passes) — iterative convergence | ✓ | -| SSA opt DCE | ~80 | O(n) — worklist-based | ✓ | -| SSA opt copy prop | ~120 | O(n) — generator-based | ✓ | -| ARM generators | ~400 | O(n) — per-instruction pattern match | ✓ | -| SSA linear scan regalloc | ~400 | O(n) — single pass over live intervals | | -| SCCP | ~300 | O(n) — lattice-based worklist | | -| GVN | ~400 | O(n) — dominator-tree value numbering | | -| Legacy cleanup | negative | deletion of tccls.c, live.c, redundant opt.c passes | | -| **Total** | **~2820** | | | - -## Risks and Mitigations - -| Risk | Mitigation | -|------|-----------| -| IJUMP (computed goto) makes CFG imprecise | Already handled: functions with IJUMP skip advanced opts. For SSA, treat IJUMP as jumping to all known label targets (same as today). | -| Address-taken locals can't be SSA-renamed | Don't rename them. If a VAR has its address taken (LEA of that VAR), keep it as a memory operation. Only promote non-address-taken scalars to SSA vregs. | -| Critical edges (pred has multiple succs, succ has multiple preds) | Insert empty split blocks during phi elimination. Simple, adds at most O(edges) blocks. | -| Compile-time regression | All algorithms are near-linear. CHK dominators is O(n^2) worst case on irreducible CFGs, but TCC always generates reducible CFGs (no `goto` into loops from outside). | - -## Current Status (2026-05-04) - -All IR tests (`make test -j16`) and GCC torture tests pass. - -**What is live in the pipeline at -O1+:** -- CFG construction + dominator tree + dominance frontier (`ir/cfg.c`) -- SSA phi placement + renaming for multi-block VAR defs (`ir/ssa.c`) -- SSA optimization engine (`ir/opt/`): copy propagation, DCE, ARM target generators -- SSA destruction with phi-resolution copies (`ir/ssa.c`) -- Pre-SSA optimizations including `opt_cse` / `cse_arith` (`ir/opt.c`) -- Existing liveness + linear scan register allocator (`tccls.c` + `ir/live.c`) - -**SSA optimization engine architecture:** -- Target-independent infrastructure in `ir/opt/` — use-def chains, generator dispatch, pass table -- Target-specific generators in `arch/arm/` — registered via `tcc_ir_ssa_opt_register_target()` -- Generic code has no knowledge of the underlying hardware -- Each generator is an explicit named function (like `thop_*` instruction builders) - -**Next steps:** -- Port remaining optimizations to SSA: SCCP, GVN (step 6) -- Legacy cleanup: make SSA default, remove tccls.c + ir/live.c + redundant opt.c passes (step 7) - -## Non-Goals (explicitly out of scope for current phase) - -- Mem2Reg / SROA (needed eventually, not for current phase) -- Pruned SSA (full SSA is simpler to implement, prune later) -- Incremental SSA updates (rebuild from scratch each time is fine) -- Spill weight heuristics (use simple "most uses = least spill priority" initially) diff --git a/docs/plan_ssa_regalloc.md b/docs/plan_ssa_regalloc.md deleted file mode 100644 index b815801d..00000000 --- a/docs/plan_ssa_regalloc.md +++ /dev/null @@ -1,201 +0,0 @@ -# SSA-Based Register Allocator — Implementation Plan - -## Context - -Step 4 of `plan_ssa.md`: replace `tcc_ir_liveness_analysis()` + `tcc_ls_allocate_registers()` with a clean SSA-aware register allocator. The current allocator (`tccls.c`) works on flat IR after SSA destruction. The new allocator operates directly on SSA-renamed IR with phi nodes — simpler liveness, no lossy SSA destruction, and cleanly separated from the old code. - -## Pipeline - -Current: -``` -SSA construct → rename → destroy → optimize → liveness(ir/live.c) → allocate(tccls.c) → codegen -``` - -New (when `-fssa-regalloc` enabled): -``` -[SKIP first SSA pass] → optimize → [build SSA] → SSA regalloc → codegen -``` - -Skip the first SSA pass when SSA regalloc is enabled. Optimizations work without it (they did before SSA was added). After optimization, VARs still have multi-defs, and the existing `ir/ssa.c` handles VARs natively. - -When disabled: pipeline unchanged. - -## File Layout — Arch-Independent vs Arch-Dependent - -### Arch-independent: `ir/regalloc.c` + `ir/regalloc.h` - -Core SSA register allocator — no ARM-specific knowledge: - -- **SSA live interval building**: scan SSA instructions + phi nodes → `[start, end]` per vreg -- **Linear scan allocation**: sort intervals by start, sweep, assign from abstract register pools -- **Phi resolution**: sequentialize parallel copies, insert ASSIGN instructions -- **Instruction array rebuild**: fix jump targets, remap indices - -The allocator receives register constraints through an abstract interface: - -```c -/* Arch-independent register class descriptor */ -typedef struct RegAllocClass { - int num_regs; /* total registers in class */ - const int *caller_saved; /* caller-saved register list */ - int num_caller_saved; - const int *callee_saved; /* callee-saved register list */ - int num_callee_saved; - int pair_align; /* 1 = pairs must be even-aligned (AAPCS) */ -} RegAllocClass; - -/* Arch-independent allocation target */ -typedef struct RegAllocTarget { - RegAllocClass int_class; /* integer registers */ - RegAllocClass fp_class; /* float/VFP registers */ - int param_regs; /* number of parameter registers (e.g. 4) */ - int static_chain_reg; /* -1 if none */ -} RegAllocTarget; -``` - -Entry point: -```c -void tcc_ir_ssa_regalloc(TCCIRState *ir, const RegAllocTarget *target, int spill_base); -``` - -### Arch-dependent: `arch/arm/arm_regalloc.c` + `arch/arm/arm_regalloc.h` - -ARM-specific register set definitions: - -```c -/* Provides the RegAllocTarget for ARM Thumb-2 */ -const RegAllocTarget *arm_get_regalloc_target(void); -``` - -Contains: -- R0-R3 as caller-saved, R4-R11 as callee-saved (AAPCS) -- VFP register set (S0-S15 caller-saved) -- Even-aligned pair rule for 64-bit (R0:R1, R2:R3, etc.) -- Parameter register count (4) -- Static chain register (R10) - -Small file (~50 lines) — just data tables, no algorithms. - -## Algorithm Details - -### SSA Live Interval Building - -For each vreg in SSA-renamed IR, compute `[start, end]`: - -1. **Scan instructions**: For each instruction `i`: - - Each USE vreg: extend `end = max(end, i)` - - Each DEF vreg: set `start = i` (single-def in SSA) - -2. **Process phi nodes**: For each block `b`, for each phi: - - `phi.dest_vreg`: set `start = b.start_idx` - - For each operand `(vreg_k, pred_k)`: extend `vreg_k.end = pred_block.end_idx - 1` - -3. **FUNCPARAMVAL chains**: Extend parameter vreg intervals from FUNCPARAMVAL to corresponding FUNCCALL - -4. **Call crossings**: Build call-site prefix-sum array, check if interval spans any call - -5. **PARAMs**: Start at instruction 0, precolored to parameter registers - -6. **Address-taken VARs**: Not SSA-renamed; mark `addrtaken=1`, force stack - -### Linear Scan Allocation - -New implementation, independent of `tccls.c`: - -1. Sort intervals by start point (params first for precoloring) -2. Sweep in order, maintain active set (sorted by end point): - - Expire intervals ending before current start → free their registers - - If address-taken: force spill to stack - - If crosses call: prefer callee-saved register - - If 64-bit: allocate aligned pair (from `RegAllocTarget` pair rules) - - If float: allocate from float register class - - If no register available: spill (evict interval with fewest uses / longest range) -3. Track dirty_registers bitmap for prologue/epilogue - -Output: write directly to `IRLiveInterval.allocation` (r0, r1, offset) via `tcc_ir_stack_reg_assign()` — same output format consumed by `machine_op_from_ir()`. - -### Phi Resolution (after allocation) - -For each predecessor block, collect all phi copies `(dest_reg, src_reg)`: -1. Filter identity copies (dest == src) -2. Topological sort for dependency order -3. For cycles: break with scratch register or temp stack slot -4. Insert ASSIGN instructions before block terminator - -### Instruction Array Rebuild - -Same pattern as `tcc_ir_ssa_destroy()`: -1. Build `old_to_new[]` index mapping -2. Fix JUMP/JUMPIF targets, switch table targets, `is_jump_target` flags -3. Remap `IRLiveInterval.start/end` -4. Build `live_regs_by_instruction` table from final intervals - -## Pipeline Integration (`tccgen.c`) - -```c -/* SSA for optimizations — skip when SSA regalloc handles it later */ -if (tcc_state->opt_ssa && !tcc_state->opt_ssa_regalloc) { - /* existing: construct → rename → destroy */ -} - -/* ... optimizations as today ... */ - -/* Register allocation */ -if (tcc_state->opt_ssa_regalloc) { - const RegAllocTarget *target = arm_get_regalloc_target(); - tcc_ir_ssa_regalloc(ir, target, loc); -} else { - tcc_ir_liveness_analysis(ir); - tcc_ls_allocate_registers(&ir->ls, ...); -} - -/* ... rest unchanged: move coalescing, patch, params, stack, codegen ... */ -``` - -## Files to Create/Modify - -| File | Change | -|------|--------| -| `ir/regalloc.c` | **NEW** — arch-independent SSA regalloc (~400 lines) | -| `ir/regalloc.h` | **NEW** — `RegAllocTarget`, `tcc_ir_ssa_regalloc()` | -| `arch/arm/arm_regalloc.c` | **NEW** — ARM register set tables (~50 lines) | -| `arch/arm/arm_regalloc.h` | **NEW** — `arm_get_regalloc_target()` | -| `ir/ir.h` | Add `#include "regalloc.h"` | -| `tccgen.c` | Route to SSA regalloc when flag enabled (~20 lines) | -| `tcc.h` | Add `opt_ssa_regalloc` field to `TCCState` (near line 1144) | -| `libtcc.c` | Add `"ssa-regalloc"` to `-f` flag table (near line 1738) | -| `Makefile` | Add `ir/regalloc.c` + `arch/arm/arm_regalloc.c` to build | - -Files NOT modified: `tccls.c`, `ir/ssa.c`, `ir/cfg.c`, `ir/live.c`, `ir/codegen.c`, `arm-thumb-gen.c`, `ir/machine_op.c` - -## Functions to Reuse (read-only) - -- `tcc_ir_cfg_build()`, `tcc_ir_cfg_compute_dominators()`, `tcc_ir_cfg_compute_dom_frontiers()` — `ir/cfg.c` -- `tcc_ir_ssa_construct()`, `tcc_ir_ssa_rename()`, `tcc_ir_ssa_free()` — `ir/ssa.c` -- `tcc_ir_stack_reg_assign()` — `ir/stack.c` (writes `IRLiveInterval.allocation`) -- `tcc_ir_mark_return_value_incoming_regs()` — `ir/codegen.c` -- `tcc_ir_vreg_live_interval()` — `ir/vreg.c` -- `irop_config[]`, `tcc_ir_op_get_dest/src1/src2()`, `irop_get_vreg()` — `tccir_operand.h` - -## Implementation Order - -1. Create `arch/arm/arm_regalloc.h` + `arch/arm/arm_regalloc.c` — ARM register tables -2. Create `ir/regalloc.h` — `RegAllocTarget` structs + `tcc_ir_ssa_regalloc()` declaration -3. Create `ir/regalloc.c` — skeleton entry point, SSA build, live interval computation -4. Implement linear scan allocation (writes `IRLiveInterval.allocation` directly) -5. Implement phi resolution + instruction array rebuild -6. Wire into pipeline: `tccgen.c`, `tcc.h`, `libtcc.c`, `Makefile`, `ir/ir.h` -7. Test: `make test -j16`, `make test-gcc-torture-compile` - -## Verification - -```bash -make cross -# Test at -O0 with SSA regalloc -cd tests/ir_tests && python run.py -c 01_hello_world.c --cflags="-fssa-regalloc" -# Test at -O1 -cd tests/ir_tests && python run.py -c 01_hello_world.c --cflags="-O1 -fssa-regalloc" -# Full suites -make test -j16 -make test-gcc-torture-compile -``` diff --git a/docs/plan_vfp_hard_float.md b/docs/plan_vfp_hard_float.md new file mode 100644 index 00000000..0a30c37c --- /dev/null +++ b/docs/plan_vfp_hard_float.md @@ -0,0 +1,175 @@ +# Plan: Add ARMv8-M hard-float VFP support (`-mfloat-abi=hard`) + +## Context + +The YasOS TinyCC fork already parses `-mfloat-abi=hard` and `-mfpu=…`, sets `TCCState::float_abi` / `fpu_type`, and even configures `architecture_config.fpu` and VFP register allocation in `arm_init()`. The VFP Thumb encoder (`arch/arm/thumb/thop_vfp.c`) is complete for the operations we need. + +What is missing is the **codegen path**: `tcc_gen_machine_fp_mop()` in `arm-thumb-gen.c` unconditionally lowers every FP IR operation (`FADD`, `FSUB`, `FMUL`, `FDIV`, `FCMP`, `FNEG`, `CVT_ITOF`, `CVT_FTOI`, `CVT_FTOF`) to soft-float `__aeabi_*` library calls. As a result, `fp_select.c` compiled with `-mfloat-abi=hard -mfpu=fpv5-sp-d16` still calls `__aeabi_fadd`/`__aeabi_dadd`/`__aeabi_fmul` and passes floats in integer registers. + +The goal is to make `-mfloat-abi=hard` emit VFP instructions and use the VFP register bank for FP values, parameters, and return values, while keeping soft-float behavior unchanged. + +## Current state summary + +| Layer | State | +|---|---| +| Command-line parsing | `-mfloat-abi=hard` and `-mfpu=fpv{4,5}*dp{16,32}` parsed into `float_abi` / `fpu_type` | +| Feature resolution | `thumb_resolve_features()` in `arch/arm/thumb/thumb.c` maps `-mfpu=…` to `vfp_sp` / `vfp_dp` / `fp_armv8` bits | +| VFP encoder | `thop_vfp.c` has `th_vadd_f`, `th_vsub_f`, `th_vmul_f`, `th_vdiv_f`, `th_vcmp_f`, `th_vneg_f`, `th_vcvt_*`, `th_vmov_*`, `th_vpush`/`th_vpop`, `th_vmrs` | +| Allocator hint | `ir/vreg.c` sets `interval->use_vfp = (float_abi == ARM_HARD_FLOAT)` | +| FPU config | `arm_determine_fpu_config()` and `architecture_config.fpu` configured in `arm_init()` | +| Register bank | `s->float_registers_for_allocator` set to FPU register count when hard-float | +| **Missing** | Backend `tcc_gen_machine_fp_mop()` has no hard-float branch | +| **Missing** | AAPCS call layout (`thumb_build_call_layout_from_ir`) does not place FP args in `s0-s15`/`d0-d7` for hard-float | +| **Missing** | Return-value path does not use `s0`/`d0` for hard-float | + +## Goal + +When `float_abi == ARM_HARD_FLOAT` and the selected FPU supports the operation: + +1. FP values live in VFP registers (`s0-s15` for single, `d0-d7` for double on `fpv5-d16`). +2. FP arithmetic/compare/negate/conversion lower to VFP instructions instead of `__aeabi_*` calls. +3. FP function arguments and return values follow the AAPCS hard-float convention (`s0-s15` / `d0-d7`, then stack). +4. Spills, reloads, and moves between GPR and VFP registers use `vldr`/`vstr`/`vmov`. +5. Existing soft-float (`-mfloat-abi=soft` / `softfp`) output is byte-for-byte unchanged. + +## Approach + +A single incremental approach: teach the existing MachineOperand-based backend (`arm-thumb-gen.c`) to handle `MACH_OP_REG` operands whose register is a VFP register, and branch `tcc_gen_machine_fp_mop()` to VFP instruction emission when in hard-float mode. + +This is preferred over rewriting the legacy (non-MOP) FP path because: +- The IR pipeline already routes FP ops through `tcc_gen_machine_fp_mop()`. +- The VFP encoder is already available and unit-tested (`test_thop_vfp.c`). +- The MOP abstraction already distinguishes operand kind, register, spill, immediate, etc. + +## Phases + +### Phase 1 — VFP operand materialization helpers + +Add small helpers in `arm-thumb-gen.c` analogous to the existing `mach_ensure_in_reg()` family, but for VFP registers: + +- `vfp_ensure_in_sreg(MachineOperand src, int sreg)` — load SPILL/IMM/SYMBOL into VFP single register `sreg`. +- `vfp_ensure_in_dreg(MachineOperand src, int dreg)` — same for double register pair / `dreg`. +- `vfp_spill_sreg(int sreg, int frame_offset)` / `vfp_reload_sreg(...)` — `vstr`/`vldr` with SP-relative addressing. +- `vfp_move_ss(int dst, int src)` / `vfp_move_dd(...)` — `vmov.f32`/`vmov.f64`. +- `vfp_mov_gp_sp(int rt, int sn, int to_arm)` / `vfp_mov_2gp_dp(...)` — GPR ↔ VFP moves for parameter/return edges and int↔float conversions. + +Key files: +- `libs/tinycc/arm-thumb-gen.c` + +Tests: +- `tests/unit/arm/armv8m/test_thop_vfp.c` already covers the encoders; extend it with a few GPR↔VFP move cases if gaps are found. +- Add `tests/ir_tests/asm/fp_hard_basic.c` and a passing assertion in `test_codegen_asm.py` that `vadd.f32`/`vmul.f32` appear. + +### Phase 2 — Hard-float branch in `tcc_gen_machine_fp_mop()` + +At the top of `tcc_gen_machine_fp_mop()`, add: + +```c +if (float_abi == ARM_HARD_FLOAT && architecture_config.fpu->has_fadd) + return tcc_gen_machine_fp_mop_hard(src1, src2, dest, op, is_complex); +``` + +Implement `tcc_gen_machine_fp_mop_hard()`: + +| IR op | VFP sequence | +|---|---| +| `FADD`/`FSUB`/`FMUL`/`FDIV` | ensure operands in `s/d` regs, emit `vadd.f32`/`vsub.f32`/`vmul.f32`/`vdiv.f32` (or `.f64`), write back | +| `FNEG` | `vneg.f32` / `vneg.f64` | +| `FCMP` | `vcmp.f32` / `vcmp.f64`, then `vmrs apsr_nzcv, fpscr` | +| `CVT_ITOF` | `vcvt.f32.s32` / `vcvt.f64.s32` (unsigned variants via `u32`) | +| `CVT_FTOI` | `vcvt.s32.f32` / `vcvt.s32.f64` (unsigned/truncation variants) | +| `CVT_FTOF` | `vcvt.f64.f32` / `vcvt.f32.f64` | + +Guard each operation by the FPU config flags (`has_fadd`, `has_fmul`, `has_ftoi`, etc.); fall back to the existing soft-float path if the selected FPU lacks support. + +Key files: +- `libs/tinycc/arm-thumb-gen.c` + +Tests: +- Extend `tests/ir_tests/asm/fp_select.c` or add `fp_hard_ops.c` covering `+`, `-`, `*`, `/`, compare, negate, int↔float, float↔double. +- Update `test_fp_hard_float_uses_vfp` to pass and add `test_fp_hard_float_all_ops`. + +### Phase 3 — AAPCS hard-float parameter passing + +Modify the call-layout builder (`thumb_build_call_layout_from_ir()` and related helpers) so that when `float_abi == ARM_HARD_FLOAT`: + +- `float` args use `s0, s1, …` up to `s15`. +- `double` args use `d0, d1, …` up to `d7` (each consumes two single slots). +- Mixed int/FP args consume independent GPR and VFP register banks (AAPCS rule). +- Variadic functions continue using the soft-float layout (AAPCS requirement). +- Once VFP registers are exhausted, FP values spill to the stack argument area. + +Also update the caller side that marshals `FUNCPARAM` operands into argument locations so it knows how to move a VFP-register operand into `sN` (`vmov` or direct if already allocated there). + +Key files: +- `libs/tinycc/arm-thumb-gen.c` (call layout and param marshalling) +- Possibly `arch/arm/arm_aapcs.c` if the layout logic is split there + +Tests: +- `tests/ir_tests/asm/call_fp_args.c`: functions with `float`/`double`/mixed int+FP args; assert the right `vmov`/`vldr` into `s0-s7`/`d0-d3` and no `__aeabi_*` calls. + +### Phase 4 — Hard-float return values + +Update `tcc_gen_machine_return_value_mop()` and `gfunc_sret()`: + +- `float` return → `s0`. +- `double` return → `d0`. +- Callee writes directly to `s0`/`d0`; caller reads from there. + +Key files: +- `libs/tinycc/arm-thumb-gen.c` + +Tests: +- `tests/ir_tests/asm/call_fp_return.c`. + +### Phase 5 — Spill / reload / prolog / epilog + +Ensure the register allocator's VFP register bank (`float_registers_for_allocator`) is actually used for FP vregs when `use_vfp` is set, and that spills are emitted via `vstr`/`vldr`: + +- Verify `ir/regalloc.c` allocates VFP registers to intervals with `use_vfp == 1`. +- Verify spill code in `arm-thumb-gen.c` emits `vstr`/`vldr` for VFP physical registers. +- Save/restore callee-saved VFP registers in prolog/epilog if any are used (usually `d8-d15` / `s16-s31`, but `fpv5-sp-d16` only has `s0-s15` caller-saved; confirm per AAPCS). + +Key files: +- `libs/tinycc/ir/regalloc.c` +- `libs/tinycc/arm-thumb-gen.c` (spill emitter, prolog/epilog) + +Tests: +- `tests/ir_tests/asm/fp_spill_pressure.c`: a function with many live `float` locals forcing spills; assert `vstr`/`vldr` and no helper calls. + +### Phase 6 — Regression and integration + +- Run `make ut`. +- Run `pytest tests/ir_tests/test_codegen_asm.py`. +- Run the QEMU smoke suite (`scripts/run_qemu_smoke.sh`) on FP-heavy cases. +- Run a self-host FAT-drive round-trip compiling tinycc itself with `-mfloat-abi=hard` once the basic cases pass. +- Regenerate `SOURCE_COVERAGE.md` if any newly-covered files change status. + +## Key files / deliverables + +Modify: +- `libs/tinycc/arm-thumb-gen.c` — VFP materialization, `tcc_gen_machine_fp_mop_hard()`, call/return layout, spills. +- `libs/tinycc/ir/regalloc.c` — confirm VFP register allocation honors `use_vfp`. +- `libs/tinycc/tests/ir_tests/test_codegen_asm.py` — new assertions for hard-float codegen. +- `libs/tinycc/docs/plan_whole_tinycc_coverage.md` — close the FP gap finding once fixed. + +New test inputs: +- `tests/ir_tests/asm/fp_hard_basic.c` +- `tests/ir_tests/asm/fp_hard_ops.c` +- `tests/ir_tests/asm/call_fp_args.c` +- `tests/ir_tests/asm/call_fp_return.c` +- `tests/ir_tests/asm/fp_spill_pressure.c` + +## Risks and mitigations + +| Risk | Mitigation | +|---|---| +| Mixed int/FP AAPCS layout is subtle | Add dedicated tests with every permutation of GPR/VFP/stack args; compare with `arm-none-eabi-gcc -mfloat-abi=hard` disassembly for a reference corpus. | +| Soft-float regression | Keep the existing soft-float path untouched; gate every new branch on `float_abi == ARM_HARD_FLOAT`. Run the full QEMU `ir_tests` corpus with `-mfloat-abi=soft` before and after. | +| VFP register allocation bugs | Start with `-O0`/`-O1` only; the existing `use_vfp` flag already guides the RA. If RA mis-allocates, add targeted unit tests in `test_ra_*.c`. | +| Self-host miscompile | The cross compiler itself is built with soft-float, so this change only affects user code compiled with `-mfloat-abi=hard`. Still, run the FAT-drive self-host with a hard-float test subset. | +| Double-precision on `fpv5-sp-d16` | `fpv5-sp-d16` has no DP hardware, so `double` ops on that FPU must fall back to `__aeabi_d*` even under `-mfloat-abi=hard`. The plan honors `architecture_config.fpu->has_dadd` etc. | + +## Stop criterion + +`pytest tests/ir_tests/test_codegen_asm.py -k fp_hard` passes, `make ut` is green, and the QEMU smoke suite shows no new failures when run with both `-mfloat-abi=soft` and `-mfloat-abi=hard -mfpu=fpv5-sp-d16`. diff --git a/docs/register_allocator_improvements.md b/docs/register_allocator_improvements.md deleted file mode 100644 index 3c1d93c4..00000000 --- a/docs/register_allocator_improvements.md +++ /dev/null @@ -1,105 +0,0 @@ -# Register Allocator Improvement Opportunities - -## Current State (25 vs 19 instructions for bench_array_sum) - -The remaining 6-instruction gap is entirely register allocation and stack layout quality: - -| Gap | TCC | GCC | Root Cause | -|---|---|---|---| -| 2 instr | `push/pop {r4}` | no callee-save | r4 used for inner loop temp; r12 not available | -| 2 instr | `add r3,sp,#8; add.w r3,#1024` | `add r1,sp,#1020` | End pointer computed in 2 instructions | -| 1 instr | `mov r0, r1` | sum already in r0 | Return value not in r0 | -| 1 instr | `subw sp,#1036` (wide) | `sub.w sp,#1024` | 12 extra bytes frame padding | - ---- - -## 1. R12 (IP) for Allocation - -### Goal -Add r12 to the allocator pool as a caller-saved register. This gives 5 caller-saved registers (r0-r3, r12) instead of 4, eliminating callee-save push/pop when register pressure is 5. - -### Current Blocker -~30 places in `arm-thumb-gen.c` hardcode `R_IP`/`R12`/`ARM_R12` without going through the scratch allocator. These would clobber any value the allocator placed in r12. - -### Hardcoded R12 uses that need conversion to scratch allocator: - -**Stack manipulation (prologue/epilogue):** -- `arm-thumb-gen.c:3116-3117` — `MOV R_IP, R_SP` for dynamic stack alloc -- `arm-thumb-gen.c:3131-3132` — Load via R_IP for stack restore -- `arm-thumb-gen.c:7881-7892` — Argument area setup uses R12 directly -- `arm-thumb-gen.c:7910-7912` — Vararg store uses R_IP - -**Struct handling:** -- `arm-thumb-gen.c:8577-8590` — `get_struct_base_addr_mop` defaults to ARM_R12 -- `arm-thumb-gen.c:9035` — Same pattern in store path -- `arm-thumb-gen.c:9106` — Returns R_IP as fallback - -**Direct scratch use:** -- `arm-thumb-gen.c:8100` — `int temp = R_IP` for parameter copy -- `arm-thumb-gen.c:9654-9655` — Stack load uses ARM_R12 for offset - -**PIC/GOT/text-data separation:** -- `arm-thumb-gen.c:6721,7298,7376` — POP uses R12 for GOT reload - -### Required changes: -1. Convert each hardcoded R12 use to call `get_scratch_reg_with_save()` instead -2. Ensure each converted site properly saves/restores if r12 is live -3. Add r12 to `caller_saved_registers` bitmap -4. Change `registers_for_allocator = 13` -5. Cap `tcc_ls_assign_callee_saved_register` to r4-r11 (exclude r12) -6. Update `tcc_ls_assign_any_register` allocation order: r0-r3, r12, r4-r11 - -### Risk -High — each hardcoded site needs careful analysis of what registers are excluded and whether the scratch save/restore interacts with the surrounding code correctly. - ---- - -## 2. Return Value Precolor Priority (Eviction) - -### Goal -When the allocator processes a precolored interval (e.g., return value hinted to r0) and the preferred register is already taken by an uncolored interval, evict the uncolored interval to a different register. - -### Current Blocker -Linear scan processes intervals in start-point order. The return value vreg (V0, start=10) is processed AFTER the loop counter (V3, start=9). V3 gets r0 first. When V0 tries r0, it's taken and falls back to r1. Result: `mov r0, r1` at return. - -### Failed Approach: Retroactive Eviction -Attempted: when precolored V0 can't get r0, find V3 in the active set, release r0, and reassign V3 to a different register. - -**Why it fails:** Retroactive reassignment changes the register for V3's ENTIRE interval. If another interval (V2) was assigned r1 during [7,12] while V3 was in r0 during [9,21], moving V3 to r1 creates an overlap [9,12] where both V3 and V2 are in r1. This produces incorrect codegen. - -### Correct Approaches (not yet implemented): - -**A. Interval Splitting:** -Split the conflicting interval at the eviction point. V3 stays in r0 for [9, eviction_point], then moves to r1 for [eviction_point, 21]. Requires inserting a MOV at the split point and managing two sub-intervals. - -**B. Priority-Based Sorting:** -Sort intervals so precolored ones are processed first among those with the same start point. Doesn't help when start points differ (V3=9 vs V0=10). - -**C. Second-Chance Allocation:** -After all intervals are processed, scan for precolored intervals that didn't get their preferred register. Try to swap with the conflicting interval if safe (no overlap with other intervals in the new register). - -**D. Graph Coloring:** -Replace linear scan with a graph-coloring allocator that handles preferences natively. Significant complexity increase. - -### Recommendation -Approach C (second-chance) is safest and simplest. After the main allocation loop, for each precolored interval that missed its hint: -1. Find the interval currently holding the desired register -2. Check if the desired register is free for the blocker's entire range (scan all intervals) -3. If safe, swap registers -4. If not safe, leave as-is - ---- - -## 3. Loop Bound Rematerialization Without Calls - -### Goal -The inner sum loop computes `end = SP+8+1024` in 2 instructions and keeps it in r3 for the entire loop. If rematerialized inside the loop (1 instruction per iteration), r3 is freed for the loaded value, avoiding r4 (callee-save). - -### Current State -`tcc_ir_opt_loop_bound_remat` only fires for loops containing function calls. The inner sum loop has no calls, so it's skipped. - -### Required Change -Relax the `has_calls` guard to also allow remat when register pressure exceeds caller-saved capacity (>4 simultaneous live values). Requires estimating live count at the IR level before register allocation. - -### Trade-off -Adds 1 instruction per inner loop iteration (the remat ADD) but saves 2 instructions total (push/pop r4). Net benefit depends on loop trip count — beneficial for loops with many iterations. diff --git a/docs/restructure_architecture.html b/docs/restructure_architecture.html new file mode 100644 index 00000000..2d54e250 --- /dev/null +++ b/docs/restructure_architecture.html @@ -0,0 +1,978 @@ + + + + + +tinycc — source restructure architecture + + + + +
+ + +
+
+

tinycc · armv8-m fork · architecture proposal

+

Restructuring the source tree for multi-architecture support

+

A source/ root with generic compiler layers, one machine + contract, and self-contained backends under source/arch/ — plus a test tree + that mirrors it. Designed so the next architecture is a directory, not a rewrite.

+
    +
  • 91k lines top-level C
  • +
  • 97k lines in ir/
  • +
  • tccgen.c 33,407 lines
  • +
  • arm-thumb-gen.c 13,534 lines
  • +
  • tcc.h 2,892 lines, included everywhere
  • +
  • seam already ~80% in place
  • +
+
+ +
+

§1Goals & ground rules

+

what this restructure must achieve — and what it must not break

+
    +
  • Physical layout matches logical layers. Everything moves under + source/; architecture-specific code lives only in + source/arch/<name>/; generic code never includes an arch header.
  • +
  • Huge files become functional blocks. tccgen.c (33k) + splits into ~10 files, arm-thumb-gen.c (13.5k) into ~13, along the block + boundaries mapped in §6–§7.
  • +
  • A second architecture drops in. One written contract (§5) is the + complete list of what a backend implements. Register facts, ABI classification, and + relocations all flow through it.
  • +
  • Every phase keeps make test green. The plan (§10) is a + sequence of mechanical, individually verifiable steps — no big-bang branch.
  • +
  • History survives. Moves are pure git mv commits, + separate from content edits, so git blame -C stays useful.
  • +
+
+ good news first +

This is not a greenfield redesign. The amalgamation build is already gone (every + .c compiles separately; only tcc.c includes + tcctools.c). arch/arm/ already exists with clean pieces — + AAPCS classification, a RegAllocTarget descriptor, 29 Thumb-2 encoder + modules — and the build system already documents how to add an architecture. The IR + operand seam (MachineOperand, machine_op_from_ir) is fully + target-neutral. What remains is finishing a boundary that is ~80% built.

+
+
+ +
+

§2Where the code is today

+

two half-finished abstraction layers, six hard leaks, one god-header

+

The load-bearing backend interface today is a flat set of ~90 + tcc_gen_machine_* / tcc_machine_* symbols declared in + tcc.h:2576–2726 and resolved at link time. A second, aspirational vtable + (TCCMachineInterface in tccmachine.h/.c) exists but is dead: + tcc_machine_register() is never called. Meanwhile the largest ARM files + still sit at the repo root, outside arch/.

+ +
+ + + + + + + + + + tcc.h — 2,892-line god-header + included by every TU · pulls arm-thumb-defs.h + + + + tccgen.c + parser + sema + IR emission + 5 functions > 1,000 lines each + 33,407 ln + + + generic core + tccpp.c · tccelf.c · tccld.c + tccdbg.c · tccasm.c · libtcc.c + tccyaff.c · tcc.c · tccls.c … + mostly clean; islands of #ifdef ARM + + + tccmachine.c — dead vtable + TCCMachineInterface never registered; + real dispatch = ~90 link-time symbols + declared in tcc.h:2576–2726 + + + + ir/ — 70 files + 97k ln + core · cfg · ssa · dump · vreg + opt_*.c + opt/ssa_opt_*.c passes + regalloc.c (RegAllocTarget ✓) + machine_op.c (MachineOperand ✓) + codegen.c — two-pass dispatch + 109 direct tcc_gen_machine_* calls + already target-neutral except one + direct call into ARM call-site code + + + + ARM files still at repo root + arm-thumb-gen.c (13,534 ln) + arm-thumb-asm.c · arm-link.c + callsite.c · defs.h · thumb-tok.h + + + arm-thumb-scratch.c + orphaned — in no Makefile, included + by nothing. Delete. + + + arch/arm/ — already exists + 7.6k ln + arm.c · arm_aapcs.c · arm_regalloc.c + ssa_opt_arm.c · thumb/thop_*.c ×29 + arch/fpu/arm: present but unbuilt + + + + + + + + + ④⑤⑥ + +
Fig. 1 — Today's top level. Red dashed arrows are the hard couplings that + break the generic/arch boundary; numbers key into the table below. Grey dashed boxes are + dead code.
+
+ +

The six hard leaks (generic → ARM)

+
+ + + + + + + + + + + + + + + + + + + + + +
#WhereLeakFix
tcc.h:358Unconditionally includes arm-thumb-defs.h — every + generic TU compiles against NB_REGS, TREG_*, + RC_*, ARM reloc aliases.Backend defs come in via the machine contract header only.
tccgen.c:38, 1028, 30983Includes arch/arm/arm_regalloc.h; calls + arm_init() and arm_get_regalloc_target() directly.Generic tcc_backend_init() + + tcc_backend_regalloc_target() hooks.
ir/codegen.c:1915Generic dispatcher calls thumb_build_call_layout_from_ir() + by name.Add call-layout entry point to the contract.
tccls.c:125–320Hardcodes SP=R13 mask, R12 special case, "scratch from R0–R3", 16-register + bounds in nominally generic linear-scan code.Read all register facts from RegAllocTarget.
tccgen.c:30891registers_for_allocator = 12 hardcoded (backend sets 13 + elsewhere — duplicated magic).Single source of truth in RegAllocTarget.
tccir.h:718Generic IR header declares arm_fpu_supports_double().Replace with tcc_target_has() capability query + (already exists in tcc_target.h).
+

Beyond these, #ifdef TCC_TARGET_ARM_THUMB appears at only ~17 sites in + generic code — mostly benign option-parsing and section-name islands in + libtcc.c, tccelf.c, tccdbg.c that can migrate to + contract hooks gradually. The relocation engine is already split correctly: + tccelf.c drives, arm-link.c implements + relocate/code_reloc/gotplt_entry_type.

+
+ +
+

§3Target source tree

+

source/ as root · arch/ for backends · generic layers in the rest

+

File basenames keep their identity where the file moves unchanged + (← annotations show origin); new names appear only where a + file is split. Repo root keeps include/ (headers shipped to compiled + programs), lib/ (runtime library), tests/, scripts/, + docs/.

+ +
+source/
+├── driver/                        # entry points & public API
+│   ├── tcc.c                      # CLI main + tool dispatch
+│   ├── tcctools.c                 # ar / cross-prefix tools
+│   └── libtcc.c                   # TCCState lifecycle, options, compile/link driver
+├── frontend/                      # C language → IR
+│   ├── tccpp.c                    # preprocessor + tokenizer
+│   ├── tccasm.c                   # GAS-style asm frontend (arch-neutral core)
+│   ├── tcctok.h · tccdefs.h
+│   └── gen/                       # tccgen.c split — see §6
+│       ├── gen_priv.h             # shared vstack/scope/switch state (the linchpin)
+│       ├── gen_core.c  gen_sym.c  gen_vstack.c  gen_ops.c  gen_types.c
+│       ├── gen_expr.c  gen_builtins.c  gen_stmt.c  gen_init.c
+│       └── gen_decl.c             # decl, nested fns, gen_function IR-pipeline driver
+├── ir/                            # target-independent IR — moves largely as-is
+│   ├── core.c  cfg.c  ssa.c  dump.c  vreg.c  stack.c  live.c  licm.c
+│   ├── operand.c                  ← tccir_operand.c (SValue ↔ IROperand)
+│   ├── passes.c                   ← tccopt.c (pass registry)
+│   ├── opt/                       # all opt_*.c + ssa_opt_*.c consolidated
+│   ├── regalloc.c                 # SSA regalloc — parameterized by RegAllocTarget ✓
+│   └── codegen.c                  # two-pass dry-run/real-run dispatch loop
+├── machine/                       # THE seam — generic side of the backend boundary
+│   ├── machine.h                  # the written contract: every symbol a backend implements (§5)
+│   ├── machine_op.c/.h            ← ir/machine_op.* (MachineOperand — already clean)
+│   ├── target.h                   ← tcc_target.h (ArchitectureConfig, capabilities)
+│   ├── abi.h                      ← tccabi.h (TCCAbiArgDesc / CallLayout)
+│   └── ls.c                       ← tccls.c, de-ARM'd (leak ④)
+├── obj/                           # object containers, linking, debug info
+│   ├── elf.c                      ← tccelf.c (ARM islands → reloc hooks)
+│   ├── ld.c                       ← tccld.c (linker scripts — already 100% generic)
+│   ├── yaff.c                     ← tccyaff.c (R_ARM_* enums → contract reloc kinds)
+│   ├── dwarf.c                    ← tccdbg.c (DWARF/stabs)
+│   └── elf.h · dwarf.h · stab.h
+├── support/
+│   ├── log.h  tcc-chained-hash.h
+│   └── tccdebug.c                 # SValue/Sym pretty-printers
+└── arch/
+    ├── arm/
+    │   ├── arm.c  arm.h           # target init: ArchitectureConfig, capabilities
+    │   ├── defs.h                 ← arm-thumb-defs.h (NB_REGS, TREG_*, RC_*)
+    │   ├── aapcs.c                ← arm_aapcs.c (ABI classification)
+    │   ├── regalloc.c             ← arm_regalloc.c (RegAllocTarget tables)
+    │   ├── ssa_opt.c              ← ssa_opt_arm.c (MLA fusion, shl+add→indexed, …)
+    │   ├── gen/                   # arm-thumb-gen.c split — see §7
+    │   ├── thumb/                 # thop_*.c encoders ×29 — unchanged
+    │   ├── asm.c                  ← arm-thumb-asm.c (mnemonic parser → thop_*)
+    │   ├── tok.h                  ← thumb-tok.h
+    │   ├── link.c                 ← arm-link.c (R_ARM_* relocations)
+    │   └── fpu/                   ← arch/fpu/arm — finally wired into arm_resolve_fpu()
+    └── riscv/                     # future — implements machine/machine.h, nothing else
+
+ +
+ deletions, not moves +

arm-thumb-scratch.c is orphaned (built by nothing, included by nothing — + the live scratch logic is inside arm-thumb-gen.c): delete it. The dead + TCCMachineInterface vtable in tccmachine.h/.c is superseded by + the contract header (§5 decision): delete it too. Legacy upstream test drivers + (tcctest.c, abitest.c, tests/Makefile suite) + quarantine under tests/legacy/.

+
+
+ +
+

§4Layered architecture

+

dependency rules the directory layout enforces

+ +
+ + + + + + + + + + + + + + driver/ + tcc.c (CLI) · libtcc.c (API, options, compile/link orchestration) + + + + obj/ + elf.c — sections, symbols, + GOT/PLT, output writer + ld.c — linker scripts, regions + yaff.c — YAFF object format + dwarf.c — debug info + calls the reloc contract: + relocate() · code_reloc() + gotplt_entry_type() + — implemented by arch/*/link.c + + + + frontend/ + tccpp.c → gen/* (parse · types · sema · vstack → IR emission) + tccasm.c (inline-asm frontend → arch asm parser) + + + + ir/ + core · cfg · ssa · opt pipeline · licm — target-independent + regalloc.c ⟵ RegAllocTarget codegen.c: two-pass dispatch + machine_op_from_ir(): IROperand → MachineOperand + + + + machine/ — the contract + machine.h (~90 entry points, §5) · machine_op · target.h + abi.h · ls.c — the only headers arch code and generic code share + + + + arch/arm/ + gen/ (mop handlers) · thumb/ (encoders) + link.c · asm.c · aapcs.c · regalloc.c + ssa_opt.c · fpu/ · defs.h + sees: machine/ + ir headers it needs + + + arch/riscv/ + future backend: + implement machine.h, + add Makefile stanza, + nothing else changes + + + + + + mop dispatch (per IR op) + + + syms/relocs + + + + + implements + +
Fig. 2 — Target layers. Solid arrows are calls (always downward or into + obj/); the dashed green arrows are backends implementing the machine contract. The copper + band is the only crossing point between generic and architecture-specific code.
+
+ +

Dependency rules (CI-enforceable)

+
    +
  • driver/ → frontend/, ir/, obj/, machine/ — orchestration only.
  • +
  • frontend/ → ir/, machine/ (emission primitives, ABI queries); + frontend/ → obj/ only for symbol/section glue + (put_extern_sym, relocations on initializers).
  • +
  • ir/ → machine/ — the dispatch loop and regalloc consume only + contract types.
  • +
  • obj/ → machine/ — reloc/attribute hooks; never + R_ARM_* by name.
  • +
  • arch/<t>/ → machine/, ir/ headers, support/ — a backend may see + generic types, never frontend internals.
  • +
  • Nothing outside arch/ includes anything inside + arch/. One grep in CI keeps this true forever: + grep -rn '#include "arch/' source/ --exclude-dir=arch must return + empty.
  • +
+
+ +
+

§5The backend contract — machine/machine.h

+

one header that is the complete definition of "a backend"

+

The interface already exists in practice — it is just scattered and unwritten. The + contract header collects the ~90 entry points, grouped and documented, so "port tinycc" + becomes "implement this file". Group sizes below are from the live + tcc.h:2576–2726 surface plus the reloc backend.

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
GroupEntry points (representative)Notes
materialization10acquire/release_scratch · load/store_spill_slot · + load_constant · addr_of_stack_slot · can_encode_stack_offsetInteger-arg primitives used by ir/mat + regalloc.
mop handlers~45data_processing_mop · load/store[_indexed|_postinc]_mop · + muldiv_mop · mla/umull/smull_mop · fp_mop · func_call_mop · select_mop · + block_copy_mop · vla_mop · setjmp/trap/prefetch…One per IR op family; all take MachineOperand.
frame8prolog · epilog · finish_noreturn · store_to_stack/sp · + number_of_registers · gfunc_sret · nested-fn trampolineFrame layout + return-value classification.
branches8jump_mop · conditional_jump_mop · cbz_jump_mop · + backpatch_jump · switch_table/load_mop · *_dry_run_sizeRelaxation policy stays in the backend.
two-pass hooks~15dry_run_init/start/end · insn_scratch_reset/count/saves_mask · + branch_opt_init/analyze · *_cache_reset · pending_pool_size · + reserve_pool_bytes · end_instructionDriven by ir/codegen.c's dry-run/real-run loop.
ABI5abi_classify_argument · abi_assign_call_args · + build_call_layout (fixes leak ③) · abi_softcall_nameTypes in machine/abi.h.
target5backend_init/deinit (fixes leak ②) · regalloc_target() · + target_has(cap) · resolve_fpu()Fills ArchitectureConfig + RegAllocTarget.
relocations4relocate · relocate_plt · code_reloc · gotplt_entry_typeAlready the upstream xxx-link.c shape; obj/elf.c is the driver.
asm (optional)2asm_opcode parser hook · asm token tableBackends without inline-asm support stub these.
+ +
+

Decision: link-time binding, not a vtable

+

recommended — matches how the fork already works

+

Each target is a separate cross-compiler binary (armv8m-tcc), exactly + one backend linked in — so link-time symbol resolution is a zero-cost dispatch that the + hot two-pass codegen loop already relies on (109 call sites in + ir/codegen.c). Formalize the symbol set in machine/machine.h + and delete the dead TCCMachineInterface vtable rather than + reviving it. If a multi-target single binary is ever wanted, the contract header is + precisely the struct definition a vtable would need — nothing is lost by waiting.

+
+ +
+ generic logic to hoist out of the backend — later, §10 phase 6 +

Five pieces of genuinely generic machinery currently live inside + arm-thumb-gen.c and would otherwise be rewritten by every new backend: + the dry-run scratch-discovery state machine, 64-bit-as-register-pair lowering, the + parallel-move solver for call arguments, mul-by-constant strength reduction, and the + peephole cache frameworks (MOV-equivalence, immediate reuse, STR→LDR forwarding). + Hoist them into machine/ as shared engines parameterized by backend + callbacks — but only when the second architecture arrives and proves the + parameterization, not speculatively.

+
+
+ +
+

§6Splitting tccgen.c — 33,407 lines → 10 files

+

frontend/gen/ · shared state declared once in gen_priv.h

+

The file's ~35 internal blocks condense into ten modules. The entangling state is + well-understood: vtop/_vstack (2,160 refs), tok + (322), loc, nocode_wanted, scope/switch stacks. + gen_priv.h declares all of it (definitions live in + gen_core.c), plus the current forward-decl list — the file's own lines + 746–789 are the seed set.

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
File≈ linesContents (today's blocks)Coupling
gen_core.c1,200Global state definitions, lifecycle (tccgen_init/compile/finish), + type predicates, code-suppression (nocode_wanted)owns state
gen_sym.c1,100Symbol table, labels, ELF symbol glue (put_extern_sym, + greloc), attribute merge/patch, aliasesclean — move first
gen_vstack.c1,700Value-stack ops, gv/gv2, long-long expand, addressing, + bitfields, boundsvstack core
gen_ops.c2,300gen_opl/opic/opif, gen_op, complex arithmetic, + vector extensionsheavy vtop
gen_types.c2,900Type compare/compat, casts, type_size, struct layout + + declaration, parse_btype, declaratorsAAPCS alignment here
gen_expr.c6,500Unary/primary/postfix, unary_funcall, binary precedence, + ternary, gexpr, const-exprvtop + tok
gen_builtins.c3,800unary_builtin_*, string-builtin folding, auto-inline + heuristics, try_inline_const_evalclean — move early
gen_stmt.c1,900block/block_1, switch codegen, return, scopes/cleanups, VLA + scope handlingtok + vtop
gen_init.c3,200vstore, inc, initializers, designators, + decl_initializer_allocvtop
gen_decl.c5,300decl, nested functions/trampolines, + gen_function (the IR-pipeline driver: SSA, opt passes, regalloc, + codegen), inline stash, late reoptIR-facing — owns ir/ includes
+
    +
  • Extraction order: gen_symgen_builtins + → gen_decl (only 7 vtop refs in the gen_function region) → + gen_typesgen_stmtgen_init → + gen_opsgen_expr → what remains is + gen_core + gen_vstack.
  • +
  • Watch one handshake: the AAPCS invisible-copy state + (aapcs_last_const_init, set during parameter typing, consumed in + unary_funcall) crosses the types/expr boundary — promote it to an + explicit field in gen_priv.h, not a bare static.
  • +
  • After the split, gen_decl.c is the only frontend file that sees the + IR optimization pipeline; the rest use emission primitives from + tccir.h only.
  • +
+
+ +
+

§7Splitting arm-thumb-gen.c — 13,534 lines → 13 files

+

arch/arm/gen/ · split by mop family, matching the contract groups

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
File≈ linesContents (today's line ranges)
state.c600ThumbGeneratorState, reg classes, frame-offset helpers, MachineCodegenContext + allocators (54–740)
scratch.c900Scratch acquire/release/spill, push-window bias, dry-run scratch discovery + hooks (1419–1794 + snapshot state)
caches.c700mov_equiv, imm_cache, strldr_cache, spill cache — peephole trackers + (1812–2287)
litpool.c700Literal pools: init/reserve/flush/find, IT-window guard + (2345–2782, 4112–4200)
emit.c900o()/ot(), opcode validation, branch patching/relaxation + (decbranch, th_patch_call), dry-run/branch-opt state + machine (853–1272, 2509–3402)
alu.c1,000Data-processing mops 32/64-bit, shifts, ubfx/bfi + (4993–5950)
muldiv.c1,050mul/div/MLA/UMULL/SMULL/MLAL, mul-by-const strength reduction, pack64, + cmp_eq64 (5952–6921)
mem.c1,500load/store/indexed/postinc mops, spill slots, strd/ldrd pairing, assign, + setif/bool (3647–4530, 6987–8463)
fp.c1,000VFP + soft-float dispatch (get_softfp_func_name), complex + lowering (8463–9476)
frame.c900prolog/epilog, noreturn finish, VLA alloc, gfunc_sret, LEA, + stack stores (3564–3646, 9476–10347)
call.c2,100Call generation, AAPCS arg placement, ThumbArgMove parallel-move solver; + absorb arm-thumb-callsite.c + (10348–12120)
branch.c500jump/conditional/cbz mops, chain slots, switch tables, indirect jump + (3413–3555, 12120–12280)
intrin.c700select/block-copy mops, trap/prefetch/setjmp/longjmp, builtin_apply, + trampolines (12281–13534)
+

The cut lines are unusually clean because handlers already communicate through + MachineOperand and the shared generator state — the split is mostly moving + functions plus one gen_priv.h-style internal header + (arch/arm/gen/gen.h) for the state struct and cross-file statics.

+
+ +
+

§8Header topology — dismantling tcc.h

+

the 2,892-line god-header already contains its own split map

+

Lines 1870–2532 of tcc.h are per-module prototype banners + (/* ---- tccpp.c ---- */ …) — each becomes that module's own header. What + remains is a small set of genuinely shared headers:

+
+ + + + + + + + + + + + + + + + + +
New headerFrom tcc.hContents
source/config.h:26–243Platform shims, target-select ladder, threading
source/core_types.h:384–914The shared data model: CType, CValue, + SValue, Sym, Section, TokenString — + needed by every layer
source/state.h:915–1524TCCState + extended symbol attributes
frontend/tokens.h:1636–1869Token constants (wraps tcctok.h)
per-module headers:1870–2532pp.h, gen.h, elf.h, asm.h, + dwarf.h, yaff.h… — each owns its banner
machine/machine.h:2576–2726The backend contract (§5); replaces the xxx-gen.c banner
arch/arm/defs.h:358 includeNB_REGS, TREG_*, RC_*, + PTR_SIZE, float-ABI enums — no longer included by generic + code; generic layers read register facts from + RegAllocTarget/ArchitectureConfig at runtime
+

A transitional tcc.h that includes the new pieces keeps every TU + compiling during the split; it shrinks to a compatibility shim and is deleted at the + end. The one behavioral constant to preserve: PTR_SIZE and + LDOUBLE_SIZE are compile-time constants per target binary — they stay + macros, provided by the per-arch defs through the build system's + -DTCC_TARGET_* defines, not through a generic include of an arch + header.

+
+ +
+

§9Target test tree — mirroring source/

+

generic = host-runnable with plain gcc · arch = needs the cross toolchain/QEMU

+ +
+tests/
+├── generic/                       # zero QEMU/newlib dependency — runs anywhere
+│   ├── frontend/                  ← tests/frontend (diagnostics · pp · types)
+│   ├── ir/                        # unit tests for ir/ passes (from tests/unit split)
+│   ├── golden_ir/                 ← tests/ir_tests/golden — pass-level IR snapshots
+│   ├── linker/                    ← tests/linker (readelf/objdump goldens)
+│   └── debug/                     ← tests/debug (DWARF/STAB goldens)
+├── arch/
+│   └── arm/
+│       ├── unit/                  # thop_*, aapcs, arm_link, backend gen_* byte-exact tests
+│       ├── asm/                   ← tests/thumb/armv8m (encode vs arm-none-eabi-gcc)
+│       ├── qemu/                  ← tests/ir_tests execution suite + mps2-an505 board + newlib
+│       ├── runtime/               ← tests/runtime/cross (aeabi, soft-fp)
+│       ├── gcc_torture/           ← ir_tests/test_gcc_torture_ir.py
+│       └── selfhost/              # compiles the compiler with armv8m-tcc
+├── fuzz/                          # differential fuzzer — stays, opt-in as today
+├── host/                          # native aeabi/soft-fp checks (test-aeabi-host)
+├── support/ut.h                   # shared unit harness + coverage scripts
+├── externals/ · benchmarks/       # opt-in corpora, unchanged
+└── legacy/                        # quarantined upstream suite (tcctest.c, abitest…)
+
+ +
    +
  • The one real complication: today's single + tests/unit/arm/armv8m build links generic IR-pass tests and ARM encoder + tests into one binary. Split it along the existing UT_MODULE_SRCS + boundary: tests/generic/ir/ links only source/ir/ (+ + operand/svalue) with stubs; tests/arch/arm/unit/ links + arch/arm/** + the backend. source_coverage_map.json + already keys every test by source path — it is the machine-readable migration + map.
  • +
  • Axis chosen deliberately: the tree mirrors source layout + (the stated goal). Note the imperfect overlap with runnability — linker/debug + goldens contain ARM ELF yet run host-side; they stay in generic/ + because they test generic drivers (obj/elf.c, obj/dwarf.c) + whose goldens are per-target files.
  • +
  • Wiring to update in lockstep (all grep-able single points): + top-level Makefile suite dirs + NEWLIB_* paths, + tests/run_tests.py, per-suite conftest.py, + gen_source_coverage.py, .gitignore, and the fuzz infra's + cached paths (.sweep_cache keys miss header moves — clear it).
  • +
  • Keep the gates: make check-pass-coverage (pass ↔ + test ledger) and the selfhost suite (sole coverage for + arch/arm/arm.c/aapcs.c) survive the move unchanged.
  • +
+
+ +
+

§10Migration plan — seven phases, each shippable

+

every phase ends with make test green on a clean branch off mob

+
+
0moves
+

Pure git-mv restructure + dead-code deletion

+

Create source/; move files with names unchanged; move the six + root-level ARM files into arch/arm/; delete + arm-thumb-scratch.c; quarantine tests/legacy/. Update + Makefile path lists (CORE_FILES, armv8m_FILES, + -I paths, the LIBTCC_INC rebuild wart) and include paths — + zero code changes otherwise.

+
make test green · git blame -C intact · one commit = moves only
+
+
1headers
+

Split tcc.h; stop leaking arm-thumb-defs.h

+

Extract config.h, core_types.h, state.h, + per-module headers from the prototype banners (§8). Generic TUs stop including + arch/arm/defs.h; register facts flow through + RegAllocTarget/ArchitectureConfig.

+
CI grep: no '#include "arch/' outside source/arch/
+
+
2seam
+

Seal the contract

+

Write machine/machine.h (§5); fix leaks ②③④⑤⑥; delete the dead + vtable; wire arch/fpu/arm into arm_resolve_fpu(); de-ARM + machine/ls.c; single-source registers_for_allocator.

+
make test green · unit suites for regalloc/ls pass unmodified
+
+
3tccgen
+

Split tccgen.c → frontend/gen/ (10 files)

+

Create gen_priv.h; extract in dependency order + (sym → builtins → decl → types → stmt → init → ops → expr), one file + per commit, running the frontend + IR suites each step.

+
make test after every extraction · no new ST_DATA globals
+
+
4arm gen
+

Split arm-thumb-gen.c → arch/arm/gen/ (13 files)

+

Same discipline; the byte-exact backend unit tests + (test_gen_*) pin emitted Thumb-2 encodings across the split.

+
test-asm + backend unit suite byte-identical output
+
+
5tests
+

Restructure tests/ to mirror source/

+

Move suites per §9; split the unit binary generic-vs-arch; update Makefile, + run_tests.py, coverage generator; regenerate + source_coverage_map.json.

+
make test green · check-pass-coverage --strict passes · fuzz smoke (batch_sweep) clean
+
+
6hoist
+

Hoist generic engines out of the backend (deferred)

+

Parallel-move solver, 64-bit pair lowering, dry-run scratch protocol, peephole + cache frameworks → machine/. Do this when the second backend starts, + so real requirements drive the parameterization.

+
triggered by arch #2 — not before
+
+
+
+ sequencing constraints +

Start from a clean tree — the current branch (heapOverflowBug) carries + a large in-flight diff; land or stash it first. Never run fuzz sweeps or bisects while + the tree is mid-restructure (the sweep cache keys miss header moves, and + reducers/sweeps racing a rebuild report phantom divergences). Phases 3 and 4 are + independent and can interleave with normal bug-fix work — each extraction commit is + small and revertible.

+
+
+ +
+

§11Adding an architecture — the checklist

+

what arch/<name>/ must provide once the restructure lands

+
    +
  1. defs.h — register names/counts, PTR_SIZE, + float-ABI constants (seen only by this backend and the build defines).
  2. +
  3. <name>.cbackend_init(): fill + ArchitectureConfig (capabilities, FP feature bits via + fpu/ tables).
  4. +
  5. regalloc.c — a RegAllocTarget: int/FP + register classes, caller/callee-saved sets, param regs, static-chain reg.
  6. +
  7. abi.cabi_classify_argument() + + call-layout builder for the target's calling convention.
  8. +
  9. gen/ — the ~90 contract entry points (§5). Start + with the ~25 that the two-pass loop requires to emit straight-line code (mop + handlers for ALU/load/store/call/branch + frame + materialization); the rest — + peephole hooks, dry-run size estimators — have safe conservative defaults.
  10. +
  11. link.crelocate, + code_reloc, gotplt_entry_type for the target's reloc + types.
  12. +
  13. Optional: asm.c + token table (inline assembly), + ssa_opt.c (target peephole generators registered into the SSA + pipeline), fpu/ feature tables.
  14. +
  15. Build: one Makefile stanza (<target>_FILES, + DEF-<target>, <target>_ARCH) — the + arch/Makefile dispatcher already documents this.
  16. +
  17. Tests: tests/arch/<name>/ — unit encoders + first, then an execution board under qemu/ mirroring + mps2-an505.
  18. +
+
+ +
+

§12Risks & open decisions

+

what could bite, and the calls already made

+
+ + + + + + + + + + + + + + + + + +
Risk / decisionPosition
Golden churn. IR goldens, byte-exact backend tests, and + objdump goldens are path- and layout-sensitive.Phases 0–2 change no codegen output by construction; goldens act as the + regression oracle, never regenerate during a move phase.
Fuzz infrastructure paths. Sweep caches, triage scripts, + bisect_opt.py reference file paths.Clear .sweep_cache after each phase; run a 500-seed + batch_sweep smoke across profiles as the phase-5 gate.
Where does tccasm.c sit? The GAS frontend is generic but + exists to feed arch mnemonic parsers.Frontend, with the mnemonic parser behind the contract's optional asm hooks + (matches today's tccasm.c → arm-thumb-asm.c split).
YAFF reloc coupling. obj/yaff.c hardcodes + R_ARM_* enums.Phase 2 introduces contract reloc-kind mapping; until then YAFF is de-facto + ARM-only (as today).
Dispatch mechanism. Vtable vs link-time symbols.Link-time (§5) — one backend per binary, zero-cost, delete the dead + vtable.
Naming. Keep tcc* basenames or re-name on + move?Phase 0 keeps basenames (pure moves); renames happen only where files split + anyway (§6–§7). Directory names carry the taxonomy.
PTR_SIZE as a macro. Generic code has 43 + #if PTR_SIZE sites.Acceptable: it's a per-binary constant delivered by build defines. Do not + convert to runtime queries — codegen constant-folds on it.
+
+ +
+ tinycc armv8-m fork · restructure architecture · 2026-07-03 · figures & counts from + source survey of the working tree (branch heapOverflowBug) · markdown source: + restructure_architecture.md +
+
+
+ + diff --git a/docs/restructure_architecture.md b/docs/restructure_architecture.md new file mode 100644 index 00000000..31bba100 --- /dev/null +++ b/docs/restructure_architecture.md @@ -0,0 +1,456 @@ +# Restructuring the source tree for multi-architecture support + +> tinycc · armv8-m fork · architecture proposal · 2026-07-03 +> +> Styled version with full diagrams: [restructure_architecture.html](restructure_architecture.html) +> (self-contained, open in a browser). This Markdown is the diff-friendly source of truth; +> Mermaid diagrams render on GitHub and in VS Code preview. + +A `source/` root with generic compiler layers, one machine contract, and self-contained +backends under `source/arch/` — plus a test tree that mirrors it. Designed so the next +architecture is a directory, not a rewrite. + +| | | +|---|---| +| Top-level C | 91k lines | +| `ir/` | 97k lines | +| `tccgen.c` | 33,407 lines | +| `arm-thumb-gen.c` | 13,534 lines | +| `tcc.h` | 2,892 lines, included everywhere | +| Backend seam | already ~80% in place | + +## Contents + +1. [Goals & ground rules](#1-goals--ground-rules) +2. [Where the code is today](#2-where-the-code-is-today) +3. [Target source tree](#3-target-source-tree) +4. [Layered architecture](#4-layered-architecture) +5. [The backend contract](#5-the-backend-contract--machinemachineh) +6. [Splitting tccgen.c](#6-splitting-tccgenc--33407-lines--10-files) +7. [Splitting arm-thumb-gen.c](#7-splitting-arm-thumb-genc--13534-lines--13-files) +8. [Header topology](#8-header-topology--dismantling-tcch) +9. [Target test tree](#9-target-test-tree--mirroring-source) +10. [Migration plan](#10-migration-plan--seven-phases-each-shippable) +11. [Adding an architecture](#11-adding-an-architecture--the-checklist) +12. [Risks & decisions](#12-risks--open-decisions) + +--- + +## §1 Goals & ground rules + +- **Physical layout matches logical layers.** Everything moves under `source/`; + architecture-specific code lives only in `source/arch//`; generic code never + includes an arch header. +- **Huge files become functional blocks.** `tccgen.c` (33k) splits into ~10 files, + `arm-thumb-gen.c` (13.5k) into ~13, along the block boundaries mapped in §6–§7. +- **A second architecture drops in.** One written contract (§5) is the complete list of + what a backend implements. Register facts, ABI classification, and relocations all flow + through it. +- **Every phase keeps `make test` green.** The plan (§10) is a sequence of mechanical, + individually verifiable steps — no big-bang branch. +- **History survives.** Moves are pure `git mv` commits, separate from content edits, so + `git blame -C` stays useful. + +> **Good news first.** This is not a greenfield redesign. The amalgamation build is +> already gone (every `.c` compiles separately; only `tcc.c` includes `tcctools.c`). +> `arch/arm/` already exists with clean pieces — AAPCS classification, a `RegAllocTarget` +> descriptor, 29 Thumb-2 encoder modules — and the build system already documents how to +> add an architecture. The IR operand seam (`MachineOperand`, `machine_op_from_ir`) is +> fully target-neutral. What remains is finishing a boundary that is ~80% built. + +## §2 Where the code is today + +The load-bearing backend interface today is a flat set of **~90 `tcc_gen_machine_*` / +`tcc_machine_*` symbols** declared in `tcc.h:2576–2726` and resolved at link time. A +second, aspirational vtable (`TCCMachineInterface` in `tccmachine.h/.c`) exists but is +dead: `tcc_machine_register()` is never called. Meanwhile the largest ARM files still sit +at the repo root, outside `arch/`. + +```mermaid +flowchart TB + tcch["tcc.h — 2,892-line god-header
included by every TU"]:::bad + + subgraph root ["repo root (generic)"] + tccgen["tccgen.c — 33,407 ln
parser + sema + IR emission
5 functions > 1,000 lines"] + core["generic core
tccpp · tccelf · tccld · tccdbg
tccasm · libtcc · tccyaff · tccls …"] + vtable["tccmachine.c — DEAD vtable
never registered; real dispatch =
~90 link-time symbols"]:::dead + end + + subgraph irdir ["ir/ — 70 files, 97k ln"] + ir["core · cfg · ssa · opt passes
regalloc (RegAllocTarget ✓)
machine_op (MachineOperand ✓)"] + codegen["codegen.c — two-pass dispatch
109 direct tcc_gen_machine_* calls"] + end + + subgraph armroot ["ARM still at repo root"] + armgen["arm-thumb-gen.c — 13,534 ln
arm-thumb-asm.c · arm-link.c
callsite.c · defs.h · thumb-tok.h"] + scratch["arm-thumb-scratch.c
ORPHANED — delete"]:::dead + end + + archarm["arch/arm/ — already exists, 7.6k ln
arm.c · aapcs.c · regalloc.c
ssa_opt_arm.c · thumb/thop_* ×29
(arch/fpu/arm present but unbuilt)"]:::arch + + tcch -. "① pulls arm-thumb-defs.h
into every TU" .-> armgen + tccgen -. "② arm_init() +
arch/arm/arm_regalloc.h" .-> archarm + codegen -. "③ thumb_build_call_layout_from_ir()" .-> armgen + core -. "④⑤⑥ hardcoded regs (tccls.c),
registers_for_allocator=12,
arm_fpu_supports_double in tccir.h" .-> armgen + + classDef bad stroke:#B3402E,stroke-width:2px + classDef dead stroke-dasharray:6 4,color:#8A948C + classDef arch stroke:#0E7B5B,stroke-width:2px +``` + +*Fig. 1 — Today's top level. Dashed arrows ①–⑥ are the hard couplings that break the +generic/arch boundary; dead boxes are code to delete.* + +### The six hard leaks (generic → ARM) + +| # | Where | Leak | Fix | +|---|-------|------|-----| +| ① | `tcc.h:358` | Unconditionally includes `arm-thumb-defs.h` — every generic TU compiles against `NB_REGS`, `TREG_*`, `RC_*`, ARM reloc aliases | Backend defs come in via the machine contract header only | +| ② | `tccgen.c:38, 1028, 30983` | Includes `arch/arm/arm_regalloc.h`; calls `arm_init()` and `arm_get_regalloc_target()` directly | Generic `tcc_backend_init()` + `tcc_backend_regalloc_target()` hooks | +| ③ | `ir/codegen.c:1915` | Generic dispatcher calls `thumb_build_call_layout_from_ir()` by name | Add call-layout entry point to the contract | +| ④ | `tccls.c:125–320` | Hardcodes SP=R13 mask, R12 special case, "scratch from R0–R3", 16-register bounds in nominally generic linear-scan code | Read all register facts from `RegAllocTarget` | +| ⑤ | `tccgen.c:30891` | `registers_for_allocator = 12` hardcoded (backend sets 13 elsewhere — duplicated magic) | Single source of truth in `RegAllocTarget` | +| ⑥ | `tccir.h:718` | Generic IR header declares `arm_fpu_supports_double()` | Replace with `tcc_target_has()` capability query (already exists in `tcc_target.h`) | + +Beyond these, `#ifdef TCC_TARGET_ARM_THUMB` appears at only ~17 sites in generic code — +mostly benign option-parsing and section-name islands in `libtcc.c`, `tccelf.c`, +`tccdbg.c` that can migrate to contract hooks gradually. The relocation engine is already +split correctly: `tccelf.c` drives, `arm-link.c` implements +`relocate`/`code_reloc`/`gotplt_entry_type`. + +## §3 Target source tree + +File basenames keep their identity where the file moves unchanged (`←` annotations show +origin); new names appear only where a file is split. Repo root keeps `include/` (headers +shipped to compiled programs), `lib/` (runtime library), `tests/`, `scripts/`, `docs/`. + +```text +source/ +├── driver/ # entry points & public API +│ ├── tcc.c # CLI main + tool dispatch +│ ├── tcctools.c # ar / cross-prefix tools +│ └── libtcc.c # TCCState lifecycle, options, compile/link driver +├── frontend/ # C language → IR +│ ├── tccpp.c # preprocessor + tokenizer +│ ├── tccasm.c # GAS-style asm frontend (arch-neutral core) +│ ├── tcctok.h · tccdefs.h +│ └── gen/ # tccgen.c split — see §6 +│ ├── gen_priv.h # shared vstack/scope/switch state (the linchpin) +│ ├── gen_core.c gen_sym.c gen_vstack.c gen_ops.c gen_types.c +│ ├── gen_expr.c gen_builtins.c gen_stmt.c gen_init.c +│ └── gen_decl.c # decl, nested fns, gen_function IR-pipeline driver +├── ir/ # target-independent IR — moves largely as-is +│ ├── core.c cfg.c ssa.c dump.c vreg.c stack.c live.c licm.c +│ ├── operand.c # ← tccir_operand.c (SValue ↔ IROperand) +│ ├── passes.c # ← tccopt.c (pass registry) +│ ├── opt/ # all opt_*.c + ssa_opt_*.c consolidated +│ ├── regalloc.c # SSA regalloc — parameterized by RegAllocTarget ✓ +│ └── codegen.c # two-pass dry-run/real-run dispatch loop +├── machine/ # THE seam — generic side of the backend boundary +│ ├── machine.h # the written contract: every symbol a backend implements (§5) +│ ├── machine_op.c/.h # ← ir/machine_op.* (MachineOperand — already clean) +│ ├── target.h # ← tcc_target.h (ArchitectureConfig, capabilities) +│ ├── abi.h # ← tccabi.h (TCCAbiArgDesc / CallLayout) +│ └── ls.c # ← tccls.c, de-ARM'd (leak ④) +├── obj/ # object containers, linking, debug info +│ ├── elf.c # ← tccelf.c (ARM islands → reloc hooks) +│ ├── ld.c # ← tccld.c (linker scripts — already 100% generic) +│ ├── yaff.c # ← tccyaff.c (R_ARM_* enums → contract reloc kinds) +│ ├── dwarf.c # ← tccdbg.c (DWARF/stabs) +│ └── elf.h · dwarf.h · stab.h +├── support/ +│ ├── log.h tcc-chained-hash.h +│ └── tccdebug.c # SValue/Sym pretty-printers +└── arch/ + ├── arm/ + │ ├── arm.c arm.h # target init: ArchitectureConfig, capabilities + │ ├── defs.h # ← arm-thumb-defs.h (NB_REGS, TREG_*, RC_*) + │ ├── aapcs.c # ← arm_aapcs.c (ABI classification) + │ ├── regalloc.c # ← arm_regalloc.c (RegAllocTarget tables) + │ ├── ssa_opt.c # ← ssa_opt_arm.c (MLA fusion, shl+add→indexed, …) + │ ├── gen/ # arm-thumb-gen.c split — see §7 + │ ├── thumb/ # thop_*.c encoders ×29 — unchanged + │ ├── asm.c # ← arm-thumb-asm.c (mnemonic parser → thop_*) + │ ├── tok.h # ← thumb-tok.h + │ ├── link.c # ← arm-link.c (R_ARM_* relocations) + │ └── fpu/ # ← arch/fpu/arm — finally wired into arm_resolve_fpu() + └── riscv/ # future — implements machine/machine.h, nothing else +``` + +> **Deletions, not moves.** `arm-thumb-scratch.c` is orphaned (built by nothing, included +> by nothing — the live scratch logic is inside `arm-thumb-gen.c`): delete it. The dead +> `TCCMachineInterface` vtable in `tccmachine.h/.c` is superseded by the contract header +> (§5 decision): delete it too. Legacy upstream test drivers (`tcctest.c`, `abitest.c`, +> `tests/Makefile` suite) quarantine under `tests/legacy/`. + +## §4 Layered architecture + +```mermaid +flowchart TB + driver["driver/
tcc.c (CLI) · libtcc.c (API, options, orchestration)"]:::drv + frontend["frontend/
tccpp.c → gen/* (parse · types · sema · vstack → IR)
tccasm.c (inline-asm frontend)"]:::fe + ir["ir/
core · cfg · ssa · opt pipeline · licm — target-independent
regalloc.c ⟵ RegAllocTarget · codegen.c: two-pass dispatch
machine_op_from_ir(): IROperand → MachineOperand"]:::ir + machine["machine/ — THE CONTRACT
machine.h (~90 entry points, §5) · machine_op · target.h · abi.h · ls.c
the only headers arch code and generic code share"]:::seam + obj["obj/
elf.c — sections, symbols, GOT/PLT, output
ld.c — linker scripts · yaff.c · dwarf.c"]:::obj + arm["arch/arm/
gen/ (mop handlers) · thumb/ (encoders)
link.c · asm.c · aapcs.c · regalloc.c · ssa_opt.c · fpu/ · defs.h"]:::arch + next["arch/riscv/ — future backend:
implement machine.h, add Makefile stanza,
nothing else changes"]:::ghost + + driver --> frontend + frontend --> ir + ir -->|"mop dispatch (per IR op)"| machine + driver --> obj + frontend -->|"syms/relocs"| obj + obj -->|"reloc contract: relocate() ·
code_reloc() · gotplt_entry_type()"| machine + arm -.implements.-> machine + next -.implements.-> machine + + classDef drv stroke:#4A5568,stroke-width:2px + classDef fe stroke:#2C5E8F,stroke-width:2px + classDef ir stroke:#6B4E9E,stroke-width:2px + classDef seam stroke:#A8672A,stroke-width:3px + classDef obj stroke:#A34D5E,stroke-width:2px + classDef arch stroke:#0E7B5B,stroke-width:2px + classDef ghost stroke-dasharray:7 5,color:#8A948C +``` + +*Fig. 2 — Target layers. Solid arrows are calls; dashed arrows are backends implementing +the machine contract. `machine/` is the only crossing point between generic and +architecture-specific code.* + +### Dependency rules (CI-enforceable) + +- `driver/ → frontend/, ir/, obj/, machine/` — orchestration only. +- `frontend/ → ir/, machine/` (emission primitives, ABI queries); `frontend/ → obj/` only + for symbol/section glue (`put_extern_sym`, relocations on initializers). +- `ir/ → machine/` — the dispatch loop and regalloc consume only contract types. +- `obj/ → machine/` — reloc/attribute hooks; never `R_ARM_*` by name. +- `arch// → machine/, ir/ headers, support/` — a backend may see generic types, never + frontend internals. +- **Nothing outside `arch/` includes anything inside `arch/`.** One grep in CI keeps this + true forever: + + ```sh + grep -rn '#include "arch/' source/ --exclude-dir=arch # must return empty + ``` + +## §5 The backend contract — `machine/machine.h` + +The interface already exists in practice — it is just scattered and unwritten. The +contract header collects the ~90 entry points, grouped and documented, so "port tinycc" +becomes "implement this file". Group sizes below are from the live `tcc.h:2576–2726` +surface plus the reloc backend. + +| Group | ≈ | Entry points (representative) | Notes | +|-------|---|-------------------------------|-------| +| materialization | 10 | `acquire/release_scratch` · `load/store_spill_slot` · `load_constant` · `addr_of_stack_slot` · `can_encode_stack_offset` | Integer-arg primitives used by ir/mat + regalloc | +| mop handlers | ~45 | `data_processing_mop` · `load/store[_indexed\|_postinc]_mop` · `muldiv_mop` · `mla/umull/smull_mop` · `fp_mop` · `func_call_mop` · `select_mop` · `block_copy_mop` · `vla_mop` · `setjmp/trap/prefetch`… | One per IR op family; all take `MachineOperand` | +| frame | 8 | `prolog` · `epilog` · `finish_noreturn` · `store_to_stack/sp` · `number_of_registers` · `gfunc_sret` · nested-fn trampoline | Frame layout + return-value classification | +| branches | 8 | `jump_mop` · `conditional_jump_mop` · `cbz_jump_mop` · `backpatch_jump` · `switch_table/load_mop` · `*_dry_run_size` | Relaxation policy stays in the backend | +| two-pass hooks | ~15 | `dry_run_init/start/end` · `insn_scratch_reset/count/saves_mask` · `branch_opt_init/analyze` · `*_cache_reset` · `pending_pool_size` · `reserve_pool_bytes` · `end_instruction` | Driven by ir/codegen.c's dry-run/real-run loop | +| ABI | 5 | `abi_classify_argument` · `abi_assign_call_args` · `build_call_layout` (fixes leak ③) · `abi_softcall_name` | Types in `machine/abi.h` | +| target | 5 | `backend_init/deinit` (fixes leak ②) · `regalloc_target()` · `target_has(cap)` · `resolve_fpu()` | Fills `ArchitectureConfig` + `RegAllocTarget` | +| relocations | 4 | `relocate` · `relocate_plt` · `code_reloc` · `gotplt_entry_type` | Already the upstream xxx-link.c shape; `obj/elf.c` is the driver | +| asm (optional) | 2 | asm opcode parser hook · asm token table | Backends without inline-asm support stub these | + +> **Decision: link-time binding, not a vtable** *(recommended — matches how the fork +> already works).* Each target is a separate cross-compiler binary (`armv8m-tcc`), exactly +> one backend linked in — so link-time symbol resolution is a zero-cost dispatch that the +> hot two-pass codegen loop already relies on (109 call sites in `ir/codegen.c`). +> Formalize the symbol set in `machine/machine.h` and **delete the dead +> `TCCMachineInterface` vtable** rather than reviving it. If a multi-target single binary +> is ever wanted, the contract header is precisely the struct definition a vtable would +> need — nothing is lost by waiting. + +> **Generic logic to hoist out of the backend — later, §10 phase 6.** Five pieces of +> genuinely generic machinery currently live inside `arm-thumb-gen.c` and would otherwise +> be rewritten by every new backend: the dry-run scratch-discovery state machine, +> 64-bit-as-register-pair lowering, the parallel-move solver for call arguments, +> mul-by-constant strength reduction, and the peephole cache frameworks (MOV-equivalence, +> immediate reuse, STR→LDR forwarding). Hoist them into `machine/` as shared engines +> parameterized by backend callbacks — but only when the second architecture arrives and +> proves the parameterization, not speculatively. + +## §6 Splitting tccgen.c — 33,407 lines → 10 files + +Location: `frontend/gen/` · shared state declared once in `gen_priv.h`. + +The file's ~35 internal blocks condense into ten modules. The entangling state is +well-understood: `vtop`/`_vstack` (2,160 refs), `tok` (322), `loc`, `nocode_wanted`, +scope/switch stacks. `gen_priv.h` declares all of it (definitions live in `gen_core.c`), +plus the current forward-decl list — the file's own lines 746–789 are the seed set. + +| File | ≈ lines | Contents (today's blocks) | Coupling | +|------|--------:|---------------------------|----------| +| `gen_core.c` | 1,200 | Global state definitions, lifecycle (`tccgen_init/compile/finish`), type predicates, code-suppression (`nocode_wanted`) | owns state | +| `gen_sym.c` | 1,100 | Symbol table, labels, ELF symbol glue (`put_extern_sym`, `greloc`), attribute merge/patch, aliases | clean — move first | +| `gen_vstack.c` | 1,700 | Value-stack ops, `gv/gv2`, long-long expand, addressing, bitfields, bounds | vstack core | +| `gen_ops.c` | 2,300 | `gen_opl/opic/opif`, `gen_op`, complex arithmetic, vector extensions | heavy vtop | +| `gen_types.c` | 2,900 | Type compare/compat, casts, `type_size`, struct layout + declaration, `parse_btype`, declarators | AAPCS alignment here | +| `gen_expr.c` | 6,500 | Unary/primary/postfix, `unary_funcall`, binary precedence, ternary, `gexpr`, const-expr | vtop + tok | +| `gen_builtins.c` | 3,800 | `unary_builtin_*`, string-builtin folding, auto-inline heuristics, `try_inline_const_eval` | clean — move early | +| `gen_stmt.c` | 1,900 | `block/block_1`, switch codegen, return, scopes/cleanups, VLA scope handling | tok + vtop | +| `gen_init.c` | 3,200 | `vstore`, `inc`, initializers, designators, `decl_initializer_alloc` | vtop | +| `gen_decl.c` | 5,300 | `decl`, nested functions/trampolines, `gen_function` (the IR-pipeline driver: SSA, opt passes, regalloc, codegen), inline stash, late reopt | IR-facing — owns ir/ includes | + +- **Extraction order:** `gen_sym` → `gen_builtins` → `gen_decl` (only 7 vtop refs in the + `gen_function` region) → `gen_types` → `gen_stmt` → `gen_init` → `gen_ops` → + `gen_expr` → what remains is `gen_core` + `gen_vstack`. +- **Watch one handshake:** the AAPCS invisible-copy state (`aapcs_last_const_init`, set + during parameter typing, consumed in `unary_funcall`) crosses the types/expr boundary — + promote it to an explicit field in `gen_priv.h`, not a bare static. +- After the split, `gen_decl.c` is the only frontend file that sees the IR optimization + pipeline; the rest use emission primitives from `tccir.h` only. + +## §7 Splitting arm-thumb-gen.c — 13,534 lines → 13 files + +Location: `arch/arm/gen/` · split by mop family, matching the contract groups. + +| File | ≈ lines | Contents (today's line ranges) | +|------|--------:|-------------------------------| +| `state.c` | 600 | ThumbGeneratorState, reg classes, frame-offset helpers, MachineCodegenContext allocators *(54–740)* | +| `scratch.c` | 900 | Scratch acquire/release/spill, push-window bias, dry-run scratch discovery hooks *(1419–1794 + snapshot state)* | +| `caches.c` | 700 | mov_equiv, imm_cache, strldr_cache, spill cache — peephole trackers *(1812–2287)* | +| `litpool.c` | 700 | Literal pools: init/reserve/flush/find, IT-window guard *(2345–2782, 4112–4200)* | +| `emit.c` | 900 | `o()/ot()`, opcode validation, branch patching/relaxation (`decbranch`, `th_patch_call`), dry-run/branch-opt state machine *(853–1272, 2509–3402)* | +| `alu.c` | 1,000 | Data-processing mops 32/64-bit, shifts, ubfx/bfi *(4993–5950)* | +| `muldiv.c` | 1,050 | mul/div/MLA/UMULL/SMULL/MLAL, mul-by-const strength reduction, pack64, cmp_eq64 *(5952–6921)* | +| `mem.c` | 1,500 | load/store/indexed/postinc mops, spill slots, strd/ldrd pairing, assign, setif/bool *(3647–4530, 6987–8463)* | +| `fp.c` | 1,000 | VFP + soft-float dispatch (`get_softfp_func_name`), complex lowering *(8463–9476)* | +| `frame.c` | 900 | prolog/epilog, noreturn finish, VLA alloc, `gfunc_sret`, LEA, stack stores *(3564–3646, 9476–10347)* | +| `call.c` | 2,100 | Call generation, AAPCS arg placement, ThumbArgMove parallel-move solver; absorb `arm-thumb-callsite.c` *(10348–12120)* | +| `branch.c` | 500 | jump/conditional/cbz mops, chain slots, switch tables, indirect jump *(3413–3555, 12120–12280)* | +| `intrin.c` | 700 | select/block-copy mops, trap/prefetch/setjmp/longjmp, builtin_apply, trampolines *(12281–13534)* | + +The cut lines are unusually clean because handlers already communicate through +`MachineOperand` and the shared generator state — the split is mostly moving functions +plus one `gen_priv.h`-style internal header (`arch/arm/gen/gen.h`) for the state struct +and cross-file statics. + +## §8 Header topology — dismantling tcc.h + +The 2,892-line god-header already contains its own split map: lines 1870–2532 are +per-module prototype banners (`/* ---- tccpp.c ---- */` …) — each becomes that module's +own header. What remains is a small set of genuinely shared headers: + +| New header | From tcc.h | Contents | +|------------|-----------:|----------| +| `source/config.h` | :26–243 | Platform shims, target-select ladder, threading | +| `source/core_types.h` | :384–914 | The shared data model: `CType`, `CValue`, `SValue`, `Sym`, `Section`, TokenString — needed by every layer | +| `source/state.h` | :915–1524 | `TCCState` + extended symbol attributes | +| `frontend/tokens.h` | :1636–1869 | Token constants (wraps `tcctok.h`) | +| per-module headers | :1870–2532 | `pp.h`, `gen.h`, `elf.h`, `asm.h`, `dwarf.h`, `yaff.h`… — each owns its banner | +| `machine/machine.h` | :2576–2726 | The backend contract (§5); replaces the xxx-gen.c banner | +| `arch/arm/defs.h` | :358 include | `NB_REGS`, `TREG_*`, `RC_*`, `PTR_SIZE`, float-ABI enums — **no longer included by generic code**; generic layers read register facts from `RegAllocTarget`/`ArchitectureConfig` at runtime | + +A transitional `tcc.h` that includes the new pieces keeps every TU compiling during the +split; it shrinks to a compatibility shim and is deleted at the end. The one behavioral +constant to preserve: `PTR_SIZE` and `LDOUBLE_SIZE` are compile-time constants per target +binary — they stay macros, provided by the per-arch defs through the build system's +`-DTCC_TARGET_*` defines, not through a generic include of an arch header. + +## §9 Target test tree — mirroring source/ + +generic = host-runnable with plain gcc · arch = needs the cross toolchain/QEMU. + +```text +tests/ +├── generic/ # zero QEMU/newlib dependency — runs anywhere +│ ├── frontend/ # ← tests/frontend (diagnostics · pp · types) +│ ├── ir/ # unit tests for ir/ passes (from tests/unit split) +│ ├── golden_ir/ # ← tests/ir_tests/golden — pass-level IR snapshots +│ ├── linker/ # ← tests/linker (readelf/objdump goldens) +│ └── debug/ # ← tests/debug (DWARF/STAB goldens) +├── arch/ +│ └── arm/ +│ ├── unit/ # thop_*, aapcs, arm_link, backend gen_* byte-exact tests +│ ├── asm/ # ← tests/thumb/armv8m (encode vs arm-none-eabi-gcc) +│ ├── qemu/ # ← tests/ir_tests execution suite + mps2-an505 board + newlib +│ ├── runtime/ # ← tests/runtime/cross (aeabi, soft-fp) +│ ├── gcc_torture/ # ← ir_tests/test_gcc_torture_ir.py +│ └── selfhost/ # compiles the compiler with armv8m-tcc +├── fuzz/ # differential fuzzer — stays, opt-in as today +├── host/ # native aeabi/soft-fp checks (test-aeabi-host) +├── support/ut.h # shared unit harness + coverage scripts +├── externals/ · benchmarks/ # opt-in corpora, unchanged +└── legacy/ # quarantined upstream suite (tcctest.c, abitest…) +``` + +- **The one real complication:** today's single `tests/unit/arm/armv8m` build links + generic IR-pass tests and ARM encoder tests into one binary. Split it along the + existing `UT_MODULE_SRCS` boundary: `tests/generic/ir/` links only `source/ir/` (+ + operand/svalue) with stubs; `tests/arch/arm/unit/` links `arch/arm/**` + the backend. + `source_coverage_map.json` already keys every test by source path — it is the + machine-readable migration map. +- **Axis chosen deliberately:** the tree mirrors *source layout* (the stated goal). Note + the imperfect overlap with runnability — linker/debug goldens contain ARM ELF yet run + host-side; they stay in `generic/` because they test generic drivers (`obj/elf.c`, + `obj/dwarf.c`) whose goldens are per-target files. +- **Wiring to update in lockstep** (all grep-able single points): top-level Makefile + suite dirs + `NEWLIB_*` paths, `tests/run_tests.py`, per-suite `conftest.py`, + `gen_source_coverage.py`, `.gitignore`, and the fuzz infra's cached paths + (`.sweep_cache` keys miss header moves — clear it). +- **Keep the gates:** `make check-pass-coverage` (pass ↔ test ledger) and the selfhost + suite (sole coverage for `arch/arm/arm.c`/`aapcs.c`) survive the move unchanged. + +## §10 Migration plan — seven phases, each shippable + +Every phase ends with `make test` green on a clean branch off `mob`. + +| Phase | Title | Work | Gate | +|-------|-------|------|------| +| **0** moves | Pure git-mv restructure + dead-code deletion | Create `source/`; move files with names unchanged; move the six root-level ARM files into `arch/arm/`; delete `arm-thumb-scratch.c`; quarantine `tests/legacy/`. Update Makefile path lists (`CORE_FILES`, `armv8m_FILES`, `-I` paths, the `LIBTCC_INC` rebuild wart) and include paths — zero code changes otherwise | `make test` green · `git blame -C` intact · one commit = moves only | +| **1** headers | Split tcc.h; stop leaking arm-thumb-defs.h | Extract `config.h`, `core_types.h`, `state.h`, per-module headers from the prototype banners (§8). Generic TUs stop including `arch/arm/defs.h`; register facts flow through `RegAllocTarget`/`ArchitectureConfig` | CI grep: no `#include "arch/` outside `source/arch/` | +| **2** seam | Seal the contract | Write `machine/machine.h` (§5); fix leaks ②③④⑤⑥; delete the dead vtable; wire `arch/fpu/arm` into `arm_resolve_fpu()`; de-ARM `machine/ls.c`; single-source `registers_for_allocator` | `make test` green · unit suites for regalloc/ls pass unmodified | +| **3** tccgen | Split tccgen.c → frontend/gen/ (10 files) | Create `gen_priv.h`; extract in dependency order (`sym → builtins → decl → types → stmt → init → ops → expr`), one file per commit, running the frontend + IR suites each step | `make test` after every extraction · no new ST_DATA globals | +| **4** arm gen | Split arm-thumb-gen.c → arch/arm/gen/ (13 files) | Same discipline; the byte-exact backend unit tests (`test_gen_*`) pin emitted Thumb-2 encodings across the split | test-asm + backend unit suite byte-identical output | +| **5** tests | Restructure tests/ to mirror source/ | Move suites per §9; split the unit binary generic-vs-arch; update Makefile, run_tests.py, coverage generator; regenerate `source_coverage_map.json` | `make test` green · `check-pass-coverage --strict` passes · fuzz smoke (batch_sweep) clean | +| **6** hoist | Hoist generic engines out of the backend (deferred) | Parallel-move solver, 64-bit pair lowering, dry-run scratch protocol, peephole cache frameworks → `machine/`. Do this when the second backend starts, so real requirements drive the parameterization | triggered by arch #2 — not before | + +> **Sequencing constraints.** Start from a clean tree — the current branch +> (`heapOverflowBug`) carries a large in-flight diff; land or stash it first. Never run +> fuzz sweeps or bisects while the tree is mid-restructure (the sweep cache keys miss +> header moves, and reducers/sweeps racing a rebuild report phantom divergences). Phases +> 3 and 4 are independent and can interleave with normal bug-fix work — each extraction +> commit is small and revertible. + +## §11 Adding an architecture — the checklist + +What `arch//` must provide once the restructure lands: + +1. **`defs.h`** — register names/counts, `PTR_SIZE`, float-ABI constants (seen only by + this backend and the build defines). +2. **`.c`** — `backend_init()`: fill `ArchitectureConfig` (capabilities, FP feature + bits via `fpu/` tables). +3. **`regalloc.c`** — a `RegAllocTarget`: int/FP register classes, caller/callee-saved + sets, param regs, static-chain reg. +4. **`abi.c`** — `abi_classify_argument()` + call-layout builder for the target's calling + convention. +5. **`gen/`** — the ~90 contract entry points (§5). Start with the ~25 that the two-pass + loop requires to emit straight-line code (mop handlers for ALU/load/store/call/branch + + frame + materialization); the rest — peephole hooks, dry-run size estimators — have + safe conservative defaults. +6. **`link.c`** — `relocate`, `code_reloc`, `gotplt_entry_type` for the target's reloc + types. +7. **Optional:** `asm.c` + token table (inline assembly), `ssa_opt.c` (target peephole + generators registered into the SSA pipeline), `fpu/` feature tables. +8. **Build:** one Makefile stanza (`_FILES`, `DEF-`, `_ARCH`) — + the `arch/Makefile` dispatcher already documents this. +9. **Tests:** `tests/arch//` — unit encoders first, then an execution board under + `qemu/` mirroring `mps2-an505`. + +## §12 Risks & open decisions + +| Risk / decision | Position | +|-----------------|----------| +| **Golden churn.** IR goldens, byte-exact backend tests, and objdump goldens are path- and layout-sensitive | Phases 0–2 change no codegen output by construction; goldens act as the regression oracle, never regenerate during a move phase | +| **Fuzz infrastructure paths.** Sweep caches, triage scripts, bisect_opt.py reference file paths | Clear `.sweep_cache` after each phase; run a 500-seed batch_sweep smoke across profiles as the phase-5 gate | +| **Where does tccasm.c sit?** The GAS frontend is generic but exists to feed arch mnemonic parsers | Frontend, with the mnemonic parser behind the contract's optional asm hooks (matches today's tccasm.c → arm-thumb-asm.c split) | +| **YAFF reloc coupling.** `obj/yaff.c` hardcodes `R_ARM_*` enums | Phase 2 introduces contract reloc-kind mapping; until then YAFF is de-facto ARM-only (as today) | +| **Dispatch mechanism.** Vtable vs link-time symbols | Link-time (§5) — one backend per binary, zero-cost, delete the dead vtable | +| **Naming.** Keep `tcc*` basenames or re-name on move? | Phase 0 keeps basenames (pure moves); renames happen only where files split anyway (§6–§7). Directory names carry the taxonomy | +| **PTR_SIZE as a macro.** Generic code has 43 `#if PTR_SIZE` sites | Acceptable: it's a per-binary constant delivered by build defines. Do not convert to runtime queries — codegen constant-folds on it | + +--- + +*Figures & counts from a source survey of the working tree (branch `heapOverflowBug`), +2026-07-03. Styled HTML version: [restructure_architecture.html](restructure_architecture.html).* diff --git a/docs/selfhost_miscompile_debugging.md b/docs/selfhost_miscompile_debugging.md deleted file mode 100644 index a1a6b2dc..00000000 --- a/docs/selfhost_miscompile_debugging.md +++ /dev/null @@ -1,270 +0,0 @@ -# Debugging self-host miscompiles (armv8m-tcc) - -A **self-host miscompile** is when the **cross** compiler (`bin/armv8m-tcc`, an x86 -binary built by gcc that *emits* ARM Thumb-2) compiles tinycc's own source into a -**native** compiler (the ARM `armv8m-tcc` that runs on the device) whose machine -code is subtly wrong. The source is correct — the same tinycc logic compiles a -test correctly when run as the cross, but wrong when run as the self-hosted -native binary. Symptom: a test program built **on the device** misbehaves -(infinite loop, wrong output, HardFault) even though the host cross builds it -fine. - -Most remaining `tests2` failures are this class. This guide is the repeatable -workflow to nail them. Worked example throughout: `09_do_while` (do-while loop -ran forever — fixed in `ir/regalloc.c ra_resolve_phis`). - ---- - -## 0. The mental model (read this first) - -``` -gcc ──compiles──> bin/armv8m-tcc (CROSS: x86 host binary, emits ARM) - │ - │ compiles tinycc's own *.c ← a bug HERE is the culprit - ▼ - native armv8m-tcc (rootfs/usr/bin/tcc: ARM, runs on device) - │ - │ compiles tests2/NN.c - ▼ - /tmp/NN (device binary that misbehaves) -``` - -Two independent facts pin it as a self-host bug: -1. **Host cross compiles the test correctly** — so the test source and tinycc - *logic* are fine. -2. **Device (native) compiles it wrong** — so the native binary's code for some - tinycc function `F` is wrong, i.e. **the cross miscompiled `F`**. - -There are two fix strategies (both valid, §6): -- **(A) Source workaround** in the tinycc function `F`: rewrite `F` so the cross - happens to compile it correctly. Fast, local, low-risk. (What `09` used.) -- **(B) Fix the cross codegen bug** itself: find the wrong ARM the cross emits and - fix the cross's optimizer/backend. Harder, but fixes *every* test that trips the - same bug at once. Prefer this when the same bug class recurs. - ---- - -## 1. Fast device round-trips: the FAT drive (use this, not RAM-scan) - -The slow/flaky way (`scripts/qemu_capture_yaff.py`) scans guest RAM for binaries. -The fast way is the host-readable FAT drive mounted at **`/mnt`** on the QEMU -guest — drop sources in, pull device-compiled binaries out, **no kernel rebuild**. -See [memory: yasos-qemu-fatdisk-host-drive] for the full design. One-liner: - -```bash -.qemu_smoke_venv/bin/python3 scripts/qemu_fatdisk_run.py \ - --put libs/tinycc/tests/tests2/09_do_while.c:IN.C \ - --cmd 'tcc -x c /mnt/IN.C -o /mnt/OUT; echo CC=$?; /mnt/OUT; echo RC=$?' \ - --get OUT:.cache/09_dev.elf \ - --backing .cache/bk.bin --img .cache/fd.img --boot-wait 7 --timeout 14 -``` - -- `--put HOST:FATNAME` puts a file on the drive; `--get FATNAME:HOST` pulls one out. -- `--cmd` runs on the guest shell; stdout/stderr stream live to the log (a runaway - guest is bounded by `--timeout`, not infinite). -- **8.3 UPPERCASE names only** (FatFs `FF_USE_LFN=0`): a source lands as `IN.C`; - tcc rejects `.C` → **always pass `tcc -x c`**. -- **Don't `ls /mnt`** — a kernel FatFs readdir bug panics ("invalid enum value"). - Compiling (open/read/write) is fine. -- It needs the QEMU kernel built with the `/mnt` drive support (already in tree: - `hal/.../ramflash.zig`, `linker_script.ld` fatdisk window, `main.zig` mount). - -Carve + disassemble the captured YAFF binary (`main` is after the crt0 stub — -look for `push {r4,...}` / `movs r4,#1`): - -```bash -python3 - <<'PY' -import struct; d=open('.cache/09_dev.elf','rb').read() -cl=struct.unpack_from('` -macro bug) and often won't even compile. Don't trust `-O0`-native as a bisector. - ---- - -## 3. Localize the miscompiled tinycc function - -This is the heart of the work. Narrow from "the test is wrong" to "tinycc -function `F`, this exact computation". - -### 3a. Narrow the *language feature* (cheap, FAT-drive) -Build one test program exercising several constructs and see which misbehaves. -`09` narrowed to **do-while only** (a `for`+`while`+`do-while` program: `for`/`while` -exited, `do-while` ran forever) → the bug is on the do-while codegen path. - -### 3b. See the IR and which *pass* transforms it (host, instant) -Build a **debug cross** (dumps IR; no device needed). Clean stale objects first — -a prior native build leaves ARM `.o`s that break the x86 cross link -("file in wrong format"): - -```bash -cd libs/tinycc -rm -rf armv8m-arch armv8m-ir armv8m-*.o *.o arm-eabi-*.o -SR=$PWD/../../rootfs -./configure --extra-cflags="-DTCC_DEBUG=1 -DCONFIG_TCC_DEBUG=1 -g -O1 -DTARGETOS_YasOS=1 -DCONFIG_TCC_BCHECK=0" \ - --enable-cross --config-asm=yes --config-pie=yes --config-pic=yes --debug --enable-O1 \ - --prefix=$PWD --sysroot=$SR --sysincludepaths="{B}/include:$SR/usr/include" \ - --crtprefix="$SR/usr/lib" --libpaths="$SR/usr/lib:$SR/lib" -make armv8m-tcc -j8 -./armv8m-tcc -dump-ir -c tests/tests2/09_do_while.c -o /tmp/x.o # 3 checkpoints -./armv8m-tcc -dump-ir-passes=all -c tests/tests2/09_do_while.c -o /tmp/x.o # after every pass -``` - -Diff the IR across passes to find the one that produces the wrong shape. For `09` -the inverted exit branch only appears in the **"AFTER OPTIMIZATIONS"** dump using -`R`-registers → it's introduced during **register allocation** (after the last -`-dump-ir-passes` checkpoint), specifically the phi-copy insertion in -`ra_resolve_phis`. (NB this debug cross is correct — it shows the *intended* IR, -e.g. exit target = 18. The device computes a different value; the gap localizes it.) - -### 3c. Get the *device's* actual values (one native rebuild) -When the IR transform is the suspect, add a one-off `fprintf(stderr, ...)` to the -relevant pass dumping the indices/targets it computes, rebuild the native tcc, -and run on the device via the FAT drive. For `09`, instrumenting -`tcc_ir_codegen_backpatch_jumps` printed `target_ir=15` (should be 18) for the -exit JUMPIF — proving the **target index in the IR was already wrong**, not the -address encoding. Remove the instrumentation afterwards. - -Rebuild native + kernel (the device tcc lives in the incbin'd romfs): -```bash -rm -f libs/tinycc/.yasos-build/native-stage1.stamp libs/tinycc/.yasos-build/native-stage2.stamp -./build_rootfs.sh -o rootfs.img # cross unchanged → only native rebuilds (~3-5 min) -rm -rf .zig-cache && zig build -Doptimize=ReleaseSafe # re-embed romfs (~1 min) -``` -(If you changed a file compiled into the *cross* too, also `rm .yasos-build/cross.stamp` -and the whole thing rebuilds, ~8-10 min.) - ---- - -## 4. Spot the cross's miscompile (disassembly) - -Once you know function `F` (e.g. `ra_resolve_phis` in `ir/regalloc.c`), look at the -ARM the **cross** emits for it. The cross compiles each tinycc TU; reproduce that -exact compile and disassemble `F`: - -```bash -cd libs/tinycc -# flags taken from the native build log line "armv8m-tcc -o armv8m-... -c ir/regalloc.c ..." -./bin/armv8m-tcc -o /tmp/F.o -c ir/regalloc.c \ - -DCONFIG_TCC_CROSSPREFIX='"armv8m-"' -I. -I./ir -I./ir/opt -DTCC_DEBUG=0 -g -O1 \ - -DTCC_ARM_VFP -DTCC_ARM_EABI=1 -DCONFIG_TCC_BCHECK=0 -DTCC_ARM_HARDFLOAT \ - -DTCC_TARGET_ARM_ARCHV8M -DTARGETOS_YasOS=1 -DTCC_TARGET_ARM_THUMB -DTCC_TARGET_ARM \ - -DTCC_IS_NATIVE -I$PWD/../../rootfs/usr/include -fpie -fPIE -mcpu=cortex-m33 \ - -fvisibility=hidden -std=c11 -Wno-declaration-after-statement -arm-none-eabi-objdump -dr /tmp/F.o | awk '/:/{f=1} f{print} f&&/^$/{exit}' -``` - -**How to know which instruction is wrong** (you need a notion of "correct"): -- **Golden ARM reference**: compile the same TU with `arm-none-eabi-gcc -O1 -mcpu=cortex-m33` - and diff the disassembly of `F`. Divergence that changes semantics = the cross bug. -- **Cross at -O0 vs -O1**: `./bin/armv8m-tcc -O0 -c …` vs `-O1`; the bug usually - rides an optimization, so `-O0` shows the intended behavior. -- **Reason from source**: e.g. for `09` the wrong value implied a stale register - read of an address-taken local across a call. - -Known good-vs-bad patterns already found this way (all in MEMORY.md): dropped -`< 0` branch - (~line 3168): a loop back-edge needing phi copies is rewritten from - `JUMPIF(cond)→top` into `JUMPIF(!cond)→exit; ; JUMP→top`. -- **Wrong computation**: the skip/exit target was stored as - `skip_dest.u.imm32 = -(wp + 2)` **before** `ra_emit_scheduled_phi_copies(…,&wp,…)` - advanced `wp`. `wp` is an **address-taken local** (`&wp` passed to the call). -- **Cross bug**: the cross cached `wp` in a register and did **not reload it after - the call** for that one expression (the adjacent JUMP-write *did* reload it) → - native used the stale pre-copies `wp` → exit target landed mid-body (IR 15) not - the epilogue (18) → `bge 0xee` → infinite loop. -- **Fix (strategy A, source)**: move the skip-target store to **after** the JUMP - write, using the now-fresh `wp`: `skip_dest.u.imm32 = -(wp + 1)`. Logically - identical on the host; sidesteps the stale-register read on the device. -- The deeper cross bug (call not invalidating a cached address-taken local) is - **latent** — strategy B would fix it for all callers. - ---- - -## 6. Fix, then verify - -**Strategy A (source workaround)** — edit `F`, rebuild (§3c), FAT-run the test: -the program must now behave (e.g. `09` prints `1..89` then `RC=0`; log ~400 B, not -~800 KB of runaway output). - -**Strategy B (fix the cross)** — fix the cross's codegen/optimizer, `rm -.yasos-build/cross.stamp`, full rebuild, retest. This is preferred when the same -bug class blocks several tests: fix once, many tests pass. - -**Always regression-test** — the official suite, reusing the current build: -```bash -./scripts/run_qemu_smoke.sh --no-build tcc_suite_test.py # full suite -./scripts/run_qemu_smoke.sh --no-build tcc_suite_test.py -k 09_do_while # one test -``` -A regalloc/codegen fix can affect unrelated loops — run the whole suite, not just -the target. - ---- - -## 7. Gotchas (each cost real time) - -- **`pkill -f qemu-system-arm` SELF-KILLS your shell** — the pattern string is in - the shell's own command line. Kill genuine QEMU by `comm`: - `ps -eo pid,comm | awk '$2=="qemu-system-arm"{print $1}' | xargs -r kill -9`. - Likewise never write `until ! pgrep -f qemu_fatdisk_run; do …` — the loop's own - cmdline matches the pattern, so it never exits. -- **Stale ARM objects break the x86 cross link** — after a native build, the cross - build fails with "file in wrong format". `rm -rf armv8m-arch armv8m-ir armv8m-*.o *.o`. -- **`config.mak` flips between cross and native** — `build_rootfs.sh` reconfigures - each as needed; if building manually, reconfigure for the mode you want - (`--enable-cross` for the cross). -- **Native rebuild is the slow loop** (~3-5 min) + kernel re-embed (~1 min). The - device tcc (~2 MB) does **not** fit the 1 MB `/mnt` window, so you can't swap - just the tcc binary — rebuild the romfs+kernel. Minimize native rebuilds: do all - the host-side localization (§2, §3b, §4) first. -- **`-O0` native shifts the bug** — don't use it as a clean bisector. -- **`NATIVE_TCC_OPT_OVERRIDE`** env var (added to `build_rootfs.sh`) overrides the - native opt level (default `-O1`) for experiments without editing the script. -- The bump commit is **not** automatically the cause — verify by reverting it; for - `09`, reverting `e65f29d0` did not fix it (long-standing bug). - ---- - -## 8. Checklist per test - -1. FAT-run the failing test; capture device binary + behavior (§1). -2. Confirm host cross is correct → self-host (§2). -3. Narrow the feature (§3a), then the pass via `-dump-ir-passes=all` on a debug - cross (§3b); if needed, instrument the pass for the device's actual values (§3c). -4. Disassemble `F` as the cross compiles it; find the wrong instruction vs a golden - reference (§4). -5. Fix (A source workaround, or B cross codegen) (§6). -6. FAT-verify the test, then run the **full** smoke suite (§6). -7. Update MEMORY.md / the per-bug memory with root cause + fix. diff --git a/docs/tcc_speedup_plan.md b/docs/tcc_speedup_plan.md deleted file mode 100644 index 3c31f526..00000000 --- a/docs/tcc_speedup_plan.md +++ /dev/null @@ -1,91 +0,0 @@ -# Plan — speed up device tcc by closing the inlining gap - -Companion to [tcc_vs_gcc_O2_codegen_report.md](./tcc_vs_gcc_O2_codegen_report.md). Goal: cut -device compile CPU by inlining the hot `static inline` helpers tcc currently emits out-of-line. - -## Facts the plan is built on - -- tcc has **no C-function inliner**; `static inline` → one out-of-line copy per TU, never inlined. -- `IROperand` is **9 bytes**, passed/returned **by value** → every accessor call does an sret - struct copy + table lookups + bounds checks, and none of it CSEs across calls. -- Call-site counts (the leverage): `irop_get_vreg` **1351**, `tcc_ir_op_get_src1` **924**, - `tcc_ir_op_get_dest` **871**, `tcc_ir_op_get_src2` **557**, `irop_make_imm32` **175**. -- The accessors are **branchy / multi-statement** (table lookup + bounds guard + sentinel - handling) — so they are *not* trivially macro-izable; a real inliner or careful - statement-expression macros are needed. -- `tccpp.c` (lexer/preprocess) is **~60% of compile CPU**; the IR accessors dominate the backend. - -## Build/validation harness (applies to every phase) - -- **★ Clean rebuild after header edits.** The tinycc Makefile has no header dependency tracking; - editing `tccir_operand.h` / `tccir.h` / `tcc.h` requires `rm *.o ir/*.o ir/opt/*.o` (or - `make distclean`) or you get stale-object SEGVs. (Known gotcha, see memory.) -- **CPU measurement:** `scripts/tcc_profile.py -n 30` (device-representative `Ir`), plus - `--save`/`--compare` for before/after deltas. Also profile `-O1`/`-O2` compiles, not just `-O0`. -- **Size:** `arm-none-eabi-nm -S bin/armv8m-tcc.elf` totals + per-helper copy counts. -- **Correctness:** QEMU smoke suite (must stay 412 pass / 0 undefined) + the tcc test suite; - confirm self-host rebuild is byte-stable (cross-built tcc and self-built tcc agree). - -## Phase 0 — Validate the lever (½ day, throwaway branch, no compiler change) - -Prove the predicted win before investing in an inliner. - -1. Force-inline the single hottest cluster only — `tcc_ir_op_get_src1/src2/get_dest` + - `irop_get_vreg` — by rewriting them as GNU statement-expression macros (`({ ... })`, which tcc - supports) **or** `__attribute__((always_inline))` if tcc honors it (check first; likely not). -2. `rm` objects, rebuild the **cross** `armv8m-tcc` (x86), re-run `scripts/tcc_profile.py - --compare base.json` on `129_scopes.c` at `-O0` and `-O1`. -3. **Decision gate:** if total `Ir` drops materially (expect several %), continue to Phase 1. - If not, the cost is elsewhere (struct-by-value ABI, table lookups) → pivot to Phase 1-B. - -Capture `base.json` from the *current* tree first so the comparison is honest. - -## Phase 1 — Pick the implementation path (decision gate after Phase 0) - -### Path A — minimal inliner in tcc (preferred if Phase 0 win is broad) -Highest leverage, compounds (an inlining tcc builds a faster tcc), fixes the 226 KB duplication -too. Higher risk given this fork's history of self-host miscompiles — so keep it **conservative -and gated**: -- Inline only functions that are: marked `inline`/`static inline`, single `return` or - straight-line + ≤1 branch, below an IR-instruction-count threshold, non-recursive, no varargs, - no address-taken. Everything else untouched. -- Implement at the IR/frontend boundary (where call lowering happens), behind a flag - (`-finline` / config define) defaulted off until validated, so it can be bisected like every - other opt pass in this tree. -- Validate with the full self-host + QEMU loop after **every** increment. - -### Path B — targeted, no new pass (fallback / lower risk) -- Macro-ize (statement-expression) the top ~8 hottest accessors from the report: - `irop_get_vreg`, `irop_set_vreg`, `tcc_ir_op_get_src1/2`, `tcc_ir_op_get_dest`, `irop_get_tag`, - `irop_make_imm32`, `irop_init_phys_regs`. -- **Plus** the orthogonal ABI win: change the worst by-value-9-byte-struct accessors to take - `const IROperand *` / write through an out-pointer, killing the sret copy even where inlining - doesn't reach. (Invasive across call sites — script the rewrite, do one accessor at a time.) -- Do the lexer helpers too (`cstr_ccat`, `tok_str_add2`, `token_lookup_cache_find`, - `default_reallocator`) — they sit in the 60%-CPU bucket. - -Recommendation: **start Path B** (safe, incremental, immediately shippable), and pursue Path A -only if Phase 0 shows the general inliner is worth the miscompile risk. - -## Phase 2 — Correctness & stability - -- QEMU smoke 412/0; tcc suite green; self-host byte-stability check. -- Watch for the known traps: stale-object SEGVs (clean rebuild), `build_rootfs.sh` not - fail-fast on cross `-Werror` (grep build.log for `error:`), statement-expression macros - double-evaluating arguments with side effects (audit each macro's args). - -## Phase 3 — Measure, report, decide next lever - -- Before/after: profiler `Ir` (total + per-fn), `.text` size, helper copy counts, and a real - device compile-time round-trip on a representative source. -- Update the report with measured deltas. Next lever after inlining is the §4 +19% codegen - quality (jump tables for dense enum switches, machine-level CSE of struct-field reloads). - -## Deliverables checklist - -- [ ] `base.json` profiler baseline committed/saved -- [ ] Phase 0 experiment branch + measured `Ir` delta -- [ ] Path decision recorded (A vs B) with the numbers behind it -- [ ] Implementation behind a flag, validated incrementally -- [ ] QEMU smoke + self-host stability green -- [ ] Report updated with before/after diff --git a/docs/tcc_vs_gcc_O2_codegen_report.md b/docs/tcc_vs_gcc_O2_codegen_report.md deleted file mode 100644 index a48af09d..00000000 --- a/docs/tcc_vs_gcc_O2_codegen_report.md +++ /dev/null @@ -1,156 +0,0 @@ -# tcc -O2 (self-host) vs arm-none-eabi-gcc -O2 — codegen comparison - -**Date:** 2026-06-23 · **Target:** Cortex-M33 / armv8m thumb · **Question:** where is the -device tcc leaving compile-time performance on the table, measured against a "good codegen" -reference? - -## Method - -The device compiler `bin/armv8m-tcc.elf` is built **by tcc compiling its own sources** with -`-O2 -mcpu=cortex-m33` (the self-host stage in `build_rootfs.sh`). To get a reference for how -good that codegen *could* be, I compiled the **same 81 translation units** (CORE + IR + arm -backend, from the Makefile's `armv8m_FILES`) with `arm-none-eabi-gcc -O2 -mcpu=cortex-m33 --mthumb -fpie`, same TCC defines. The gcc build is **not linked or run** — it only exists to -diff codegen quality per function. All 81 TUs compiled (2 needed `-fpermissive` / a `dlfcn.h` -stub; neither is a hotspot). - -I then matched functions **by name across both builds** (the `.elf` carries ~3900 symbols incl. -libc/native code the gcc objects don't; comparing only the 1547 functions present in **both** -keeps it apples-to-apples) and weighted everything by `scripts/tcc_profile.py` — the -device-representative CPU profile (callgrind `Ir` on the x86 cross, which runs the identical -codegen path) for the default `-O0` compile of `129_scopes.c`. - -**Caveats (read before acting):** -- Code size is a *proxy* for cycles. On the M33 (no data cache) instruction-fetch ∝ size is a - fair proxy, but data traffic also costs — so the profiler `Ir` weighting, not raw size, is the - authority on "what's hot." -- gcc and tcc inline differently, which **confounds per-function size** (see §3). I call this out - where it matters rather than letting it mislead. -- The gcc build drops `TCC_IS_NATIVE` and forces `CONFIG_TCC_STATIC` / `CONFIG_TCC_SEMLOCK=0` to - build under newlib. These only touch `tcc_run`/threading glue — none of the hot codegen. - -## 1. Headline numbers - -| metric | value | -|---|---| -| `.text` of device `armv8m-tcc.elf` | **2.26 MB** | -| matched-function total, **gcc -O2** | 1,152,516 B | -| matched-function total, **tcc -O2** | 1,368,164 B | -| **tcc / gcc ratio** | **1.19×** (tcc emits +19% more code on equal functions) | -| `.text` that is **duplicated inline-helper copies** | **~226 KB (10% of .text)** | - -Two distinct, independently-actionable problems fall out: a **systemic inlining gap** (§2, -the big one) and a **per-function codegen-quality gap** (§4, the steady +19%). - -## 2. Root cause #1 — tcc has *no* function inliner (biggest lever) - -There is **no C-function inlining pass anywhere in tcc** (the IR optimizer's only "inline" -references are inline-*asm*). `static inline` in a header is compiled as an ordinary function: -**emitted once per TU that references it, and never inlined into a call site.** - -The IR operand layer (`tccir_operand.h`) is *designed* around tiny by-value struct accessors -that assume the compiler inlines them. It doesn't. Measured copies in the two binaries: - -| helper (`static inline`, hot IR loops) | tcc copies | gcc copies | -|---|---|---| -| `irop_set_vreg` | **42** | 0 (fully inlined) | -| `irop_init_phys_regs` | **37** | 0 (fully inlined) | -| `irop_get_vreg` | **53** | 14 | -| `tcc_ir_op_get_src1` | **55** | 20 | -| `irop_make_imm32` | **31** | 1 | - -Same function, per-function size blowups (tcc ÷ gcc): `irop_make_imm32` **49×**, -`tcc_ir_op_get_dest` **9.4×**, `tcc_ir_op_get_src2` **9.1×**, `irop_get_imm64_ex` **5.3×**, -`irop_get_vreg` **5.1×**. - -This costs **twice**: -1. **CPU (the point of this exercise):** every IR operand touched during codegen pays a real - `bl`/return + struct-by-value copy instead of a few inlined instructions. These accessors run - per-operand, per-instruction, across the whole backend — and the backend is run by the device - tcc on every compile. -2. **Flash:** ~226 KB of `.text` (10%) is redundant duplicated copies of 30 such helpers. - `thop_emit` alone is **128 KB across 27 copies**; the `irop_*`/`tcc_ir_op_*` accessors add - another ~70 KB. - -The same root cause explains why several **hot lexer functions look "smaller" in tcc** in §3 -(`next` 0.22×, `macro_subst_tok` 0.40×): gcc inlined their helpers *into* them (work shows up in -the caller), tcc left the helpers as out-of-line calls. It's the same missing optimization seen -from the other side — and the lexer/preprocessor is **>50% of device compile CPU** (§3), so it's -exactly where the call overhead hurts most. - -## 3. Hot functions: CPU weight vs codegen size - -Top of the device-representative profile (`-O0` compile, the default). `ratio` = tcc ÷ gcc size; -**<1 means gcc inlined helpers into the caller**, not that tcc is better. - -``` -fn CPU% gccB tccB ratio note -next_nomacro 24.6% 4752 4396 0.93x -macro_subst_tok 11.5% 4092 1644 0.40x gcc inlined helpers in -tok_str_add2 8.0% 282 666 2.36x tcc bloat -next 6.5% 3428 764 0.22x gcc inlined helpers in -tccpp_new 6.5% 692 644 0.93x -macro_subst 4.5% 364 524 1.44x -parse_btype 3.2% 2348 3444 1.47x tcc bloat -cstr_ccat 2.5% 68 98 1.44x -token_lookup_cache_find 2.2% 76 108 1.42x -default_reallocator 2.2% 64 124 1.94x -post_type 1.8% 1660 2644 1.59x -svalue_to_iroperand 1.8% 1924 2548 1.32x -sym_push 1.4% 588 1180 2.01x -unary_funcall 1.4% 15392 20860 1.36x -``` - -Takeaway: **`tccpp.c` (lex + preprocess) is the CPU, by a wide margin** — `next_nomacro`, -`next`, `macro_subst_tok`, `macro_subst`, `tccpp_new`, `tok_str_add2` together are ~60% of the -profile. Whatever we do, it has to make the lexer hot path cheaper. - -## 4. Root cause #2 — steady +19% per-function codegen quality - -Beyond inlining, on functions where both builds emit one real copy, tcc is ~1.2–2× larger. The -gaps cluster around: -- **Dense switches over op/tag enums** compiled as linear compare chains instead of jump tables - (`tcc_ir_op_get_*`, `thumb_generate_opcode_for_data_processing` 3.2×). -- **Repeated struct-field reloads** — weak CSE/value-numbering at the machine level means a field - like `op->vr` is re-loaded instead of kept in a register across uses. -- **Spill-happy register allocation** in the big functions (`tcc_ir_codegen_generate` +10 KB, - `gen_function` +5.8 KB, `unary_funcall` +5.5 KB). - -This is the broad, always-on tax. Each fix is smaller per-unit than inlining but applies to the -whole binary (and to every program the device compiles). - -## 5. Recommendations, ranked by expected speedup ÷ effort - -1. **Inline the hot IR-operand accessors — do this first.** No new compiler pass required: - convert the handful of hottest `static inline` helpers in `tccir_operand.h` - (`irop_get_vreg`/`irop_set_vreg`, `irop_init_phys_regs`, `tcc_ir_op_get_src1/2/dest`, - `irop_get_tag`, `irop_make_imm32`) into **macros** (or hand-inline at the few hottest call - sites). tcc *will* emit macro bodies inline. Expected: removes the per-operand call+struct-copy - overhead from the entire backend **and** reclaims a chunk of the 226 KB. Low risk, mechanical. -2. **Inline the hot lexer helpers** the same way: `cstr_ccat`, `tok_str_add2`, - `token_lookup_cache_find`, `default_reallocator` are tiny, hot, and called in the >50%-CPU - lexer loop. gcc inlines them; tcc can via macro-ization. Targets the single biggest CPU bucket. -3. **A minimal real inliner** (medium effort, highest ceiling): inline single-return leaf - functions marked `inline`/`static inline` below an instruction-count threshold. This solves - #1 and #2 generally, eliminates the 226 KB duplication, and compounds — *a tcc that inlines - compiles a faster tcc*. Worth it if macro-ization proves too piecemeal. -4. **De-duplicate out-of-line copies** (link-time / single-definition fold). Reclaims ~226 KB - flash but **not** the call overhead — strictly worse than inlining for speed; do it only if - flash is the binding constraint and an inliner isn't. -5. **Jump tables for dense enum switches** in `tcc_ir_op_get_*` and the thumb opcode emitters — - attacks the §4 +19% at its largest contributors. - -The leverage multiplier worth remembering: the device tcc runs **its own compiled code**. Every -codegen improvement here makes the next self-host build of tcc itself faster, on top of speeding -up every user program it compiles. - -## Reproduce - -```sh -# gcc -O2 reference objects (81 TUs) -> /tmp/gcc_tcc/*.o (see flags in this report's git history) -# per-function sizes: -arm-none-eabi-nm -S --defined-only /tmp/gcc_tcc/*.o | awk '$3~/[tT]/{print $2,$4}' > /tmp/gcc_sizes.txt -arm-none-eabi-nm -S --defined-only bin/armv8m-tcc.elf | awk '$3~/[tT]/{print $2,$4}' > /tmp/elf_sizes.txt -# device-representative hot list: -scripts/tcc_profile.py -n 30 -``` diff --git a/fastcheck.py b/fastcheck.py new file mode 100644 index 00000000..1192e9ef --- /dev/null +++ b/fastcheck.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +"""Fast O0-vs-O1 checker for a single C file via the QEMU harness.""" +import os, subprocess, sys +from pathlib import Path +os.environ["ASAN_OPTIONS"] = "detect_leaks=0" +sys.path.insert(0, str(Path("tests/fuzz"))) +import fuzz_harness as H +from pathlib import Path + +def run(source): + wd = Path("/tmp/opencode/_wd"); wd.mkdir(exist_ok=True) + r0 = H.run_with_tcc(Path(source), "-O0", wd) + r1 = H.run_with_tcc(Path(source), "-O1", wd) + return (r0.ok and r1.ok and r0.signature == r1.signature), r0, r1 + +if __name__ == "__main__": + src = sys.argv[1] + ok, r0, r1 = run(src) + print("OK" if ok else "DIVERGE", "|", repr(r0.stdout.strip()), "vs", repr(r1.stdout.strip())) diff --git a/fuzz_triage_longlong_100_500.md b/fuzz_triage_longlong_100_500.md new file mode 100644 index 00000000..f8eaad36 --- /dev/null +++ b/fuzz_triage_longlong_100_500.md @@ -0,0 +1,12 @@ +# Fuzz O-level triage (100-500) + +Ground truth = `gcc -m32 -funsigned-char`. tcc -O0 is normally correct. + +| seed | class | ref | O0 | O1 | O2 | Os | culprit knob | +|------|-------|-----|----|----|----|----|--------------| +| 218 | O2 | ? | 569064ef | 569064ef | bae58432 | 569064ef | - | +| 408 | O1 | ? | cfd9ee9c | 49a476ae | cfd9ee9c | cfd9ee9c | - | +| 465 | O1 | ? | 604fac3c | 27cc71ea | 604fac3c | 604fac3c | - | + +Repros in tests/fuzz/fuzz_triage_repros/. Per-seed serial repro: +`python3 scripts/diff_olevels.py --seed N --require-qemu` diff --git a/ir/cfg.c b/ir/cfg.c index c00c858c..f837c7c7 100644 --- a/ir/cfg.c +++ b/ir/cfg.c @@ -57,6 +57,26 @@ IRCFG *tcc_ir_cfg_build(TCCIRState *ir) is_leader[target] = 1; } } + /* SWITCH_TABLE case/default targets are jump targets too. A case body + * reached by fall-through from the previous case is NOT otherwise a + * leader; without splitting there, instr_to_block[] maps the case entry + * to the middle of the merged block and every switch edge lands at that + * block's START — SCCP then const-folds values along the wrong case + * chain (switch fuzz seed 18613: selector 6 folded via case 3's body). */ + if (q->op == TCCIR_OP_SWITCH_TABLE) { + IROperand src2 = tcc_ir_op_get_src2(ir, q); + int table_id = (int)irop_get_imm64_ex(ir, src2); + if (table_id >= 0 && table_id < ir->num_switch_tables) { + TCCIRSwitchTable *table = &ir->switch_tables[table_id]; + for (int ti = 0; ti < table->num_entries; ti++) { + int target = table->targets[ti]; + if (target >= 0 && target < n) + is_leader[target] = 1; + } + if (table->default_target >= 0 && table->default_target < n) + is_leader[table->default_target] = 1; + } + } if (q->op == TCCIR_OP_JUMP || q->op == TCCIR_OP_JUMPIF || q->op == TCCIR_OP_RETURNVALUE || q->op == TCCIR_OP_RETURNVOID || q->op == TCCIR_OP_IJUMP || q->op == TCCIR_OP_SWITCH_TABLE) { diff --git a/ir/codegen.c b/ir/codegen.c index 21ef60c4..5dde5f90 100644 --- a/ir/codegen.c +++ b/ir/codegen.c @@ -1120,12 +1120,16 @@ static int try_reassign_scratch_conflict(TCCIRState *ir, int r, int insn_i) * and tcc_ls_find_free_scratch_reg). */ ls_iv->r0 = (int16_t)new_r; - /* 3. Patch live_regs_by_instruction for the interval's full range. */ + /* 3. Patch live_regs_by_instruction for the interval's full range. + * r's bit may be shared with another interval that move-coalescing put on + * the same register (in-place two-address ops) — only clear positions + * where no other claimant is still live. */ if (ls->live_regs_by_instruction) { for (int j = (int)ls_iv->start; j <= (int)ls_iv->end && j < ls->live_regs_by_instruction_size; j++) { - ls->live_regs_by_instruction[j] &= ~(1u << r); + if (!tcc_ls_reg_held_by_other(ls, r, j, ls_iv)) + ls->live_regs_by_instruction[j] &= ~(1u << r); ls->live_regs_by_instruction[j] |= (1u << new_r); } } @@ -1601,6 +1605,34 @@ static inline void ir_codegen_track_scratch(int is_dry_run, int i, TccIrOp op, i ir_codegen_check_scratch(i, op, dry_insn_scratch, dry_insn_saves); } +/* Find the next non-NOP instruction after `i`, for peepholes that fuse `i` with + * a later partner (STRD/LDRD spill pairs, MLAL, spill block copy) and advance + * the loop counter past the gap. Returns -1 when there is none, OR when a + * skipped NOP between `i` and the partner is a branch target: fusing `i` and + * the partner into a single instruction is illegal if a branch can land on that + * NOP, because the partner would then be reachable without executing `i` (and + * vice-versa). This is the ternary-merge case where both arms store to + * adjacent spill slots — the merge NOP sits between the true-arm store and the + * post-merge store (signed fuzz seed 2987, O0 HardFault: the fused STRD landed + * on the true-arm path only, so the false arm both mis-stored and left the + * merge label with no code address -> `b.w 0`). branch_target_reset[] catches + * targets that the is_jump_target bit misses at -O0. */ +static int ir_codegen_next_nonnop_no_label(TCCIRState *ir, const uint8_t *branch_target_reset, int i) +{ + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + IRQuadCompact *q = &ir->compact_instructions[j]; + if (q->op == TCCIR_OP_NOP) + { + if (q->is_jump_target || (branch_target_reset && branch_target_reset[j])) + return -1; + continue; + } + return j; + } + return -1; +} + static int ir_codegen_count_vreg_uses(TCCIRState *ir, int32_t vreg) { if (vreg < 0) @@ -1754,15 +1786,39 @@ static inline MopArgs ir_decode_cached(int is_dry_run, int use_mop_cache, MopArg IRQuadCompact *cq, const IROperand *src1_ir, const IROperand *src2_ir, const IROperand *dest_ir, MopSpec spec) { - /* Real-run cache hit: scale/accum not needed, cache is valid. */ - if (!is_dry_run && use_mop_cache && !spec.scale && !spec.accum) - return mop_cache[i]; + /* Real-run cache hit: replay the dry-run decode. This must cover ALL + * specs, including scale/accum (LOAD_INDEXED/STORE_INDEXED/MLA): the + * decode-time peepholes (ir_codegen_before_ret_peephole) PATCH interval + * allocations, and those patches persist from the dry-run into the + * real-run. A fresh real-run decode can therefore make a peephole + * decision the dry-run did not — e.g. a LOAD_INDEXED whose following + * ASSIGN's dest was only retargeted to a register later in the dry-run + * fires the coalesce peephole in the real-run only, retargeting the load + * while every cached consumer still reads the source's pre-patch register + * (ptr fuzz seed 30436: `ldr r8, [...]` immediately clobbered by the + * stale-cache copy `mov r8, ip`). */ + if (!is_dry_run && use_mop_cache) + { + MopArgs cached = mop_cache[i]; + /* A peephole that skips an instruction (i = next_i; break) can fire in the + * dry-run but not the real-run when its decision depends on pass-varying + * state. The STRD-spill fusion is one such case: it keys on the + * SP-relative offset via fp_adjust_local_offset(), whose allocated_stack_size + * term is 0 during the dry-run (the prologue that sets it runs only before + * the real pass) but final during the real-run. A large frame can therefore + * make the dry-run fuse-and-skip instruction i while the real-run does not, + * leaving mop_cache[i] never written (zero-initialised → all MACH_OP_NONE). + * A genuinely decoded store/load always materialises dest or src1, so an + * all-NONE pair marks an unpopulated slot: re-decode instead of returning + * the stale sentinel (which would trip the MACH_OP_NONE codegen assert). */ + if (cached.dest.kind != MACH_OP_NONE || cached.src1.kind != MACH_OP_NONE) + return cached; + } MopArgs a = decode_mop_args(ir, cq, src1_ir, src2_ir, dest_ir, i, spec); - /* Dry-run: store decoded dest/src1/src2 for reuse, unless scale/accum are - * involved (those instructions re-decode cheaply in the real-run). */ - if (is_dry_run && mop_cache && !spec.scale && !spec.accum) + /* Dry-run: store the decoded operands for real-run replay. */ + if (is_dry_run && mop_cache) mop_cache[i] = a; return a; @@ -1808,40 +1864,9 @@ void tcc_ir_codegen_generate(TCCIRState *ir) memset(orig_ir_to_code_mapping, 0xFF, sizeof(uint32_t) * ir->orig_ir_to_code_mapping_size); /* Track addresses of return jumps for later backpatching to epilogue */ int *return_jump_addrs = tcc_malloc(sizeof(int) * ir->next_instruction_index); + ir->codegen_return_jump_addrs = return_jump_addrs; int num_return_jumps = 0; - /* --- DEBUG: catch codegen-time corruption of a spilled temp's allocation.r0. - * The HW-only 90_struct c[1].y=5 bug: a temp that regalloc spilled - * (allocation.r0 == 0x3f) is overwritten to a register number during codegen, - * so machine_op_from_ir later reads it as "lives in R8". Snapshot now - * (post-regalloc) and report the first instruction at which any spilled temp - * flips to a register. --- */ - static uint8_t *dbg_alloc_snap = NULL; - static int dbg_alloc_snap_n = 0; - static int dbg_alloc_active = 0; - static int dbg_alloc_reported = 0; - dbg_alloc_active = 0; - if (funcname && !strcmp((const char *)funcname, "test_init_struct_from_struct")) - { - dbg_alloc_snap_n = ir->temporary_variables_live_intervals_size; - dbg_alloc_snap = tcc_realloc(dbg_alloc_snap, (size_t)dbg_alloc_snap_n + 1); - for (int p = 0; p < dbg_alloc_snap_n; p++) - dbg_alloc_snap[p] = (uint8_t)ir->temporary_variables_live_intervals[p].allocation.r0; - dbg_alloc_active = 1; - dbg_alloc_reported = 0; - fprintf(stderr, "ALLOCSNAP n=%d\n", dbg_alloc_snap_n); - /* Snapshot the liveness bitmap at the printf-arg LEA indices at codegen - * START. Compare with the FSR trace (printed at the find_free call): if - * these are correct here but wrong at find_free, the bitmap is corrupted - * during codegen; if already wrong here, ra_build_live_regs_bitmap - * miscomputed it. */ - uint32_t *lrb = ir->ls.live_regs_by_instruction; - int lrbn = ir->ls.live_regs_by_instruction_size; - fprintf(stderr, "LRBSNAP arr=%p sz=%d [70]=0x%x [72]=0x%x [75]=0x%x [80]=0x%x\n", (void *)lrb, lrbn, - (lrb && 70 < lrbn) ? lrb[70] : 0xDEADu, (lrb && 72 < lrbn) ? lrb[72] : 0xDEADu, - (lrb && 75 < lrbn) ? lrb[75] : 0xDEADu, (lrb && 80 < lrbn) ? lrb[80] : 0xDEADu); - } - /* Clear spill cache at function start */ tcc_ir_spill_cache_clear(&ir->spill_cache); @@ -2108,6 +2133,8 @@ void tcc_ir_codegen_generate(TCCIRState *ir) * Both arrays are declared before #if so they are visible in both passes. */ int *dry_insn_scratch = tcc_mallocz(ir->next_instruction_index * sizeof(int)); uint16_t *dry_insn_saves = tcc_mallocz(ir->next_instruction_index * sizeof(uint16_t)); + ir->codegen_dry_insn_scratch = dry_insn_scratch; + ir->codegen_dry_insn_saves = dry_insn_saves; /* ============================================================================ * OPTION A: Skip dry-run for scratch-conflict-free functions @@ -2283,9 +2310,13 @@ void tcc_ir_codegen_generate(TCCIRState *ir) * ============================================================================ */ /* Option B: allocate per-instruction MopArgs cache for the dry-run. * Not used when the dry-run is skipped (can_skip_dry_run). */ + /* Zero-initialised: an unwritten slot reads back as all-MACH_OP_NONE, which + * ir_decode_cached() treats as "not populated in the dry-run" and re-decodes + * (see the cache-hit path there). */ MopArgs *mop_cache = (!can_skip_dry_run && ir->next_instruction_index > 0) - ? tcc_malloc(ir->next_instruction_index * sizeof(MopArgs)) + ? tcc_mallocz(ir->next_instruction_index * sizeof(MopArgs)) : NULL; + ir->codegen_mop_cache = mop_cache; int use_mop_cache = 0; const int pass_start = can_skip_dry_run ? 1 : 0; @@ -2306,6 +2337,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) if (ir->next_instruction_index > 0) { branch_target_reset = tcc_mallocz((size_t)ir->next_instruction_index); + ir->codegen_branch_target_reset = branch_target_reset; int has_indirect_jump = 0; for (int bi = 0; bi < ir->next_instruction_index; bi++) { @@ -2407,27 +2439,6 @@ void tcc_ir_codegen_generate(TCCIRState *ir) /* Track current instruction for scratch register allocation */ ir->codegen_instruction_idx = i; - /* DEBUG: report the first spilled temp whose allocation.r0 was overwritten - * to a register since codegen start (corruption happened at instr <= i-1, - * or in the dry-run pass if i is 0). */ - if (dbg_alloc_active && !dbg_alloc_reported) - { - int lim = ir->temporary_variables_live_intervals_size; - if (lim > dbg_alloc_snap_n) - lim = dbg_alloc_snap_n; - for (int p = 0; p < lim; p++) - { - uint8_t now = (uint8_t)ir->temporary_variables_live_intervals[p].allocation.r0; - if (dbg_alloc_snap[p] == 0x3f && now != 0x3f) - { - fprintf(stderr, "ALLOCCORRUPT T%d r0 0x3f->0x%x by codegen idx<=%d (this op=%d)\n", - p, now, i, (int)cq->op); - dbg_alloc_reported = 1; - break; - } - } - } - /* Debug tracking: update current op for ot_check failure reporting */ g_debug_current_op = (int)cq->op; @@ -2522,11 +2533,16 @@ void tcc_ir_codegen_generate(TCCIRState *ir) } if (imm_op) { - int next_j = i + 1; - while (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_NOP) - next_j++; - if (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_ADD && - !ir->compact_instructions[next_j].is_jump_target) + int next_j = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); + /* The ADD may carry a barrel-shift side-table annotation + * (ir->barrel_shifts[orig_index], set by tcc_ir_barrel_shift_fusion): + * its src2 is then a value that must still be shifted when the ADD + * executes. The fused emission below bypasses the annotated path + * entirely, silently dropping that shift, so skip the fusion. */ + if (next_j >= 0 && ir->compact_instructions[next_j].op == TCCIR_OP_ADD && + !ir->compact_instructions[next_j].is_jump_target && + !(ir->barrel_shifts && ir->compact_instructions[next_j].orig_index >= 0 && + ir->barrel_shifts[ir->compact_instructions[next_j].orig_index])) { IRQuadCompact *nq = &ir->compact_instructions[next_j]; IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); @@ -2671,10 +2687,8 @@ void tcc_ir_codegen_generate(TCCIRState *ir) * accumulator pair maps directly to (S/U)MLAL. */ if (a.dest.vreg >= 0 && ir_codegen_count_vreg_uses(ir, a.dest.vreg) == 1) { - int next_j = i + 1; - while (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_NOP) - next_j++; - if (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_ADD && + int next_j = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); + if (next_j >= 0 && ir->compact_instructions[next_j].op == TCCIR_OP_ADD && !ir->compact_instructions[next_j].is_jump_target) { IRQuadCompact *nq = &ir->compact_instructions[next_j]; @@ -2704,10 +2718,8 @@ void tcc_ir_codegen_generate(TCCIRState *ir) if (accum && accum->is_64bit && irop_get_vreg(n_dest_ir) >= 0) { - int store_j = next_j + 1; - while (store_j < ir->next_instruction_index && ir->compact_instructions[store_j].op == TCCIR_OP_NOP) - store_j++; - if (store_j < ir->next_instruction_index && ir->compact_instructions[store_j].op == TCCIR_OP_STORE && + int store_j = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, next_j); + if (store_j >= 0 && ir->compact_instructions[store_j].op == TCCIR_OP_STORE && !ir->compact_instructions[store_j].is_jump_target) { IRQuadCompact *sq = &ir->compact_instructions[store_j]; @@ -3005,15 +3017,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) (a.src1.u.spill.offset & 3) == 0) { int first_load_reg = a.dest.u.reg.r0; - int store_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - store_i = j; - break; - } - } + int store_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); if (store_i >= 0 && ir->compact_instructions[store_i].op == TCCIR_OP_STORE && !ir->compact_instructions[store_i].is_jump_target) { @@ -3037,15 +3041,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) while (count < 32) { - int next_load_i = -1; - for (int j = last_i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_load_i = j; - break; - } - } + int next_load_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, last_i); if (next_load_i < 0 || ir->compact_instructions[next_load_i].op != TCCIR_OP_LOAD || ir->compact_instructions[next_load_i].is_jump_target) break; @@ -3066,15 +3062,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) if (la.dest.kind != MACH_OP_REG || la.dest.u.reg.r0 != first_load_reg) break; - int next_store_i = -1; - for (int j = next_load_i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_store_i = j; - break; - } - } + int next_store_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, next_load_i); if (next_store_i < 0 || ir->compact_instructions[next_store_i].op != TCCIR_OP_STORE || ir->compact_instructions[next_store_i].is_jump_target) break; @@ -3119,24 +3107,26 @@ void tcc_ir_codegen_generate(TCCIRState *ir) /* STRD peephole: if this is a 32-bit store to a spill slot and the * very next non-NOP instruction is also a 32-bit store to an adjacent - * (+4) spill slot, emit STRD for both and skip the second. */ + * (+4) spill slot, emit STRD for both and skip the second. + * + * The value operand (src1) must NOT be a deref: STORE's src1 can carry + * needs_deref, meaning "dereference this pointer register to obtain the + * value to store" (slot = *ptr). Pairing such a store into STRD would + * feed the address register straight to try_strd_spill as if it were + * the value, silently dropping the required load. */ if (a.dest.kind == MACH_OP_SPILL && !a.dest.needs_deref && - a.src1.kind == MACH_OP_REG && !a.src1.is_64bit && + a.src1.kind == MACH_OP_REG && !a.src1.is_64bit && !a.src1.needs_deref && (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32) && (a.dest.u.spill.offset & 3) == 0) { /* Find next non-NOP instruction */ - int next_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); + /* is_jump_target misses some branch targets (see branch_target_reset); + * consuming a branch-target store removes the label's only emission + * point, so branches to it backpatch against code address 0. */ if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_STORE && - !ir->compact_instructions[next_i].is_jump_target) + !ir->compact_instructions[next_i].is_jump_target && + !(branch_target_reset && branch_target_reset[next_i])) { /* Decode the next store's operands */ IRQuadCompact *nq = &ir->compact_instructions[next_i]; @@ -3148,7 +3138,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) (MopSpec){.dest = 1, .src1 = 2}); if (b.dest.kind == MACH_OP_SPILL && !b.dest.needs_deref && - b.src1.kind == MACH_OP_REG && !b.src1.is_64bit && + b.src1.kind == MACH_OP_REG && !b.src1.is_64bit && !b.src1.needs_deref && (b.dest.btype == IROP_BTYPE_INT32 || b.dest.btype == IROP_BTYPE_FLOAT32) && (b.dest.u.spill.offset & 3) == 0) { @@ -3157,18 +3147,18 @@ void tcc_ir_codegen_generate(TCCIRState *ir) int reg1 = a.src1.u.reg.r0; int reg2 = b.src1.u.reg.r0; - if (off1 + 4 == off2) + if (reg1 != reg2 && off1 + 4 == off2) { - if (tcc_gen_machine_try_strd_spill(reg1, reg2, off1, off2)) + if (tcc_gen_machine_try_strd_spill(reg1, off1, reg2, off2)) { /* Skip the next store — advance i past NOPs and the paired store */ i = next_i; break; } } - else if (off2 + 4 == off1) + else if (reg1 != reg2 && off2 + 4 == off1) { - if (tcc_gen_machine_try_strd_spill(reg2, reg1, off2, off1)) + if (tcc_gen_machine_try_strd_spill(reg2, off2, reg1, off1)) { i = next_i; break; @@ -3186,15 +3176,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32) && (a.dest.u.spill.offset & 3) == 0) { - int next_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_STORE && !ir->compact_instructions[next_i].is_jump_target) { @@ -3242,18 +3224,10 @@ void tcc_ir_codegen_generate(TCCIRState *ir) * STORE_INDEXED, but the off=0 store stays plain STORE — so the * existing STORE_INDEXED-only peephole misses the pair. */ if (a.dest.kind == MACH_OP_REG && a.dest.needs_deref && - a.src1.kind == MACH_OP_REG && !a.src1.is_64bit && + a.src1.kind == MACH_OP_REG && !a.src1.is_64bit && !a.src1.needs_deref && (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32)) { - int next_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); /* is_jump_target misses some branch targets (see branch_target_reset); * consuming a branch-target store removes the label's only emission * point, so branches to it backpatch against code address 0. */ @@ -3268,7 +3242,9 @@ void tcc_ir_codegen_generate(TCCIRState *ir) MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, &n_src1_ir, &n_src2_ir, &n_dest_ir, (MopSpec){.dest = 1, .src1 = 1, .src2 = 1, .scale = 1}); - if (!b.src1.is_64bit && b.src1.kind == MACH_OP_REG && + /* src1 is the value being stored; a deref there (value = *ptr) would + * feed the pointer register to try_strd_base as the value. */ + if (!b.src1.is_64bit && b.src1.kind == MACH_OP_REG && !b.src1.needs_deref && b.scale.kind == MACH_OP_IMM && b.scale.u.imm.val == 0 && b.src2.kind == MACH_OP_IMM && b.dest.kind == MACH_OP_REG && !b.dest.needs_deref && @@ -3302,15 +3278,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) a.src1.kind == MACH_OP_IMM && !a.src1.is_64bit && (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32)) { - int next_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); /* is_jump_target misses some branch targets (see branch_target_reset); * consuming a branch-target store removes the label's only emission * point, so branches to it backpatch against code address 0. */ @@ -3353,15 +3321,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) a.src1.kind == MACH_OP_REG && !a.src1.is_64bit && (a.dest.u.spill.offset & 3) == 0) { - int next_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_LOAD && !ir->compact_instructions[next_i].is_jump_target) { @@ -3404,15 +3364,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) a.src1.kind == MACH_OP_REG && !a.src1.needs_deref && (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32)) { - int next_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_LOAD_INDEXED && !ir->compact_instructions[next_i].is_jump_target) { @@ -3473,7 +3425,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) * Only for REG sources — IMM STRD through generic base registers is * unsafe because STRD requires 4-byte aligned addresses while * individual STR tolerates unaligned access on ARMv8-M. */ - if (!a.src1.is_64bit && a.src1.kind == MACH_OP_REG && + if (!a.src1.is_64bit && a.src1.kind == MACH_OP_REG && !a.src1.needs_deref && a.scale.kind == MACH_OP_IMM && a.scale.u.imm.val == 0 && a.src2.kind == MACH_OP_IMM && a.dest.kind == MACH_OP_REG && !a.dest.needs_deref && @@ -3483,6 +3435,12 @@ void tcc_ir_codegen_generate(TCCIRState *ir) for (int j = i + 1; j < ir->next_instruction_index; j++) { int jop = ir->compact_instructions[j].op; + /* A branch target between the two STORE_INDEXEDs — even a code-less + * NOP or identity move — means a jump can land between them, so they + * cannot share one STRD. Bail rather than fuse across the label. */ + if (ir->compact_instructions[j].is_jump_target || + (branch_target_reset && branch_target_reset[j])) + break; if (jop == TCCIR_OP_NOP) continue; /* An ASSIGN or pure-vreg LOAD whose src and dst materialise to @@ -3525,7 +3483,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, &n_src1_ir, &n_src2_ir, &n_dest_ir, (MopSpec){.dest = 1, .src1 = 1, .src2 = 1, .scale = 1}); - if (!b.src1.is_64bit && b.src1.kind == MACH_OP_REG && + if (!b.src1.is_64bit && b.src1.kind == MACH_OP_REG && !b.src1.needs_deref && b.scale.kind == MACH_OP_IMM && b.scale.u.imm.val == 0 && b.src2.kind == MACH_OP_IMM && b.dest.kind == MACH_OP_REG && !b.dest.needs_deref && @@ -3568,15 +3526,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) a.dest.kind == MACH_OP_REG && !a.dest.needs_deref && (a.src1.btype == IROP_BTYPE_INT32 || a.src1.btype == IROP_BTYPE_FLOAT32)) { - int next_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); /* is_jump_target misses some branch targets (see branch_target_reset); * consuming a branch-target store removes the label's only emission * point, so branches to it backpatch against code address 0. */ @@ -3641,15 +3591,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) for (int k = 1; k <= 3; k++) { - int next_i = -1; - for (int j = last_i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, last_i); if (next_i < 0 || ir->compact_instructions[next_i].op != TCCIR_OP_STORE_INDEXED || ir->compact_instructions[next_i].is_jump_target) @@ -3817,6 +3759,18 @@ void tcc_ir_codegen_generate(TCCIRState *ir) { MopArgs a = DECODE(.dest = 2, .src1 = 1); + /* A bare immediate destination (no deref) is malformed IR: you cannot + * assign into a literal, so the instruction is a dead no-op. It can + * survive when const-prop folds a value to a constant and a fusion then + * consumes the real consumer, leaving a stranded `#K <- #M` assign + * (seed 2966: the UDIV accumulator folds to 9 and is fused into the + * MLA, but the now-dead `#9 <- #-9733` def keeps an immediate dest). + * Drop it rather than aborting in mach_get_dest_reg ("unexpected kind + * 3"); the live value already resides in the fused op, so the computed + * result is unchanged. */ + if (a.dest.kind == MACH_OP_IMM && !a.dest.needs_deref) + break; + /* LDRD peephole: two adjacent 32-bit assigns loading from adjacent * spill slots into registers → single LDRD instruction. */ if (a.src1.kind == MACH_OP_SPILL && !a.src1.needs_deref && @@ -3824,15 +3778,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) (a.src1.btype == IROP_BTYPE_INT32 || a.src1.btype == IROP_BTYPE_FLOAT32) && (a.src1.u.spill.offset & 3) == 0) { - int next_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_ASSIGN && !ir->compact_instructions[next_i].is_jump_target) { @@ -3881,15 +3827,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32) && (a.dest.u.spill.offset & 3) == 0) { - int next_i = -1; - for (int j = i + 1; j < ir->next_instruction_index; j++) - { - if (ir->compact_instructions[j].op != TCCIR_OP_NOP) - { - next_i = j; - break; - } - } + int next_i = ir_codegen_next_nonnop_no_label(ir, branch_target_reset, i); if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_ASSIGN && !ir->compact_instructions[next_i].is_jump_target) { @@ -4218,6 +4156,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) if (cbz_dry_mapping) tcc_free(cbz_dry_mapping); cbz_dry_mapping = tcc_malloc(ir->ir_to_code_mapping_size * sizeof(uint32_t)); + ir->codegen_cbz_dry_mapping = cbz_dry_mapping; memcpy(cbz_dry_mapping, ir_to_code_mapping, ir->ir_to_code_mapping_size * sizeof(uint32_t)); /* Check if LR was pushed during dry run in a leaf function */ @@ -4303,6 +4242,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) /* Interval table was mutated: cached MopArgs are stale, discard. */ tcc_free(mop_cache); mop_cache = NULL; + ir->codegen_mop_cache = NULL; } use_mop_cache = (mop_cache != NULL); } @@ -4381,10 +4321,13 @@ void tcc_ir_codegen_generate(TCCIRState *ir) } tcc_free(mop_cache); + ir->codegen_mop_cache = NULL; if (cbz_dry_mapping) tcc_free(cbz_dry_mapping); + ir->codegen_cbz_dry_mapping = NULL; if (branch_target_reset) tcc_free(branch_target_reset); + ir->codegen_branch_target_reset = NULL; ir_to_code_mapping[ir->next_instruction_index] = ind; orig_ir_to_code_mapping[ir->orig_ir_to_code_mapping_size - 1] = ind; @@ -4420,8 +4363,11 @@ void tcc_ir_codegen_generate(TCCIRState *ir) } tcc_free(return_jump_addrs); + ir->codegen_return_jump_addrs = NULL; tcc_free(dry_insn_saves); + ir->codegen_dry_insn_saves = NULL; tcc_free(dry_insn_scratch); + ir->codegen_dry_insn_scratch = NULL; } /* ============================================================================ diff --git a/ir/core.c b/ir/core.c index de082370..b7e0813c 100644 --- a/ir/core.c +++ b/ir/core.c @@ -86,6 +86,7 @@ TCCIRState *tcc_ir_alloc(void) block->processing_if = 0; block->basic_block_start = 1; block->prevent_coalescing = 0; + block->func_has_label_addr = 0; /* Nested function / static chain fields */ block->has_static_chain = 0; @@ -207,6 +208,35 @@ void tcc_ir_free(TCCIRState *ir) tcc_free(ir->parameters_live_intervals); } + if (ir->barrel_shifts) + { + tcc_free(ir->barrel_shifts); + ir->barrel_shifts = NULL; + } + if (ir->shift64_dead_half) + { + tcc_free(ir->shift64_dead_half); + ir->shift64_dead_half = NULL; + } + if (ir->bfi_params) + { + tcc_free(ir->bfi_params); + ir->bfi_params = NULL; + } + + tcc_free(ir->codegen_return_jump_addrs); + ir->codegen_return_jump_addrs = NULL; + tcc_free(ir->codegen_dry_insn_scratch); + ir->codegen_dry_insn_scratch = NULL; + tcc_free(ir->codegen_dry_insn_saves); + ir->codegen_dry_insn_saves = NULL; + tcc_free(ir->codegen_mop_cache); + ir->codegen_mop_cache = NULL; + tcc_free(ir->codegen_cbz_dry_mapping); + ir->codegen_cbz_dry_mapping = NULL; + tcc_free(ir->codegen_branch_target_reset); + ir->codegen_branch_target_reset = NULL; + if (ir->stack_layout.slots != NULL) { tcc_free(ir->stack_layout.slots); @@ -2136,3 +2166,31 @@ IRLiveInterval *tcc_ir_get_live_interval(TCCIRState *ir, int vreg) } return NULL; /* unreachable, silences -Werror with old compiler */ } + +/* Non-fatal sibling of tcc_ir_get_live_interval(): returns NULL instead of + * aborting the process when vreg is negative, carries an unknown type, or + * addresses a position past the allocated interval array. Use this from + * callers that must tolerate an unmapped vreg (e.g. tcc_ir_stack_reg_get). */ +IRLiveInterval *tcc_ir_try_get_live_interval(TCCIRState *ir, int vreg) +{ + if (!ir || vreg < 0) + return NULL; + int decoded_vreg_position = TCCIR_DECODE_VREG_POSITION(vreg); + switch (TCCIR_DECODE_VREG_TYPE(vreg)) + { + case TCCIR_VREG_TYPE_VAR: + if (decoded_vreg_position >= ir->variables_live_intervals_size) + return NULL; + return &ir->variables_live_intervals[decoded_vreg_position]; + case TCCIR_VREG_TYPE_TEMP: + if (decoded_vreg_position >= ir->temporary_variables_live_intervals_size) + return NULL; + return &ir->temporary_variables_live_intervals[decoded_vreg_position]; + case TCCIR_VREG_TYPE_PARAM: + if (decoded_vreg_position >= ir->parameters_live_intervals_size) + return NULL; + return &ir->parameters_live_intervals[decoded_vreg_position]; + default: + return NULL; + } +} diff --git a/ir/dump.c b/ir/dump.c index 862d1137..9511a7c7 100644 --- a/ir/dump.c +++ b/ir/dump.c @@ -431,9 +431,9 @@ void tcc_dump_quadruple_to(FILE *out, const TACQuadruple *q, int pc) fprintf(out, "JMP to %d ", (int)q->dest.c.i); break; case TCCIR_OP_IJUMP: + /* Mnemonic only; the generic has_src1 block below prints src1 once. + See docs/bugs.md #5 (matching the fix in tcc_print_quadruple_irop). */ fprintf(out, "IJMP "); - tcc_dump_svalue_short_to(out, &q->src1); - fprintf(out, " "); break; default: tcc_dump_svalue_short_to(out, &q->dest); @@ -622,6 +622,59 @@ void tcc_ir_dump_set_show_physical_regs(int show) show_physical_regs = show; } +/* Returns 1 if `pass_name` is selected by the comma-separated -dump-ir-passes= + * list in s->dump_ir_passes (or the list contains the special token "all"). */ +int tcc_ir_dump_passes_match(TCCState *s, const char *pass_name) +{ + if (!s || !s->dump_ir_passes || !pass_name) + return 0; + const char *p = s->dump_ir_passes; + size_t name_len = strlen(pass_name); + while (*p) + { + const char *comma = strchr(p, ','); + size_t tok_len = comma ? (size_t)(comma - p) : strlen(p); + if (tok_len == 3 && !memcmp(p, "all", 3)) + return 1; + if (tok_len == name_len && !memcmp(p, pass_name, name_len)) + return 1; + if (!comma) + break; + p = comma + 1; + } + return 0; +} + +/* If pass_name is selected by -dump-ir-passes=, print the IR labeled with the + * pass name as "=== AFTER ===" ... "=== END AFTER ===". Shared by + * the legacy optimize loop (tccgen.c RUN_PASS / dump_ir_after_pass) and the SSA + * optimizer driver (ir/opt/ssa_opt.c) so every pass is observable the same way. + * A no-op unless built with CONFIG_TCC_DEBUG. */ +void tcc_ir_dump_after_pass(TCCIRState *ir, const char *pass_name) +{ +#ifdef CONFIG_TCC_DEBUG + if (!tcc_ir_dump_passes_match(tcc_state, pass_name)) + return; + tcc_ir_dump_set_show_physical_regs(0); + printf("=== AFTER %s ===\n", pass_name); + tcc_ir_show(ir); + /* Switch side tables are absolute-index consumers that renumbering passes + * must keep in sync — print them so a stale target is visible in the dump. */ + for (int t = 0; t < ir->num_switch_tables; t++) { + TCCIRSwitchTable *tbl = &ir->switch_tables[t]; + printf("SWTAB %d: min=%lld max=%lld default=%d targets=[", t, + (long long)tbl->min_val, (long long)tbl->max_val, tbl->default_target); + for (int j = 0; j < tbl->num_entries; j++) + printf("%s%d", j ? "," : "", tbl->targets[j]); + printf("]\n"); + } + printf("=== END AFTER %s ===\n", pass_name); +#else + (void)ir; + (void)pass_name; +#endif +} + /* Get the short prefix for a vreg type: V, T, or P */ static char vreg_type_prefix(int vreg) { @@ -952,9 +1005,11 @@ void tcc_print_quadruple_irop(TCCIRState *ir, IRQuadCompact *q, int pc) printf("JMP to %ld ", (long)irop_get_imm64_ex(ir, dest)); break; case TCCIR_OP_IJUMP: + /* Only print the mnemonic here; the generic has_src1 block below prints + src1 (the target register) exactly once. Printing it here too produced + a double "IJMP T4 T4" (docs/bugs.md #5). Unlike JUMPIF/MLA, IJUMP is + not excluded from that block, so this case must not print src1 itself. */ printf("IJMP "); - print_iroperand_short(ir, src1); - printf(" "); break; case TCCIR_OP_MLA: /* MLA has 4 operands: dest = src1 * src2 + accum */ diff --git a/ir/licm.c b/ir/licm.c index d1210730..8abe4435 100644 --- a/ir/licm.c +++ b/ir/licm.c @@ -10,6 +10,7 @@ #include "licm.h" #include "opt.h" +#include "opt_utils.h" #include "cfg.h" #include "core.h" #include "pool.h" @@ -177,8 +178,11 @@ IRLoops *tcc_ir_detect_loops(TCCIRState *ir) IROperand dest = tcc_ir_op_get_dest(ir, q); int target = (int)irop_get_imm64_ex(ir, dest); - /* Check if this is a backward jump (loop back edge) */ - if (target < i) + /* Check if this is a backward jump (loop back edge). The target must be + * a valid non-negative instruction index: an unresolved/sentinel dest can + * decode negative, which would make the loop body range [target, i] index + * before compact_instructions. */ + if (target >= 0 && target < i) { /* Found a loop */ if (loops->num_loops >= loops->capacity) @@ -510,6 +514,28 @@ static int insert_instruction_before(TCCIRState *ir, int before_idx, IRQuadCompa } } + /* SWITCH_TABLE case targets live in a side table independent of the IR + * array; without this they silently desynchronize on every insertion + * (docs/bugs.md #7, combo fuzz seeds 52/80/187/311/333/392/460: hoisting a + * pure call out of a loop containing a switch left every case target stale + * by the insertion count — downstream reachability-based passes then + * deleted live FUNCPARAMVALs, and at runtime the dispatch jumped into the + * middle of the wrong case). Mirrors gsym_cse_insert_before. */ + for (int t = 0; t < ir->num_switch_tables; t++) + { + TCCIRSwitchTable *table = &ir->switch_tables[t]; + if (table->default_target >= before_idx) + table->default_target += 1; + if (table->targets) + { + for (int j = 0; j < table->num_entries; j++) + { + if (table->targets[j] >= before_idx) + table->targets[j] += 1; + } + } + } + return before_idx; } @@ -598,7 +624,7 @@ typedef struct int hoisted; /* Whether we've created the ASSIGN yet */ } HoistedStackAddr; -static int hoist_from_loop(TCCIRState *ir, IRLoop *loop) +__attribute__((unused)) static int hoist_from_loop(TCCIRState *ir, IRLoop *loop) { if (!ir || !loop || loop->preheader_idx < 0) return 0; @@ -1064,59 +1090,6 @@ static int hoist_const_exprs_from_loop(TCCIRState *ir, IRLoop *loop) return total_inserted; } -int tcc_ir_hoist_loop_invariants(TCCIRState *ir, IRLoops *loops) -{ - if (!ir || !loops) - return 0; - - /* Hoisting is now done by the dominance-based LICM in tcc_ir_opt_licm_ex. */ - return 0; - - /* Old implementation below (unreachable but compiles): */ - if (!ir || !loops) - return 0; - - int total_hoisted = 0; - - for (int i = 0; i < loops->num_loops; i++) - { - IRLoop *loop = &loops->loops[i]; - int hoisted = hoist_from_loop(ir, loop); - total_hoisted += hoisted; - - /* If we hoisted any instructions, update indices for all subsequent loops */ - if (hoisted > 0) - { - LOG_LICM("Loop %d hoisted %d instrs, loop[%d].preheader=%d, updating later loops", i, hoisted, i, - loop->preheader_idx); - /* Indices of subsequent loops need to be shifted by number of inserted instructions */ - for (int j = i + 1; j < loops->num_loops; j++) - { - IRLoop *later_loop = &loops->loops[j]; - - /* Update loop boundary indices if they are after the insertion point */ - if (later_loop->header_idx >= loop->preheader_idx) - later_loop->header_idx += hoisted; - if (later_loop->start_idx >= loop->preheader_idx) - later_loop->start_idx += hoisted; - if (later_loop->end_idx >= loop->preheader_idx) - later_loop->end_idx += hoisted; - if (later_loop->preheader_idx >= loop->preheader_idx) - later_loop->preheader_idx += hoisted; - - /* Update body instruction indices */ - for (int k = 0; k < later_loop->num_body_instrs; k++) - { - if (later_loop->body_instrs[k] >= loop->preheader_idx) - later_loop->body_instrs[k] += hoisted; - } - } - } - } - - return total_hoisted; -} - /* ============================================================================ * Pure Function Detection and LICM for Function Calls (Phase 1) * ============================================================================ */ @@ -1429,8 +1402,12 @@ int tcc_ir_get_func_purity(TCCIRState *ir, Sym *sym) if (!sym) return TCC_FUNC_PURITY_UNKNOWN; - /* Check if this is a function */ - if (!(sym->type.t & VT_FUNC)) + /* Check if this is a function. Must mask VT_BTYPE first: VT_FUNC (6) shares + * bits with other basic types (e.g. VT_INT==3, 3 & 6 == 2 != 0), so a bare + * `sym->type.t & VT_FUNC` wrongly passes non-function symbols through to the + * purity lookup. This matches the `(t & VT_BTYPE) == VT_FUNC` idiom used + * everywhere else in the codebase (tccgen.c, tccdbg.c, ir/opt.c). */ + if ((sym->type.t & VT_BTYPE) != VT_FUNC) return TCC_FUNC_PURITY_IMPURE; /* Not a function = not pure */ /* Get function name from symbol */ @@ -1514,6 +1491,40 @@ int tcc_ir_get_func_purity(TCCIRState *ir, Sym *sym) * The hoisted_vregs array contains vregs that were hoisted in previous iterations. * These are considered loop-invariant even if they have an ASSIGN in the loop body. */ +/* True if the address of `vreg`'s stack slot is taken anywhere in the + * function: a SOURCE operand carrying this vreg with STACKOFF tag and + * is_lval == 0 (the IR's `&V` form — see the dump printer). Once the + * address escapes, any store through any pointer may mutate the variable, + * so a "no direct def in the loop" scan is not sufficient for invariance + * (docs/bugs.md #7: ptr fuzz seeds 500/517 — helper3(#imm, V7) hoisted out + * of a loop that mutated V7 through the pointers p14 and p15). */ +static int vreg_addr_taken_anywhere(TCCIRState *ir, int32_t vreg) +{ + for (int i = 0; i < ir->next_instruction_index; i++) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP) + continue; + IROperand srcs[3]; + int nsrcs = 0; + if (irop_config[q->op].has_src1) + srcs[nsrcs++] = tcc_ir_op_get_src1(ir, q); + if (irop_config[q->op].has_src2) + srcs[nsrcs++] = tcc_ir_op_get_src2(ir, q); + if (q->op == TCCIR_OP_MLA) + srcs[nsrcs++] = tcc_ir_op_get_accum(ir, q); + for (int s = 0; s < nsrcs; s++) + { + if (irop_get_tag(srcs[s]) == IROP_TAG_STACKOFF && !srcs[s].is_lval && + irop_get_vreg(srcs[s]) == vreg) + return 1; + } + } + return 0; +} + +static int loop_body_may_clobber_memory(TCCIRState *ir, IRLoop *loop); + static int is_operand_loop_invariant_ex(TCCIRState *ir, IROperand op, IRLoop *loop, int32_t *hoisted_vregs, int num_hoisted_vregs) { @@ -1593,6 +1604,15 @@ static int is_operand_loop_invariant_ex(TCCIRState *ir, IROperand op, IRLoop *lo } } + /* No direct def in the loop. The value can STILL change across iterations + * if the variable's address has been taken: any store through a pointer or + * any non-CONST call inside the loop may then mutate its stack slot without + * a visible def of the vreg (docs/bugs.md #7, ptr seeds 500/517). Only + * accept an address-taken variable when the loop provably cannot write + * memory at all. */ + if (vreg_addr_taken_anywhere(ir, vreg) && loop_body_may_clobber_memory(ir, loop)) + return 0; + /* Vreg not defined in loop - it's loop-invariant */ return 1; } @@ -1603,10 +1623,54 @@ __attribute__((unused)) static int is_operand_loop_invariant(TCCIRState *ir, IRO return is_operand_loop_invariant_ex(ir, op, loop, NULL, 0); } +/* Does the loop body contain anything that could modify memory a PURE function + * might read? PR20100: a PURE function (as opposed to CONST) reads global/heap + * memory, so its result is only loop-invariant if that memory is unchanged + * across iterations. Any store, or any call that is not itself CONST (an + * IMPURE/UNKNOWN callee — or an indirect call — may write memory), can change + * what a PURE callee observes, so hoisting it would be a miscompile. CONST + * callees read no memory and are unaffected by this. */ +static int loop_body_may_clobber_memory(TCCIRState *ir, IRLoop *loop) +{ + for (int i = 0; i < loop->num_body_instrs; i++) + { + IRQuadCompact *q = &ir->compact_instructions[loop->body_instrs[i]]; + switch (q->op) + { + case TCCIR_OP_NOP: + continue; + case TCCIR_OP_STORE: + case TCCIR_OP_STORE_INDEXED: + case TCCIR_OP_STORE_POSTINC: + case TCCIR_OP_BLOCK_COPY: + return 1; + case TCCIR_OP_INLINE_ASM: + case TCCIR_OP_ASM_OUTPUT: + return 1; /* inline asm may write arbitrary memory */ + case TCCIR_OP_FUNCCALLVAL: + case TCCIR_OP_FUNCCALLVOID: + { + Sym *callee = irop_get_sym_ex(ir, tcc_ir_op_get_src1(ir, q)); + if (!callee || tcc_ir_get_func_purity(ir, callee) < TCC_FUNC_PURITY_CONST) + return 1; /* indirect / impure / merely-pure call may write memory */ + continue; + } + default: + /* A memory store through a non-STORE op shows up as an lval destination. */ + if (irop_config[q->op].has_dest && tcc_ir_op_get_dest(ir, q).is_lval) + return 1; + continue; + } + } + return 0; +} + /* Check if a function call instruction can be hoisted * Requirements: * 1. Function is pure or const * 2. All arguments are loop-invariant (considering already-hoisted vregs) + * 3. If the function is PURE (reads memory) rather than CONST, the loop must + * not modify any memory it could read (PR20100) */ static int tcc_ir_is_hoistable_call_ex(TCCIRState *ir, int instr_idx, IRLoop *loop, int32_t *hoisted_vregs, int num_hoisted_vregs) @@ -1650,6 +1714,15 @@ static int tcc_ir_is_hoistable_call_ex(TCCIRState *ir, int instr_idx, IRLoop *lo return 0; } + /* A merely-PURE function reads memory; hoisting it is only safe if the loop + * cannot change what it reads. A CONST function reads nothing and is always + * safe (PR20100 / docs/bugs.md #7). */ + if (purity < TCC_FUNC_PURITY_CONST && loop_body_may_clobber_memory(ir, loop)) + { + LOG_LICM("Call at %d: PURE (not CONST) and loop clobbers memory — not hoistable", instr_idx); + return 0; + } + LOG_LICM("Call at %d: function is pure (purity=%d), checking args...", instr_idx, purity); /* Find all FUNCPARAMVAL instructions for this call */ @@ -1697,7 +1770,12 @@ typedef struct int is_hoisted; } HoistableCallInfo; -/* Collect all FUNCPARAMVAL instructions belonging to a call */ +/* Collect all param markers belonging to a call. BOTH FUNCPARAMVAL (a value + * argument) and FUNCPARAMVOID (the marker a zero-argument or void call still + * carries, and which the backend pairs with the CALL by call_id) must be + * collected — otherwise hoisting the CALL but leaving its FUNCPARAMVOID behind + * orphans the marker ("no call site found for call_id=N") and the hoisted call + * loses its marker ("missing FUNCPARAMVAL"). See docs/bugs.md #7. */ static int collect_call_params(TCCIRState *ir, int call_idx, int *param_indices, int max_params) { IRQuadCompact *call_q = &ir->compact_instructions[call_idx]; @@ -1705,11 +1783,11 @@ static int collect_call_params(TCCIRState *ir, int call_idx, int *param_indices, int call_id = TCCIR_DECODE_CALL_ID(irop_get_imm64_ex(ir, call_src2)); int num_params = 0; - /* Scan all instructions for params with matching call_id */ + /* Scan all instructions for params/markers with matching call_id */ for (int i = 0; i < ir->next_instruction_index && num_params < max_params; i++) { IRQuadCompact *q = &ir->compact_instructions[i]; - if (q->op == TCCIR_OP_FUNCPARAMVAL) + if (q->op == TCCIR_OP_FUNCPARAMVAL || q->op == TCCIR_OP_FUNCPARAMVOID) { IROperand src2 = tcc_ir_op_get_src2(ir, q); int param_call_id = TCCIR_DECODE_CALL_ID(irop_get_imm64_ex(ir, src2)); @@ -1731,12 +1809,28 @@ int tcc_ir_hoist_pure_calls(TCCIRState *ir, IRLoops *loops) if (!ir || !loops) return 0; + /* Re-enabled 2026-07-02 after the ninth (and final) defect fix: the + * combo-profile residue (seeds 52/80/187/311/333/392/460) was + * insert_instruction_before desynchronizing SWITCH_TABLE side-table + * targets — not the linear-index call bookkeeping suspected earlier. + * Full history in docs/bugs.md #7 (resolved). */ + + /* Kill-switch for bisection: TCC_DISABLE_PASS=pure_call_hoist. */ + if (tcc_ir_opt_pass_disabled("pure_call_hoist")) + return 0; + int total_hoisted = 0; for (int loop_idx = 0; loop_idx < loops->num_loops; loop_idx++) { IRLoop *loop = &loops->loops[loop_idx]; + /* total_hoisted accumulates across ALL loops; the post-loop index fix-up + * below must shift by only THIS loop's insertions, otherwise a later loop + * (already shifted by an earlier loop's insertions) is over-shifted. + * Snapshot the running total to recover the per-loop delta. */ + int total_hoisted_at_loop_start = total_hoisted; + if (loop->preheader_idx < 0) continue; /* No preheader - can't hoist */ @@ -1772,6 +1866,82 @@ int tcc_ir_hoist_pure_calls(TCCIRState *ir, IRLoops *loops) if (preheader_in_other_loop) continue; + /* The hoist inserts at preheader_idx+1 and relies on control FALLING + * THROUGH from the preheader into the loop header. tcc_ir_detect_loops' + * preheader walk skips backward over JUMP/JUMPIF, so preheader_idx may + * belong to a block that never reaches this loop. docs/bugs.md #7, + * combo fuzz seed 18: the hoisted call landed on a bypass path just + * ahead of an unconditional JMP while the loop itself was entered by a + * jump straight to the header — the loop then read the hoisted result + * vreg UNDEFINED (wrong checksum; an undefined loop bound turns into an + * infinite loop). Two requirements make the insertion point sound: + * 1. the preheader is the header's immediate predecessor (nothing was + * skipped — control genuinely falls from it into the header), and + * 2. no jump from OUTSIDE the loop targets the header (such an entry + * edge would bypass the inserted preheader code). Back-edges and + * `continue`-style jumps from inside are fine: on any path that + * reaches them, the hoisted call has already executed. */ + if (loop->preheader_idx != loop->header_idx - 1) + { + LOG_LICM("Skipping loop %d: preheader %d is not the header %d's immediate predecessor", loop_idx, + loop->preheader_idx, loop->header_idx); + continue; + } + { + /* Reject any entry edge from outside the loop's linear range into ANY + * instruction of [header, end] — not just the header. An edge into + * the header bypasses the inserted preheader code (docs/bugs.md #7, + * combo seed 18); an edge into the middle of the range would break the + * header's dominance over the call sites we rewrite (the hoisted + * result vreg could be read on a path that never ran the preheader). + * Note this also skips increment-trampoline rotated loops, whose + * physical body jumps back into [header, end] from linearly outside — + * their linear range holds only guard/increment code, so nothing + * hoistable is lost. */ + int external_entry = 0; + for (int j = 0; j < ir->next_instruction_index && !external_entry; j++) + { + if (j >= loop->start_idx && j <= loop->end_idx) + continue; /* jumps from inside the loop are fine */ + IRQuadCompact *jq = &ir->compact_instructions[j]; + if (jq->op == TCCIR_OP_JUMP || jq->op == TCCIR_OP_JUMPIF) + { + int jt = (int)irop_get_imm64_ex(ir, tcc_ir_op_get_dest(ir, jq)); + if (jt >= loop->header_idx && jt <= loop->end_idx) + external_entry = 1; + } + else if (jq->op == TCCIR_OP_SWITCH_TABLE) + { + /* A switch outside the loop with a case/default target in the + * range is an entry edge, same as a plain JUMP. */ + int table_id = (int)irop_get_imm64_ex(ir, tcc_ir_op_get_src2(ir, jq)); + if (table_id >= 0 && table_id < ir->num_switch_tables) + { + TCCIRSwitchTable *table = &ir->switch_tables[table_id]; + if (table->default_target >= loop->header_idx && table->default_target <= loop->end_idx) + external_entry = 1; + for (int ti = 0; table->targets && ti < table->num_entries && !external_entry; ti++) + { + if (table->targets[ti] >= loop->header_idx && table->targets[ti] <= loop->end_idx) + external_entry = 1; + } + } + } + else if (jq->op == TCCIR_OP_IJUMP) + { + /* Indirect jump: target unknowable — conservatively treat it as a + * possible entry edge into the loop. */ + external_entry = 1; + } + } + if (external_entry) + { + LOG_LICM("Skipping loop %d: header %d is entered by a jump from outside the loop", loop_idx, + loop->header_idx); + continue; + } + } + /* Skip loops containing VLA allocations. * VLAs have special stack semantics - the size is computed at runtime * and SP is adjusted dynamically. Hoisting a pure function call that @@ -1805,6 +1975,23 @@ int tcc_ir_hoist_pure_calls(TCCIRState *ir, IRLoops *loops) for (int i = 0; i < loop->num_body_instrs && num_all_calls < MAX_HOISTABLE_CALLS; i++) { int instr_idx = loop->body_instrs[i]; + + /* body_instrs is an OVER-approximation: tcc_ir_detect_loops' forward- + * jump "extension" (any jump out of [start,end] whose target is within + * +50 of the header extends the body, with no path-back-to-header + * check) can swallow post-loop code. volatile fuzz seeds 3583/6116: a + * rotated for-loop's exit jump pulled the else arm of the enclosing + * if/else into body_instrs, and the two else-arm calls were "hoisted" + * above the loop — onto the then-path — while the else path entered at + * its own label and read both result vregs UNDEFINED. The over- + * approximation is conservative (correct) for the clobber/invariance + * scans, but calls may only be hoisted from the certain linear range: + * with the preheader fall-through + no-external-entry guards above, + * every instruction in [start,end] is dominated by the header, so the + * preheader insertion point dominates the rewritten call site. */ + if (instr_idx < loop->start_idx || instr_idx > loop->end_idx) + continue; + IRQuadCompact *q = &ir->compact_instructions[instr_idx]; if (q->op == TCCIR_OP_FUNCCALLVAL || q->op == TCCIR_OP_FUNCCALLVOID) @@ -1943,18 +2130,29 @@ int tcc_ir_hoist_pure_calls(TCCIRState *ir, IRLoops *loops) int64_t new_call_encoded = TCCIR_ENCODE_CALL(new_call_id, argc); IROperand new_call_src2 = irop_make_imm32(-1, (int32_t)new_call_encoded, IROP_BTYPE_INT32); - /* Reallocate operand pool for call copy with updated call_id */ - IROperand call_dest = tcc_ir_op_get_dest(ir, &call_copy); + /* Reallocate operand pool for the call copy with the updated call_id. + * The operand layout is [dest?, src1, src2] where the dest slot exists + * ONLY when irop_config[op].has_dest is set (FUNCCALLVAL). A + * FUNCCALLVOID has no dest, so it must be laid out as [src1, src2]; + * emitting a spurious dest operand for it shifts src1/src2 down by one + * and makes the accessors read the call_id/argc encoding out of the + * callee-symref slot instead (decoding to a bogus argc -> the backend + * then reports "missing FUNCPARAMVAL for call_id=N"). See bugs.md #7. */ IROperand call_src1 = tcc_ir_op_get_src1(ir, &call_copy); + int call_has_dest = irop_config[call_copy.op].has_dest; - if (hoistable[i].hoisted_vreg >= 0) + if (call_has_dest) { - /* Update destination to use hoisted vreg */ - call_dest = irop_make_vreg(hoistable[i].hoisted_vreg, IROP_BTYPE_INT32); + IROperand call_dest = tcc_ir_op_get_dest(ir, &call_copy); + if (hoistable[i].hoisted_vreg >= 0) + call_dest = irop_make_vreg(hoistable[i].hoisted_vreg, IROP_BTYPE_INT32); + call_copy.operand_base = tcc_ir_pool_add(ir, call_dest); + tcc_ir_pool_add(ir, call_src1); + } + else + { + call_copy.operand_base = tcc_ir_pool_add(ir, call_src1); } - - call_copy.operand_base = tcc_ir_pool_add(ir, call_dest); - tcc_ir_pool_add(ir, call_src1); tcc_ir_pool_add(ir, new_call_src2); insert_instruction_before(ir, loop->preheader_idx + 1, &call_copy); @@ -1973,11 +2171,21 @@ int tcc_ir_hoist_pure_calls(TCCIRState *ir, IRLoops *loops) int64_t new_param_encoded = TCCIR_ENCODE_PARAM(new_call_id, param_idx); IROperand new_param_src2 = irop_make_imm32(-1, (int32_t)new_param_encoded, IROP_BTYPE_INT32); - /* Allocate operands in pool according to irop_config for FUNCPARAMVAL: - * has_dest=0, has_src1=1, has_src2=1 - * So operands are: src1 at base+0, src2 at base+1 (NO dest!) */ - int new_operand_base = tcc_ir_pool_add(ir, tcc_ir_op_get_src1(ir, ¶m_copies[p])); - tcc_ir_pool_add(ir, new_param_src2); + /* Allocate operands per the marker's own irop_config (both have + * has_dest=0, has_src2=1). FUNCPARAMVAL has has_src1=1 (the value): + * layout [src1, src2]. FUNCPARAMVOID has has_src1=0: layout [src2] + * only — writing a spurious src1 for it would push src2 down a slot + * and misdecode the call_id (docs/bugs.md #7). */ + int new_operand_base; + if (irop_config[param_copies[p].op].has_src1) + { + new_operand_base = tcc_ir_pool_add(ir, tcc_ir_op_get_src1(ir, ¶m_copies[p])); + tcc_ir_pool_add(ir, new_param_src2); + } + else + { + new_operand_base = tcc_ir_pool_add(ir, new_param_src2); + } param_copies[p].operand_base = new_operand_base; insert_instruction_before(ir, loop->preheader_idx + 1, ¶m_copies[p]); @@ -2049,33 +2257,49 @@ int tcc_ir_hoist_pure_calls(TCCIRState *ir, IRLoops *loops) } } - } while (hoisted_this_iteration > 0); - - /* Update loop indices for subsequent loops */ - if (total_hoisted > 0) + /* Single pass only: transitive-invariance chaining (hoisting a call + * whose argument is the RESULT of another call just hoisted in this same + * loop) is deliberately NOT performed. That path required rewriting the + * copied FUNCPARAMVAL's operand from the loop-body result vreg to the + * hoisted temp AND inserting the dependent call after its producer; + * the copy-verbatim / insert-at-preheader+1 logic below does neither, so + * a second iteration produced a preheader call reading an undefined vreg + * (or one defined by a later-in-preheader call), corrupting argument + * linkage (docs/bugs.md #7). A single pass with num_hoisted_vregs left + * at 0 during the hoistability checks only hoists calls whose arguments + * are loop-invariant in the strict sense, which is correct by + * construction. (void)hoisted_this_iteration keeps the counter live for + * the trace logging above without re-looping.) */ + } while (0); + (void)hoisted_this_iteration; + + /* Update loop indices for subsequent loops, shifting by ONLY the number of + * instructions inserted while processing THIS loop (see snapshot above). */ + int hoisted_this_loop = total_hoisted - total_hoisted_at_loop_start; + if (hoisted_this_loop > 0) { for (int j = loop_idx + 1; j < loops->num_loops; j++) { IRLoop *later_loop = &loops->loops[j]; if (later_loop->start_idx >= loop->preheader_idx) - later_loop->start_idx += total_hoisted; + later_loop->start_idx += hoisted_this_loop; if (later_loop->end_idx >= loop->preheader_idx) - later_loop->end_idx += total_hoisted; + later_loop->end_idx += hoisted_this_loop; if (later_loop->preheader_idx >= loop->preheader_idx) - later_loop->preheader_idx += total_hoisted; + later_loop->preheader_idx += hoisted_this_loop; for (int k = 0; k < later_loop->num_body_instrs; k++) { if (later_loop->body_instrs[k] >= loop->preheader_idx) - later_loop->body_instrs[k] += total_hoisted; + later_loop->body_instrs[k] += hoisted_this_loop; } } /* Update this loop's indices too */ - loop->header_idx += total_hoisted; - loop->start_idx += total_hoisted; + loop->header_idx += hoisted_this_loop; + loop->start_idx += hoisted_this_loop; for (int k = 0; k < loop->num_body_instrs; k++) { - loop->body_instrs[k] += total_hoisted; + loop->body_instrs[k] += hoisted_this_loop; } } } @@ -2131,10 +2355,24 @@ static IRLoops *tcc_ir_opt_licm_ex__timed(TCCIRState *ir) * because VLAs have special stack semantics - the size computation must * happen at the VLA allocation point, not in the preheader. */ - /* Pure call hoisting disabled for now — the call_id renumbering - * corrupts argument linkage in chained-call patterns. - * TODO: fix tcc_ir_hoist_pure_calls index tracking and re-enable. */ - int hoisted_calls = 0; + /* Pure/const call hoisting — RE-ENABLED (docs/bugs.md #7, fixed 2026-07-02). + * Four defects were fixed to make this safe: + * 1. Multi-loop index fix-up now shifts later loops by per-loop insertion + * counts, not the cumulative total (which over-shifted a 3rd+ loop). + * 2. Transitive-invariance CHAINING (a call whose arg is another just- + * hoisted call's result) is not attempted — a single non-chaining pass; + * that path copied a FUNCPARAMVAL referencing a body-only vreg and + * ordered the dependent call before its producer. + * 3. The copied CALL / param markers are laid out per each op's irop_config + * (FUNCCALLVOID has no dest; FUNCPARAMVOID has no src1) — the old code + * always emitted a dest+src1, misdecoding a void call's call_id. + * 4. collect_call_params gathers FUNCPARAMVOID markers too, so a call's + * end-of-args marker travels with it. + * 5. A merely-PURE (memory-reading) call is hoisted only when the loop + * cannot modify the memory it reads (PR20100); CONST calls always may. + * Change is signalled to the pipeline via num_loops > 0 (see tcc_ir_opt_licm), + * same as the dominance-based LICM below. */ + int hoisted_calls = tcc_ir_hoist_pure_calls(ir, loops); int hoisted = 0; (void)hoisted_calls; diff --git a/ir/licm.h b/ir/licm.h index 8a16fe5c..7c0a6dfa 100644 --- a/ir/licm.h +++ b/ir/licm.h @@ -95,10 +95,6 @@ void tcc_ir_free_loops(IRLoops *loops); /* Check if an instruction index is inside a loop */ int tcc_ir_is_in_loop(IRLoop *loop, int instr_idx); -/* Identify and hoist loop-invariant stack address computations - * Returns number of instructions hoisted */ -int tcc_ir_hoist_loop_invariants(TCCIRState *ir, IRLoops *loops); - /* Estimate how many values can be safely hoisted out of a loop without * starving the loop body of registers. Scans the loop body to estimate * register pressure and returns the number of registers available for diff --git a/ir/opt.c b/ir/opt.c index 3176b7b9..77446517 100644 --- a/ir/opt.c +++ b/ir/opt.c @@ -2995,6 +2995,25 @@ int tcc_ir_opt_memmove_to_indexed_stores(TCCIRState *ir) st_src = tcc_ir_op_get_src1(ir, sq); if (irop_get_tag(st_dest) == IROP_TAG_STACKOFF && st_dest.is_local && st_dest.is_lval) { + /* A STORE whose STACKOFF dest also carries a *named* local (VAR / + * PARAM) vreg identity cannot be relocated by stack offset alone: the + * backend keys the store off that vreg, so the rewritten copy would + * still target the original (source) local, leaving the memcpy + * destination unwritten. This shows up after the small-function + * inliner expands `T f(T x){ T u; memcpy(&u,&x,sizeof u); return u; }` + * — the param/result become named VARs and the fold dropped the + * copy entirely (fuzz float_seed*). Bail; anonymous stack temps + * (vreg == -1) relocate cleanly and are unaffected. */ + int32_t dvr = irop_get_vreg(st_dest); + if (dvr >= 0) + { + int vt = TCCIR_DECODE_VREG_TYPE(dvr); + if (vt == TCCIR_VREG_TYPE_VAR || vt == TCCIR_VREG_TYPE_PARAM) + { + aborted = 1; + break; + } + } st_off = (int)irop_get_imm64_ex(ir, st_dest); st_off_found = 1; st_size = ir_opt_store_btype_size_bytes(irop_get_btype(st_dest)); @@ -3516,6 +3535,51 @@ int tcc_ir_opt_memmove_to_indexed_stores(TCCIRState *ir) return changes; } +/* Carry a narrow plain STORE's access width onto its value operand. + * + * A plain STORE (`*p = v`) derives its store width from the DEST (lvalue) + * btype — the codegen ignores the value operand's width. But several later + * transforms rewrite a plain STORE into a STORE_INDEXED, which instead derives + * its width from the VALUE operand's btype. When the value carries a wider + * (INT32) expression type — e.g. after copy-propagation forwards a wider temp + * into a char/short store, as happens collapsing `(v & ~field) | x` for a + * packed bitfield — the converted indexed store widens to a full word and + * clobbers the adjacent bytes (e.g. a packed-bitfield byte store overwriting + * the next array element). + * + * Clamping the value operand's btype to the access width here keeps every + * later STORE_INDEXED conversion narrow. It is a no-op for the plain STORE + * itself (whose width still comes from the dest), and only narrows (never + * widens) so a correctly-narrow value is left untouched. Runs just before + * register allocation, after all value forwarding has settled. */ +int tcc_ir_opt_narrow_store_value_btype(TCCIRState *ir) +{ + int n = ir ? ir->next_instruction_index : 0; + int changes = 0; + for (int i = 0; i < n; i++) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op != TCCIR_OP_STORE) + continue; + IROperand dest = tcc_ir_op_get_dest(ir, q); + if (!dest.is_lval) + continue; + int dbt = irop_get_btype(dest); + if (dbt != IROP_BTYPE_INT8 && dbt != IROP_BTYPE_INT16) + continue; + IROperand src = tcc_ir_op_get_src1(ir, q); + int sbt = irop_get_btype(src); + /* Only act when the value is wider than the access; leave already-narrow + * and non-integer (struct/float/64-bit) values alone. */ + if (sbt != IROP_BTYPE_INT32) + continue; + src.btype = (uint8_t)dbt; + tcc_ir_set_src1(ir, i, src); + changes++; + } + return changes; +} + /* ============================================================================ * Per-pass timing instrumentation (opt-in via TCC_PASS_TIMING env var). * Accumulates wall-clock microseconds per named pass across a whole compile diff --git a/ir/opt.h b/ir/opt.h index aa403c2e..bb1cee1c 100644 --- a/ir/opt.h +++ b/ir/opt.h @@ -287,6 +287,11 @@ int tcc_ir_opt_cmp_narrow_64(struct TCCIRState *ir); * or high word is provably unread, so codegen skips the dead half-write. */ int tcc_ir_opt_shift64_dead_half(struct TCCIRState *ir); +/* Clamp a narrow plain STORE's value-operand btype to its access width so a + * later STORE_INDEXED conversion (which takes width from the value) does not + * widen a char/short store to a word and clobber adjacent memory. */ +int tcc_ir_opt_narrow_store_value_btype(struct TCCIRState *ir); + /* Global LOAD value CSE - deduplicate loads from the same global within a BB */ int tcc_ir_opt_cse_global_load(struct TCCIRState *ir); diff --git a/ir/opt/ssa_opt.c b/ir/opt/ssa_opt.c index 2d35f97a..bc07f232 100644 --- a/ir/opt/ssa_opt.c +++ b/ir/opt/ssa_opt.c @@ -13,6 +13,8 @@ #include "ssa_opt.h" #include +extern int tcc_ir_opt_pass_disabled(const char *name); + /* ============================================================================ * Target-Specific Generator Registration * ============================================================================ */ @@ -77,7 +79,7 @@ static void ssa_opt_record_use(IRSSAOptCtx *ctx, int32_t vreg, int instr_idx) ssa_opt_add_use_instr(vi, instr_idx); } -static void ssa_opt_scan_instr_uses(IRSSAOptCtx *ctx, int i, IRQuadCompact *q) +void ssa_opt_scan_instr_uses(IRSSAOptCtx *ctx, int i, IRQuadCompact *q) { TCCIRState *ir = ctx->ir; @@ -357,6 +359,24 @@ static void ssa_opt_rewrite_operand(IRSSAOptCtx *ctx, int instr_idx, } } +static int ssa_opt_use_is_barrel_shift_src2(IRSSAOptCtx *ctx, IRSSAUse use, + int32_t old_vr) +{ + if (use.kind != SSA_USE_INSTR) + return 0; + + TCCIRState *ir = ctx->ir; + IRQuadCompact *q = &ir->compact_instructions[use.idx]; + if (!ir->barrel_shifts || q->orig_index < 0 || + q->orig_index > ir->max_orig_index || + ir->barrel_shifts[q->orig_index] == 0 || + !irop_config[q->op].has_src2) + return 0; + + IROperand src2 = tcc_ir_op_get_src2(ir, q); + return irop_get_vreg(src2) == old_vr; +} + static void ssa_opt_rewrite_phi_operand(IRSSAOptCtx *ctx, int block, int slot, int32_t old_vr, int32_t new_vr) @@ -379,6 +399,16 @@ int ssa_opt_replace_all_uses(IRSSAOptCtx *ctx, int32_t old_vr, int32_t new_vr) if (!old_vi) return 0; + /* ARM barrel-shift fusion encodes a hidden shift on an instruction's src2 + * in ir->barrel_shifts[orig_index]. Replacing that src2 with another vreg + * or an immediate drops the implicit "this operand must be shifted" value + * identity from SSA's point of view. Leave such defs in place so codegen + * still materializes the shift source exactly as fusion recorded it. */ + for (int i = 0; i < old_vi->use_count; i++) { + if (ssa_opt_use_is_barrel_shift_src2(ctx, old_vi->uses[i], old_vr)) + return 0; + } + int count = 0; while (old_vi->use_count > 0) { IRSSAUse use = old_vi->uses[--old_vi->use_count]; @@ -405,9 +435,21 @@ int ssa_opt_replace_all_uses(IRSSAOptCtx *ctx, int32_t old_vr, int32_t new_vr) * ============================================================================ */ int ssa_opt_resolve_lea_stackloc(IRSSAOptCtx *ctx, int32_t vr) +{ + return ssa_opt_resolve_lea_stackloc_ex(ctx, vr, NULL); +} + +/* The address-source operand at a resolution terminal carries the location's + * identity in its vreg: irop_get_vreg(src) is -1 for a real direct stack slot + * (vreg_type == 0, offset authoritative) and the VAR/PARAM vreg for a `&VAR` + * spill-encoded address (offset is a shared placeholder). Report it so callers + * can tell distinct address-taken locals apart at SSA time. */ +int ssa_opt_resolve_lea_stackloc_ex(IRSSAOptCtx *ctx, int32_t vr, int32_t *out_base_var) { TCCIRState *ir = ctx->ir; int acc = 0; + if (out_base_var) + *out_base_var = -1; /* Bound on chain length; chains longer than this (e.g. degenerate va_arg * pointer arithmetic) bail to INT_MIN. Without a cap the recursive form * blew the host stack on pathological inputs. */ @@ -421,15 +463,21 @@ int ssa_opt_resolve_lea_stackloc(IRSSAOptCtx *ctx, int32_t vr) if (dq->op == TCCIR_OP_LEA) { IROperand src = tcc_ir_op_get_src1(ir, dq); - if (src.tag == IROP_TAG_STACKOFF || src.is_local) + if (src.tag == IROP_TAG_STACKOFF || src.is_local) { + if (out_base_var) + *out_base_var = irop_get_vreg(src); return irop_get_stack_offset(src) + acc; + } return INT_MIN; } if (dq->op == TCCIR_OP_ASSIGN) { IROperand src = tcc_ir_op_get_src1(ir, dq); - if (src.tag == IROP_TAG_STACKOFF && !src.is_lval) + if (src.tag == IROP_TAG_STACKOFF && !src.is_lval) { + if (out_base_var) + *out_base_var = irop_get_vreg(src); return irop_get_stack_offset(src) + acc; + } int32_t sv = irop_get_vreg(src); if (sv >= 0 && !src.is_lval) { vr = sv; @@ -445,8 +493,11 @@ int ssa_opt_resolve_lea_stackloc(IRSSAOptCtx *ctx, int32_t vr) IROperand dest = tcc_ir_op_get_dest(ir, dq); if (!dest.is_lval) { IROperand src = tcc_ir_op_get_src1(ir, dq); - if (src.tag == IROP_TAG_STACKOFF && !src.is_lval) + if (src.tag == IROP_TAG_STACKOFF && !src.is_lval) { + if (out_base_var) + *out_base_var = irop_get_vreg(src); return irop_get_stack_offset(src) + acc; + } int32_t sv = irop_get_vreg(src); if (sv >= 0 && !src.is_lval) { vr = sv; @@ -566,6 +617,12 @@ int ssa_opt_resolve_temp_to_base_off(IRSSAOptCtx *ctx, int32_t vr, } int ssa_opt_indirect_stack_offset(IRSSAOptCtx *ctx, const IRQuadCompact *q, int side) +{ + return ssa_opt_indirect_stack_offset_ex(ctx, q, side, NULL); +} + +int ssa_opt_indirect_stack_offset_ex(IRSSAOptCtx *ctx, const IRQuadCompact *q, int side, + int32_t *out_base_var) { TCCIRState *ir = ctx->ir; IROperand base; @@ -573,6 +630,9 @@ int ssa_opt_indirect_stack_offset(IRSSAOptCtx *ctx, const IRQuadCompact *q, int int require_lval = 0; IROperand idx = IROP_NONE, scale = IROP_NONE; + if (out_base_var) + *out_base_var = -1; + if (side == SSA_OPT_INDIRECT_DEST) { base = tcc_ir_op_get_dest(ir, q); if (q->op == TCCIR_OP_STORE_INDEXED) { @@ -604,9 +664,12 @@ int ssa_opt_indirect_stack_offset(IRSSAOptCtx *ctx, const IRQuadCompact *q, int int32_t bvr = irop_get_vreg(base); if (bvr < 0 || TCCIR_DECODE_VREG_TYPE(bvr) != TCCIR_VREG_TYPE_TEMP) return INT_MIN; - int base_off = ssa_opt_resolve_lea_stackloc(ctx, bvr); - if (base_off == INT_MIN) + int base_off = ssa_opt_resolve_lea_stackloc_ex(ctx, bvr, out_base_var); + if (base_off == INT_MIN) { + if (out_base_var) + *out_base_var = -1; return INT_MIN; + } if (!has_index) return base_off; if (!irop_is_immediate(idx) || !irop_is_immediate(scale)) @@ -652,45 +715,43 @@ int tcc_ir_ssa_opt_run(IRSSAOptCtx *ctx) const int max_iterations = 5; int changes; + /* Run one SSA pass, accumulate its change count, then make it observable: + * dbg_scan_imm_dest() for the SCAN_IMM_DEST bug hunt and + * tcc_ir_dump_after_pass() for -dump-ir-passes= golden snapshots + * (mirrors the legacy RUN_PASS macro in tccgen.c). */ +#define SSA_RUN(name, call) \ + do \ + { \ + if (!tcc_ir_opt_pass_disabled(name)) \ + changes += (call); \ + dbg_scan_imm_dest(ctx->ir, name); \ + tcc_ir_dump_after_pass(ctx->ir, name); \ + } while (0) + do { changes = 0; iteration++; /* target-independent passes */ - changes += ssa_opt_var_const_fold(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:var_const_fold"); - changes += ssa_opt_sccp(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:sccp"); - changes += ssa_opt_cprop(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:cprop"); + SSA_RUN("ssa:var_const_fold", ssa_opt_var_const_fold(ctx)); + SSA_RUN("ssa:sccp", ssa_opt_sccp(ctx)); + SSA_RUN("ssa:cprop", ssa_opt_cprop(ctx)); /* Collapse `V <- val [STORE]; ... PARAM V` into `... PARAM val` when V * has a single def and that lone PARAM as its only use. Catches the * inlined-check1 pattern that spills printf args into VARs ahead of * the conditional branch even when only the FAIL path reads them. */ - changes += ssa_opt_var_to_param_forward(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:var_to_param_forward"); - changes += ssa_opt_fold(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:fold"); - changes += ssa_opt_load_cse(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:load_cse"); - changes += ssa_opt_branch(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:branch"); - changes += ssa_opt_cmp_eq_prop(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:cmp_eq_prop"); - changes += ssa_opt_reassoc(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:reassoc"); - changes += ssa_opt_strength(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:strength"); - changes += ssa_opt_narrow(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:narrow"); - changes += ssa_opt_gvn(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:gvn"); - changes += ssa_opt_phi_simplify(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:phi_simplify"); - changes += ssa_opt_dead_loop(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:dead_loop"); - changes += ssa_opt_dce(ctx); - dbg_scan_imm_dest(ctx->ir, "ssa:dce"); + SSA_RUN("ssa:var_to_param_forward", ssa_opt_var_to_param_forward(ctx)); + SSA_RUN("ssa:fold", ssa_opt_fold(ctx)); + SSA_RUN("ssa:load_cse", ssa_opt_load_cse(ctx)); + SSA_RUN("ssa:branch", ssa_opt_branch(ctx)); + SSA_RUN("ssa:cmp_eq_prop", ssa_opt_cmp_eq_prop(ctx)); + SSA_RUN("ssa:reassoc", ssa_opt_reassoc(ctx)); + SSA_RUN("ssa:strength", ssa_opt_strength(ctx)); + SSA_RUN("ssa:narrow", ssa_opt_narrow(ctx)); + SSA_RUN("ssa:gvn", ssa_opt_gvn(ctx)); + SSA_RUN("ssa:phi_simplify", ssa_opt_phi_simplify(ctx)); + SSA_RUN("ssa:dead_loop", ssa_opt_dead_loop(ctx)); + SSA_RUN("ssa:dce", ssa_opt_dce(ctx)); /* target-specific generators (registered by backend) */ if (target_gens && target_gen_count > 0) @@ -698,6 +759,7 @@ int tcc_ir_ssa_opt_run(IRSSAOptCtx *ctx) total += changes; } while (changes > 0 && iteration < max_iterations); +#undef SSA_RUN return total; } diff --git a/ir/opt/ssa_opt.h b/ir/opt/ssa_opt.h index 0234b27d..42e8fb6c 100644 --- a/ir/opt/ssa_opt.h +++ b/ir/opt/ssa_opt.h @@ -101,6 +101,9 @@ int ssa_opt_run_gens(IRSSAOptCtx *ctx, const IRSSAOptGen *gens, int count); IRSSAVregInfo *ssa_opt_vinfo(IRSSAOptCtx *ctx, int32_t vreg); void ssa_opt_add_use_instr(IRSSAVregInfo *vi, int instr_idx); void ssa_opt_add_use_phi(IRSSAVregInfo *vi, int block, int slot); +/* Append use-list entries for every vreg `q` (at index i) reads — same rules + * as the init-time scan (src1/src2, MLA accum, memory-write STORE dest). */ +void ssa_opt_scan_instr_uses(IRSSAOptCtx *ctx, int i, IRQuadCompact *q); void ssa_opt_remove_use_instr(IRSSAVregInfo *vi, int instr_idx); void ssa_opt_nop_instr(IRSSAOptCtx *ctx, int idx); int ssa_opt_replace_all_uses(IRSSAOptCtx *ctx, int32_t old_vr, int32_t new_vr); @@ -146,6 +149,19 @@ void ssa_drop_phi_edge(IRSSAOptCtx *ctx, int dead_pred_block, int target_block_i * if the chain doesn't resolve to a stack address. Multi-def TEMPs bail. */ int ssa_opt_resolve_lea_stackloc(IRSSAOptCtx *ctx, int32_t vr); +/* Like ssa_opt_resolve_lea_stackloc, but also reports the *identity* of the + * resolved address through *out_base_var: + * -1 -> a real direct stack slot (vreg_type == 0); the returned offset is + * authoritative and uniquely names the slot. + * >=0 -> the address is `&VAR` (a scalar local addressed via its VAR/PARAM + * spill encoding). At SSA time such locals have no assigned slot, so + * the returned offset is a placeholder (typically 0) SHARED by every + * distinct VAR — callers MUST disambiguate by *out_base_var, not by + * the offset alone, or they alias unrelated locals (ptr fuzz seed 67: + * &u2 and &u3 both -> offset 0). + * out_base_var may be NULL. It is set to -1 on an INT_MIN (unresolved) return. */ +int ssa_opt_resolve_lea_stackloc_ex(IRSSAOptCtx *ctx, int32_t vr, int32_t *out_base_var); + /* Resolve a vreg backward to its canonical (base_vr, offset) form. Chases * single-def ASSIGN copies and `T = base ADD #imm` chains until it lands * on a VAR/PARAM root (or a TEMP whose definition isn't a recognized copy @@ -163,6 +179,10 @@ int ssa_opt_resolve_temp_to_base_off(IRSSAOptCtx *ctx, int32_t vr, * dest is not TEMP-DEREF or the LEA chain does not resolve, or the index * is not a constant with scale 0. */ int ssa_opt_indirect_stack_offset(IRSSAOptCtx *ctx, const IRQuadCompact *q, int side); +/* Variant that also reports the resolved address identity via *out_base_var + * (see ssa_opt_resolve_lea_stackloc_ex for the -1 / >=0 contract). */ +int ssa_opt_indirect_stack_offset_ex(IRSSAOptCtx *ctx, const IRQuadCompact *q, int side, + int32_t *out_base_var); #define SSA_OPT_INDIRECT_DEST 0 /* STORE / STORE_INDEXED dest base */ #define SSA_OPT_INDIRECT_SRC1 1 /* LOAD / LOAD_INDEXED source base */ diff --git a/ir/opt/ssa_opt_branch.c b/ir/opt/ssa_opt_branch.c index 5e49fb5a..8ae3c9b0 100644 --- a/ir/opt/ssa_opt_branch.c +++ b/ir/opt/ssa_opt_branch.c @@ -105,7 +105,9 @@ void ssa_drop_phi_edge(IRSSAOptCtx *ctx, int dead_pred_block, static int ssa_block_for_instr(IRCFG *cfg, int instr_idx) { if (!cfg || !cfg->instr_to_block) return -1; - if (instr_idx < 0) return -1; + /* instr_to_block is sized to num_instrs at CFG-build time; instructions + * appended by later passes index past it, so bound-check both ends. */ + if (instr_idx < 0 || instr_idx >= cfg->num_instrs) return -1; return cfg->instr_to_block[instr_idx]; } diff --git a/ir/opt/ssa_opt_cprop.c b/ir/opt/ssa_opt_cprop.c index 68209f2c..dbdb42fe 100644 --- a/ir/opt/ssa_opt_cprop.c +++ b/ir/opt/ssa_opt_cprop.c @@ -60,6 +60,20 @@ static int ssa_gen_cprop_assign(IRSSAOptCtx *ctx, int idx) if (vi && vi->def_count > 1) return 0; + /* Do not propagate a copy whose dest feeds a phi operand. Such a copy + * `T_dest <- T_src` often resolves a phi (e.g. a loop back-edge value): + * folding T_dest away and naming T_src directly in the phi reintroduces the + * lost-copy problem at out-of-SSA phi resolution, since T_src stays live past + * the phi edge and its slot can be overwritten before the parallel copy runs + * (fuzz seed 2698: the loop-carried `cs` back-edge copy was dropped, yielding + * a wrong checksum). Leaving the copy in place keeps phi resolution correct; + * DCE still removes genuinely dead copies. */ + if (vi) { + for (int u = 0; u < vi->use_count; u++) + if (vi->uses[u].kind == SSA_USE_PHI) + return 0; + } + int replaced = ssa_opt_replace_all_uses(ctx, dest_vr, src_vr); return replaced > 0 ? 1 : 0; } @@ -227,8 +241,25 @@ static int ssa_gen_cprop_load_redundant(IRSSAOptCtx *ctx, int idx) if (irop_config[pq->op].has_dest && pq->op != TCCIR_OP_FUNCPARAMVAL && pq->op != TCCIR_OP_FUNCPARAMVOID) { IROperand pd = tcc_ir_op_get_dest(ir, pq); - if (irop_get_vreg(pd) == src_vr) + int32_t pd_vr = irop_get_vreg(pd); + if (pd_vr == src_vr) return 0; + /* A deref-style LOAD reads memory through a register pointer. Besides + * STOREs (handled above), a plain ALU/ASSIGN def of an address-taken + * VAR/PARAM also writes that memory — the value lives in the vreg's + * stack slot and the pointer may hold its address (fuzz ptr seed 6734: + * `p = &u; ... = *p; u = expr; ... = *p` — the second read must not + * reuse the first across u's update). */ + if (src.is_lval && !src.is_local && pd_vr >= 0 && + TCCIR_DECODE_VREG_TYPE(pd_vr) != TCCIR_VREG_TYPE_TEMP) { + IRLiveInterval *pdi = + (TCCIR_DECODE_VREG_TYPE(pd_vr) == TCCIR_VREG_TYPE_VAR || + TCCIR_DECODE_VREG_TYPE(pd_vr) == TCCIR_VREG_TYPE_PARAM) + ? tcc_ir_vreg_live_interval(ir, pd_vr) + : NULL; + if (!pdi || pdi->addrtaken) + return 0; + } } /* Match the prior LOAD: same op, same source flags+vreg, TEMP dest. */ @@ -1125,6 +1156,20 @@ int ssa_opt_var_to_param_forward(IRSSAOptCtx *ctx) if (!touches) continue; + /* ARM barrel-shift fusion records a hidden shift on this use's src2 + * (ir->barrel_shifts[orig_index], set just before regalloc). Substituting + * the stored value here rewrites the operand fusion pinned — an immediate + * cannot be barrel-shifted, so codegen would silently drop the shift + * (volatile fuzz seed 16558: `(u6<<7)|x` folded to `u6|x`). One blocked + * use blocks the whole VAR: forwarding the others would NOP the def this + * use still reads. */ + if (ir->barrel_shifts && uq->orig_index >= 0 && + uq->orig_index <= ir->max_orig_index && + ir->barrel_shifts[uq->orig_index]) { + safe = 0; + break; + } + if (!v2v_dominates(cfg, def_blk, cfg->instr_to_block[j])) { safe = 0; break; @@ -1522,12 +1567,38 @@ static int ssa_var_const_fold_one(IRSSAOptCtx *ctx, int idx) return 0; } + /* The self-update always folds safely to the constant (Vx's value at `idx` + * is `prior_val` — the backward scan proved no write to Vx lies between). + * But the prior `Vx <- #const` def may still be read by an instruction + * *between* it and the self-update: e.g. + * si11 = -2992; // V2 <- #-2992 (prior_idx) + * si12 = si11 - si10; // V3 <- V2 SUB V1 (reads the prior def!) + * si11 = si11 & 0x7fff; // V2 <- V2 AND ... (idx, self-update) + * NOPing the prior def then leaves that intervening use reading an + * undefined Vx. Only drop the prior def when nothing in (prior_idx, idx) + * reads Vx. Stores/calls in that range already aborted the fold above, so + * every intervening read lives in a src1/src2 slot (incl. FUNCPARAMVAL, + * whose value is src1). */ + int prior_used = 0; + for (int k = prior_idx + 1; k < idx; k++) { + IRQuadCompact *uq = &ir->compact_instructions[k]; + if (uq->op == TCCIR_OP_NOP) + continue; + IROperand us1 = tcc_ir_op_get_src1(ir, uq); + IROperand us2 = tcc_ir_op_get_src2(ir, uq); + if (irop_get_vreg(us1) == dest_vr || irop_get_vreg(us2) == dest_vr) { + prior_used = 1; + break; + } + } + IROperand imm = irop_make_imm32(0, (int32_t)result, dest.btype); q->op = TCCIR_OP_ASSIGN; tcc_ir_op_set_src1(ir, q, imm); tcc_ir_op_set_src2(ir, q, IROP_NONE); - ir->compact_instructions[prior_idx].op = TCCIR_OP_NOP; + if (!prior_used) + ir->compact_instructions[prior_idx].op = TCCIR_OP_NOP; return 1; } diff --git a/ir/opt/ssa_opt_dce.c b/ir/opt/ssa_opt_dce.c index a4d0f42c..eb260152 100644 --- a/ir/opt/ssa_opt_dce.c +++ b/ir/opt/ssa_opt_dce.c @@ -13,6 +13,8 @@ #include "ssa_opt.h" #include +extern int tcc_ir_opt_pass_disabled(const char *name); + static int dce_temp_worklist(IRSSAOptCtx *ctx) { int cap = ctx->vinfo_cap; @@ -604,6 +606,29 @@ static int dce_dead_stackloc_stores(IRSSAOptCtx *ctx) #undef SL_SET #undef SL_TEST +static int ssa_dce_block_in_backedge_region(IRCFG *cfg, int block) +{ + if (!cfg || block < 0 || block >= cfg->num_blocks) + return 0; + + IRBasicBlock *bb = &cfg->blocks[block]; + for (int h = 0; h < cfg->num_blocks; h++) { + IRBasicBlock *header = &cfg->blocks[h]; + for (int i = 0; i < header->num_preds; i++) { + int pred = header->preds[i]; + if (pred < 0 || pred >= cfg->num_blocks) + continue; + IRBasicBlock *latch = &cfg->blocks[pred]; + if (pred != h && latch->start_idx < header->start_idx) + continue; + if (bb->start_idx >= header->start_idx && + bb->start_idx <= latch->start_idx) + return 1; + } + } + return 0; +} + /* Aggressive dead phi cycle elimination. * * Standard DCE cannot break cycles of phi nodes and ASSIGN copies where each @@ -716,6 +741,7 @@ static int dce_dead_phi_cycles(IRSSAOptCtx *ctx) /* Phase 3: remove phi nodes whose dest TEMP is not live. */ int changes = 0; for (int b = 0; b < cfg->num_blocks; b++) { + int in_backedge_region = ssa_dce_block_in_backedge_region(cfg, b); IRPhiNode **pp = &ssa->block_phis[b]; while (*pp) { IRPhiNode *phi = *pp; @@ -723,12 +749,31 @@ static int dce_dead_phi_cycles(IRSSAOptCtx *ctx) if (dv >= 0 && TCCIR_DECODE_VREG_TYPE(dv) == TCCIR_VREG_TYPE_TEMP) { int dp = TCCIR_DECODE_VREG_POSITION(dv); if (dp < cap && !BM_TEST(dp)) { + /* Phis in a natural back-edge region are not just value uses: phi + * resolution needs them to carry state through loop iterations. + * Removing such a phi can make out-of-SSA conflate a loop-carried + * value with its source even when the visible ASSIGN/phi graph + * looks dead (fp_round seed 18960). */ + if (in_backedge_region) { + pp = &phi->next; + continue; + } + if (getenv("TCC_DBG_PHI_CYCLES")) { + fprintf(stderr, "[phi_cycles] remove phi block=%d dest=T%d ops:", b, dp); + for (int pi = 0; pi < phi->num_operands; pi++) + fprintf(stderr, " %d", phi->operands[pi].vreg); + fprintf(stderr, "\n"); + } for (int pi = 0; pi < phi->num_operands; pi++) { IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, phi->operands[pi].vreg); if (vi && vi->use_count > 0) vi->use_count--; } *pp = phi->next; + /* Free the unlinked node — it is no longer reachable from block_phis, + * so tcc_ir_ssa_free would otherwise never reclaim it. */ + tcc_free(phi->operands); + tcc_free(phi); changes++; continue; } @@ -737,8 +782,33 @@ static int dce_dead_phi_cycles(IRSSAOptCtx *ctx) } } - if (changes) + if (changes) { + /* Rebuild the FULL use lists before cascading: the per-operand + * use_count-- above operates on counts that may already be stale + * (same desync family as ptr seed 7226 — count-only updates let a + * live use fall off the tracked list), so the worklist could delete + * a def still feeding a LIVE phi (fp_round seed 18960: a loop-carried + * copy's def died and out-of-SSA conflated it with its multi-def + * source). Mirrors the rebuild done by the ssa_opt_dce driver. */ + for (int p = 0; p < ctx->vinfo_cap; p++) + ctx->vinfo[p].use_count = 0; + for (int i = 0; i < ir->next_instruction_index; i++) { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP) + continue; + ssa_opt_scan_instr_uses(ctx, i, q); + } + for (int b = 0; b < cfg->num_blocks; b++) { + for (IRPhiNode *phi = ssa->block_phis[b]; phi; phi = phi->next) { + for (int pi = 0; pi < phi->num_operands; pi++) { + IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, phi->operands[pi].vreg); + if (vi) + ssa_opt_add_use_phi(vi, b, pi); + } + } + } changes += dce_temp_worklist(ctx); + } #undef BM_SET #undef BM_TEST @@ -1115,53 +1185,37 @@ int ssa_opt_dce(IRSSAOptCtx *ctx) changes += dce_dead_overwrite_stores(ctx); changes += dce_dead_stackloc_stores(ctx); if (changes) { - /* Repair stale TEMP use counts: some passes NOP instructions - * without fully updating the use-def chains. Rebuild accurate - * counts in O(n) so the final temp worklist can cascade. */ + /* Repair stale TEMP use chains: some passes NOP or rewrite + * instructions without fully updating the use-def chains. Rebuild + * the FULL use lists in O(n), not just the counts — truncating + * use_count while keeping the old uses[] entries desynchronizes the + * two, so the surviving prefix can hold a stale entry while a live + * use falls off the end. A later replace_all_uses then walks the + * wrong list, leaves the live use un-rewritten, and this DCE deletes + * a def that is still referenced (ptr fuzz seed 7226: *p9's pointer + * temp lost its deref use and the deref read an undefined vreg). */ for (int p = 0; p < ctx->vinfo_cap; p++) ctx->vinfo[p].use_count = 0; for (int i = 0; i < ctx->ir->next_instruction_index; i++) { IRQuadCompact *q = &ctx->ir->compact_instructions[i]; if (q->op == TCCIR_OP_NOP) continue; - if (irop_config[q->op].has_src1) { - IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, - irop_get_vreg(tcc_ir_op_get_src1(ctx->ir, q))); - if (vi) vi->use_count++; - } - if (irop_config[q->op].has_src2) { - IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, - irop_get_vreg(tcc_ir_op_get_src2(ctx->ir, q))); - if (vi) vi->use_count++; - } - if (q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED || - q->op == TCCIR_OP_STORE_POSTINC) { - IROperand d = tcc_ir_op_get_dest(ctx->ir, q); - /* STORE with non-lval VREG dest is a value def, not a memory - * write — dest is not a use. See ssa_opt_scan_instr_uses. */ - if (q->op != TCCIR_OP_STORE || d.is_lval) { - IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, irop_get_vreg(d)); - if (vi) vi->use_count++; - } - } - if (q->op == TCCIR_OP_MLA) { - IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, - irop_get_vreg(tcc_ir_op_get_accum(ctx->ir, q))); - if (vi) vi->use_count++; - } + ssa_opt_scan_instr_uses(ctx, i, q); } - /* Count phi operand uses */ + /* Rebuild phi operand uses */ for (int b = 0; b < ctx->cfg->num_blocks; b++) { for (IRPhiNode *phi = ctx->ssa->block_phis[b]; phi; phi = phi->next) { for (int pi = 0; pi < phi->num_operands; pi++) { IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, phi->operands[pi].vreg); - if (vi) vi->use_count++; + if (vi) + ssa_opt_add_use_phi(vi, b, pi); } } } changes += dce_temp_worklist(ctx); } - changes += dce_dead_phi_cycles(ctx); + if (!tcc_ir_opt_pass_disabled("ssa:dce:phi_cycles")) + changes += dce_dead_phi_cycles(ctx); } return changes; diff --git a/ir/opt/ssa_opt_dead_loop.c b/ir/opt/ssa_opt_dead_loop.c index b3f919ed..e9f2a176 100644 --- a/ir/opt/ssa_opt_dead_loop.c +++ b/ir/opt/ssa_opt_dead_loop.c @@ -83,10 +83,70 @@ static int loop_max_idx(IRLoop *loop) return m; } +/* The forward-jump body-extension heuristic in tcc_ir_detect_loops also follows + * the loop's own exit branch (the header's conditional `jumpif exit`): + * structurally that exit is a forward jump past the back-edge, so body_instrs[] + * (and hence loop_max_idx) can reach the exit target and the post-loop + * instructions beyond it. The dead-loop transforms must NOT treat those as loop + * body — otherwise try_kill_loop_body believes a post-loop use of the induction + * variable is in-loop, kills the loop, and deletes the trailing compare, leaving + * a flag-less conditional branch (a miscompile). + * + * Clamp the body upper bound to just before the forward exit target. A natural + * loop never exits into the middle of its own body, so every real body + * instruction — including a body reached by a forward jump past the back-edge — + * sits strictly before the exit target. Clamping hides no real body work; it + * only drops the spuriously-included post-loop tail. When the header doesn't + * open with a CMP+JUMPIF (no identifiable forward exit) the bound is left as-is. */ +static int dead_loop_body_hi(TCCIRState *ir, IRLoop *loop) +{ + int hi = loop_max_idx(loop); + + int cmp_idx = -1; + for (int j = loop->header_idx; j <= hi && j < ir->next_instruction_index; j++) { + int op = ir->compact_instructions[j].op; + if (op == TCCIR_OP_NOP) + continue; + if (op == TCCIR_OP_CMP) { + cmp_idx = j; + break; + } + break; /* header doesn't open with a compare — leave the bound as-is */ + } + if (cmp_idx < 0) + return hi; + + int jpf_idx = cmp_idx + 1; + while (jpf_idx <= hi && ir->compact_instructions[jpf_idx].op == TCCIR_OP_NOP) + jpf_idx++; + if (jpf_idx > hi || ir->compact_instructions[jpf_idx].op != TCCIR_OP_JUMPIF) + return hi; + + IROperand exit_dest = tcc_ir_op_get_dest(ir, &ir->compact_instructions[jpf_idx]); + int exit_target = (int)irop_get_imm64_ex(ir, exit_dest); + + /* A natural loop never exits into the middle of its own body, so the body is + * exactly [header_idx, exit_target): exit_target-1 is the authoritative upper + * bound. loop_max_idx() can OVER-count (a spuriously-included post-loop tail) + * OR UNDER-count: when the body sits past a split/rotated back-edge the loop + * detector's end_idx stops at the back-edge, leaving the real body (the + * straight-line region between the back-edge and the exit target) outside + * loop_max_idx. Under-counting was a wrong-code bug: loop_body_has_side_effects + * and rewrite_loop_exit_phis' in-loop-use guard then missed the body's CALLs and + * in-loop phi uses, so a loop-carried header phi got folded to its latch constant + * and corrupted the first-iteration read (random-C O1/O2 wrong-code, seeds + * 51/52/132/281). Take exit_target-1 as the bound in both directions. */ + if (exit_target > loop->header_idx) + hi = exit_target - 1; + if (hi >= ir->next_instruction_index) + hi = ir->next_instruction_index - 1; + return hi; +} + static int loop_body_has_side_effects(IRSSAOptCtx *ctx, IRLoop *loop) { TCCIRState *ir = ctx->ir; - int hi = loop_max_idx(loop); + int hi = dead_loop_body_hi(ir, loop); for (int idx = loop->start_idx; idx <= hi && idx < ir->next_instruction_index; idx++) { IRQuadCompact *q = &ir->compact_instructions[idx]; if (q->op == TCCIR_OP_NOP) @@ -150,7 +210,7 @@ typedef struct LoopEntryInfo { static int analyze_loop_entry(IRSSAOptCtx *ctx, IRLoop *loop, LoopEntryInfo *out) { TCCIRState *ir = ctx->ir; - int hi = loop_max_idx(loop); + int hi = dead_loop_body_hi(ir, loop); memset(out, 0, sizeof(*out)); /* Walk the header forward to find the controlling CMP. */ @@ -306,7 +366,7 @@ static int rewrite_loop_exit_phis(IRSSAOptCtx *ctx, IRLoop *loop) if (header_block < 0) return 0; int latch_block = cfg->instr_to_block[loop->end_idx]; - int hi = loop_max_idx(loop); + int hi = dead_loop_body_hi(ir, loop); int changes = 0; @@ -417,7 +477,7 @@ static int try_kill_loop_body(IRSSAOptCtx *ctx, IRLoop *loop) IRSSAState *ssa = ctx->ssa; IRCFG *cfg = ctx->cfg; - int hi = loop_max_idx(loop); + int hi = dead_loop_body_hi(ir, loop); /* Re-locate the header CMP+JUMPIF; the IR may have been modified above. */ int cmp_idx = -1; @@ -554,7 +614,7 @@ static int rewrite_loop_exit_phis_guarded(IRSSAOptCtx *ctx, IRLoop *loop, LoopEn if (!ssa || !ssa->block_phis || !cfg) return 0; - int hi = loop_max_idx(loop); + int hi = dead_loop_body_hi(ir, loop); /* Collect qualifying value phis. */ enum { MAX_CANDS = 4 }; diff --git a/ir/opt/ssa_opt_fold.c b/ir/opt/ssa_opt_fold.c index 7353e2b8..7df8ec17 100644 --- a/ir/opt/ssa_opt_fold.c +++ b/ir/opt/ssa_opt_fold.c @@ -23,6 +23,13 @@ * x - x, x ^ x → 0 * ============================================================================ */ +static int has_barrel_shift_annotation(TCCIRState *ir, const IRQuadCompact *q) +{ + return ir->barrel_shifts && q->orig_index >= 0 && + q->orig_index <= ir->max_orig_index && + ir->barrel_shifts[q->orig_index] != 0; +} + /* Resolve a vreg operand back to its constant defining ASSIGN, if any. * In SSA a TEMP is single-def, so following its def to an ASSIGN #imm gives * the value the operand will carry at runtime. Returns 1 and sets *out_val @@ -69,6 +76,9 @@ static int fold_binary(IRSSAOptCtx *ctx, int idx) IROperand src2 = tcc_ir_op_get_src2(ir, q); IROperand dest = tcc_ir_op_get_dest(ir, q); + if (has_barrel_shift_annotation(ir, q)) + return 0; + int32_t dest_vr = irop_get_vreg(dest); if (dest_vr < 0 || TCCIR_DECODE_VREG_TYPE(dest_vr) != TCCIR_VREG_TYPE_TEMP) return 0; @@ -110,7 +120,75 @@ static int fold_binary(IRSSAOptCtx *ctx, int idx) /* Both operands immediate: full constant fold */ if (src1_is_imm && src2_is_imm) { + /* An IMM32 operand of a 64-bit op is a sign-extended 64-bit constant + * (irop_get_int64 semantics); evaluating it with 32-bit arithmetic + * loses the high word (fuzz longlong seed 3161: `#imm SHR #32` folded + * to 0 instead of the sign-bits 0xFFFFFFFF). */ + int is_64 = irop_is_64bit(dest); + if (!is_64 && (irop_is_64bit(src1) || irop_is_64bit(src2))) + return 0; int64_t result; + if (is_64) { + int64_t v1 = (int64_t)val1; + int64_t v2 = (int64_t)val2; + switch (q->op) { + case TCCIR_OP_ADD: result = (int64_t)((uint64_t)v1 + (uint64_t)v2); break; + case TCCIR_OP_SUB: result = (int64_t)((uint64_t)v1 - (uint64_t)v2); break; + case TCCIR_OP_MUL: result = (int64_t)((uint64_t)v1 * (uint64_t)v2); break; + case TCCIR_OP_AND: result = v1 & v2; break; + case TCCIR_OP_OR: result = v1 | v2; break; + case TCCIR_OP_XOR: result = v1 ^ v2; break; + case TCCIR_OP_SHL: + if ((uint64_t)v2 >= 64) result = 0; + else result = (int64_t)((uint64_t)v1 << v2); + break; + case TCCIR_OP_SHR: + if ((uint64_t)v2 >= 64) result = 0; + else result = (int64_t)((uint64_t)v1 >> v2); + break; + case TCCIR_OP_SAR: + if ((uint64_t)v2 >= 64) result = v1 >> 63; + else result = v1 >> v2; + break; + case TCCIR_OP_DIV: + if (v2 == 0) return 0; + if (v2 == -1 && v1 == INT64_MIN) return 0; + result = v1 / v2; + break; + case TCCIR_OP_UDIV: + if (v2 == 0) return 0; + result = (int64_t)((uint64_t)v1 / (uint64_t)v2); + break; + case TCCIR_OP_IMOD: + if (v2 == 0) return 0; + if (v2 == -1 && v1 == INT64_MIN) return 0; + result = v1 % v2; + break; + case TCCIR_OP_UMOD: + if (v2 == 0) return 0; + result = (int64_t)((uint64_t)v1 % (uint64_t)v2); + break; + default: + /* ROR has no 64-bit form */ + return 0; + } + + IROperand imm; + if (result == (int64_t)(int32_t)result) + imm = irop_make_imm32(0, (int32_t)result, dest.btype); + else + imm = irop_make_i64(0, tcc_ir_pool_add_i64(ir, result), dest.btype); + q->op = TCCIR_OP_ASSIGN; + tcc_ir_op_set_src1(ir, q, imm); + tcc_ir_op_set_src2(ir, q, IROP_NONE); + + IRSSAVregInfo *vi; + vi = ssa_opt_vinfo(ctx, irop_get_vreg(src1)); + if (vi) ssa_opt_remove_use_instr(vi, idx); + vi = ssa_opt_vinfo(ctx, irop_get_vreg(src2)); + if (vi) ssa_opt_remove_use_instr(vi, idx); + return 1; + } switch (q->op) { case TCCIR_OP_ADD: result = (int64_t)((uint64_t)(uint32_t)val1 + (uint64_t)(uint32_t)val2); break; case TCCIR_OP_SUB: result = (int64_t)((uint64_t)(uint32_t)val1 - (uint64_t)(uint32_t)val2); break; diff --git a/ir/opt/ssa_opt_gvn.c b/ir/opt/ssa_opt_gvn.c index adbbdefa..71bfc3c9 100644 --- a/ir/opt/ssa_opt_gvn.c +++ b/ir/opt/ssa_opt_gvn.c @@ -246,6 +246,16 @@ static int gvn_process_block(IRSSAOptCtx *ctx, IRCFG *cfg, GVNEntry **table, int if (dest_vr < 0 || TCCIR_DECODE_VREG_TYPE(dest_vr) != TCCIR_VREG_TYPE_TEMP) continue; + /* Do not value-number 64-bit results. Replacing a 64-bit computation with + * an ASSIGN copy of a congruent earlier one is value-correct, but the extra + * register-pair copy is mishandled downstream — the copied high word is + * dropped, so a later `>> 32` reads 0 (longlong seed 686: loop-unroll makes + * the 5 copies of `q11 = q12 | const` congruent; GVN turns them into copies + * of one T86 and the SHR#32 that extracts q11's high word then yields 0). + * 64-bit CSE is rare; decline it rather than emit a truncating copy. */ + if (irop_is_64bit(dest)) + continue; + IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, dest_vr); if (vi && vi->def_count > 1) continue; diff --git a/ir/opt/ssa_opt_load_cse.c b/ir/opt/ssa_opt_load_cse.c index 2d2f8cf7..5d2f9f46 100644 --- a/ir/opt/ssa_opt_load_cse.c +++ b/ir/opt/ssa_opt_load_cse.c @@ -54,6 +54,12 @@ typedef struct { int btype; int32_t stored_vr; /* TEMP vreg, or -1 if immediate */ IROperand stored_imm; /* valid when stored_vr == -1 */ + /* Identity of the stored-to location: -1 for a real direct stack slot (the + * offset uniquely names it), or the VAR/PARAM base vreg for a `&VAR` address + * whose offset is a placeholder shared by every distinct local. A load only + * forwards from this entry when its own resolved base matches (ptr fuzz seed + * 67: `&u2` and `&u3` both resolve to offset 0 but must not alias). */ + int32_t base_var; } SStoreEntry; typedef struct { @@ -173,13 +179,15 @@ static void sstore_invalidate_overlap(GLoadState *st, int offset, int btype) } } -static void sstore_track_vr(GLoadState *st, int offset, int btype, int32_t stored_vr) +static void sstore_track_vr(GLoadState *st, int offset, int btype, int32_t stored_vr, + int32_t base_var) { sstore_invalidate_overlap(st, offset, btype); int k = sstore_find(st, offset); if (k >= 0) { st->sstores[k].btype = btype; st->sstores[k].stored_vr = stored_vr; + st->sstores[k].base_var = base_var; return; } if (st->scount >= SSTORE_MAX) @@ -188,9 +196,11 @@ static void sstore_track_vr(GLoadState *st, int offset, int btype, int32_t store e->stack_offset = offset; e->btype = btype; e->stored_vr = stored_vr; + e->base_var = base_var; } -static void sstore_track_imm(GLoadState *st, int offset, int btype, IROperand imm) +static void sstore_track_imm(GLoadState *st, int offset, int btype, IROperand imm, + int32_t base_var) { sstore_invalidate_overlap(st, offset, btype); int k = sstore_find(st, offset); @@ -198,6 +208,7 @@ static void sstore_track_imm(GLoadState *st, int offset, int btype, IROperand im st->sstores[k].btype = btype; st->sstores[k].stored_vr = -1; st->sstores[k].stored_imm = imm; + st->sstores[k].base_var = base_var; return; } if (st->scount >= SSTORE_MAX) @@ -207,6 +218,7 @@ static void sstore_track_imm(GLoadState *st, int offset, int btype, IROperand im e->btype = btype; e->stored_vr = -1; e->stored_imm = imm; + e->base_var = base_var; } static void sstore_remove_vr(GLoadState *st, int32_t vr) @@ -398,6 +410,24 @@ static void iload_remove_vr(GLoadState *st, int32_t vr) } } +/* A direct def of an address-taken VAR/PARAM (`V <-- T SUB #imm`, plain ALU + * or ASSIGN — not a STORE op) still writes V's stack slot, memory that the + * TEMP-pointer-keyed trackers (iloads, tvstores) may name through a `&V` + * pointer. Keying by vreg ID can't see that aliasing, so drop both trackers + * (fuzz ptr seed 6734: `p = &u; ..= *p; u = expr; ..= *p` — the second read + * CSE'd to the first across u's update). */ +static void ptr_state_kill_for_addrtaken_def(TCCIRState *ir, GLoadState *st, int32_t dvr) +{ + int type = TCCIR_DECODE_VREG_TYPE(dvr); + if (type != TCCIR_VREG_TYPE_VAR && type != TCCIR_VREG_TYPE_PARAM) + return; + IRLiveInterval *vi = tcc_ir_vreg_live_interval(ir, dvr); + if (vi && !vi->addrtaken) + return; + st->ilcount = 0; + st->tvcount = 0; +} + /* Kill iload entries that may alias a store at byte range [store_lo, store_hi) * through base store_base_vr. Entries with a different base_vr are killed * conservatively (different TEMP vregs may still alias the same memory). */ @@ -653,23 +683,35 @@ static int gload_process_block(IRSSAOptCtx *ctx, GLoadState *st_init, int b_init if (src.is_lval && !src.is_sym && !irop_is_immediate(src)) { int load_off = INT_MIN; int load_btype = irop_get_btype(src); + int32_t load_base = -1; if (src.tag == IROP_TAG_STACKOFF) { int32_t svr = irop_get_vreg(src); if (svr < 0 || TCCIR_DECODE_VREG_TYPE(svr) != TCCIR_VREG_TYPE_VAR) load_off = irop_get_stack_offset(src); } else if (src.tag == IROP_TAG_VREG && !src.is_local) { int32_t pvr = irop_get_vreg(src); - load_off = resolve_lea_stackloc(ctx, pvr); + load_off = ssa_opt_resolve_lea_stackloc_ex(ctx, pvr, &load_base); } if (load_off != INT_MIN) { int sk = sstore_find(st, load_off); - if (sk >= 0 && st->sstores[sk].btype == load_btype) { + if (sk >= 0 && st->sstores[sk].btype == load_btype && + st->sstores[sk].base_var == load_base) { SStoreEntry *se = &st->sstores[sk]; if (se->stored_vr < 0) { + /* The deref source is replaced by the forwarded immediate, + * so the pointer vreg is no longer referenced here — drop + * its use record, like every sibling forwarding path. A + * stale entry corrupts the pointer's use list (ptr fuzz + * seed 7226: a later swap-remove + count-only rebuild left + * the wrong entry, a live deref use vanished, and cprop/DCE + * deleted the pointer's def while a deref still read it). */ + if (src.tag == IROP_TAG_VREG) { + IRSSAVregInfo *pvi = ssa_opt_vinfo(ctx, irop_get_vreg(src)); + if (pvi) + ssa_opt_remove_use_instr(pvi, i); + } tcc_ir_set_src1(ir, i, se->stored_imm); changes++; - /* Refresh dest after rewrite (no-op for STORE; just use - * existing local to keep flow consistent). */ } } } @@ -679,16 +721,54 @@ static int gload_process_block(IRSSAOptCtx *ctx, GLoadState *st_init, int b_init /* Track StackLoc stores for stack forwarding. Record the stored * slot width so narrower subfield loads do not reuse wider values. */ if (dest.tag == IROP_TAG_STACKOFF) { + /* A real stack memory write may alias any TVStore pointer: a + * tvstore is only tracked when its pointer did NOT resolve to a + * stack slot, so nothing proves it doesn't point right here (ptr + * fuzz seed 8507: `*T = const` with T = Addr[StackLoc[-32]]+4 + * survived the direct store `StackLoc[-28] <- u2` to the same + * address and forwarded the stale constant into a later deref). */ + if (dest.is_lval || q->op == TCCIR_OP_STORE_INDEXED) + st->tvcount = 0; IROperand src = tcc_ir_op_get_src1(ir, q); int32_t svr = irop_get_vreg(src); /* Direct stack stores are encoded as StackLoc lvalues. Non-lvalue * STACKOFF operands are stack addresses, not memory writes. */ if (!ctx->no_stack_fwd && dest.is_local && dest.is_lval && !dest.is_llocal) { int store_btype = irop_get_btype(dest); + /* irop_get_vreg(dest) is -1 for a real stack slot (offset is the + * identity) or the VAR vreg for a named local addressed by its slot + * encoding (offset is a placeholder; the vreg is the identity). */ + int32_t dest_base = irop_get_vreg(dest); if (svr >= 0 && TCCIR_DECODE_VREG_TYPE(svr) == TCCIR_VREG_TYPE_TEMP) - sstore_track_vr(st, irop_get_stack_offset(dest), store_btype, svr); + sstore_track_vr(st, irop_get_stack_offset(dest), store_btype, svr, dest_base); else if (irop_is_immediate(src)) - sstore_track_imm(st, irop_get_stack_offset(dest), store_btype, src); + sstore_track_imm(st, irop_get_stack_offset(dest), store_btype, src, dest_base); + else { + int off = irop_get_stack_offset(dest); + sstore_invalidate_overlap(st, off, store_btype); + sstore_remove_offset(st, off); + } + } else if (q->op == TCCIR_OP_STORE_INDEXED) { + /* Indexed write through a stack base address (Addr[StackLoc[B]] + + * idx*scale). The base-offset-only removal in the plain branch below + * dropped just the B slot, leaving the sibling slots forwardable even + * though a runtime index can land on any of them (fuzz seed 2657: + * `arr[i]=v` with runtime i, then a fully-unrolled `for k arr[k]` whose + * reads wrongly forwarded the initializer values for k != B). With a + * constant index invalidate just the exact slot; with a runtime index + * conservatively drop all stack-store and indexed-load forwarding. */ + IROperand idx = tcc_ir_op_get_src2(ir, q); + IROperand sc = tcc_ir_op_get_scale(ir, q); + if (irop_is_immediate(idx) && irop_is_immediate(sc)) { + int off = irop_get_stack_offset(dest) + + (int)irop_get_imm32(idx) * (1 << irop_get_imm32(sc)); + sstore_invalidate_overlap(st, off, irop_get_btype(dest)); + sstore_remove_offset(st, off); + st->ilcount = 0; + } else { + st->scount = 0; + st->ilcount = 0; + } } else { /* A non-direct STACKOFF write may expose the address. */ int off = irop_get_stack_offset(dest); @@ -709,16 +789,17 @@ static int gload_process_block(IRSSAOptCtx *ctx, GLoadState *st_init, int b_init ((q->op == TCCIR_OP_STORE && dest.is_lval) || q->op == TCCIR_OP_STORE_INDEXED)); if (store_dest_is_temp_indir) { - int eff_off = ssa_opt_indirect_stack_offset(ctx, q, SSA_OPT_INDIRECT_DEST); + int32_t store_base = -1; + int eff_off = ssa_opt_indirect_stack_offset_ex(ctx, q, SSA_OPT_INDIRECT_DEST, &store_base); if (eff_off != INT_MIN) { if (!ctx->no_stack_fwd) { IROperand src = tcc_ir_op_get_src1(ir, q); int store_btype = irop_get_btype(dest); int32_t svr = irop_get_vreg(src); if (svr >= 0 && TCCIR_DECODE_VREG_TYPE(svr) == TCCIR_VREG_TYPE_TEMP) - sstore_track_vr(st, eff_off, store_btype, svr); + sstore_track_vr(st, eff_off, store_btype, svr, store_base); else if (irop_is_immediate(src)) - sstore_track_imm(st, eff_off, store_btype, src); + sstore_track_imm(st, eff_off, store_btype, src, store_base); else sstore_remove_offset(st, eff_off); } @@ -817,12 +898,18 @@ static int gload_process_block(IRSSAOptCtx *ctx, GLoadState *st_init, int b_init gstore_remove_vr(st, dvr); tvstore_remove_vr(st, dvr); iload_remove_vr(st, dvr); + ptr_state_kill_for_addrtaken_def(ir, st, dvr); } continue; } } if (dest.is_sym && dest.is_lval) { + /* Same aliasing gap as stack stores: an unresolved TVStore pointer + * may name this very global (`&sym + off` LEAs that never became + * SymRef operands are exactly what tvstores track), so a direct + * sym store must drop them. */ + st->tvcount = 0; IRPoolSymref *sref = irop_get_symref_ex(ir, dest); if (sref && sref->sym) { for (int k = 0; k < st->count; k++) { @@ -896,6 +983,7 @@ static int gload_process_block(IRSSAOptCtx *ctx, GLoadState *st_init, int b_init gstore_remove_vr(st, qdvr); tvstore_remove_vr(st, qdvr); iload_remove_vr(st, qdvr); + ptr_state_kill_for_addrtaken_def(ir, st, qdvr); } } @@ -957,10 +1045,12 @@ static int gload_process_block(IRSSAOptCtx *ctx, GLoadState *st_init, int b_init * tracked stack store at that offset. ssa_opt_indirect_stack_offset * already enforces scale==0 and constant idx. */ if (!ctx->no_stack_fwd) { - int eff_off = ssa_opt_indirect_stack_offset(ctx, q, SSA_OPT_INDIRECT_SRC1); + int32_t load_base = -1; + int eff_off = ssa_opt_indirect_stack_offset_ex(ctx, q, SSA_OPT_INDIRECT_SRC1, &load_base); if (eff_off != INT_MIN) { int sk = sstore_find(st, eff_off); - if (sk >= 0 && st->sstores[sk].btype == il_btype) { + if (sk >= 0 && st->sstores[sk].btype == il_btype && + st->sstores[sk].base_var == load_base) { SStoreEntry *se = &st->sstores[sk]; IROperand new_src; if (se->stored_vr >= 0) { @@ -1113,6 +1203,7 @@ static int gload_process_block(IRSSAOptCtx *ctx, GLoadState *st_init, int b_init /* Stack store-load forwarding */ if (src1.is_lval && !src1.is_sym) { int stack_off = INT_MIN; + int32_t load_base = -1; /* Direct StackLoc load: T <-- StackLoc[N] [LOAD]. * Skip if the operand carries a VAR vreg — that's a load from a @@ -1127,7 +1218,7 @@ static int gload_process_block(IRSSAOptCtx *ctx, GLoadState *st_init, int b_init /* LEA+DEREF load: T <-- *Addr[StackLoc[N]] [LOAD] */ if (stack_off == INT_MIN) { int32_t ptr_vr = irop_get_vreg(src1); - stack_off = resolve_lea_stackloc(ctx, ptr_vr); + stack_off = ssa_opt_resolve_lea_stackloc_ex(ctx, ptr_vr, &load_base); } if (stack_off != INT_MIN) { @@ -1136,6 +1227,11 @@ static int gload_process_block(IRSSAOptCtx *ctx, GLoadState *st_init, int b_init SStoreEntry *se = &st->sstores[sk]; if (se->btype != dest_btype) continue; + /* Only forward when the store and this load name the same location: + * for `&VAR` addresses the offset is a shared placeholder, so the + * canonical base must match (ptr fuzz seed 67). */ + if (se->base_var != load_base) + continue; IROperand new_src; if (se->stored_vr >= 0) { new_src = irop_make_vreg(se->stored_vr, dest_btype); diff --git a/ir/opt/ssa_opt_phi.c b/ir/opt/ssa_opt_phi.c index 01b7c1c3..4c42d8b8 100644 --- a/ir/opt/ssa_opt_phi.c +++ b/ir/opt/ssa_opt_phi.c @@ -58,8 +58,23 @@ int ssa_opt_phi_simplify(IRSSAOptCtx *ctx) continue; } - /* Replace all uses of phi->dest_vreg with unique */ + /* Replace all uses of phi->dest_vreg with unique. The replacement + * can bail and rewrite NOTHING when a use must keep dest's exact vreg + * identity — e.g. an ARM barrel-shift src2 whose implicit shift is + * keyed on the operand's vreg (ssa_opt_use_is_barrel_shift_src2). + * Dropping the phi while such uses remain leaves them referencing an + * undefined value: the def vanishes but the use does not. (fuzz seed + * 19826: a loop-invariant local read after the loop as `x >> n` then + * read 0, because its loop-closing phi was simplified away while the + * barrel-shifted use kept the phi-dest vreg.) Only drop the phi once + * dest_vreg is genuinely use-free; otherwise keep it so phi resolution + * still materializes it. */ ssa_opt_replace_all_uses(ctx, phi->dest_vreg, unique); + IRSSAVregInfo *dvi = ssa_opt_vinfo(ctx, phi->dest_vreg); + if (dvi && dvi->use_count > 0) { + pp = &(*pp)->next; + continue; + } /* Remove phi from the list */ *pp = phi->next; diff --git a/ir/opt/ssa_opt_reassoc.c b/ir/opt/ssa_opt_reassoc.c index 746a7f4e..08a88754 100644 --- a/ir/opt/ssa_opt_reassoc.c +++ b/ir/opt/ssa_opt_reassoc.c @@ -33,6 +33,13 @@ * so we don't increase register pressure. * ============================================================================ */ +static int has_barrel_shift_annotation(TCCIRState *ir, const IRQuadCompact *q) +{ + return ir->barrel_shifts && q->orig_index >= 0 && + q->orig_index <= ir->max_orig_index && + ir->barrel_shifts[q->orig_index] != 0; +} + static int reassoc_binary(IRSSAOptCtx *ctx, int idx) { TCCIRState *ir = ctx->ir; @@ -46,6 +53,13 @@ static int reassoc_binary(IRSSAOptCtx *ctx, int idx) if (src2.tag != IROP_TAG_IMM32 || src2.is_lval) return 0; + /* The ARM barrel-shift fusion pass records a hidden shift on an ALU op's + * src2 in ir->barrel_shifts[orig_index]. Later SSA folds can still make + * that visible src2 look like a plain immediate, but reassociating through + * it would combine constants as if the shift did not exist. */ + if (has_barrel_shift_annotation(ir, q)) + return 0; + /* src1 must be a single-use TEMP vreg */ int32_t src1_vr = irop_get_vreg(src1); if (src1_vr < 0 || TCCIR_DECODE_VREG_TYPE(src1_vr) != TCCIR_VREG_TYPE_TEMP) @@ -58,6 +72,8 @@ static int reassoc_binary(IRSSAOptCtx *ctx, int idx) return 0; IRQuadCompact *inner = &ir->compact_instructions[vi->def_instr]; + if (has_barrel_shift_annotation(ir, inner)) + return 0; /* Inner op must also have an immediate in src2 */ IROperand inner_src1 = tcc_ir_op_get_src1(ir, inner); @@ -195,6 +211,10 @@ static int reassoc_add_cancel_const(IRSSAOptCtx *ctx, int idx) IRQuadCompact *d1 = &ir->compact_instructions[vi1->def_instr]; IRQuadCompact *d2 = &ir->compact_instructions[vi2->def_instr]; + if (has_barrel_shift_annotation(ir, q) || + has_barrel_shift_annotation(ir, d1) || + has_barrel_shift_annotation(ir, d2)) + return 0; /* Match (a OP1 c) and (a OP2 c) where OP1/OP2 are {ADD, SUB} and the * constants cancel (same value with opposite signs in the combined sum). */ diff --git a/ir/opt/ssa_opt_sccp.c b/ir/opt/ssa_opt_sccp.c index 83e5f434..d2ef2de8 100644 --- a/ir/opt/ssa_opt_sccp.c +++ b/ir/opt/ssa_opt_sccp.c @@ -429,6 +429,89 @@ static int sccp_no_aliasing_between(SCCPState *s, int store_idx, int load_idx, return 1; } +/* Recover the base stack offset of an indexed/postinc store's destination + * array, whether the base address is a direct STACKOFF operand + * (Addr[StackLoc[off]], emitted for `arr[i] = v` where `arr` is a local array) + * or a TEMP that LEA-resolves to a stack slot. Returns INT_MIN when the base + * cannot be pinned to a local stack address. Unlike + * sccp_store_indexed_base_off() this also accepts the direct-STACKOFF base + * (vreg == -1) so the entry-block alias check below can bound an indexed + * write whose index is not a known constant. */ +static int sccp_indexed_store_base_off(IRSSAOptCtx *ctx, IRQuadCompact *q) +{ + if (q->op != TCCIR_OP_STORE_INDEXED && q->op != TCCIR_OP_STORE_POSTINC) + return INT_MIN; + TCCIRState *ir = ctx->ir; + IROperand base = tcc_ir_op_get_dest(ir, q); + if (base.tag == IROP_TAG_STACKOFF && base.is_local && irop_get_vreg(base) == -1) + return irop_get_stack_offset(base); + if (base.tag == IROP_TAG_VREG && !base.is_local) { + int32_t bvr = irop_get_vreg(base); + if (bvr >= 0 && TCCIR_DECODE_VREG_TYPE(bvr) == TCCIR_VREG_TYPE_TEMP) + return ssa_opt_resolve_lea_stackloc(ctx, bvr); + } + return INT_MIN; +} + +/* Entry-block initializers are usually allowed to forward broadly, but a later + * write whose stack byte range resolves exactly still clobbers that value. + * Keep this narrower than sccp_no_aliasing_between(): do not treat calls or + * unresolved pointer stores as barriers here, preserving the older permissive + * behavior for common aggregate-init shapes. */ +static int sccp_resolved_stack_write_between(SCCPState *s, int store_idx, int load_idx, + int soff, int load_btype) +{ + TCCIRState *ir = s->ctx->ir; + int load_size = sccp_btype_bytes(load_btype); + int load_lo = soff; + int load_hi = soff + load_size; + for (int i = store_idx + 1; i < load_idx; i++) { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op != TCCIR_OP_STORE && q->op != TCCIR_OP_STORE_INDEXED && + q->op != TCCIR_OP_STORE_POSTINC) + continue; + int store_btype = 0; + int target = sccp_store_target_off(s->ctx, q, &store_btype); + if (target == INT_MIN) { + /* Unresolved concrete offset. A STORE_INDEXED / STORE_POSTINC into a + * stack array still clobbers our load when the array's extent covers the + * load slot, even though the index is not a known constant during this + * scan. The entry-block exemption must NOT skip such a write: seed 3691 + * had a conditional `arr[i] = v` whose index was still a TEMP at SCCP + * time, so sccp_store_target_off() returned INT_MIN and the array-init + * LOAD wrongly folded back to the initializer. Mirror the indexed-base + * extent check sccp_no_aliasing_between() applies for the non-entry path. */ + if (q->op == TCCIR_OP_STORE_INDEXED || q->op == TCCIR_OP_STORE_POSTINC) { + const int LCS_INDEXED_MAX_ARRAY = 64; + int base_off = sccp_indexed_store_base_off(s->ctx, q); + if (base_off == INT_MIN) + return 1; /* indexed write to an unknown base — may alias the load */ + int extent_lo = base_off; + int extent_hi = base_off + LCS_INDEXED_MAX_ARRAY; + if (extent_hi > load_lo && load_hi > extent_lo) + return 1; /* the array's plausible extent covers the load slot */ + } + /* A plain STORE through a pointer that doesn't resolve to a concrete + * stack slot (e.g. the pointer lives in a named VAR) can write any + * address-taken frame slot — including our load's. The entry-block + * exemption must not skip it (ptr fuzz seed 58108: a conditional + * `*p10 = v` between arr8's initializer and an arr8[5] load was + * ignored, folding the load back to the initializer). VAR-slot + * writes and address-materialisation pseudo-stores stay permissive, + * as do calls (handled by the caller's dominator-path checks). */ + else if (q->op == TCCIR_OP_STORE && sccp_store_may_escape(s->ctx, q)) { + return 1; + } + continue; + } + int store_lo = target; + int store_hi = target + sccp_btype_bytes(store_btype); + if (store_hi > load_lo && load_hi > store_lo) + return 1; + } + return 0; +} + /* Back-edge-aware clobber check. sccp_no_aliasing_between only scans the * linear IR range between a dominating store and the load, on the assumption * that every path from store to load lies within that range. That assumption @@ -625,15 +708,12 @@ static int sccp_resolve_stack_load(SCCPState *s, int soff, int load_btype, break; } } - /* Only run the cross-block alias check when the matched STORE is NOT - * in the entry basic block. Entry-block stores are direct array - * initializers that the broader pipeline has always treated as - * dominating subsequent code; tightening that here regresses common - * vector/struct-init patterns (e.g. scal-to-vec1) without catching - * any real aliasing bug. Mid-function stores — including LCS's - * residual STOREs that replace a folded loop's memory writes — are - * the ones that need the alias check, because intervening loop - * bodies can contain STORE_INDEXED writes through the same array. */ + /* Mid-function stores — including LCS's residual STOREs that replace a + * folded loop's memory writes — need the full alias check, because + * intervening blocks can contain unresolved pointer writes. Entry-block + * stores stay more permissive for aggregate-init patterns, but a later + * STORE_INDEXED/direct STORE that resolves to the same concrete stack + * bytes still invalidates the initializer. */ /* A loop between the (dominating) store and the load whose body writes * the slot makes the loaded value loop-carried, not the stored constant. * The linear alias scan below is skipped for entry-block stores, so this @@ -647,9 +727,14 @@ static int sccp_resolve_stack_load(SCCPState *s, int soff, int load_btype, } int entry_block = (cfg->num_blocks > 0) ? 0 : -1; int store_block = cfg->instr_to_block[matched_idx]; - int needs_alias_check = (matched_idx >= 0 && store_block != entry_block); - if (needs_alias_check && - !sccp_no_aliasing_between(s, matched_idx, instr_idx, soff, load_btype)) { + int aliases_between = 0; + if (matched_idx >= 0) { + if (store_block == entry_block) + aliases_between = sccp_resolved_stack_write_between(s, matched_idx, instr_idx, soff, load_btype); + else + aliases_between = !sccp_no_aliasing_between(s, matched_idx, instr_idx, soff, load_btype); + } + if (aliases_between) { /* Aliasing write in between — restore state and treat as unknown. */ *out = saved_out; if (dep_pos) *dep_pos = saved_dep; @@ -1027,6 +1112,51 @@ static void sccp_visit_phi(SCCPState *s, IRPhiNode *phi, int block) } } +/* Conservative fixpoint repair for the optimistic-propagation gap documented at + * the re-sweep loop: a phi (typically a loop-header phi in an un-rotated loop) + * can settle at CONST while one of its operands — arriving on an EXECUTABLE + * edge — is still TOP because its defining value was never lowered and the + * worklist never re-propagated it. sccp_visit_phi SKIPS TOP operands, so even + * the re-sweep never widens such a phi. At a true fixpoint no reachable value + * stays TOP, so a TOP source on an executable edge is an inconsistency: trust + * nothing and force the phi to BOTTOM rather than keep the partial constant + * (which would fold the loop-carried value to its latch constant — a + * miscompile, e.g. 990527-1's `for(...){j++; g(j); j=9;}` summing 9*10 instead + * of 1+8*10). Monotone (only descends cells), so convergence is preserved. + * Returns the count of phis forced to BOTTOM. */ +static int sccp_force_stuck_phis_bottom(SCCPState *s) +{ + IRSSAState *ssa = s->ctx->ssa; + if (!ssa || !ssa->block_phis) + return 0; + int forced = 0; + for (int blk = 0; blk < s->num_blocks; blk++) { + if (!s->block_reachable[blk]) + continue; + for (IRPhiNode *phi = ssa->block_phis[blk]; phi; phi = phi->next) { + SCCPCell *dest = sccp_cell(s, phi->dest_vreg); + if (!dest || dest->state != SCCP_CONST) + continue; + for (int i = 0; i < phi->num_operands; i++) { + int pred = phi->operands[i].pred_block; + if (pred < 0 || pred >= s->num_blocks) + continue; + if (!s->edge_exec[pred * s->num_blocks + blk]) + continue; + SCCPCell *src = sccp_cell(s, phi->operands[i].vreg); + if (src && src->state == SCCP_TOP) { + if (sccp_set_bottom(dest)) { + sccp_add_ssa(s, TCCIR_DECODE_VREG_POSITION(phi->dest_vreg)); + forced++; + } + break; + } + } + } + } + return forced; +} + static void sccp_visit_instr(SCCPState *s, int idx) { TCCIRState *ir = s->ctx->ir; @@ -1060,6 +1190,20 @@ static void sccp_visit_instr(SCCPState *s, int idx) goto handle_control_flow; } + /* A barrel-shift-fused ALU op (opt_fusion) carries a hidden shift applied to + * one operand, recorded in ir->barrel_shifts[] and invisible in the IR + * operands. Lattice-evaluating it as a plain ALU op would compute the wrong + * constant (e.g. `x & (y<<7)` folded as `x & y`), so force it to BOTTOM — the + * same guard GVN already uses (ssa_opt_gvn.c). Random-C O1 wrong-code, + * seed 215. */ + if (ir->barrel_shifts && q->orig_index >= 0 && + q->orig_index <= ir->max_orig_index && + ir->barrel_shifts[q->orig_index]) { + if (sccp_set_bottom(dest_cell)) + sccp_add_ssa(s, TCCIR_DECODE_VREG_POSITION(dest_vr)); + goto handle_control_flow; + } + int is_64 = (dest.btype == IROP_BTYPE_INT64); /* ASSIGN: propagate source value */ @@ -1222,6 +1366,17 @@ static void sccp_visit_instr(SCCPState *s, int idx) break; } } + /* Degenerate conditional branch: the taken target IS the fall-through + * block (a JUMPIF to the next instruction), so there is no successor + * distinct from target_block and the loop above leaves fall_block = -1. + * Both branch outcomes go to the same single successor — point the + * fall-through there too, otherwise resolving the branch "not taken" + * would add an edge to block -1 and leave the real successor (and any + * definition it carries into a downstream phi) wrongly unreachable. + * DCE collapsing the only instruction between a JUMPIF and its target + * produces exactly this shape (seed 1454). */ + if (fall_block < 0) + fall_block = target_block; int resolved = 0; if (ci >= 0) { @@ -1713,6 +1868,10 @@ int ssa_opt_sccp(IRSSAOptCtx *ctx) for (int i = bb->start_idx; i < bb->end_idx; i++) sccp_visit_instr(&s, i); } + /* Repair optimistic-fold gaps: any phi left CONST with a TOP operand on an + * executable edge is widened to BOTTOM, re-seeding the worklists so its + * dependents re-evaluate before we accept the fixpoint. */ + sccp_force_stuck_phis_bottom(&s); if (s.cfg_wl_count == 0 && s.ssa_wl_count == 0) break; } diff --git a/ir/opt_branch.c b/ir/opt_branch.c index d15db5a0..0db7f29f 100644 --- a/ir/opt_branch.c +++ b/ir/opt_branch.c @@ -44,12 +44,35 @@ static int vrp_get_slot(int vr_type, int pos) return -1; } +/* VRP models 32-bit values as sign-extended int32 (the IMM32 operand + * encoding, which the range table and its ADD/SUB arithmetic use). A 32-bit + * unsigned constant can instead arrive as a pool-stored I64 holding the + * ZERO-extended value (e.g. #3435266601, printed as #-859700695), and mixing + * the two encodings in one int64 comparison flips unsigned compares (ptr + * fuzz seed 35289: `T = 0 or both < 0 as int64), so the uint32 ordering is monotone. */ static int vrp_fold_cmp(int64_t rmin, int64_t rmax, int64_t cmp_val, int tok) { + /* Enforce the precondition above instead of trusting every caller: a + * mixed-sign range covers both halves of the uint32 space, so endpoint + * checks say nothing about the values in between. */ + if ((tok == 0x92 || tok == 0x93 || tok == 0x96 || tok == 0x97) && + (rmin < 0) != (rmax < 0)) + return -1; int res_min = evaluate_compare_condition(rmin, cmp_val, tok); int res_max = evaluate_compare_condition(rmax, cmp_val, tok); if (res_min < 0 || res_max < 0 || res_min != res_max) @@ -490,20 +513,27 @@ int tcc_ir_opt_vrp(TCCIRState *ir) { int src_slot = vrp_get_slot(TCCIR_DECODE_VREG_TYPE(src1_vr), TCCIR_DECODE_VREG_POSITION(src1_vr)); int dst_slot = vrp_get_slot(TCCIR_DECODE_VREG_TYPE(dest_vr), TCCIR_DECODE_VREG_POSITION(dest_vr)); - if (src_slot >= 0 && ranges[src_slot].valid && dst_slot >= 0) + int64_t imm; + if (src_slot >= 0 && ranges[src_slot].valid && dst_slot >= 0 && + vrp_read_const32(ir, src2, &imm)) { - int64_t imm = irop_get_imm64_ex(ir, src2); int64_t new_min = (q->op == TCCIR_OP_ADD) ? ranges[src_slot].min_val + imm : ranges[src_slot].min_val - imm; int64_t new_max = (q->op == TCCIR_OP_ADD) ? ranges[src_slot].max_val + imm : ranges[src_slot].max_val - imm; - /* Clamp to int32 range to stay within 32-bit value semantics */ - if (new_min < (int64_t)INT32_MIN) - new_min = INT32_MIN; - if (new_max > (int64_t)INT32_MAX) - new_max = INT32_MAX; - ranges[dst_slot].valid = 1; - ranges[dst_slot].min_val = new_min; - ranges[dst_slot].max_val = new_max; - ranges_dirty = 1; + /* A result outside int32 wraps in 32-bit arithmetic and the wrapped + * value set is not an interval in this domain, so drop the range + * rather than clamp (a clamped endpoint asserts a value the program + * never actually takes). */ + if (new_min < (int64_t)INT32_MIN || new_max > (int64_t)INT32_MAX) + { + ranges[dst_slot].valid = 0; + } + else + { + ranges[dst_slot].valid = 1; + ranges[dst_slot].min_val = new_min; + ranges[dst_slot].max_val = new_max; + ranges_dirty = 1; + } } else if (dst_slot >= 0) { @@ -525,6 +555,39 @@ int tcc_ir_opt_vrp(TCCIRState *ir) { int32_t s1_vr = irop_get_vreg(src1); int32_t d_vr = irop_get_vreg(dest); + + /* Seed a singleton range from a plain immediate assignment: T = #imm + * gives the destination the range [imm, imm]. Without this, the very + * first range in an "assign a constant, then compare it" chain is never + * established -- every other range source (fall-through constraints from + * a prior CMP, ADD/SUB propagation, and vreg-to-vreg copy propagation + * below) can only forward a range that already exists, none can create + * one from a bare immediate -- so `T = #5; CMP T,#20; JUMPIF LT` never + * folds. See docs/bugs.md #6. Constants are normalized to the pass's + * sign-extended-int32 domain by vrp_read_const32 (64-bit-typed values + * are rejected); a non-vreg or lval/sym destination is not a plain + * value definition and is left to the generic invalidation. */ + if (irop_is_immediate(src1) && d_vr >= 0 && !dest.is_lval && !dest.is_sym) + { + int d_slot = vrp_get_slot(TCCIR_DECODE_VREG_TYPE(d_vr), TCCIR_DECODE_VREG_POSITION(d_vr)); + if (d_slot >= 0) + { + int64_t imm; + if (vrp_read_const32(ir, src1, &imm)) + { + ranges[d_slot].valid = 1; + ranges[d_slot].min_val = imm; + ranges[d_slot].max_val = imm; + ranges_dirty = 1; + } + else + { + ranges[d_slot].valid = 0; + } + continue; + } + } + int src_type = (s1_vr >= 0) ? TCCIR_DECODE_VREG_TYPE(s1_vr) : -1; int src_forwards_value = s1_vr >= 0 && @@ -557,15 +620,17 @@ int tcc_ir_opt_vrp(TCCIRState *ir) if (src1_vr >= 0) { int src_slot = vrp_get_slot(TCCIR_DECODE_VREG_TYPE(src1_vr), TCCIR_DECODE_VREG_POSITION(src1_vr)); - int64_t cmp_val = irop_get_imm64_ex(ir, src2); + int64_t cmp_val = 0; + int cmp_val_ok = vrp_read_const32(ir, src2, &cmp_val); IROperand cond_op = tcc_ir_op_get_src1(ir, jump_q); int tok = (int)irop_get_imm64_ex(ir, cond_op); IROperand jmp_dest = tcc_ir_op_get_dest(ir, jump_q); /* Tautology fold: unsigned compare against zero is always-true * (>=U 0) or always-false (= 0 && ranges[src_slot].valid) + if (src_slot >= 0 && ranges[src_slot].valid && cmp_val_ok) { int64_t rmin = ranges[src_slot].min_val; int64_t rmax = ranges[src_slot].max_val; @@ -656,7 +721,7 @@ int tcc_ir_opt_vrp(TCCIRState *ir) } /* Set pending fall-through constraint: NOT(cond) holds after JUMPIF not-taken */ - if (src_slot >= 0 && i + 2 < n) + if (src_slot >= 0 && i + 2 < n && cmp_val_ok) { int64_t new_min = INT32_MIN; int64_t new_max = INT32_MAX; @@ -746,7 +811,8 @@ int tcc_ir_opt_vrp(TCCIRState *ir) IROperand bs = tcc_ir_op_get_src1(ir, bq); if ((bq->op == TCCIR_OP_ASSIGN || bq->op == TCCIR_OP_LOAD) && irop_is_immediate(bs)) { - if (irop_get_imm64_ex(ir, bs) == new_min) + int64_t bs_val; + if (!vrp_read_const32(ir, bs, &bs_val) || bs_val == new_min) bp_safe = 0; continue; } @@ -857,9 +923,10 @@ int tcc_ir_opt_vrp(TCCIRState *ir) * the constrained PARAM; any other def shape is unknown. * Skip when the CMP dereferences its source (is_lval) — the * constraint tracks the scalar value, not the pointed-to. */ + int64_t sf_cmp_val; if (!have_range && !src1.is_lval && eq_scope_src_slot >= 0 && - i < eq_scope_end && cmp_slot >= 0) { - int64_t sf_cmp_val = irop_get_imm64_ex(ir, src2); + i < eq_scope_end && cmp_slot >= 0 && + vrp_read_const32(ir, src2, &sf_cmp_val)) { IROperand sf_cond_op = tcc_ir_op_get_src1(ir, jump_q); int sf_tok = (int)irop_get_imm64_ex(ir, sf_cond_op); int sf_unified = -2; @@ -875,7 +942,7 @@ int tcc_ir_opt_vrp(TCCIRState *ir) IROperand bs = tcc_ir_op_get_src1(ir, bq); if ((bq->op == TCCIR_OP_ASSIGN || bq->op == TCCIR_OP_LOAD) && irop_is_immediate(bs)) { - def_val = irop_get_imm64_ex(ir, bs); + if (!vrp_read_const32(ir, bs, &def_val)) { sf_safe = 0; continue; } } else if (bq->op == TCCIR_OP_ASSIGN || bq->op == TCCIR_OP_LOAD) { int32_t bsv = irop_get_vreg(bs); if (bsv >= 0 && !bs.is_lval) { @@ -899,9 +966,9 @@ int tcc_ir_opt_vrp(TCCIRState *ir) have_range = 1; } } - if (have_range) + int64_t cmp_val; + if (have_range && vrp_read_const32(ir, src2, &cmp_val)) { - int64_t cmp_val = irop_get_imm64_ex(ir, src2); int64_t rmin = ranges[cmp_slot].min_val; int64_t rmax = ranges[cmp_slot].max_val; IROperand set_src1_op = tcc_ir_op_get_src1(ir, jump_q); @@ -1441,6 +1508,7 @@ int tcc_ir_opt_nonneg_branch_fold(TCCIRState *ir) int tcc_ir_opt_branch_folding(TCCIRState *ir) { + if (tcc_ir_opt_pass_disabled("branch_fold")) return 0; if (ir->next_instruction_index < 2) return 0; IROptCtx ctx; diff --git a/ir/opt_constfold.c b/ir/opt_constfold.c index 242d71fd..cf1a1223 100644 --- a/ir/opt_constfold.c +++ b/ir/opt_constfold.c @@ -580,7 +580,16 @@ int tcc_ir_opt_self_copy_elim(TCCIRState *ir) !ir_opt_get_call_param_operand(ir, i, 1, &p1)) continue; - if (!ir_opt_pure_expr_equal(ir, p0, i, p1, i, 0)) + /* Resolve each param's source at its own marshalling site, not at the call + * index. If the source temp is redefined between param0 and param1, using + * the call index as the use-site for both collapses them to the same (last) + * reaching definition and the self-copy fold fires incorrectly. */ + int p0_idx = ir_opt_get_call_param_index(ir, i, 0); + int p1_idx = ir_opt_get_call_param_index(ir, i, 1); + if (p0_idx < 0 || p1_idx < 0) + continue; + + if (!ir_opt_pure_expr_equal(ir, p0, p0_idx, p1, p1_idx, 0)) continue; /* Self-copy: NOP the param marshalling and the call itself. @@ -1634,7 +1643,7 @@ int tcc_ir_simulate_switch_func_ex(const TCCFuncSwitchSnapshot *snap, int64_t ar case TCCIR_OP_ADD: case TCCIR_OP_SUB: { - int64_t l, r1; + int64_t l = 0, r1 = 0; int rl = switch_sim_read_src(&env, o, 1, &l); int rr = switch_sim_read_src(&env, o, 2, &r1); if (rl == 0 || rr == 0) @@ -1945,7 +1954,7 @@ static int rebuild_sim_env(const TCCFuncSwitchSnapshot *snap, int64_t arg_value, case TCCIR_OP_ADD: case TCCIR_OP_SUB: { - int64_t l, r1; + int64_t l = 0, r1 = 0; int rl = switch_sim_read_src(env, o, 1, &l); int rr = switch_sim_read_src(env, o, 2, &r1); if (rl == 0 || rr == 0) return 0; diff --git a/ir/opt_constprop.c b/ir/opt_constprop.c index dea399e7..9e4bae37 100644 --- a/ir/opt_constprop.c +++ b/ir/opt_constprop.c @@ -46,6 +46,113 @@ static int nan_compare_branch_result(int cond_token) } } +static int cmp_operand_is_unsigned_int(IROperand op) +{ + int btype = irop_get_btype(op); + return op.is_unsigned && + (btype == IROP_BTYPE_INT8 || btype == IROP_BTYPE_INT16 || + btype == IROP_BTYPE_INT32 || btype == IROP_BTYPE_INT64); +} + +static int cmp_operands_unsigned_width(IROperand src1, IROperand src2) +{ + return (irop_get_btype(src1) == IROP_BTYPE_INT64 || + irop_get_btype(src2) == IROP_BTYPE_INT64) + ? 64 + : 32; +} + +static int unsigned_cond_for_cmp_operands(int cond, IROperand src1, IROperand src2) +{ + if (!cmp_operand_is_unsigned_int(src1) && !cmp_operand_is_unsigned_int(src2)) + return cond; + + switch (cond) + { + case TOK_LT: + return TOK_ULT; + case TOK_GE: + return TOK_UGE; + case TOK_LE: + return TOK_ULE; + case TOK_GT: + return TOK_UGT; + default: + return cond; + } +} + +static int evaluate_compare_condition_cmp_operands(int64_t val1, int64_t val2, int cond, + IROperand src1, IROperand src2) +{ + cond = unsigned_cond_for_cmp_operands(cond, src1, src2); + if (cmp_operands_unsigned_width(src1, src2) != 64) + { + int32_t s1 = (int32_t)(uint32_t)val1; + int32_t s2 = (int32_t)(uint32_t)val2; + switch (cond) + { + case TOK_EQ: + return (uint32_t)val1 == (uint32_t)val2; + case TOK_NE: + return (uint32_t)val1 != (uint32_t)val2; + case TOK_LT: + return s1 < s2; + case TOK_GE: + return s1 >= s2; + case TOK_LE: + return s1 <= s2; + case TOK_GT: + return s1 > s2; + default: + break; + } + } + switch (cond) + { + case TOK_ULT: + case TOK_UGE: + case TOK_ULE: + case TOK_UGT: + { + if (cmp_operands_unsigned_width(src1, src2) == 64) + return evaluate_compare_condition(val1, val2, cond); + uint32_t u1 = (uint32_t)val1; + uint32_t u2 = (uint32_t)val2; + switch (cond) + { + case TOK_ULT: + return u1 < u2; + case TOK_UGE: + return u1 >= u2; + case TOK_ULE: + return u1 <= u2; + case TOK_UGT: + return u1 > u2; + default: + break; + } + } + default: + return evaluate_compare_condition(val1, val2, cond); + } +} + +static int64_t ir_opt_fit_const_to_operand(int64_t val, IROperand op) +{ + switch (irop_get_btype(op)) + { + case IROP_BTYPE_INT8: + return op.is_unsigned ? (int64_t)(uint8_t)val : (int64_t)(int8_t)val; + case IROP_BTYPE_INT16: + return op.is_unsigned ? (int64_t)(uint16_t)val : (int64_t)(int16_t)val; + case IROP_BTYPE_INT32: + return op.is_unsigned ? (int64_t)(uint32_t)val : (int64_t)(int32_t)val; + default: + return val; + } +} + /* Refresh stale `interval->addrtaken` flags. The flag is set by the * frontend when source code takes a variable's address, but earlier * optimizer passes may have eliminated the producing LEA (e.g. a dead @@ -238,9 +345,22 @@ static int refresh_stale_var_addrtaken(TCCIRState *ir) return cleared; } +/* MLA carries a 4th (accumulator) operand at pool[operand_base+3] that is a + * real USE of its vreg but is invisible to the has_src1/has_src2 operand + * config. Every use-scan that decides whether a def is dead must include + * it, or a value consumed only as an MLA accumulator is treated as unread + * and its def deleted (ptr seed 6869). Returns -1 when there is none. */ +static int32_t ir_opt_mla_accum_vreg(const TCCIRState *ir, const IRQuadCompact *q) +{ + if (q->op != TCCIR_OP_MLA) + return -1; + return irop_get_vreg(tcc_ir_op_get_accum(ir, q)); +} + static int tcc_ir_opt_const_var_prop__timed(TCCIRState *ir); int tcc_ir_opt_const_var_prop(TCCIRState *ir) { + if (tcc_ir_opt_pass_disabled("const_var_prop")) return 0; tcc_pass_timing_init(); if (!tcc_pass_timing_on) return tcc_ir_opt_const_var_prop__timed(ir); unsigned long _t = tcc_pass_clk_us(); @@ -248,6 +368,28 @@ int tcc_ir_opt_const_var_prop(TCCIRState *ir) tcc_pass_timing_add("const_var_prop", tcc_pass_clk_us() - _t); return _r; } + +static int ir_has_variadic_stack_arg_call(TCCIRState *ir) +{ + int n = ir ? ir->next_instruction_index : 0; + for (int i = 0; i < n; i++) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op != TCCIR_OP_FUNCCALLVAL && q->op != TCCIR_OP_FUNCCALLVOID) + continue; + + Sym *callee = irop_get_sym_ex(ir, tcc_ir_op_get_src1(ir, q)); + if (!callee || !callee->type.ref || callee->type.ref->f.func_type != FUNC_ELLIPSIS) + continue; + + IROperand meta = tcc_ir_op_get_src2(ir, q); + int argc = TCCIR_DECODE_CALL_ARGC((uint32_t)irop_get_imm64_ex(ir, meta)); + if (argc > 4) + return 1; + } + return 0; +} + static int tcc_ir_opt_const_var_prop__timed(TCCIRState *ir) { int n = ir->next_instruction_index; @@ -264,6 +406,14 @@ static int tcc_ir_opt_const_var_prop__timed(TCCIRState *ir) * where `dead` got eliminated as unread. */ refresh_stale_var_addrtaken(ir); + /* ARM variadic calls with anonymous arguments beyond r0-r3 use the caller's + * outgoing stack area. varargs seed 31282 exposed a backend/register- + * allocation miscompile only after this whole-function constant propagator + * aggressively simplified such call regions; keep the IR shape conservative + * until that lower-level ABI bug is fixed directly. */ + if (ir_has_variadic_stack_arg_call(ir)) + return 0; + /* Phase 1: Find constant VAR vregs (assigned exactly once with immediate * or symref). For symrefs we also remember is_lval/is_local/is_const so * the rebuilt operand at the use site preserves the original semantics. */ @@ -395,6 +545,13 @@ static int tcc_ir_opt_const_var_prop__timed(TCCIRState *ir) if (var_info[pos].use_count < 255) var_info[pos].use_count++; } + int32_t acc_vr = ir_opt_mla_accum_vreg(ir, q); + if (acc_vr >= 0 && TCCIR_DECODE_VREG_TYPE(acc_vr) == TCCIR_VREG_TYPE_VAR) + { + int pos = TCCIR_DECODE_VREG_POSITION(acc_vr); + if (pos <= max_var_pos && var_info[pos].use_count < 255) + var_info[pos].use_count++; + } } /* Mark multiply-defined vars as non-constant */ @@ -536,6 +693,13 @@ static int tcc_ir_opt_const_var_prop__timed(TCCIRState *ir) has_use[pos / 8] |= (1 << (pos % 8)); } } + int32_t acc_vr = ir_opt_mla_accum_vreg(ir, q); + if (acc_vr >= 0 && TCCIR_DECODE_VREG_TYPE(acc_vr) == TCCIR_VREG_TYPE_VAR) + { + int pos = TCCIR_DECODE_VREG_POSITION(acc_vr); + if (pos <= max_var_pos) + has_use[pos / 8] |= (1 << (pos % 8)); + } } /* NOP dead ASSIGN instructions for constant VARs with no remaining uses */ @@ -1092,18 +1256,24 @@ int tcc_ir_opt_symref_const_prop(TCCIRState *ir) changes++; } - /* Record new ASSIGN(symref) definitions for downstream substitution. */ - if (q->op == TCCIR_OP_ASSIGN && irop_config[q->op].has_dest) + /* Record new ASSIGN(symref) definitions for downstream substitution, and + * invalidate any tracked tmp redefined by a write that does NOT record a + * fresh copy. Both cases share the dest-decode prologue, so they live in + * one branch: an ASSIGN whose source is not a non-lval symref must still + * fall through to invalidation (it redefines the tmp), which an + * `if/else if` split would have skipped. */ + if (irop_config[q->op].has_dest) { IROperand dest = tcc_ir_op_get_dest(ir, q); int32_t dvr = irop_get_vreg(dest); if (TCCIR_DECODE_VREG_TYPE(dvr) == TCCIR_VREG_TYPE_TEMP) { - IROperand src1 = tcc_ir_op_get_src1(ir, q); - if (src1.is_sym && !src1.is_lval) + int pos = TCCIR_DECODE_VREG_POSITION(dvr); + int recorded = 0; + if (q->op == TCCIR_OP_ASSIGN) { - int pos = TCCIR_DECODE_VREG_POSITION(dvr); - if (pos <= max_tmp_pos) + IROperand src1 = tcc_ir_op_get_src1(ir, q); + if (src1.is_sym && !src1.is_lval && pos <= max_tmp_pos) { map[pos].gen = current_gen; map[pos].pool_idx = (uint32_t)src1.u.pool_idx; @@ -1111,19 +1281,11 @@ int tcc_ir_opt_symref_const_prop(TCCIRState *ir) map[pos].is_local = src1.is_local; map[pos].is_const = src1.is_const; map[pos].is_unsigned = src1.is_unsigned; + recorded = 1; } } - } - } - /* Any other write that targets a tracked tmp invalidates it. */ - else if (irop_config[q->op].has_dest) - { - IROperand dest = tcc_ir_op_get_dest(ir, q); - int32_t dvr = irop_get_vreg(dest); - if (TCCIR_DECODE_VREG_TYPE(dvr) == TCCIR_VREG_TYPE_TEMP) - { - int pos = TCCIR_DECODE_VREG_POSITION(dvr); - if (pos <= max_tmp_pos && map[pos].gen == current_gen) + /* Not a fresh copy record → this write kills any tracked symref. */ + if (!recorded && pos <= max_tmp_pos && map[pos].gen == current_gen) map[pos].gen = 0; } } @@ -1345,7 +1507,8 @@ static int ir_opt_vreg_use_count(TCCIRState *ir, int32_t vreg) if (q->op == TCCIR_OP_NOP) continue; if (irop_get_vreg(tcc_ir_op_get_src1(ir, q)) == vreg || - irop_get_vreg(tcc_ir_op_get_src2(ir, q)) == vreg) + irop_get_vreg(tcc_ir_op_get_src2(ir, q)) == vreg || + ir_opt_mla_accum_vreg(ir, q) == vreg) count++; } return count; @@ -1473,6 +1636,7 @@ static int eval_cmp_operand_const(TCCIRState *ir, IROperand op, int use_idx, uin int64_t target_off = irop_get_stack_offset(op); int op_btype = irop_get_btype(op); + int32_t target_vr = irop_get_vreg(op); for (int j = use_idx - 1; j >= 0; j--) { @@ -1497,6 +1661,8 @@ static int eval_cmp_operand_const(TCCIRState *ir, IROperand op, int use_idx, uin int64_t soff = irop_get_stack_offset(sdest); if (soff != target_off) continue; + if (target_vr >= 0 && irop_get_vreg(sdest) != target_vr) + return 0; if (irop_get_btype(sdest) != op_btype) return 0; @@ -1514,6 +1680,7 @@ static int eval_cmp_operand_const(TCCIRState *ir, IROperand op, int use_idx, uin static int tcc_ir_opt_const_prop__timed(TCCIRState *ir); int tcc_ir_opt_const_prop(TCCIRState *ir) { + if (tcc_ir_opt_pass_disabled("const_prop")) return 0; tcc_pass_timing_init(); if (!tcc_pass_timing_on) return tcc_ir_opt_const_prop__timed(ir); unsigned long _t = tcc_pass_clk_us(); @@ -1913,6 +2080,13 @@ static int tcc_ir_opt_const_prop__timed(TCCIRState *ir) if (pos <= max_var_pos && var_info[pos].use_count < 255) var_info[pos].use_count++; } + int32_t acc_vr = ir_opt_mla_accum_vreg(ir, uq); + if (acc_vr >= 0 && TCCIR_DECODE_VREG_TYPE(acc_vr) == TCCIR_VREG_TYPE_VAR) + { + int pos = TCCIR_DECODE_VREG_POSITION(acc_vr); + if (pos <= max_var_pos && var_info[pos].use_count < 255) + var_info[pos].use_count++; + } } /* Second pass: propagate constants and apply algebraic simplifications */ @@ -1979,6 +2153,27 @@ static int tcc_ir_opt_const_prop__timed(TCCIRState *ir) * multiple uses — keep the VAR alive so a single load suffices. */ if (var_info[pos].use_count > 1 && VAR_CONST_NEEDS_POOL_LOAD(val)) continue; + /* CMP computes src1 - src2; moving a constant into src1 inverts the + * subtraction and reverses ordered conditions (LT/GT/LE/GE) read by + * the following JUMPIF/SETIF. Only propagate when src2 is also a + * compile-time constant, so the whole CMP folds. (See the matching + * guard in tcc_ir_opt_const_prop_tmp__timed.) */ + if (q->op == TCCIR_OP_CMP) + { + IROperand cmp_s2 = tcc_ir_op_get_src2(ir, q); + int s2_const = irop_is_immediate(cmp_s2); + if (!s2_const) + { + int32_t s2_vr = irop_get_vreg(cmp_s2); + if (s2_vr >= 0 && TCCIR_DECODE_VREG_TYPE(s2_vr) == TCCIR_VREG_TYPE_VAR) + { + int sp = TCCIR_DECODE_VREG_POSITION(s2_vr); + s2_const = (sp <= max_var_pos && var_info[sp].is_constant); + } + } + if (!s2_const) + continue; + } IROperand new_src1; int btype = irop_get_btype(src1); if (val == (int32_t)val) @@ -2134,23 +2329,35 @@ static int tcc_ir_opt_const_prop__timed(TCCIRState *ir) result = (val1 != 0) || (val2 != 0) ? 1 : 0; break; case TCCIR_OP_IMOD: - if (val2 != 0) + if (val2 == 0) { - result = val1 % val2; + can_fold = 0; /* Division by zero - don't fold */ + } + else if (val2 == -1 && + ((btype == IROP_BTYPE_INT64 && val1 == INT64_MIN) || + (btype != IROP_BTYPE_INT64 && (int32_t)val1 == INT32_MIN))) + { + can_fold = 0; /* INT_MIN % -1 overflows in two's complement - bail */ } else { - can_fold = 0; /* Division by zero - don't fold */ + result = val1 % val2; } break; case TCCIR_OP_DIV: - if (val2 != 0) + if (val2 == 0) { - result = val1 / val2; + can_fold = 0; /* Division by zero - don't fold */ + } + else if (val2 == -1 && + ((btype == IROP_BTYPE_INT64 && val1 == INT64_MIN) || + (btype != IROP_BTYPE_INT64 && (int32_t)val1 == INT32_MIN))) + { + can_fold = 0; /* INT_MIN / -1 overflows in two's complement - bail */ } else { - can_fold = 0; /* Division by zero - don't fold */ + result = val1 / val2; } break; case TCCIR_OP_UDIV: @@ -2423,10 +2630,12 @@ static int tcc_ir_opt_const_prop__timed(TCCIRState *ir) } } - /* Byte-cast folding: SHL #N → SHR #N → AND #mask. - * TCC emits (byte)x as SHL #24, SHR #24 (shift up then unsigned shift down). - * Fold to AND #0xFF which the backend can emit as UXTB or UBFX. - * Also fold SHL #16, SHR #16 → AND #0xFFFF (halfword cast). */ + /* Bitfield-extract folding: SHL #N → SHR #M → (x >> (M-N)) & mask. + * TCC emits (byte)x as SHL #24, SHR #24 (equal shifts), but bitfield reads + * use unequal amounts such as SHL #18, SHR #25. When 0 < N <= M < 32, + * (x << N) >> M is equivalent to (x >> (M-N)) & ((1 << (32-M)) - 1). + * The resulting SHR+AND pair is then eligible for the UBFX fusion below. + * Signed extracts (SAR) are not handled here; they are left for known-bits. */ for (i = 0; i < n - 1; i++) { IRQuadCompact *shl_q = &ir->compact_instructions[i]; @@ -2439,32 +2648,52 @@ static int tcc_ir_opt_const_prop__timed(TCCIRState *ir) continue; int64_t shl_amt = irop_get_imm64_ex(ir, shl_src2); int64_t shr_amt = irop_get_imm64_ex(ir, shr_src2); - if (shl_amt != shr_amt || shl_amt <= 0 || shl_amt >= 32) + if (shl_amt <= 0 || shl_amt >= 32 || shr_amt <= 0 || shr_amt >= 32 || shl_amt > shr_amt) continue; /* Verify the SHR reads the SHL's dest */ IROperand shl_dest = tcc_ir_op_get_dest(ir, shl_q); IROperand shr_src1 = tcc_ir_op_get_src1(ir, shr_q); if (irop_get_vreg(shl_dest) != irop_get_vreg(shr_src1)) continue; - /* Skip 64-bit types: the mask computation assumes 32-bit width. - * For INT64, SHL #16 → SHR #16 masks 48 bits, not 16. Also check dest - * btypes since src1 btype may have been weakened during forwarding. */ + /* The transformation rewrites the SHL instruction itself. It is only safe + * if the SHL result is used exclusively by the SHR; other consumers (e.g. + * a rotate idiom that also shifts the value left) would see the wrong + * value after the SHL is turned into a SHR (gcc.c-torture/20180112-1). */ + if (!tcc_ir_vreg_has_single_use(ir, irop_get_vreg(shl_dest), i)) + continue; + /* Skip 64-bit types: the mask computation assumes 32-bit width. */ IROperand shl_orig_src1_chk = tcc_ir_op_get_src1(ir, shl_q); IROperand shr_dest_chk = tcc_ir_op_get_dest(ir, shr_q); if (shl_orig_src1_chk.btype == IROP_BTYPE_INT64 || shl_orig_src1_chk.btype == IROP_BTYPE_FLOAT64 || shl_dest.btype == IROP_BTYPE_INT64 || shl_dest.btype == IROP_BTYPE_FLOAT64 || shr_dest_chk.btype == IROP_BTYPE_INT64 || shr_dest_chk.btype == IROP_BTYPE_FLOAT64) continue; - /* SHL #N then SHR #N = AND with mask of (32-N) low bits */ - uint32_t mask = (shl_amt == 32) ? 0 : ((1u << (32 - shl_amt)) - 1); - /* Replace SHL with AND, NOP the SHR */ IROperand shl_orig_src1 = tcc_ir_op_get_src1(ir, shl_q); IROperand shr_dest = tcc_ir_op_get_dest(ir, shr_q); - shr_q->op = TCCIR_OP_AND; - tcc_ir_set_dest(ir, i + 1, shr_dest); - tcc_ir_set_src1(ir, i + 1, shl_orig_src1); - tcc_ir_set_src2(ir, i + 1, irop_make_imm32(-1, (int32_t)mask, IROP_BTYPE_INT32)); - shl_q->op = TCCIR_OP_NOP; + if (shl_amt == shr_amt) + { + /* SHL #N then SHR #N = AND with mask of (32-N) low bits */ + uint32_t mask = (shl_amt == 32) ? 0 : ((1u << (32 - shl_amt)) - 1); + shr_q->op = TCCIR_OP_AND; + tcc_ir_set_dest(ir, i + 1, shr_dest); + tcc_ir_set_src1(ir, i + 1, shl_orig_src1); + tcc_ir_set_src2(ir, i + 1, irop_make_imm32(-1, (int32_t)mask, IROP_BTYPE_INT32)); + shl_q->op = TCCIR_OP_NOP; + } + else + { + /* SHL #N then SHR #M = (x >> (M-N)) & ((1 << (32-M)) - 1) */ + uint32_t rshift = (uint32_t)(shr_amt - shl_amt); + uint32_t mask = (1u << (32 - shr_amt)) - 1; + shl_q->op = TCCIR_OP_SHR; + tcc_ir_set_dest(ir, i, shl_dest); + tcc_ir_set_src1(ir, i, shl_orig_src1); + tcc_ir_set_src2(ir, i, irop_make_imm32(-1, (int32_t)rshift, IROP_BTYPE_INT32)); + shr_q->op = TCCIR_OP_AND; + tcc_ir_set_dest(ir, i + 1, shr_dest); + tcc_ir_set_src1(ir, i + 1, shl_dest); + tcc_ir_set_src2(ir, i + 1, irop_make_imm32(-1, (int32_t)mask, IROP_BTYPE_INT32)); + } changes++; } @@ -2486,6 +2715,12 @@ static int tcc_ir_opt_const_prop__timed(TCCIRState *ir) IROperand xor2_src1 = tcc_ir_op_get_src1(ir, xor2_q); if (irop_get_vreg(xor1_dest) != irop_get_vreg(xor2_src1)) continue; + /* The rewrite deletes the first XOR. That is only safe when the + * intermediate value feeds this second XOR alone; otherwise sibling + * consumers would observe the pre-cancel value, and non-SSA updates like + * `V = V ^ C; T = V ^ C` would lose the updated V. */ + if (!tcc_ir_vreg_has_single_use(ir, irop_get_vreg(xor1_dest), i)) + continue; LOG_IR_GEN("OPTIMIZE: XOR cancel (x ^ %lld) ^ %lld = x at i=%d,%d", (long long)irop_get_imm64_ex(ir, xor1_src2), (long long)irop_get_imm64_ex(ir, xor2_src2), i, i + 1); IROperand xor1_src1 = tcc_ir_op_get_src1(ir, xor1_q); @@ -2936,44 +3171,9 @@ static int tcc_ir_opt_const_prop__timed(TCCIRState *ir) IROperand setif_src1 = tcc_ir_op_get_src1(ir, setif_q); cond = (int)irop_get_imm64_ex(ir, setif_src1); /* Condition code stored as immediate (TCC token) */ - /* Evaluate the comparison based on TCC token values */ - result = 0; - switch (cond) - { - case 0x94: /* TOK_EQ */ - result = (val1 == val2) ? 1 : 0; - break; - case 0x95: /* TOK_NE */ - result = (val1 != val2) ? 1 : 0; - break; - case 0x9c: /* TOK_LT */ - result = (val1 < val2) ? 1 : 0; - break; - case 0x9d: /* TOK_GE */ - result = (val1 >= val2) ? 1 : 0; - break; - case 0x9e: /* TOK_LE */ - result = (val1 <= val2) ? 1 : 0; - break; - case 0x9f: /* TOK_GT */ - result = (val1 > val2) ? 1 : 0; - break; - case 0x92: /* TOK_ULT (unsigned <) */ - result = ((uint64_t)val1 < (uint64_t)val2) ? 1 : 0; - break; - case 0x93: /* TOK_UGE (unsigned >=) */ - result = ((uint64_t)val1 >= (uint64_t)val2) ? 1 : 0; - break; - case 0x96: /* TOK_ULE (unsigned <=) */ - result = ((uint64_t)val1 <= (uint64_t)val2) ? 1 : 0; - break; - case 0x97: /* TOK_UGT (unsigned >) */ - result = ((uint64_t)val1 > (uint64_t)val2) ? 1 : 0; - break; - default: - /* Unknown condition, don't fold */ + result = evaluate_compare_condition_cmp_operands(val1, val2, cond, src1, src2); + if (result < 0) continue; - } LOG_IR_GEN("OPTIMIZE: Fold CMP+SETIF const (%lld cmp %lld, cond=0x%x) = %d at i=%d", (long long)val1, (long long)val2, cond, result, i); @@ -3032,6 +3232,11 @@ static int tcc_ir_opt_const_prop__timed(TCCIRState *ir) break; } } + if (ir_opt_mla_accum_vreg(ir, jq) == vr) + { + still_used = 1; + break; + } } if (!still_used) @@ -3127,6 +3332,7 @@ typedef struct static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir); int tcc_ir_opt_value_tracking(TCCIRState *ir) { + if (tcc_ir_opt_pass_disabled("value_tracking")) return 0; tcc_pass_timing_init(); if (!tcc_pass_timing_on) return tcc_ir_opt_value_tracking__timed(ir); unsigned long _t = tcc_pass_clk_us(); @@ -3148,6 +3354,7 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) * Merges 3 separate O(n) scans into 1. */ uint8_t *is_merge = tcc_mallocz((n + 7) / 8); int *pred_count = tcc_mallocz(n * sizeof(int)); + int has_control_flow = 0; for (int i = 0; i < n; i++) { @@ -3178,6 +3385,7 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) /* Build pred_count and is_merge */ if (q->op == TCCIR_OP_JUMP || q->op == TCCIR_OP_JUMPIF) { + has_control_flow = 1; IROperand dest = tcc_ir_op_get_dest(ir, q); int target = (int)dest.u.imm32; if (target >= 0 && target < n) @@ -3191,6 +3399,7 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) /* SWITCH_TABLE: all case targets are merge points */ if (q->op == TCCIR_OP_SWITCH_TABLE) { + has_control_flow = 1; IROperand src2 = tcc_ir_op_get_src2(ir, q); int table_id = (int)irop_get_imm64_ex(ir, src2); if (table_id >= 0 && table_id < ir->num_switch_tables) @@ -3219,6 +3428,8 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) { pred_count[i + 1]++; } + if (q->op == TCCIR_OP_IJUMP) + has_control_flow = 1; } /* Mark instructions with multiple predecessors as merge points */ for (int i = 0; i < n; i++) @@ -3228,6 +3439,21 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) } tcc_free(pred_count); + uint8_t *var_def_count = tcc_mallocz(max_vreg + 1); + for (int i = 0; i < n; i++) + { + IRQuadCompact *dq = &ir->compact_instructions[i]; + if (dq->op == TCCIR_OP_NOP || !irop_config[dq->op].has_dest) + continue; + IROperand ddest = tcc_ir_op_get_dest(ir, dq); + int32_t dvr = irop_get_vreg(ddest); + if (dvr < 0 || TCCIR_DECODE_VREG_TYPE(dvr) != TCCIR_VREG_TYPE_VAR) + continue; + int dpos = TCCIR_DECODE_VREG_POSITION(dvr); + if (dpos >= 0 && dpos <= max_vreg && var_def_count[dpos] < 2) + var_def_count[dpos]++; + } + /* Detect VLA — SHL folding is unsafe in functions with VLA because * it can disrupt VLA stack save/restore patterns in nested scopes. */ int has_vla = 0; @@ -3570,6 +3796,10 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) { VT_INVALIDATE(state, dest_pos); } + else if (has_control_flow && var_def_count[dest_pos] > 1) + { + VT_INVALIDATE(state, dest_pos); + } else { /* Previous unread constant def is dead — NOP it */ @@ -3690,6 +3920,15 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) { if (src1_pos >= 0 && src1_pos <= max_vreg) VT_CLEAR_DEF(state, src1_pos); + /* The surviving MLA still reads its accumulator — mark that def + * live too, or a later redef of the same VAR would NOP it. */ + int32_t live_acc_vr = irop_get_vreg(accum); + if (live_acc_vr >= 0 && TCCIR_DECODE_VREG_TYPE(live_acc_vr) == TCCIR_VREG_TYPE_VAR) + { + int live_acc_pos = TCCIR_DECODE_VREG_POSITION(live_acc_vr); + if (live_acc_pos <= max_vreg) + VT_CLEAR_DEF(state, live_acc_pos); + } if (dest_pos >= 0 && dest_pos <= max_vreg) VT_INVALIDATE(state, dest_pos); continue; @@ -3736,6 +3975,14 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) /* src1 is read but not folded — mark its def as live */ if (src1_pos >= 0 && src1_pos <= max_vreg) VT_CLEAR_DEF(state, src1_pos); + /* Same for the accumulator read of a surviving MLA. */ + int32_t live_acc_vr = irop_get_vreg(accum); + if (live_acc_vr >= 0 && TCCIR_DECODE_VREG_TYPE(live_acc_vr) == TCCIR_VREG_TYPE_VAR) + { + int live_acc_pos = TCCIR_DECODE_VREG_POSITION(live_acc_vr); + if (live_acc_pos <= max_vreg) + VT_CLEAR_DEF(state, live_acc_pos); + } /* Destination no longer has known constant value */ if (dest_pos >= 0 && dest_pos <= max_vreg) VT_INVALIDATE(state, dest_pos); @@ -3987,7 +4234,7 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) if (jump_q->op == TCCIR_OP_JUMPIF) { int32_t src1_vr = irop_get_vreg(src1); - int src1_pos = (src1_vr >= 0 && TCCIR_DECODE_VREG_TYPE(src1_vr) == TCCIR_VREG_TYPE_VAR) + int src1_pos = (!src1.is_lval && src1_vr >= 0 && TCCIR_DECODE_VREG_TYPE(src1_vr) == TCCIR_VREG_TYPE_VAR) ? TCCIR_DECODE_VREG_POSITION(src1_vr) : -1; @@ -4003,7 +4250,7 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) IROperand cond = tcc_ir_op_get_src1(ir, jump_q); int tok = (int)irop_get_imm64_ex(ir, cond); - int result = evaluate_compare_condition(val1, val2, tok); + int result = evaluate_compare_condition_cmp_operands(val1, val2, tok, src1, src2); if (result >= 0) { @@ -4031,7 +4278,7 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) else if (jump_q->op == TCCIR_OP_SETIF) { int32_t src1_vr = irop_get_vreg(src1); - int src1_pos = (src1_vr >= 0 && TCCIR_DECODE_VREG_TYPE(src1_vr) == TCCIR_VREG_TYPE_VAR) + int src1_pos = (!src1.is_lval && src1_vr >= 0 && TCCIR_DECODE_VREG_TYPE(src1_vr) == TCCIR_VREG_TYPE_VAR) ? TCCIR_DECODE_VREG_POSITION(src1_vr) : -1; @@ -4045,7 +4292,7 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) IROperand setif_src1 = tcc_ir_op_get_src1(ir, jump_q); int cond = (int)irop_get_imm64_ex(ir, setif_src1); - int result = evaluate_compare_condition(val1, val2, cond); + int result = evaluate_compare_condition_cmp_operands(val1, val2, cond, src1, src2); if (result >= 0) { @@ -4089,6 +4336,16 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) if (s2_pos >= 0 && s2_pos <= max_vreg) VT_CLEAR_DEF(state, s2_pos); } + /* An MLA that reaches here (src2 not immediate, so Pattern 2 didn't + * consume it) still reads its accumulator; without this a later redef + * of the same VAR NOPs the def it reads (struct_byval seed 9494). */ + int32_t acc_vr = ir_opt_mla_accum_vreg(ir, q); + if (acc_vr >= 0 && TCCIR_DECODE_VREG_TYPE(acc_vr) == TCCIR_VREG_TYPE_VAR) + { + int acc_pos = TCCIR_DECODE_VREG_POSITION(acc_vr); + if (acc_pos >= 0 && acc_pos <= max_vreg) + VT_CLEAR_DEF(state, acc_pos); + } } /* Constant-fold __aeabi_lcmp/__aeabi_ulcmp calls when both arguments are @@ -4278,6 +4535,17 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) LOG_IR_GEN("VALUE_TRACK: %s(%lld, %lld) = %lld at i=%d -> folded", fname, (long long)val0, (long long)val1, (long long)result, i); changes++; + /* This CALL now defines the dest VAR with the folded quotient. + * The `continue` skips the general VAR-def state-invalidation at + * the loop tail, so the value-tracking map would still hold the + * VAR's STALE pre-call constant and forward it to a later use + * (combo_num seed 58: after loop-unroll collapses the prefix to a + * single block, q10's pre-division init `(u5<<32)|u6` leaked past + * this folded __aeabi_uldivmod into `q10 ^ q10>>32`). Invalidate + * the dest here so it is not forwarded; the rewritten `V <- #q` + * assignment still carries the correct value for later passes. */ + if (dest_pos >= 0 && dest_pos <= max_vreg) + VT_INVALIDATE(state, dest_pos); continue; } } @@ -4632,6 +4900,14 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) tcc_ir_set_src2(ir, i, irop_make_imm32(-1, (int32_t)(val1 & 63), IROP_BTYPE_INT32)); LOG_IR_GEN("VALUE_TRACK: %s(vreg, %lld) at i=%d -> lowered to IR shift", fname, (long long)val1, i); changes++; + /* This CALL now redefines the dest VAR with a runtime (non-constant) + * shift result. Without invalidating here, `state` would still hold + * the VAR's STALE pre-call constant and forward it to a later read + * in this same forward scan (longlong seed 2057: q13's pre-shift init + * `(u10<<32)|u11` leaked past this lowered __aeabi_llsl into + * `q13 ^ q13>>32`). Mirrors the uldivmod fix above. */ + if (dest_pos >= 0 && dest_pos <= max_vreg) + VT_INVALIDATE(state, dest_pos); continue; } } @@ -5262,6 +5538,7 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) tcc_free(lea_var_map); tcc_free(lea_map); tcc_free(state); + tcc_free(var_def_count); tcc_free(is_merge); /* Run DCE to remove code after eliminated branches */ @@ -5306,6 +5583,7 @@ static int tcc_ir_opt_value_tracking__timed(TCCIRState *ir) static int tcc_ir_opt_const_prop_tmp__timed(TCCIRState *ir); int tcc_ir_opt_const_prop_tmp(TCCIRState *ir) { + if (tcc_ir_opt_pass_disabled("const_prop_tmp")) return 0; tcc_pass_timing_init(); if (!tcc_pass_timing_on) return tcc_ir_opt_const_prop_tmp__timed(ir); unsigned long _t = tcc_pass_clk_us(); @@ -5486,9 +5764,46 @@ static int tcc_ir_opt_const_prop_tmp__timed(TCCIRState *ir) } } if (do_prop) + { + /* CMP computes src1 - src2 and sets flags read by a following + * JUMPIF/SETIF whose condition token was emitted for this operand + * order. Replacing src1 with a constant inverts the subtraction + * (const - src2 instead of src1 - src2), reversing every signed/ + * unsigned ordered condition (LT/GT/LE/GE). Only propagate when + * src2 is also a known constant so the whole CMP folds to a + * compile-time value (where operand order is irrelevant). EQ/NE + * are order-independent, but the following consumer's condition + * token is not inspected here, so apply the rule uniformly. */ + if (q->op == TCCIR_OP_CMP) + { + IROperand cmp_s2 = tcc_ir_op_get_src2(ir, q); + int s2_const = irop_is_immediate(cmp_s2); + if (!s2_const) + { + int32_t s2_vr = irop_get_vreg(cmp_s2); + if (s2_vr >= 0) + { + if (TCCIR_DECODE_VREG_TYPE(s2_vr) == TCCIR_VREG_TYPE_TEMP) + { + int p = TCCIR_DECODE_VREG_POSITION(s2_vr); + s2_const = (p <= max_tmp_pos && tmp_info[p].gen == current_gen); + } + else if (max_var_pos >= 0 && TCCIR_DECODE_VREG_TYPE(s2_vr) == TCCIR_VREG_TYPE_VAR) + { + int p = TCCIR_DECODE_VREG_POSITION(s2_vr); + s2_const = (p <= max_var_pos && var_info[p].gen == current_gen); + } + } + } + if (!s2_const) + do_prop = 0; + } + } + if (do_prop) { int btype = irop_get_btype(src1); IROperand new_src1; + prop_val = ir_opt_fit_const_to_operand(prop_val, src1); if (prop_val == (int32_t)prop_val) { new_src1 = irop_make_imm32(-1, (int32_t)prop_val, btype); @@ -5537,7 +5852,7 @@ static int tcc_ir_opt_const_prop_tmp__timed(TCCIRState *ir) { LOG_IR_GEN("OPTIMIZE: const propagate vreg %d = %lld to src2 at i=%d", src2_vr, (long long)prop_val, i); int btype = irop_get_btype(src2); - int64_t val = prop_val; + int64_t val = ir_opt_fit_const_to_operand(prop_val, src2); /* When propagating a narrow constant into a wider bitwise op, * widen it to INT64 with zero-extension so the code generator * doesn't sign-extend the immediate into the upper register. */ @@ -5742,7 +6057,7 @@ static int tcc_ir_opt_const_prop_tmp__timed(TCCIRState *ir) int64_t cv2 = irop_get_imm64_ex(ir, cs2); IROperand setif_src1 = tcc_ir_op_get_src1(ir, next_q); int cond = (int)irop_get_imm64_ex(ir, setif_src1); - int result = evaluate_compare_condition(cv1, cv2, cond); + int result = evaluate_compare_condition_cmp_operands(cv1, cv2, cond, cs1, cs2); if (result >= 0) { q->op = TCCIR_OP_NOP; @@ -5870,18 +6185,34 @@ static int tcc_ir_opt_const_prop_tmp__timed(TCCIRState *ir) continue; } - /* Track TMP <- constant assignments (re-fetch src1 since fold may have changed it) */ + /* Track TMP <- constant assignments (re-fetch src1 since fold may have + * changed it). A TEMP redefined with a non-constant value must drop its + * entry: TEMPs are single-def by construction, but loop unrolling renames + * at most UNROLL_MAX_RENAME body-local temps per copy, so leftover temps + * are multi-def straight-line code and a stale constant from one copy + * would leak into the next (volatile fuzz seed 8310). STORE/STORE_INDEXED + * lvalue dests (deref addresses) and FUNCPARAM dests (the passed value) + * are uses, not defs, and leave the entry alone; STORE_POSTINC updates + * its address register, so it falls through to the invalidation. */ IROperand dest = tcc_ir_op_get_dest(ir, q); int32_t dest_vr = irop_get_vreg(dest); if (irop_config[q->op].has_dest && TCCIR_DECODE_VREG_TYPE(dest_vr) == TCCIR_VREG_TYPE_TEMP && - (q->op == TCCIR_OP_ASSIGN || q->op == TCCIR_OP_CVT_FTOF)) + q->op != TCCIR_OP_FUNCPARAMVAL && q->op != TCCIR_OP_FUNCPARAMVOID && + !((q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED) && dest.is_lval)) { const int pos = TCCIR_DECODE_VREG_POSITION(dest_vr); - IROperand cur_src1 = tcc_ir_op_get_src1(ir, q); - if (pos <= max_tmp_pos && irop_is_immediate(cur_src1)) + if (pos <= max_tmp_pos) { - tmp_info[pos].gen = current_gen; - tmp_info[pos].value = irop_get_imm64_ex(ir, cur_src1); + IROperand cur_src1 = tcc_ir_op_get_src1(ir, q); + if ((q->op == TCCIR_OP_ASSIGN || q->op == TCCIR_OP_CVT_FTOF) && irop_is_immediate(cur_src1)) + { + tmp_info[pos].gen = current_gen; + tmp_info[pos].value = ir_opt_fit_const_to_operand(irop_get_imm64_ex(ir, cur_src1), dest); + } + else + { + tmp_info[pos].gen = 0; + } } } @@ -5901,7 +6232,7 @@ static int tcc_ir_opt_const_prop_tmp__timed(TCCIRState *ir) if (irop_is_immediate(cur_src1) && !cur_src1.is_sym) { var_info[pos].gen = current_gen; - var_info[pos].value = irop_get_imm64_ex(ir, cur_src1); + var_info[pos].value = ir_opt_fit_const_to_operand(irop_get_imm64_ex(ir, cur_src1), dest); } else { @@ -6200,6 +6531,16 @@ int tcc_ir_opt_cmp_expr_fold(TCCIRState *ir) * const-var-prop may leave behind `CMP symref(X), symref(X)` that the * vreg-based path below would skip because vr1 == vr2 == -1. */ is_equal = ir_opt_nonvreg_expr_equal(ir, src1, src2); + /* Two integer immediates compare equal by value (e.g. `CMP #7, #7`). + * Scoped to the CMP-operand site (mirroring the asymmetric branch's + * manual check) rather than broadening the shared + * `ir_opt_nonvreg_expr_equal` helper, which would perturb its ADD/SUB + * base-equality callers. Floats excluded (NaN != NaN). */ + if (!is_equal && irop_is_immediate(src1) && irop_is_immediate(src2) && + !src1.is_sym && !src2.is_sym && + irop_get_btype(src1) != IROP_BTYPE_FLOAT32 && irop_get_btype(src1) != IROP_BTYPE_FLOAT64 && + irop_get_btype(src2) != IROP_BTYPE_FLOAT32 && irop_get_btype(src2) != IROP_BTYPE_FLOAT64) + is_equal = irop_get_imm64_ex(ir, src1) == irop_get_imm64_ex(ir, src2); /* Fallback for symref-vs-symref: the strict check requires every flag * to match, but the two operands at a CMP can carry different * unsigned/is_lval encodings from how the frontend lowered each side @@ -6290,7 +6631,7 @@ int tcc_ir_opt_cmp_expr_fold(TCCIRState *ir) } else { - if (vr1 < 0 || vr2 < 0 || vr1 == vr2) + if (vr1 < 0 || vr2 < 0) continue; /* Operand value-identity requires matching lval-ness: `*(p)` (a load @@ -6300,15 +6641,33 @@ int tcc_ir_opt_cmp_expr_fold(TCCIRState *ir) if (src1.is_lval != src2.is_lval) continue; - /* Both operands must have a single reaching definition */ - def1 = tcc_ir_find_defining_instruction(ir, vr1, i); - def2 = tcc_ir_find_defining_instruction(ir, vr2, i); - if (def1 < 0 || def2 < 0 || def1 == def2) - continue; + if (vr1 == vr2) + { + /* x OP x: a value compared against itself. CMP is an integer compare + * (floats lower to FCMP), so a plain register value is always + * determinate — evaluate_compare_condition(0,0,tok) gives the result. + * Require matching width and signedness: `CMP x:I8, x:I32` compares a + * truncation against the full value and is NOT always equal. A + * dereference *(V) OP *(V) could read a volatile location twice, so + * only fold the non-lval (register-value) form. */ + if (src1.is_lval || + irop_get_btype(src1) != irop_get_btype(src2) || + src1.is_unsigned != src2.is_unsigned) + continue; + is_equal = 1; + } + else + { + /* Both operands must have a single reaching definition */ + def1 = tcc_ir_find_defining_instruction(ir, vr1, i); + def2 = tcc_ir_find_defining_instruction(ir, vr2, i); + if (def1 < 0 || def2 < 0 || def1 == def2) + continue; - /* Try standard def equality (works for single-def vregs) */ - if (DC_IS_SINGLE_DEF(dc, dc_stride, vr1) && DC_IS_SINGLE_DEF(dc, dc_stride, vr2)) - is_equal = ir_opt_pure_def_equal(ir, def1, def2, 0); + /* Try standard def equality (works for single-def vregs) */ + if (DC_IS_SINGLE_DEF(dc, dc_stride, vr1) && DC_IS_SINGLE_DEF(dc, dc_stride, vr2)) + is_equal = ir_opt_pure_def_equal(ir, def1, def2, 0); + } } /* Pattern match: both defs are ADD/SUB with the same immediate, and @@ -6330,7 +6689,12 @@ int tcc_ir_opt_cmp_expr_fold(TCCIRState *ir) int32_t bvr1 = irop_get_vreg(base1); int32_t bvr2 = irop_get_vreg(base2); - if (bvr1 >= 0 && bvr2 >= 0) + /* A dereferenced base `*(V)` (is_lval) and a plain address base `V` + * are different values even when V resolves to the same definition. + * Without this, `*(p) + K` (loaded value + K) is equated with + * `p + K` (an address), mis-folding `(c->field0 + K) > c->fieldK` + * (K == field offset) to a constant. */ + if (base1.is_lval == base2.is_lval && bvr1 >= 0 && bvr2 >= 0) { /* Same base vreg → equal */ if (bvr1 == bvr2) @@ -6570,6 +6934,14 @@ int tcc_ir_opt_cmp_const_offset_fold(TCCIRState *ir) int def_a = tcc_ir_find_defining_instruction(ir, a, i); if (def_a < 0) continue; + /* `a = b +/- K` must hold at the CMP. tcc_ir_find_defining_instruction + * is a linear backward scan blind to a back-edge redefinition of a + * multi-def vreg: a loop-carried `a` reset inside the loop reaches the + * CMP again with a different value, so the offset from the preceding def + * is invalid on the back-edge path. Only trust it when `a` is single-def + * (mirrors the guard in ir_opt_eval_const_u64). */ + if (!tcc_ir_vreg_has_single_def(ir, a)) + continue; IRQuadCompact *dq = &ir->compact_instructions[def_a]; if (dq->op != TCCIR_OP_ADD && dq->op != TCCIR_OP_SUB) continue; @@ -6577,11 +6949,18 @@ int tcc_ir_opt_cmp_const_offset_fold(TCCIRState *ir) IROperand ds1 = tcc_ir_op_get_src1(ir, dq); IROperand ds2 = tcc_ir_op_get_src2(ir, dq); + /* The CMP operand standing in for `b`. The ADD base must match it in + * lval-ness too: `*(V)` (loaded value) and `V` (address) share a vreg + * but are different values, so `a = *(V) + K` does not make `a == V + K` + * provable from `b == V`. */ + IROperand b_op = swap ? src1 : src2; + /* Match `a = b + K` (or `a = K + b`, commutative ADD). */ int64_t k = 0; - if (irop_get_vreg(ds1) == b && irop_is_immediate(ds2)) + if (irop_get_vreg(ds1) == b && ds1.is_lval == b_op.is_lval && irop_is_immediate(ds2)) k = irop_get_imm64_ex(ir, ds2); - else if (dq->op == TCCIR_OP_ADD && irop_get_vreg(ds2) == b && irop_is_immediate(ds1)) + else if (dq->op == TCCIR_OP_ADD && irop_get_vreg(ds2) == b && ds2.is_lval == b_op.is_lval && + irop_is_immediate(ds1)) k = irop_get_imm64_ex(ir, ds1); else continue; @@ -6595,7 +6974,16 @@ int tcc_ir_opt_cmp_const_offset_fold(TCCIRState *ir) if (k > (int64_t)INT32_MAX || k < (int64_t)INT32_MIN) continue; - /* B must hold the same value at the CMP as at def_a. */ + /* B must hold the same value at the CMP as at def_a. Reject multi-def + * for the same back-edge reason as `a`: the linear def lookups below + * cannot see a loop redefinition of a multi-def `b` reaching the CMP with + * a value different from the one at def_a, which would break the delta. + * Unlike `a` (guaranteed >=1 def since def_a was just found), `b` may + * legitimately have zero defs (e.g. an incoming parameter) — that is + * exactly as safe as single-def, so use the multi-def check, not + * single-def, to avoid rejecting the common zero-def case. */ + if (tcc_ir_vreg_has_multi_def(ir, b)) + continue; int b_def_at_use = tcc_ir_find_defining_instruction(ir, b, i); int b_def_at_def = tcc_ir_find_defining_instruction(ir, b, def_a); if (b_def_at_use != b_def_at_def) @@ -6871,7 +7259,12 @@ static int ir_has_backward_control_flow(TCCIRState *ir) * Returns 1 and writes *out_off on success, 0 otherwise. * * Conservative: stops at any other def of V or at any jump_target between - * the def and `at_idx` (don't cross BB boundaries / merge points). */ + * the def and `at_idx` (don't cross BB boundaries / merge points). A vreg + * operand read at an instruction that is ITSELF a jump target is never + * resolved: its value depends on which edge entered (docs/bugs.md #2 — a + * loop-exit `CMP ptr,end` at a back-edge target resolved ptr through the + * preheader init only, missing the in-loop `ptr += stride` redefinition, + * and the fold deleted the loop's only exit test). */ static int ir_resolve_stack_addr_value(TCCIRState *ir, IROperand op, int at_idx, int *out_off) { StackAddrValue value; @@ -6977,31 +7370,42 @@ static int ir_resolve_stack_addr_value_ex(TCCIRState *ir, IROperand op, int at_i if (sav_vreg_has_no_def(vr)) return 0; - int saw_merge_at = at_idx; + /* A vreg read at a merge point (jump target) has an edge-dependent value: + * a def found by the linear backward walk holds only for the fall-through + * path, not for the jumped-in edge(s). This check also covers recursive + * calls: resolving a def instruction's own operands uses at_idx = def_j, + * so a def sitting at a merge point refuses to resolve its inputs. */ + if (at_idx >= 0 && at_idx < ir->next_instruction_index && + ir->compact_instructions[at_idx].is_jump_target) + return 0; + for (int j = at_idx - 1; j >= 0; j--) { IRQuadCompact *q = &ir->compact_instructions[j]; - if (q->op == TCCIR_OP_NOP) - continue; - /* Conservative: stop crossing merges (instruction with is_jump_target set). - * We allow the very first step (j == at_idx-1) to look back across our - * own CMP/BB head, but no further. */ - if (q->is_jump_target && j != saw_merge_at - 1) - return 0; - saw_merge_at = j; - if (!irop_config[q->op].has_dest) - continue; - IROperand dest = tcc_ir_op_get_dest(ir, q); - if (irop_get_vreg(dest) != vr) - continue; - /* STORE-style ops carry an address-of-write in dest, not a def. */ - if (q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED || - q->op == TCCIR_OP_STORE_POSTINC) - continue; - /* FUNCPARAMVAL dest carries the param value (a use, not a def). */ - if (q->op == TCCIR_OP_FUNCPARAMVAL) + /* Determine whether this instruction is a real def of vr. STORE-style + * ops carry an address-of-write in dest (a use, not a def); FUNCPARAMVAL + * dest carries the param value (also a use). */ + int is_def_of_vr = 0; + if (q->op != TCCIR_OP_NOP && irop_config[q->op].has_dest && sav_is_def_op(q->op)) + { + IROperand dest = tcc_ir_op_get_dest(ir, q); + if (irop_get_vreg(dest) == vr) + is_def_of_vr = 1; + } + + /* Never cross a merge point (instruction with is_jump_target set, NOPs + * included — a NOPed jump target still merges control flow): a value + * flowing in over the jumped-in edge may differ from the fall-through + * value. The found def itself being a jump target is fine — its RESULT + * dominates the straight-line range down to at_idx (no entries between); + * its own operands are guarded by the at_idx check in the recursion. */ + if (!is_def_of_vr) + { + if (q->is_jump_target) + return 0; continue; + } if (q->op == TCCIR_OP_ASSIGN) { @@ -7307,22 +7711,13 @@ int tcc_ir_opt_single_value_tmp(TCCIRState *ir) } if (changes) { - for (int i = 0; i < n; i++) { - IRQuadCompact *q = &ir->compact_instructions[i]; - if (q->op == TCCIR_OP_NOP || !irop_config[q->op].has_dest) - continue; - if (q->op != TCCIR_OP_ASSIGN && q->op != TCCIR_OP_LOAD) - continue; - IROperand d = tcc_ir_op_get_dest(ir, q); - int32_t dvr = irop_get_vreg(d); - if (dvr < 0 || TCCIR_DECODE_VREG_TYPE(dvr) != TCCIR_VREG_TYPE_TEMP) - continue; - int pos = TCCIR_DECODE_VREG_POSITION(dvr); - if (pos < count && state[pos] == 1) { - q->op = TCCIR_OP_NOP; - changes++; - } - } + /* Let DCE reclaim the now-dead constant defs. Do NOT NOP them directly by + * state[pos] == 1: a single-value temp may still have uses OTHER than the + * RETURNVALUE we just folded (e.g. `OR T, #const` in a bitfield store), + * because Phase 2 only propagates into RETURNVALUE operands. Blindly + * removing such a def leaves a dangling use → a use-before-def miscompile. + * DCE removes a def only when it has no remaining uses, which is exactly + * the condition we need. */ changes += tcc_ir_opt_dce(ir); } diff --git a/ir/opt_copyprop.c b/ir/opt_copyprop.c index 91af8d17..091ea779 100644 --- a/ir/opt_copyprop.c +++ b/ir/opt_copyprop.c @@ -219,6 +219,13 @@ static int tcc_ir_opt_copy_prop__timed(TCCIRState *ir) LOG_COPY_PROP("Propagate src1 TMP:%d -> vreg:%d (lval=%d) at i=%d", pos, TCCIR_DECODE_VREG_POSITION(copy_info[pos].source_vr), src1.is_lval, i); tcc_ir_set_src1(ir, i, replacement); + /* Keep the local in sync so the copy-recording step below sees the + * propagated source, not the stale original. Otherwise an + * ASSIGN T2<-T1 rewritten to T2<-V0 is still recorded as T2<-V0's + * source = T1, leaving a T1 use that only collapses on a second pass + * (non-convergence). */ + src1 = replacement; + src1_vr = irop_get_vreg(replacement); changes++; } else @@ -256,6 +263,8 @@ static int tcc_ir_opt_copy_prop__timed(TCCIRState *ir) LOG_COPY_PROP("Propagate src2 TMP:%d -> vreg:%d (lval=%d) at i=%d", pos, TCCIR_DECODE_VREG_POSITION(copy_info[pos].source_vr), src2.is_lval, i); tcc_ir_set_src2(ir, i, replacement); + src2 = replacement; + src2_vr = irop_get_vreg(replacement); changes++; } } @@ -381,7 +390,7 @@ static int tcc_ir_opt_copy_prop__timed(TCCIRState *ir) (db != IROP_BTYPE_INT64 && db != IROP_BTYPE_FLOAT32 && db != IROP_BTYPE_FLOAT64 && sb != IROP_BTYPE_INT64 && sb != IROP_BTYPE_FLOAT32 && sb != IROP_BTYPE_FLOAT64 && db != IROP_BTYPE_INT8 && db != IROP_BTYPE_INT16 && sb != IROP_BTYPE_INT8 && sb != IROP_BTYPE_INT16); - if (!src_is_const && src1_vr >= 0 && !src1.is_lval && btype_compat && + if (!src_is_const && src1_vr >= 0 && src1_vr != dest_vr && !src1.is_lval && btype_compat && (src_vreg_type == TCCIR_VREG_TYPE_VAR || src_vreg_type == TCCIR_VREG_TYPE_PARAM || src_vreg_type == TCCIR_VREG_TYPE_TEMP)) { @@ -1129,8 +1138,17 @@ int tcc_ir_opt_cse_param_add(TCCIRState *ir) int wt = TCCIR_DECODE_VREG_TYPE(wvr); if (wt == TCCIR_VREG_TYPE_VAR || wt == TCCIR_VREG_TYPE_PARAM) { + /* The same local can be CSE-keyed either by its raw VAR/PARAM + * vreg (register form) or by the STACKOFF synthetic key + * (0x70000000|pos, the memory form used when it's read as a + * stack lvalue). A register-form write changes the value a + * later stack-slot read of the same slot would observe, so it + * must invalidate BOTH keys — otherwise a `V - #c` computed + * after the write gets CSE'd to one computed before it, across + * the redefinition (int fuzz seed 41379). */ + int32_t syn_key = (int32_t)(0x70000000 | ((uint32_t)wvr & 0x0FFFFFFF)); for (int e = 0; e < entry_count; e++) - if (entries[e].valid && entries[e].src_vr == wvr) + if (entries[e].valid && (entries[e].src_vr == wvr || entries[e].src_vr == syn_key)) entries[e].valid = 0; } } @@ -1140,9 +1158,12 @@ int tcc_ir_opt_cse_param_add(TCCIRState *ir) int32_t w_vr = irop_get_vreg(wd); if (tcc_ir_vreg_is_valid(ir, w_vr)) { + /* Symmetric to the register-form case above: a memory-form store + * to the slot must also kill any raw-vreg-keyed entry for the + * same local. */ int32_t syn_key = (int32_t)(0x70000000 | ((uint32_t)w_vr & 0x0FFFFFFF)); for (int e = 0; e < entry_count; e++) - if (entries[e].valid && entries[e].src_vr == syn_key) + if (entries[e].valid && (entries[e].src_vr == syn_key || entries[e].src_vr == w_vr)) entries[e].valid = 0; } } @@ -1561,14 +1582,25 @@ int tcc_ir_opt_local_alu_cse(TCCIRState *ir) kills = 1; if (dest_vr_kill >= 0) { - if (cache[c].s1_tag == IROP_TAG_VREG && cache[c].s1_vr == dest_vr_kill) + /* A cached source operand reads the just-redefined value if it carries + * dest_vr_kill's vreg — whether encoded as a plain VREG or as a + * STACKOFF-lval VAR read (a local variable read). The original guard + * only matched IROP_TAG_VREG, so a re-assignment of a local VAR + * (`lr = ...`, dest encoded STACKOFF-lval, dest_vr_kill = the VAR vreg) + * failed to invalidate a cached `pb XOR lr` keyed on the old lr, and the + * stale value was commutatively re-CSE'd into a later `lr XOR pb` + * (random-C O1 wrong-code, seeds 202/251). */ + #define ALU_CSE_KILLS_VR(tg, lv, vr) \ + (((tg) == IROP_TAG_VREG || ((tg) == IROP_TAG_STACKOFF && (lv))) && (vr) == dest_vr_kill) + if (ALU_CSE_KILLS_VR(cache[c].s1_tag, cache[c].s1_lval, cache[c].s1_vr)) kills = 1; - else if (cache[c].s2_tag == IROP_TAG_VREG && cache[c].s2_vr == dest_vr_kill) + else if (ALU_CSE_KILLS_VR(cache[c].s2_tag, cache[c].s2_lval, cache[c].s2_vr)) kills = 1; - else if (cache[c].s3_tag == IROP_TAG_VREG && cache[c].s3_vr == dest_vr_kill) + else if (ALU_CSE_KILLS_VR(cache[c].s3_tag, cache[c].s3_lval, cache[c].s3_vr)) kills = 1; else if (cache[c].dest_vr == dest_vr_kill) kills = 1; /* this op redefines a previously-cached dest — drop entry */ + #undef ALU_CSE_KILLS_VR } if (!kills) cache[w++] = cache[c]; diff --git a/ir/opt_dce.c b/ir/opt_dce.c index c76b656b..e6b19435 100644 --- a/ir/opt_dce.c +++ b/ir/opt_dce.c @@ -455,6 +455,22 @@ static int ir_opt_direct_auto_vreg_store_is_local(IROperand op) return 0; } +static int ir_dce_addrof_var_pos(TCCIRState *ir, IRQuadCompact *q) +{ + if (q->op != TCCIR_OP_LEA && q->op != TCCIR_OP_ASSIGN) + return -1; + if (!irop_config[q->op].has_src1) + return -1; + + IROperand s = tcc_ir_op_get_src1(ir, q); + int32_t vr = irop_get_vreg(s); + if (vr < 0 || TCCIR_DECODE_VREG_TYPE(vr) != TCCIR_VREG_TYPE_VAR) + return -1; + if (q->op == TCCIR_OP_ASSIGN && !(s.is_local && !s.is_lval)) + return -1; + return TCCIR_DECODE_VREG_POSITION(vr); +} + static int ir_opt_op_is_essential(TCCIRState *ir, IRQuadCompact *q, int idx, const uint8_t *pure_call_ids, int pure_call_id_bytes) { @@ -3344,6 +3360,20 @@ static int tcc_ir_opt_dse__timed(TCCIRState *ir) var_used[pos / 8] |= (1 << (pos % 8)); } } + + /* MLA accumulator (4th operand) is a use not covered by src1/src2 + * (ptr seed 6869: a VAR read only as an MLA addend looked dead). */ + if (q->op == TCCIR_OP_MLA) + { + const IROperand acc = tcc_ir_op_get_accum(ir, q); + int32_t vr = irop_get_vreg(acc); + if (vr >= 0 && TCCIR_DECODE_VREG_TYPE(vr) == TCCIR_VREG_TYPE_VAR) + { + int pos = TCCIR_DECODE_VREG_POSITION(vr); + if (pos <= max_var_pos) + var_used[pos / 8] |= (1 << (pos % 8)); + } + } } /* NOP ASSIGN/STORE to unused VARs (skip address-taken) */ @@ -3693,6 +3723,26 @@ static int tcc_ir_opt_dse__timed(TCCIRState *ir) } } + /* MLA accumulator (4th operand) is a use not surfaced by src1/src2 + * (struct_byval seed 11651): `T <-- Ta MLA Tb + Tacc***DEREF***` + * reads memory through an addr-prop TMP, and a non-deref accumulator + * lets the pointer value escape the tracker via the MLA dest. + * Conservatively mark the origin read either way. */ + if (q->op == TCCIR_OP_MLA) + { + IROperand s = tcc_ir_op_get_accum(ir, q); + int32_t vr = irop_get_vreg(s); + if (vr >= 0) + { + int origin = GET_ORIGIN(vr); + if (origin != -1) + { + LOG_IR_GEN("DSE-SL: Phase3 MARK READ origin=%d at i=%d op=%d accum", origin, i, q->op); + MARK_ORIGIN_READ(origin); + } + } + } + /* STORE dest: if dest is an addr-prop TMP (deref write), that's safe. * No marking needed — this is a write through the pointer. */ @@ -4332,18 +4382,10 @@ int tcc_ir_opt_dead_var_store_elim(TCCIRState *ir) continue; if (q->op == TCCIR_OP_SET_CHAIN || q->op == TCCIR_OP_INIT_CHAIN_SLOT) has_set_chain = 1; - /* Track LEA instructions that take the address of a VAR */ - if (q->op == TCCIR_OP_LEA) - { - IROperand src1 = tcc_ir_op_get_src1(ir, q); - int32_t vr = irop_get_vreg(src1); - if (vr >= 0 && TCCIR_DECODE_VREG_TYPE(vr) == TCCIR_VREG_TYPE_VAR) - { - int pos = TCCIR_DECODE_VREG_POSITION(vr); - if (pos <= max_var) - var_has_lea[pos / 8] |= (1 << (pos % 8)); - } - } + /* Track visible address-of instructions that take the address of a VAR. */ + int addrof_pos = ir_dce_addrof_var_pos(ir, q); + if (addrof_pos >= 0 && addrof_pos <= max_var) + var_has_lea[addrof_pos / 8] |= (1 << (addrof_pos % 8)); if (irop_config[q->op].has_src1) { IROperand src1 = tcc_ir_op_get_src1(ir, q); @@ -5169,6 +5211,17 @@ static int tcc_ir_opt_redundant_var_assign__timed(TCCIRState *ir) for (int v = 0; v <= max_var; v++) pending[v] = -1; + uint8_t *var_addr_taken = tcc_mallocz((max_var + 8) / 8); + for (int i = 0; i < n; i++) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP) + continue; + int pos = ir_dce_addrof_var_pos(ir, q); + if (pos >= 0 && pos <= max_var) + var_addr_taken[pos / 8] |= (1 << (pos % 8)); + } + int changes = 0; for (int i = 0; i < n; i++) { @@ -5230,6 +5283,21 @@ static int tcc_ir_opt_redundant_var_assign__timed(TCCIRState *ir) } } + /* MLA accumulator (4th operand) is a read not surfaced by src1/src2 + * (ptr 6869 family): a VAR read only as an MLA addend must clear its + * pending assign, or the assign gets NOP'd as "overwritten unread". */ + if (q->op == TCCIR_OP_MLA) + { + IROperand acc = tcc_ir_op_get_accum(ir, q); + int32_t vr = irop_get_vreg(acc); + if (vr >= 0 && TCCIR_DECODE_VREG_TYPE(vr) == TCCIR_VREG_TYPE_VAR) + { + int pos = TCCIR_DECODE_VREG_POSITION(vr); + if (pos <= max_var) + pending[pos] = -1; + } + } + /* STORE dest is a pointer USE — if it's a VAR, count as read */ if (q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED) { @@ -5254,6 +5322,11 @@ static int tcc_ir_opt_redundant_var_assign__timed(TCCIRState *ir) int pos = TCCIR_DECODE_VREG_POSITION(vr); if (pos <= max_var) { + if (var_addr_taken[pos / 8] & (1 << (pos % 8))) + { + pending[pos] = -1; + continue; + } if (pending[pos] >= 0) { /* Previous assign to this VAR is dead — overwritten before read */ @@ -5268,6 +5341,7 @@ static int tcc_ir_opt_redundant_var_assign__timed(TCCIRState *ir) LOG_IR_GEN("=== REDUNDANT VAR ASSIGN: eliminated %d dead assigns ===", changes); + tcc_free(var_addr_taken); tcc_free(pending); tcc_free(is_target); return changes; diff --git a/ir/opt_dead_lea_store.c b/ir/opt_dead_lea_store.c index 6e5eccdb..8aedb1e1 100644 --- a/ir/opt_dead_lea_store.c +++ b/ir/opt_dead_lea_store.c @@ -396,8 +396,10 @@ int tcc_ir_opt_dead_lea_store_elim(TCCIRState *ir) } /* Walk operands; record reads of known slots and bail on any non-tame - * use of a known-address vreg. */ - for (int k = 0; k < 3; k++) + * use of a known-address vreg. k==3 is MLA's accumulator (4th operand): + * `T <-- Ta MLA Tb + Tacc***DEREF***` reads the slot through Tacc, a use + * src1/src2 never surface (struct_byval seed 11651). */ + for (int k = 0; k < 4; k++) { IROperand op; int has; @@ -405,8 +407,10 @@ int tcc_ir_opt_dead_lea_store_elim(TCCIRState *ir) if (has) op = tcc_ir_op_get_dest(ir, q); } else if (k == 1) { has = irop_config[q->op].has_src1; if (has) op = tcc_ir_op_get_src1(ir, q); } - else { has = irop_config[q->op].has_src2; + else if (k == 2) { has = irop_config[q->op].has_src2; if (has) op = tcc_ir_op_get_src2(ir, q); } + else { has = (q->op == TCCIR_OP_MLA); + if (has) op = tcc_ir_op_get_accum(ir, q); } if (!has) continue; /* Lval reference: it's a read of the slot. We treat any lval-src use @@ -526,6 +530,69 @@ int tcc_ir_opt_dead_lea_store_elim(TCCIRState *ir) if (dest.is_complex) dest_w *= 2; int store_off = slot_off; + + /* Write-after-write: if a later store in the same straight-line run fully + * overwrites this store's byte range with no read of those bytes in + * between, S1's value is never observed — eliminate it even though the slot + * is read further on (that read sees the overwriting store's value). + * Restricting to a straight-line run (break at any control-flow op or jump + * target) keeps the proof sound: the covering store unconditionally runs + * after S1 before any branch could route to a read. Intermediate stores + * never *read* R1 (their value operands were escape-checked in Pass 2), so + * they cannot keep S1 alive — only a recorded read can. */ + int waw_dead = 0; + for (int j = i + 1; j < n; j++) + { + IRQuadCompact *qj = &ir->compact_instructions[j]; + if (qj->op == TCCIR_OP_NOP) + continue; + if (qj->is_jump_target) + break; /* control-flow merge — straight-line run ends */ + if (qj->op == TCCIR_OP_JUMP || qj->op == TCCIR_OP_JUMPIF || + qj->op == TCCIR_OP_IJUMP || qj->op == TCCIR_OP_SWITCH_TABLE || + qj->op == TCCIR_OP_RETURNVALUE || qj->op == TCCIR_OP_RETURNVOID || + qj->op == TCCIR_OP_FUNCCALLVAL || qj->op == TCCIR_OP_FUNCCALLVOID) + break; /* leaves the straight-line run */ + if (qj->op != TCCIR_OP_STORE) + continue; + IROperand d2 = tcc_ir_op_get_dest(ir, qj); + if (!RESOLVE_LVAL_SLOT(d2)) + continue; /* writes a non-tracked location (no escapes survived Pass 2) */ + int off2 = slot_off; + int w2 = ir_opt_store_btype_size_bytes(irop_get_btype(d2)); + if (w2 <= 0) + w2 = irop_is_64bit(d2) ? 8 : 4; + if (d2.is_complex) + w2 *= 2; + if (off2 <= store_off && store_off + dest_w <= off2 + w2) + { + /* Full cover: S1 is dead unless its bytes are read before j. */ + int read_between = 0; + for (int r = 0; r < reads_n; r++) + if (store_off < reads[r].off + reads[r].width && + reads[r].off < store_off + dest_w && + reads[r].pos > i && reads[r].pos < j) + { + read_between = 1; + break; + } + if (!read_between) + waw_dead = 1; + break; + } + if (store_off < off2 + w2 && off2 < store_off + dest_w) + break; /* partial overlap — cannot prove S1 fully dead */ + /* disjoint slot — keep scanning for a covering store */ + } + if (waw_dead) + { + LOG_IR_GEN("DEAD LEA-STORE (WAW): nop STORE to StackLoc[%d] at i=%d w=%d", + store_off, i, dest_w); + q->op = TCCIR_OP_NOP; + changes++; + continue; + } + int alive = 0; for (int r = 0; r < reads_n; r++) { diff --git a/ir/opt_du.c b/ir/opt_du.c index a8fe3411..d1682b23 100644 --- a/ir/opt_du.c +++ b/ir/opt_du.c @@ -116,6 +116,15 @@ void ir_opt_du_build_mode(TCCIRState *ir, IROptDU *du, uint8_t mode) if (idx >= 0 && du->use[idx] < 2) du->use[idx]++; } + /* MLA/MLS have a 4th accumulator operand that is a USE of its vreg. + * Missing it makes call-result elimination think the result is dead + * when it is only consumed as an MLA accumulator (seed 4274). */ + if (q->op == TCCIR_OP_MLA) + { + int idx = ir_opt_du_idx(du, irop_get_vreg(tcc_ir_op_get_accum(ir, q))); + if (idx >= 0 && du->use[idx] < 2) + du->use[idx]++; + } } } diff --git a/ir/opt_fusion.c b/ir/opt_fusion.c index 1bd0dab8..66cf07cf 100644 --- a/ir/opt_fusion.c +++ b/ir/opt_fusion.c @@ -1425,6 +1425,15 @@ void tcc_ir_barrel_shift_fusion(TCCIRState *ir) if (amount < 0 || amount > 31) continue; + /* A zero-amount right shift/rotate is an identity in the IR (x >> 0 == x), + * but ARM's barrel shifter encodes an immediate field of 0 for LSR/ASR as + * shift-by-32 (yielding 0 / sign-extend) and for ROR as RRX — NOT the + * shift-by-0 we mean. Only LSL #0 (stype 1) is a true no-op operand, so + * refuse to fuse `x SHR/SAR/ROR #0`; leave the standalone shift for the + * backend's shift-by-0 identity fold (arm-thumb-gen.c) to lower as MOV. */ + if (amount == 0 && stype != 1) + continue; + IROperand shift_src1 = tcc_ir_op_get_src1(ir, sq); if (!irop_has_vreg(shift_src1)) continue; @@ -1433,6 +1442,8 @@ void tcc_ir_barrel_shift_fusion(TCCIRState *ir) IROperand other = (attempt == 0) ? tcc_ir_op_get_src1(ir, q) : tcc_ir_op_get_src2(ir, q); + if (!irop_has_vreg(other)) + continue; if (irop_has_vreg(other) && irop_get_vreg(other) == shift_src_vr) continue; @@ -2166,14 +2177,7 @@ int tcc_ir_opt_lea_cse(TCCIRState *ir) * i1: T2 = T1 ADD #K * i2: ... T2***DEREF*** ... * - * Pattern C — ADD Addr[StackLoc] + #K + consumer-with-deref (combined-form - * variant of B; the frontend emits this single ADD when materializing - * &local[const_idx] without a separate LEA op, e.g. via nested-function - * inlining): - * i0: T = ADD Addr[StackLoc[-N]], #K - * i1: ... T***DEREF*** ... - * - * Pattern D — ASSIGN Addr[StackLoc] + consumer-with-deref (semantically + * Pattern C — ASSIGN Addr[StackLoc] + consumer-with-deref (semantically * identical to pattern A; the frontend emits ASSIGN instead of LEA when * the address materialization is part of a copy chain, again common in * nested-function inlining): @@ -2212,24 +2216,21 @@ int tcc_ir_opt_lea_fold(TCCIRState *ir) { IRQuadCompact *lea_q = &ir->compact_instructions[i]; - /* Three entry shapes are handled: + /* Two entry shapes are handled: * - LEA Addr[StackLoc[X]] -> T (classic LEA form) * - ASSIGN Addr[StackLoc[X]] -> T (semantically identical to LEA; * emitted by the frontend when materializing &local for nested-function * inlining or other capture-via-address patterns) - * - ADD Addr[StackLoc[X]], #K -> T (combined LEA+offset form) - * The ADD form already folds the constant offset, so the optional - * ADD-interposer search below is skipped. */ - int is_add_form = 0; - int32_t add_form_imm = 0; + * + * The combined ADD Addr[StackLoc[X]], #K form is deliberately not an entry + * root here. Folding it to a direct StackLoc access can remove the only + * address-valued operation tying a constant subslot access to the enclosing + * aggregate; later stack-slot passes then miss aliases through other + * Addr[StackLoc] indexed accesses. Keep that form explicit unless it is an + * interposer after a real LEA/ASSIGN root, where the root still carries the + * address-taken information for the aggregate. */ if (lea_q->op == TCCIR_OP_ADD) - { - IROperand s2 = tcc_ir_op_get_src2(ir, lea_q); - if (irop_get_tag(s2) != IROP_TAG_IMM32) - continue; - add_form_imm = (int32_t)s2.u.imm32; - is_add_form = 1; - } + continue; else if (lea_q->op == TCCIR_OP_ASSIGN) { /* ASSIGN must have no src2 (or NONE) to be a pure copy of src1. */ @@ -2360,12 +2361,11 @@ int tcc_ir_opt_lea_fold(TCCIRState *ir) /* Optional ADD #K interposer: a single intermediate ADD that consumes * the LEA result and adds a constant, whose own result has exactly one - * use (the eventual deref consumer). Skipped for ADD-form starts — - * the constant offset is already in add_form_imm. */ + * use (the eventual deref consumer). */ int add_idx = -1; - int32_t add_offset = is_add_form ? add_form_imm : 0; + int32_t add_offset = 0; IRQuadCompact *add_q = &ir->compact_instructions[cur_idx]; - if (!is_add_form && add_q->op == TCCIR_OP_ADD) + if (add_q->op == TCCIR_OP_ADD) { IROperand a1 = tcc_ir_op_get_src1(ir, add_q); IROperand a2 = tcc_ir_op_get_src2(ir, add_q); @@ -2491,11 +2491,10 @@ int tcc_ir_opt_lea_fold(TCCIRState *ir) * stack-store-load forwarding and DSE on aggregate field writes. */ { IRQuadCompact *cq = &ir->compact_instructions[cur_idx]; - int is_store_idx = (cq->op == TCCIR_OP_STORE_INDEXED); int is_load_idx = (cq->op == TCCIR_OP_LOAD_INDEXED); - if (is_store_idx || is_load_idx) + if (is_load_idx) { - IROperand base = is_store_idx ? tcc_ir_op_get_dest(ir, cq) : tcc_ir_op_get_src1(ir, cq); + IROperand base = tcc_ir_op_get_src1(ir, cq); if (irop_has_vreg(base) && irop_get_vreg(base) == deref_vr) { IROperand idx = tcc_ir_op_get_src2(ir, cq); @@ -2504,8 +2503,7 @@ int tcc_ir_opt_lea_fold(TCCIRState *ir) scale.u.imm32 == 0) { int folded_off = base_offset + add_offset + (int32_t)idx.u.imm32; - IROperand width_op = is_store_idx ? tcc_ir_op_get_src1(ir, cq) - : tcc_ir_op_get_dest(ir, cq); + IROperand width_op = tcc_ir_op_get_dest(ir, cq); if (width_op.btype != IROP_BTYPE_STRUCT) { IROperand stack_op = irop_make_stackoff(-1, folded_off, /*is_lval*/ 1, /*is_llocal*/ 0, @@ -2514,31 +2512,19 @@ int tcc_ir_opt_lea_fold(TCCIRState *ir) stack_op.is_unsigned = width_op.is_unsigned; stack_op.is_static = lea_src.is_static; - if (is_store_idx) - { - IROperand val = tcc_ir_op_get_src1(ir, cq); - cq->op = TCCIR_OP_STORE; - tcc_ir_set_dest(ir, cur_idx, stack_op); - tcc_ir_set_src1(ir, cur_idx, val); - tcc_ir_set_src2(ir, cur_idx, IROP_NONE); - } - else - { - IROperand orig_dest = tcc_ir_op_get_dest(ir, cq); - cq->op = TCCIR_OP_LOAD; - tcc_ir_set_dest(ir, cur_idx, orig_dest); - tcc_ir_set_src1(ir, cur_idx, stack_op); - tcc_ir_set_src2(ir, cur_idx, IROP_NONE); - } + IROperand orig_dest = tcc_ir_op_get_dest(ir, cq); + cq->op = TCCIR_OP_LOAD; + tcc_ir_set_dest(ir, cur_idx, orig_dest); + tcc_ir_set_src1(ir, cur_idx, stack_op); + tcc_ir_set_src2(ir, cur_idx, IROP_NONE); lea_q->op = TCCIR_OP_NOP; if (add_idx >= 0) ir->compact_instructions[add_idx].op = TCCIR_OP_NOP; changes++; - LOG_IR_GEN("LEA FOLD INDEXED: LEA@%d%s -> %s_INDEXED@%d -> %s (offset=%d+%d+%d=%d)", - i, (add_idx >= 0 ? " + ADD" : ""), is_store_idx ? "STORE" : "LOAD", cur_idx, - is_store_idx ? "STORE" : "LOAD", base_offset, add_offset, + LOG_IR_GEN("LEA FOLD INDEXED: LEA@%d%s -> LOAD_INDEXED@%d -> LOAD (offset=%d+%d+%d=%d)", + i, (add_idx >= 0 ? " + ADD" : ""), cur_idx, base_offset, add_offset, (int32_t)idx.u.imm32, folded_off); continue; } @@ -2550,6 +2536,12 @@ int tcc_ir_opt_lea_fold(TCCIRState *ir) int which = 0; if (!find_deref_use_operand(ir, cur_idx, deref_vr, &which)) continue; + /* Keep stores through the address temp explicit. A direct StackLoc store + * followed by direct StackLoc loads lets later scalar stack-slot passes + * reason about one field while missing other aliases through the aggregate + * address. Read-side folds are still safe and keep the common load win. */ + if (which == 3) + continue; IRQuadCompact *cons_q = &ir->compact_instructions[cur_idx]; @@ -2992,4 +2984,3 @@ int tcc_ir_opt_assign_fuse(TCCIRState *ir) int tcc_ir_opt_postinc_fusion_ex(IROptCtx *ctx) { return tcc_ir_opt_postinc_fusion(ctx->ir); } int tcc_ir_opt_assign_fuse_ex(IROptCtx *ctx) { return tcc_ir_opt_assign_fuse(ctx->ir); } - diff --git a/ir/opt_gens_branch.c b/ir/opt_gens_branch.c index 94894622..3fa9aa4e 100644 --- a/ir/opt_gens_branch.c +++ b/ir/opt_gens_branch.c @@ -21,6 +21,51 @@ #include "opt_utils.h" #include "opt_gens_branch.h" +static int ir_branch_cmp_width(IROperand src1, IROperand src2) +{ + return (irop_get_btype(src1) == IROP_BTYPE_INT64 || + irop_get_btype(src2) == IROP_BTYPE_INT64) + ? 64 + : 32; +} + +static int ir_branch_eval_const_cmp(int64_t val1, int64_t val2, int cond, + IROperand src1, IROperand src2) +{ + if (ir_branch_cmp_width(src1, src2) != 64) + { + uint32_t u1 = (uint32_t)val1; + uint32_t u2 = (uint32_t)val2; + int32_t s1 = (int32_t)u1; + int32_t s2 = (int32_t)u2; + switch (cond) + { + case TOK_EQ: + return u1 == u2; + case TOK_NE: + return u1 != u2; + case TOK_LT: + return s1 < s2; + case TOK_GE: + return s1 >= s2; + case TOK_LE: + return s1 <= s2; + case TOK_GT: + return s1 > s2; + case TOK_ULT: + return u1 < u2; + case TOK_UGE: + return u1 >= u2; + case TOK_ULE: + return u1 <= u2; + case TOK_UGT: + return u1 > u2; + default: + break; + } + } + return evaluate_compare_condition(val1, val2, cond); +} static int ir_gen_branch_fold_test_zero(IROptCtx *ctx, int i) { @@ -118,7 +163,7 @@ static int ir_gen_branch_fold_cmp(IROptCtx *ctx, int i) IROperand cond = tcc_ir_op_get_src1(ir, jump_q); int tok = (int)irop_get_imm64_ex(ir, cond); - int result = evaluate_compare_condition(val1, val2, tok); + int result = ir_branch_eval_const_cmp(val1, val2, tok, src1, src2); if (result < 0) return 0; diff --git a/ir/opt_gens_call_result.c b/ir/opt_gens_call_result.c index 1cf54bf6..2c415e94 100644 --- a/ir/opt_gens_call_result.c +++ b/ir/opt_gens_call_result.c @@ -61,6 +61,13 @@ static int ir_gen_dead_call_result(IROptCtx *ctx, int i) if (irop_get_vreg(po) == dest_vr) return 0; } + /* MLA has a 4th accumulator operand that the three-slot scan above + * misses; a call result consumed only as an accumulator is not dead. */ + if (p->op == TCCIR_OP_MLA) { + IROperand accum = tcc_ir_op_get_accum(ir, p); + if (irop_get_vreg(accum) == dest_vr) + return 0; + } } } diff --git a/ir/opt_gens_fusion.c b/ir/opt_gens_fusion.c index 7c5ab68b..17178031 100644 --- a/ir/opt_gens_fusion.c +++ b/ir/opt_gens_fusion.c @@ -223,6 +223,14 @@ static int ir_gen_mla_fusion(IROptCtx *ctx, int i) return 0; } + /* The MLA lands at the MUL's position, hoisting the ADD's accumulator + * read up to it. A memory-read accumulator (fused lvalue load) must not + * skip stores between the MUL and the ADD (mirror of the SSA-side + * mul-operand sink guard; volatile fuzz seed 5053 family). */ + if (ir_xform_operand_reads_memory(accum_op) && + (q->is_jump_target || !ir_xform_range_preserves_memory(ir, mul_idx, i))) + return 0; + IROperand final_dest = add_dest; int store_idx = -1; if (long_mla && irop_has_vreg(add_dest) && ir_opt_du_uses(du, irop_get_vreg(add_dest)) == 1) { @@ -263,23 +271,26 @@ static int ir_gen_mla_fusion(IROptCtx *ctx, int i) } mul_q->op = TCCIR_OP_MLA; - int mul_dest_idx = mul_q->operand_base; - if (mul_dest_idx >= 0 && mul_dest_idx < ir->iroperand_pool_count) - ir->iroperand_pool[mul_dest_idx] = final_dest; - int accum_idx = mul_q->operand_base + 3; - while (ir->iroperand_pool_count <= accum_idx) - tcc_ir_pool_add(ir, IROP_NONE); - if (accum_idx < ir->iroperand_pool_capacity) { - ir->iroperand_pool[accum_idx] = accum_op; - q->op = TCCIR_OP_NOP; - if (store_idx >= 0) - ir->compact_instructions[store_idx].op = TCCIR_OP_NOP; - return 1; + /* The MLA has four operands (dest, src1, src2, accum) but the original MUL + * only allocated three slots. Growing the block in-place at operand_base+3 + * can overwrite operands of instructions whose operand blocks were allocated + * between the MUL and the ADD, so move the whole operand block to a fresh + * 4-slot region at the end of the pool. */ + { + int new_base = ir->iroperand_pool_count; + tcc_ir_pool_ensure(ir, 4); + tcc_ir_pool_add(ir, final_dest); + tcc_ir_pool_add(ir, tcc_ir_op_get_src1(ir, mul_q)); + tcc_ir_pool_add(ir, tcc_ir_op_get_src2(ir, mul_q)); + tcc_ir_pool_add(ir, accum_op); + mul_q->operand_base = new_base; } - mul_q->op = old_mul_op; - return 0; + q->op = TCCIR_OP_NOP; + if (store_idx >= 0) + ir->compact_instructions[store_idx].op = TCCIR_OP_NOP; + return 1; } static int ir_gen_indexed_memory_fusion(IROptCtx *ctx, int i) diff --git a/ir/opt_knownbits.c b/ir/opt_knownbits.c index 967e7d52..2fec84ff 100644 --- a/ir/opt_knownbits.c +++ b/ir/opt_knownbits.c @@ -33,6 +33,7 @@ #include "ir.h" #include "opt.h" +#include "opt_alias.h" #include "opt_engine.h" #include "opt_utils.h" @@ -174,6 +175,23 @@ static int kb_lval_stack_off(const TCCIRState *ir, IROperand op, return 0; } +/* Resolve a base pointer operand (e.g. STORE_INDEXED base) to a concrete + * stack-frame offset when it is a direct Addr[StackLoc] or a single-def + * TEMP/VAR holding such an address. */ +static int kb_base_stack_off(const TCCIRState *ir, IROperand base, + const TmpKB *tmp_kb, int max_tmp_pos, + const VregAddrKB *var_addr, int max_var_pos, + int current_gen, int32_t *out_off) +{ + if (kb_is_direct_stackoff(base, 0)) + { + *out_off = (int32_t)irop_get_imm64_ex(ir, base); + return 1; + } + return vreg_addr_lookup(irop_get_vreg(base), tmp_kb, max_tmp_pos, var_addr, + max_var_pos, current_gen, out_off); +} + static int kb_value_is_stack_addr(const TCCIRState *ir, IROperand op, const TmpKB *tmp_kb, int max_tmp_pos, const VregAddrKB *var_addr, int max_var_pos, @@ -365,7 +383,13 @@ static int kb_operand_const_u64(const TCCIRState *ir, const IROperand *op, if (btype == IROP_BTYPE_FLOAT32 || btype == IROP_BTYPE_FLOAT64 || btype == IROP_BTYPE_STRUCT) return 0; - *out = kb_apply_const_width((uint64_t)irop_get_imm64_ex(ir, *op), btype, op->is_unsigned); + /* An immediate already stores its actual signed/unsigned VALUE in u.imm32 + * (a signed char -56 holds -56; an unsigned char 208 holds 208). Applying + * sub-word width extension would re-interpret the low byte as a bit pattern + * and sign-extend it — corrupting an `unsigned char` 208 (0xd0) to -48 when + * the immediate's is_unsigned flag was dropped upstream (combo seed 1053). + * Read immediates raw; only memory loads model sub-word extension. */ + *out = (uint64_t)irop_get_imm64_ex(ir, *op); return 1; } @@ -424,7 +448,7 @@ static IROperand kb_make_const_operand(TCCIRState *ir, uint64_t val, int btype) return irop_make_i64(-1, pool_idx, btype); } -static int kb_const_compute(TccIrOp op, int dest_btype, +static int kb_const_compute(TccIrOp op, int dest_btype, int src1_btype, uint64_t a, uint64_t b, uint64_t *out) { int width = (dest_btype == IROP_BTYPE_INT64) ? 64 : 32; @@ -434,9 +458,24 @@ static int kb_const_compute(TccIrOp op, int dest_btype, { case TCCIR_OP_ASSIGN: case TCCIR_OP_LOAD: - case TCCIR_OP_ZEXT: *out = a; break; + case TCCIR_OP_ZEXT: + { + /* Zero-extend from the SOURCE width. kb_operand_const_u64 sign-extends a + * signed source to 64 bits, so a verbatim copy would poison the high half + * (e.g. ZEXT(#-326:I32) must give 0x00000000FFFFFEBA, not ...FFFFFEBA). */ + uint64_t src_mask; + switch (src1_btype) + { + case IROP_BTYPE_INT8: src_mask = 0xFFULL; break; + case IROP_BTYPE_INT16: src_mask = 0xFFFFULL; break; + case IROP_BTYPE_INT32: src_mask = 0xFFFFFFFFULL; break; + default: src_mask = ~0ULL; break; + } + *out = a & src_mask; + break; + } case TCCIR_OP_ADD: *out = a + b; break; @@ -460,7 +499,9 @@ static int kb_const_compute(TccIrOp op, int dest_btype, case TCCIR_OP_SHR: if (b >= (uint64_t)width) return 0; - *out = a >> b; + /* Logical shift: mask the source to the operation width first so the + * sign-extended high bits (for a 32-bit op) are not shifted in. */ + *out = (a & mask) >> b; break; case TCCIR_OP_SAR: if (b >= (uint64_t)width) @@ -721,6 +762,7 @@ static int kb_compute(TccIrOp op, uint32_t a_kz, uint32_t a_ko, static int tcc_ir_opt_known_bits__timed(TCCIRState *ir); int tcc_ir_opt_known_bits(TCCIRState *ir) { + if (tcc_ir_opt_pass_disabled("known_bits")) return 0; tcc_pass_timing_init(); if (!tcc_pass_timing_on) return tcc_ir_opt_known_bits__timed(ir); unsigned long _t = tcc_pass_clk_us(); @@ -848,6 +890,23 @@ static int tcc_ir_opt_known_bits__timed(TCCIRState *ir) if (have_off) { + /* A narrow store at stack_off also overwrites bytes belonging to any + * OTHER tracked slot whose range overlaps [stack_off, stack_off+width) + * — e.g. a sub-word bitfield write (INT16 at offset N) clobbers the + * high half of the enclosing word slot at N-2. stack_kb_set only + * touches the exact-offset slot, so without invalidating the + * overlapping aliases a later wide load of one of them would fold to a + * stale value (the bitfield write silently lost). Mirrors the overlap + * invalidation already done by the STORE_INDEXED and wide-store paths. */ + int width = ir_opt_store_btype_size_bytes(dest_btype); + if (width <= 0) + width = 4; + for (int s = 0; s < n_stack_slots; s++) + if (stack_slots[s].off != stack_off && + stack_slots[s].off < stack_off + width && + stack_slots[s].off + 4 > stack_off) + stack_slots[s].gen = 0; + uint32_t kz, ko; if (kb_operand(ir, src1, tmp_kb, max_tmp_pos, current_gen, var_addr, max_var_pos, @@ -873,6 +932,43 @@ static int tcc_ir_opt_known_bits__timed(TCCIRState *ir) goto post_op; } + /* STORE_INDEXED / STORE_POSTINC: *(base + (idx << scale)) = src. + * If the base resolves to a known stack address and the index/scale are + * constant, invalidate the touched slot(s). Otherwise be conservative: + * a variable-indexed or unknown-base indexed store may alias any slot. + * Without this, kb can fold a later direct StackLoc load to a stale value + * because it never saw the indexed write clobber the slot. */ + if (op == TCCIR_OP_STORE_INDEXED || op == TCCIR_OP_STORE_POSTINC) + { + IROperand base = tcc_ir_op_get_dest(ir, q); + IROperand idx = tcc_ir_op_get_src2(ir, q); + IROperand sc = tcc_ir_op_get_scale(ir, q); + int32_t base_off; + + if (kb_base_stack_off(ir, base, tmp_kb, max_tmp_pos, var_addr, + max_var_pos, current_gen, &base_off) && + irop_is_immediate(idx) && !idx.is_sym && + irop_is_immediate(sc) && !sc.is_sym) + { + int shift = (int)irop_get_imm64_ex(ir, sc) & 3; + int32_t off = base_off + ((int32_t)irop_get_imm64_ex(ir, idx) << shift); + IROperand val = tcc_ir_op_get_src1(ir, q); + int width = ir_opt_store_btype_size_bytes(irop_get_btype(val)); + if (width <= 0) + width = 4; + for (int s = 0; s < n_stack_slots; s++) + if (stack_slots[s].off < off + width && + stack_slots[s].off + 4 > off) + stack_slots[s].gen = 0; + } + else + { + stack_kb_invalidate_all(stack_slots, n_stack_slots); + } + stack_dirty_since_split = 1; + goto post_op; + } + /* CALL: stack locals only become externally mutable after their address * escapes. Indirect control flow and asm remain fully conservative. */ if (op == TCCIR_OP_FUNCCALLVOID || op == TCCIR_OP_FUNCCALLVAL) @@ -907,9 +1003,6 @@ static int tcc_ir_opt_known_bits__timed(TCCIRState *ir) stack_dirty_since_split = 1; } - if (op == TCCIR_OP_JUMPIF) - stack_dirty_since_split = 0; - /* TEST_ZERO + JUMPIF EQ/NE folding using known-bits. When kb proves * src1 has any known-one bit (ko != 0), the value is provably non-zero * and the EQ branch is dead / NE branch unconditional. branch_folding @@ -1224,7 +1317,7 @@ static int tcc_ir_opt_known_bits__timed(TCCIRState *ir) var_addr, max_var_pos, stack_slots, n_stack_slots, &cv2); if (h1 && (!irop_config[op].has_src2 || h2) && - kb_const_compute(op, dest_btype, cv1, cv2, &cres)) + kb_const_compute(op, dest_btype, s1_btype, cv1, cv2, &cres)) { IROperand imm = kb_make_const_operand(ir, cres, dest_btype); imm.is_unsigned = dest.is_unsigned; @@ -1241,6 +1334,24 @@ static int tcc_ir_opt_known_bits__timed(TCCIRState *ir) if (low_mask && ((uint32_t)cres & low_mask) == low_mask) suppress_rewrite = 1; } + /* ASSIGN is already the canonical constant form for plain immediates, + * while ASSIGN with a load-shaped source must preserve that operand's + * dereference tags. Record the known bits, but do not rewrite it in + * this fast path. */ + if (op == TCCIR_OP_ASSIGN) + suppress_rewrite = 1; + /* If a source is an lvalue, the fully-known result depends on a memory + * read. Keep the instruction shape so later codegen still performs + * that read; only record the known-bits fact for local consumers. + * LOAD is the exception: its src1 is the address being loaded, and + * folding a load from a known stack slot into an immediate ASSIGN is + * exactly what this pass is supposed to do. Use irop_op_is_lval so + * that a missing src2 (IROP_NONE, whose packed vr field has all bits + * set) does not accidentally look like an lvalue. */ + if (op != TCCIR_OP_LOAD && + ((irop_config[op].has_src1 && irop_op_is_lval(s1)) || + (irop_config[op].has_src2 && irop_op_is_lval(s2)))) + suppress_rewrite = 1; if (!already_folded && !suppress_rewrite) { q->op = TCCIR_OP_ASSIGN; @@ -1551,12 +1662,12 @@ recheck_wide:; LOG_IR_GEN( "OPTIMIZE: knownbits fold TMP:%d = #%d at i=%d (kz=%08x ko=%08x)", dpos, val, i, dkz, dko); + changes++; tmp_kb[dpos].gen = current_gen; tmp_kb[dpos].kz = ~(uint32_t)val; tmp_kb[dpos].ko = (uint32_t)val; tmp_kb[dpos].const_val = (uint32_t)val; tmp_kb[dpos].has_const = 1; - changes++; continue; } tmp_kb[dpos].gen = current_gen; diff --git a/ir/opt_loop.c b/ir/opt_loop.c index bd97c8a2..abf6d811 100644 --- a/ir/opt_loop.c +++ b/ir/opt_loop.c @@ -352,6 +352,20 @@ int tcc_ir_opt_loop_bound_remat(TCCIRState *ir) if (irop_get_tag(src) != IROP_TAG_STACKOFF) continue; + /* Only rematerialize an address-of-stack computation (`Addr[StackLoc]`, + * is_lval=0) — the SP-relative *end pointer* this pass targets. A + * value LOAD from a stack slot (is_lval=1) is NOT an end pointer: it + * reads memory whose content can differ from a fresh anonymous-slot + * load. In particular a value-load of a named local VAR (is_local=1, + * carrying a live VAR vreg) is a register/SSA value with no guaranteed + * physical home at that offset; rematerializing it as a raw + * `StackLoc[off]` load (vreg=-1) reads uninitialized stack. (fuzz + * seed 6214: pre-loop `u8 <= ~cs` test read `u8` from an unwritten + * StackLoc[0].) Recomputing a stack ADDRESS, by contrast, is always + * sound, so keep those. */ + if (src.is_lval) + continue; + int32_t stack_off = (int32_t)irop_get_imm64_ex(ir, src); int is_param = src.is_param; int is_lval = src.is_lval; @@ -923,6 +937,17 @@ int tcc_ir_opt_decrement_to_zero(TCCIRState *ir) IRQuadCompact *q = &ir->compact_instructions[i]; if (q->op != TCCIR_OP_CMP) continue; + /* A bottom-tested / rotated loop with no pre-test guard distinct from + * its own back-edge CMP/JUMPIF exposes the back-edge test itself inside + * this header-window scan. Never accept it as the "pre-test guard": + * the apply step below rewrites be_cmp_idx/be_jmpif_idx (CMP #0, != 0) + * and then unconditionally NOPs hdr_cmp_idx/hdr_jmpif_idx. If those + * coincide, step 5 would delete the loop's only remaining back-edge + * test, degenerating the loop to a single iteration. Skipping it here + * leaves hdr_cmp_idx == -1, so the transform bails at the guard below. + * See docs/bugs.md #12. */ + if (i == be_cmp_idx) + continue; IROperand s1 = tcc_ir_op_get_src1(ir, q); if (irop_get_vreg(s1) != iv_vr) continue; @@ -933,6 +958,8 @@ int tcc_ir_opt_decrement_to_zero(TCCIRState *ir) jq_idx++; if (jq_idx < n && ir->compact_instructions[jq_idx].op == TCCIR_OP_JUMPIF) { + if (jq_idx == be_jmpif_idx) + continue; /* same guard-coincidence hazard as above */ hdr_cmp_idx = i; hdr_jmpif_idx = jq_idx; break; diff --git a/ir/opt_loop_const_sim.c b/ir/opt_loop_const_sim.c index 337209c5..5c0b0457 100644 --- a/ir/opt_loop_const_sim.c +++ b/ir/opt_loop_const_sim.c @@ -31,6 +31,7 @@ #include "ir.h" #include "opt.h" #include "opt_engine.h" +#include "opt_alias.h" #include "opt_loop_const_sim.h" #include "opt_loop_utils.h" #include "opt_utils.h" @@ -49,6 +50,8 @@ typedef struct LcsSlot int known; int64_t value; int btype; /* IROP_BTYPE_INT32 / INT64 / FLOAT32 / FLOAT64 */ + int is_unsigned; /* sign of a narrow (INT8/INT16) value — needed so the + residual is zero- vs sign-extended correctly */ int is_addr; /* value is a stack offset (Addr[StackLoc[value]]) */ } LcsSlot; @@ -67,6 +70,7 @@ typedef struct LcsMemSlot int32_t offset; /* stack offset (negative = local) */ int64_t value; int btype; + int is_unsigned; /* sign of a narrow store — see LcsSlot.is_unsigned */ int known; /* current value is known */ int written; /* sim wrote to this slot at least once */ int64_t initial_value; /* value before the loop (if initial_known) */ @@ -106,6 +110,7 @@ static LcsMemSlot *lcs_mem_get(LcsState *st, int32_t offset) s->offset = offset; s->value = 0; s->btype = IROP_BTYPE_INT32; + s->is_unsigned = 0; s->known = 0; s->written = 0; s->initial_value = 0; @@ -113,6 +118,31 @@ static LcsMemSlot *lcs_mem_get(LcsState *st, int32_t offset) return s; } +/* A store of `width` bytes at `offset` also clobbers any OTHER tracked slot + * whose byte range overlaps it. Slots are keyed by exact offset with no + * width awareness, so a packed-bitfield byte store at word_off+3 must mark + * the word's slot unknown (and vice versa) — otherwise the simulator folds + * a later RMW from the stale full-word value (bitfield seed 11840: byte-3 + * b3 store ignored, the collapsed loop store wiped it back to 0). */ +static void lcs_mem_clobber_overlaps(LcsState *st, int32_t offset, int width, + const LcsMemSlot *keep) +{ + for (int i = 0; i < st->n_mem; i++) + { + LcsMemSlot *m = &st->mem[i]; + if (m == keep) + continue; + int mw = ir_opt_store_btype_size_bytes(m->btype); + if (mw <= 0) + mw = 4; + if (m->offset < offset + width && m->offset + mw > offset) + { + m->known = 0; + m->initial_known = 0; + } + } +} + /* Resolve an operand to a stack offset when it is either: * - a literal stack-address operand: Addr[StackLoc[off]] (LEA-style source) * - a TEMP/VAR whose simulator slot is marked is_addr @@ -326,6 +356,7 @@ static int lcs_write_operand(LcsState *st, IROperand op, int64_t value, int btyp st->vars[pos].known = 1; st->vars[pos].value = value; st->vars[pos].btype = btype; + st->vars[pos].is_unsigned = op.is_unsigned; st->vars[pos].is_addr = 0; return 1; } @@ -336,6 +367,7 @@ static int lcs_write_operand(LcsState *st, IROperand op, int64_t value, int btyp st->tmps[pos].known = 1; st->tmps[pos].value = value; st->tmps[pos].btype = btype; + st->tmps[pos].is_unsigned = op.is_unsigned; st->tmps[pos].is_addr = 0; return 1; } @@ -365,6 +397,7 @@ static int lcs_write_addr_operand(LcsState *st, IROperand op, int32_t stack_offs slot->known = 1; slot->value = stack_offset; slot->btype = IROP_BTYPE_INT32; + slot->is_unsigned = 0; slot->is_addr = 1; return 1; } @@ -495,6 +528,40 @@ static int lcs_eval_softcall(int kind, int is_double, LcsState *st, return 1; } +/* Evaluate a comparison whose operands are soft-float bit patterns (set by a + * cfcmp / cdcmp flag-setter). b1/b2 are the raw 32- or 64-bit FP bits; tok is + * the same relational token evaluate_compare_condition uses. Returns 1 + * (taken), 0 (not taken), or -1 (unsupported token -> caller bails). + * Unordered (NaN) operands make every relation false except "!=", matching C + * and the ARM flag semantics the lowered branch tests. */ +static int lcs_evaluate_fp_compare(int64_t b1, int64_t b2, int tok, int is_double) +{ + double a, b; + if (is_double) + { + union { double d; uint64_t u; } x, y; + x.u = (uint64_t)b1; y.u = (uint64_t)b2; + a = x.d; b = y.d; + } + else + { + union { float f; uint32_t u; } x, y; + x.u = (uint32_t)b1; y.u = (uint32_t)b2; + a = (double)x.f; b = (double)y.f; + } + int unordered = (a != a) || (b != b); + switch (tok) + { + case 0x94: /* TOK_EQ */ return !unordered && (a == b); + case 0x95: /* TOK_NE */ return unordered || (a != b); + case 0x9c: /* TOK_LT */ return !unordered && (a < b); + case 0x9d: /* TOK_GE */ return !unordered && (a >= b); + case 0x9e: /* TOK_LE */ return !unordered && (a <= b); + case 0x9f: /* TOK_GT */ return !unordered && (a > b); + default: return -1; /* unsigned/unknown token: bail */ + } +} + static LcsStep lcs_exec(TCCIRState *ir, LcsState *st, IRQuadCompact *q, int pc, int start_idx, int end_idx, int cmp_idx, int jmpif_idx, int exit_target) @@ -549,6 +616,7 @@ static LcsStep lcs_exec(TCCIRState *ir, LcsState *st, IRQuadCompact *q, int pc, st->vars[dpos].known = 1; st->vars[dpos].value = store_val; st->vars[dpos].btype = dbt; + st->vars[dpos].is_unsigned = dest.is_unsigned; st->vars[dpos].is_addr = 0; recorded_in_var = 1; } @@ -595,8 +663,15 @@ static LcsStep lcs_exec(TCCIRState *ir, LcsState *st, IRQuadCompact *q, int pc, if (!ms) { r.action = 0; return r; } ms->value = store_val; ms->btype = dbt; + ms->is_unsigned = dest.is_unsigned; ms->known = 1; ms->written = 1; + { + int sw = ir_opt_store_btype_size_bytes(dbt); + if (sw <= 0) + sw = 4; + lcs_mem_clobber_overlaps(st, off, sw, ms); + } return r; } @@ -762,7 +837,13 @@ static LcsStep lcs_exec(TCCIRState *ir, LcsState *st, IRQuadCompact *q, int pc, return r; } int tok = (int)irop_get_imm64_ex(ir, src1); - int taken = evaluate_compare_condition(st->cmp_v1, st->cmp_v2, tok); + /* A compare flagged by a soft-float helper (cfcmp / cdcmp) holds raw FP + * bit patterns in cmp_v1/cmp_v2; evaluating them as integers is wrong for + * any operand whose sign bit is set (a negative float bit pattern reads as + * a huge unsigned int). Reinterpret and compare as float/double. */ + int taken = st->cmp_is_fp + ? lcs_evaluate_fp_compare(st->cmp_v1, st->cmp_v2, tok, st->cmp_is_double) + : evaluate_compare_condition(st->cmp_v1, st->cmp_v2, tok); if (taken < 0) { r.action = 0; @@ -1145,6 +1226,23 @@ static void lcs_init_var_state(TCCIRState *ir, int start_idx, LcsState *st) q->op == TCCIR_OP_RETURNVALUE || q->op == TCCIR_OP_RETURNVOID || q->op == TCCIR_OP_TRAP) continue; + /* A pre-loop write through a computed/indexed address (STORE_INDEXED, + * STORE_POSTINC) or a bulk copy (BLOCK_COPY) can land on ANY stack slot: + * the direct- and known-address STORE seeding below cannot resolve its + * target offset, so without this an overwritten slot would keep the stale + * value of an EARLIER direct store. Conservatively demote every tracked + * memory slot to flow-unsafe so the simulator never trusts a stale initial + * value. (agg_deep seed 47: `st12.f2 = st12.f0 ^ *p` lowers to a + * `STORE_INDEXED #4` off `&st12`, overwriting the slot the loop body then + * copies into `st12.f0`; missing that store folded the copy to f2's stale + * initializer constant.) The base/pointer vreg is still demoted by the + * generic dest handling below, so we do not skip the rest of the loop. */ + if (q->op == TCCIR_OP_STORE_INDEXED || q->op == TCCIR_OP_STORE_POSTINC || + q->op == TCCIR_OP_BLOCK_COPY) + { + for (int m = 0; m < st->n_mem; m++) + mem_flow_unsafe[m] = 1; + } if (!irop_config[q->op].has_dest) continue; IROperand d = tcc_ir_op_get_dest(ir, q); if (d.is_llocal || d.is_sym) continue; @@ -1160,6 +1258,14 @@ static void lcs_init_var_state(TCCIRState *ir, int start_idx, LcsState *st) LcsMemSlot *ms = lcs_mem_get(st, off); if (!ms) continue; int mem_idx = (int)(ms - st->mem); + /* A pre-loop store also clobbers overlapping slots tracked at OTHER + * offsets (packed sub-word accesses of the same word). */ + { + int sw = ir_opt_store_btype_size_bytes(irop_get_btype(d)); + if (sw <= 0) + sw = 4; + lcs_mem_clobber_overlaps(st, off, sw, ms); + } if (mem_flow_unsafe[mem_idx]) continue; if (irop_is_immediate(s1)) @@ -1210,6 +1316,62 @@ static void lcs_init_var_state(TCCIRState *ir, int start_idx, LcsState *st) } continue; } + /* Indirect STORE through a known stack-address temp/var: + * T <- Addr[StackLoc[off]] ; T***DEREF*** <- value + * The body simulator resolves exactly this form (see the TCCIR_OP_STORE + * case in lcs_step), so the pre-loop scan must too: otherwise a pre-loop + * write through an address alias is dropped, leaving the slot's initial + * value stale and mis-seeding the simulation (bitfield seed 5 -- a packed + * RMW of b1 via Addr[bf], then a loop RMW of b2 in the same word; the + * missed b1 store made the residual store clobber b1 back to 0). */ + if (q->op == TCCIR_OP_STORE && d.is_lval) + { + int32_t avr = irop_get_vreg(d); + if (avr >= 0) + { + int atype = TCCIR_DECODE_VREG_TYPE(avr); + int apos = TCCIR_DECODE_VREG_POSITION(avr); + const LcsSlot *aslot = NULL; + if (atype == TCCIR_VREG_TYPE_VAR && apos < st->n_vars) + aslot = &st->vars[apos]; + else if (atype == TCCIR_VREG_TYPE_TEMP && apos < st->n_tmps) + aslot = &st->tmps[apos]; + if (aslot && aslot->known && aslot->is_addr) + { + int32_t off = (int32_t)aslot->value; + LcsMemSlot *ms = lcs_mem_get(st, off); + if (ms) + { + int mem_idx = (int)(ms - st->mem); + { + int sw = ir_opt_store_btype_size_bytes(irop_get_btype(d)); + if (sw <= 0) + sw = 4; + lcs_mem_clobber_overlaps(st, off, sw, ms); + } + if (!mem_flow_unsafe[mem_idx]) + { + IROperand s1 = tcc_ir_op_get_src1(ir, q); + if (irop_is_immediate(s1)) + { + ms->value = irop_get_imm64_ex(ir, s1); + ms->btype = irop_get_btype(d); + ms->known = 1; + ms->initial_value = ms->value; + ms->initial_known = 1; + mem_has_def[mem_idx] = 1; + } + else + { + ms->known = 0; + ms->initial_known = 0; + } + } + } + continue; + } + } + } if (d.is_local && !d.is_lval) continue; int32_t vr = irop_get_vreg(d); if (vr < 0) continue; @@ -1259,6 +1421,47 @@ static void lcs_init_var_state(TCCIRState *ir, int start_idx, LcsState *st) *has_def = 0; } } + else if (q->op == TCCIR_OP_ADD || q->op == TCCIR_OP_SUB) + { + /* Address arithmetic: `T = +/- immediate` produces + * another stack address. lcs_step models this (see the ADD/SUB case), + * so the pre-loop scan must too — otherwise a later indirect store + * through the result (`T = &arr + 4; *T = v`) can't resolve its target + * slot and leaves that slot's stale initializer in the memory map + * (combo_num seed 872: `arr12[u11&7] = ...` lowers to + * `T = Addr[StackLoc] ADD #4; *T = `, and missing it let the + * unrolled/simulated loop read arr12[1]'s .data initializer instead). */ + IROperand s1 = tcc_ir_op_get_src1(ir, q); + IROperand s2 = tcc_ir_op_get_src2(ir, q); + int32_t base_off; + if (lcs_resolve_stack_addr(st, s1, &base_off) && (!s1.is_lval || s1.is_local) && + irop_is_immediate(s2) && !s2.is_sym) + { + int64_t imm = irop_get_imm64_ex(ir, s2); + slot->known = 1; + slot->value = (q->op == TCCIR_OP_ADD) ? (base_off + imm) : (base_off - imm); + slot->btype = IROP_BTYPE_INT32; + slot->is_addr = 1; + *has_def = 1; + } + else if (q->op == TCCIR_OP_ADD && + lcs_resolve_stack_addr(st, s2, &base_off) && (!s2.is_lval || s2.is_local) && + irop_is_immediate(s1) && !s1.is_sym) + { + int64_t imm = irop_get_imm64_ex(ir, s1); + slot->known = 1; + slot->value = base_off + imm; + slot->btype = IROP_BTYPE_INT32; + slot->is_addr = 1; + *has_def = 1; + } + else + { + /* addr - addr, addr +/- runtime, etc.: not a resolvable address. */ + slot->known = 0; + *has_def = 0; + } + } else { /* Any other op writing this slot: we don't model — demote. */ @@ -1343,6 +1546,16 @@ static int lcs_var_used_after(TCCIRState *ir, int var_pos, int from_idx) { int n = ir->next_instruction_index; int32_t target_vr = TCCIR_ENCODE_VREG(TCCIR_VREG_TYPE_VAR, var_pos); + /* This is a linear scan over instruction *indices*, which only reflects + * control flow while the path stays straight-line. A redefinition therefore + * kills the loop's value only in the straight-line prefix from the loop exit: + * once we pass any branch, a later redefinition may sit in a sibling + * (not-taken) branch while the real use is reached via another path. That is + * exactly fuzz seed 8985 — the loop is in an `if` branch, the value is read + * after the merge, and the `else` branch redefines the same VAR at a lower + * index than that read. Honouring the kill there wrongly dropped the loop's + * residual store, leaving the variable at its pre-loop value. */ + int saw_branch = 0; for (int i = from_idx; i < n; i++) { IRQuadCompact *q = &ir->compact_instructions[i]; @@ -1357,12 +1570,16 @@ static int lcs_var_used_after(TCCIRState *ir, int var_pos, int from_idx) IROperand s = tcc_ir_op_get_src2(ir, q); if (irop_get_vreg(s) == target_vr) return 1; } - /* A redefinition kills any need to preserve the loop's value */ - if (irop_config[q->op].has_dest) + /* A redefinition kills the loop's value only when it is unconditionally + * reached from the loop exit (no branch in between). */ + if (!saw_branch && irop_config[q->op].has_dest) { IROperand d = tcc_ir_op_get_dest(ir, q); if (!d.is_lval && irop_get_vreg(d) == target_vr) return 0; } + if (q->op == TCCIR_OP_JUMP || q->op == TCCIR_OP_JUMPIF || q->op == TCCIR_OP_IJUMP || + q->op == TCCIR_OP_SWITCH_TABLE) + saw_branch = 1; } return 0; } @@ -1565,7 +1782,28 @@ static int lcs_try_fold(TCCIRState *ir, IRLoop *loop) if (have_iv_trip && exit_target > eff_end + 1) { if (exit_target - eff_start > 512) return 0; + int orig_end = eff_end; eff_end = exit_target - 1; + /* The extension assumes [orig_end+1 .. eff_end] is rotated loop body, + * reachable only through the loop's own control flow. If an instruction + * OUTSIDE the loop jumps INTO this absorbed region, it is not loop body + * at all but a separate block that merely sits between the back-edge and + * the exit target — e.g. the ELSE arm of a guard whose THEN arm holds the + * loop: the guard's false-branch JUMP lands on the else block, which lies + * before the join. Folding it into the loop would NOP the else block and + * misroute the guard jump to the exit, dropping the else body entirely + * (longlong seed 2426). The caller's ext_entry check only covered the + * pre-extension range, so re-check the newly-absorbed tail here. */ + int nn = ir->next_instruction_index; + for (int j = 0; j < nn; j++) + { + if (j >= eff_start && j <= eff_end) continue; + IRQuadCompact *jq = &ir->compact_instructions[j]; + if (jq->op != TCCIR_OP_JUMP && jq->op != TCCIR_OP_JUMPIF) continue; + int jt = (int)irop_get_imm64_ex(ir, tcc_ir_op_get_dest(ir, jq)); + if (jt > orig_end && jt <= eff_end) + return 0; + } } if (!have_iv_trip) @@ -1652,6 +1890,7 @@ static int lcs_try_fold(TCCIRState *ir, IRLoop *loop) st.vars[pos].known = 1; st.vars[pos].value = iv->init_val; st.vars[pos].btype = IROP_BTYPE_INT32; + st.vars[pos].is_unsigned = 0; st.vars[pos].is_addr = 0; } } @@ -1819,6 +2058,12 @@ static int lcs_try_fold(TCCIRState *ir, IRLoop *loop) int btype = st.vars[p].btype ? st.vars[p].btype : IROP_BTYPE_INT32; int32_t vr = TCCIR_ENCODE_VREG(TCCIR_VREG_TYPE_VAR, p); IROperand d = irop_make_vreg(vr, btype); + /* Preserve the sign of a narrow (INT8/INT16) VAR. st.vars[p].value holds + * the un-narrowed simulated value; downstream const-prop narrows it to the + * residual's width via ir_opt_fit_const_to_operand, which sign- vs + * zero-extends based on is_unsigned. Dropping this flag would sign-extend + * an unsigned char (e.g. 254 -> -2) and miscompile. */ + d.is_unsigned = st.vars[p].is_unsigned; int64_t val = st.vars[p].value; IROperand s; if (btype == IROP_BTYPE_FLOAT64) @@ -1857,6 +2102,7 @@ static int lcs_try_fold(TCCIRState *ir, IRLoop *loop) int btype = ms->btype ? ms->btype : IROP_BTYPE_INT32; IROperand d = irop_make_stackoff(-1, ms->offset, /*is_lval*/ 1, /*is_llocal*/ 0, /*is_param*/ 0, btype); + d.is_unsigned = ms->is_unsigned; int64_t val = ms->value; IROperand s; if (btype == IROP_BTYPE_FLOAT64) @@ -1972,6 +2218,39 @@ int tcc_ir_opt_loop_const_sim(TCCIRState *ir) if (loop->depth > 1) continue; /* Skip very large loop ranges to keep cost bounded */ if (loop->end_idx - loop->start_idx > 256) continue; + + /* Skip loops that have external entries into the body (not to the + * header) — same guard as try_unroll_loop_ex/opt_loop.c. + * tcc_ir_detect_loops flags ANY JUMP/JUMPIF whose numeric target is + * lower than its own index as a loop back edge, with no dominance + * check. A switch's case-body-before-dispatch layout (the dispatch + * jumps forward in control flow to a case handler that was laid out + * earlier in instruction order) satisfies that test without being a + * loop at all: the dispatch's own entry jump lands inside the "body" + * but not at the "header", which a real loop never does. Simulating + * such a false loop executes switch-case code as if it were a + * repeating body, corrupting the result (seed 589, switch profile). */ + int ext_entry = 0; + for (int j = 0; j < ir->next_instruction_index && !ext_entry; j++) + { + if (j >= loop->start_idx && j <= loop->end_idx) + continue; /* skip instructions inside the loop itself */ + IRQuadCompact *jq = &ir->compact_instructions[j]; + if (jq->op == TCCIR_OP_JUMP || jq->op == TCCIR_OP_JUMPIF) + { + IROperand jdest = tcc_ir_op_get_dest(ir, jq); + int jtarget = (int)irop_get_imm64_ex(ir, jdest); + if (jtarget > loop->start_idx && jtarget <= loop->end_idx) + { + LOG_IR_GEN("[LOOP-CONST-SIM] loop header=%d: external entry from [%d] to [%d], skipping", + loop->header_idx, j, jtarget); + ext_entry = 1; + } + } + } + if (ext_entry) + continue; + changes += lcs_try_fold(ir, loop); } diff --git a/ir/opt_loop_utils.c b/ir/opt_loop_utils.c index 29637364..834f3754 100644 --- a/ir/opt_loop_utils.c +++ b/ir/opt_loop_utils.c @@ -804,11 +804,29 @@ int insert_instr_at(TCCIRState *ir, int pos, TccIrOp op, IROperand dest, IROpera } } } + for (int ti = 0; ti < ir->num_switch_tables; ti++) + { + TCCIRSwitchTable *table = &ir->switch_tables[ti]; + if (table->default_target >= pos) + table->default_target++; + for (int tj = 0; tj < table->num_entries; tj++) + { + if (table->targets[tj] >= pos) + table->targets[tj]++; + } + } /* Create the new instruction using operand pool */ IRQuadCompact *new_q = &ir->compact_instructions[pos]; new_q->op = op; - new_q->orig_index = pos; + /* Assign a fresh unique orig_index — never re-use the compact position + * `pos`, which both collides with an existing instruction's key and is not + * reflected in ir->max_orig_index. Side tables keyed by orig_index and + * sized max_orig_index+1 (ir->barrel_shifts[], shift64_dead_half[], + * bfi_params[], the codegen orig->code map) would otherwise be + * under-allocated and over-read in codegen. Bumping max_orig_index keeps + * them sized to cover every live orig_index. */ + new_q->orig_index = ++ir->max_orig_index; new_q->is_jump_target = 0; /* shifted instructions carry their flag; new slot has none */ new_q->no_unroll = 0; new_q->line_num = 0; @@ -830,37 +848,122 @@ int insert_instr_at(TCCIRState *ir, int pos, TccIrOp op, IROperand dest, IROpera * only rewrites use_idx in place to ASSIGN dest, shared_ptr. */ -/* True if vreg `v` is the DIV-pointer `ud_vr` itself, or is defined inside the - * loop by an ADD/SUB/LEA that has `ud_vr` as one operand — i.e. `v = ud_vr + - * offset`, a field address derived from the strength-reduction pointer. A - * struct-field load `arr[i].f` lowers to `t = (base + iv*stride); a = t + foff; - * LOAD [a]`, so the memory access dereferences `a` (= ud_vr + foff), NOT ud_vr - * directly. The direct is_lval scan therefore misses it; this follows one - * level of offset arithmetic so such DIVs are correctly treated as feeding a - * memory access. */ -static int sr_vreg_is_ud_or_offset(TCCIRState *ir, IRLoop *loop, int32_t v, int32_t ud_vr) +/* Escape analysis for the derived-IV address value (docs/bugs.md #2). + * + * Taint-tracks every vreg that may (transitively) carry the DIV's computed + * address within [lo..hi] and verifies the value never leaves the plain + * register domain: the ONLY allowed consumers are ASSIGN / ADD / SUB copies + * and arithmetic (which propagate the taint to their dest) and CMP. Any + * other use disqualifies the DIV: + * - a dereference (any lval-marked operand holding a tainted vreg), + * - STORE / STORE_INDEXED / STORE_POSTINC / LOAD / LOAD_INDEXED touching + * a tainted vreg in any slot (address OR stored value), + * - FUNCPARAMVAL (the address escapes into a call), + * - anything else (RETURNVALUE, IJUMP, MLA, ...). + * + * This replaces the earlier one-level `ud_vr + offset` scan, which missed + * multi-hop flows (va-arg-24: the ADD's dest reached the loop store through + * a chain the scan could not correlate). Flow-insensitive: a stale taint + * after redefinition only over-approximates, i.e. skips more DIVs — safe. + * Returns 1 when the value provably stays in registers, 0 otherwise + * (including scan-capacity overflow). */ +#define SR_TAINT_MAX 64 +static int sr_div_value_stays_in_regs(TCCIRState *ir, int lo, int hi, int32_t seed_vr) { - if (v < 0) - return 0; - if (v == ud_vr) - return 1; - int lo = loop->start_idx >= 0 ? loop->start_idx : 0; - int hi = loop->end_idx < ir->next_instruction_index ? loop->end_idx : ir->next_instruction_index - 1; - for (int i = lo; i <= hi; i++) + int32_t taint[SR_TAINT_MAX]; + int nt = 0; + taint[nt++] = seed_vr; + + int changed = 1; + while (changed) { - IRQuadCompact *q = &ir->compact_instructions[i]; - if (q->op != TCCIR_OP_ADD && q->op != TCCIR_OP_SUB && q->op != TCCIR_OP_LEA) - continue; - if (!irop_config[q->op].has_dest) - continue; - if (irop_get_vreg(tcc_ir_op_get_dest(ir, q)) != v) - continue; - int32_t a = irop_config[q->op].has_src1 ? irop_get_vreg(tcc_ir_op_get_src1(ir, q)) : -1; - int32_t b = irop_config[q->op].has_src2 ? irop_get_vreg(tcc_ir_op_get_src2(ir, q)) : -1; - if (a == ud_vr || b == ud_vr) - return 1; + changed = 0; + for (int j = lo; j <= hi; j++) + { + IRQuadCompact *q = &ir->compact_instructions[j]; + if (q->op == TCCIR_OP_NOP) + continue; + + /* Gather all operand slots this instruction READS. For STORE-style + * ops the dest slot holds the write address — a read of the pointer. + * MLA's accumulator lives at pool slot +3, invisible to src1/src2 + * accessors (the ptr-6869 blind spot) — gather it explicitly. */ + IROperand reads[4]; + int nreads = 0; + int dest_is_read = (q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED || + q->op == TCCIR_OP_STORE_POSTINC || q->op == TCCIR_OP_FUNCPARAMVAL); + if (irop_config[q->op].has_src1) + reads[nreads++] = tcc_ir_op_get_src1(ir, q); + if (irop_config[q->op].has_src2) + reads[nreads++] = tcc_ir_op_get_src2(ir, q); + if (dest_is_read && irop_config[q->op].has_dest) + reads[nreads++] = tcc_ir_op_get_dest(ir, q); + if (q->op == TCCIR_OP_MLA) + reads[nreads++] = tcc_ir_op_get_accum(ir, q); + + int reads_taint = 0; + for (int r = 0; r < nreads; r++) + { + int32_t rv = irop_get_vreg(reads[r]); + if (rv < 0) + continue; + int t = 0; + for (int k = 0; k < nt; k++) + { + if (taint[k] == rv) + { + t = 1; + break; + } + } + if (!t) + continue; + /* An lval-marked read is a memory dereference of the tainted value — + * EXCEPT the IR's plain "fetch variable" form: a VAR-typed vreg with + * is_lval+is_local reads the variable's own value, not memory through + * it (see opt_dead_vla.c CLASSIFY / opt_loop_const_sim.c notes). */ + if (reads[r].is_lval && + !(TCCIR_DECODE_VREG_TYPE(rv) == TCCIR_VREG_TYPE_VAR && reads[r].is_local)) + return 0; + reads_taint = 1; + } + if (!reads_taint) + continue; + + /* Tainted value consumed here — allow only plain ALU/copy/compare. */ + if (q->op != TCCIR_OP_ASSIGN && q->op != TCCIR_OP_ADD && q->op != TCCIR_OP_SUB && + q->op != TCCIR_OP_CMP) + return 0; + + if (irop_config[q->op].has_dest) + { + IROperand d = tcc_ir_op_get_dest(ir, q); + if (d.is_lval) + return 0; /* store through an lval dest — memory write */ + int32_t dv = irop_get_vreg(d); + if (dv >= 0) + { + int already = 0; + for (int k = 0; k < nt; k++) + { + if (taint[k] == dv) + { + already = 1; + break; + } + } + if (!already) + { + if (nt >= SR_TAINT_MAX) + return 0; /* capacity — be conservative */ + taint[nt++] = dv; + changed = 1; + } + } + } + } } - return 0; + return 1; } int transform_derived_iv(TCCIRState *ir, IRLoop *loop, InductionVar *iv, DerivedIV *div, int *out_ptr_vreg, @@ -875,78 +978,36 @@ int transform_derived_iv(TCCIRState *ir, IRLoop *loop, InductionVar *iv, Derived if (out_stride_pos) *out_stride_pos = -1; - /* DIAGNOSTIC: temporarily disable all derived-IV strength reduction to test - * whether it is the source of the linker heap corruption. REMOVE after test. - * (Plain early return rather than a `(void*)1` sentinel: the sentinel made - * GCC's VRP assume out_ptr_vreg == (void*)1 past the check, so the later - * `*out_ptr_vreg = ...` writes tripped -Werror=array-bounds.) */ - return 0; + /* Kill-switch for bisection: TCC_DISABLE_PASS=derived_iv (docs/bugs.md #2). */ + if (tcc_ir_opt_pass_disabled("derived_iv")) + return 0; - /* Shared-pointer fast path: rewrite the use site to ASSIGN of the existing - * primary's strength-reduced pointer. No insertions — just rewrites. - * Returns 1 to signal success without triggering the caller's index-shift - * bookkeeping (no instructions inserted). */ - if (shared_ptr_vreg >= 0) + /* Never rewrite a derived IV whose use site is itself an indexed memory + * access (STORE_INDEXED / LOAD_INDEXED). The escape scan below also + * rejects these, but keep the explicit guard as a backstop for direct + * callers: the backend already forms efficient indexed addressing for + * array element accesses, so nothing is lost by skipping. */ + if (div->use_idx >= 0 && div->use_idx < ir->next_instruction_index) { - if (div->use_idx < 0 || div->use_idx >= ir->next_instruction_index) + int uop = ir->compact_instructions[div->use_idx].op; + if (uop == TCCIR_OP_STORE_INDEXED || uop == TCCIR_OP_LOAD_INDEXED) return 0; - IRQuadCompact *use_q = &ir->compact_instructions[div->use_idx]; - IROperand ptr_op = irop_make_vreg(shared_ptr_vreg, IROP_BTYPE_INT32); - IROperand null_op = {0}; - - /* INDEXED-DIV use site: rewrite LOAD_INDEXED→LOAD or STORE_INDEXED→STORE - * pointing at the shared primary's pointer. The trailing index/scale - * slots are left orphaned in the pool (harmless — plain LOAD/STORE never - * reads them). */ - if (use_q->op == TCCIR_OP_LOAD_INDEXED) - { - IROperand ptr_lval = ptr_op; - ptr_lval.is_lval = 1; - use_q->op = TCCIR_OP_LOAD; - tcc_ir_op_set_src1(ir, use_q, ptr_lval); - LOG_IV_SR("IV_SR: shared INDEXED-DIV at idx=%d rewritten to LOAD <- TMP%d", div->use_idx, - TCCIR_DECODE_VREG_POSITION(shared_ptr_vreg)); - if (out_ptr_vreg) - *out_ptr_vreg = shared_ptr_vreg; - return 1; - } - if (use_q->op == TCCIR_OP_STORE_INDEXED) - { - IROperand ptr_lval = ptr_op; - ptr_lval.is_lval = 1; - use_q->op = TCCIR_OP_STORE; - tcc_ir_op_set_dest(ir, use_q, ptr_lval); - LOG_IV_SR("IV_SR: shared INDEXED-DIV at idx=%d rewritten to STORE -> TMP%d", div->use_idx, - TCCIR_DECODE_VREG_POSITION(shared_ptr_vreg)); - if (out_ptr_vreg) - *out_ptr_vreg = shared_ptr_vreg; - return 1; - } - - use_q->op = TCCIR_OP_ASSIGN; - tcc_ir_op_set_src1(ir, use_q, ptr_op); - tcc_ir_op_set_src2(ir, use_q, null_op); - /* If this DIV had a separate SHL/MUL feeding into it (shl_idx >= 0), - * NOP it — its result is now dead because the consuming ADD just became - * an ASSIGN. Leaving a dead SHL/MUL in place would let later passes - * (e.g. local_alu_cse) treat its output as a live equivalent expression - * and CSE other matching ADDs into stale values, miscompiling the loop. - * For MLA-fused DIVs (shl_idx == -1) there is no separate instruction. */ - if (div->shl_idx >= 0 && div->shl_idx < ir->next_instruction_index) - { - IRQuadCompact *shl_q = &ir->compact_instructions[div->shl_idx]; - shl_q->op = TCCIR_OP_NOP; - } - /* Note: MLA's accum operand at +3 is now orphaned in the pool, harmless. */ - if (out_ptr_vreg) - *out_ptr_vreg = shared_ptr_vreg; - LOG_IV_SR("IV_SR: shared-DIV at idx=%d rewritten to ASSIGN <- TMP%d (NOPed shl_idx=%d)", div->use_idx, - TCCIR_DECODE_VREG_POSITION(shared_ptr_vreg), div->shl_idx); - return 1; } - /* Bail out for a derived IV whose computed address feeds a MEMORY ACCESS - * (a load or store through that address). + /* Shared-pointer rewrites (share_with groups reusing a primary's pointer) + * are NOT supported: the rewrite ran no escape analysis and could not + * prove the shared use executes before the primary's `ptr += stride` bump + * within an iteration — reading a post-increment pointer value for a + * pre-increment address (docs/bugs.md #2). The caller's one-transform- + * per-invocation policy makes this path unreachable anyway (a duplicate is + * only visited when its primary FAILED); duplicates are instead re-detected + * as independent primaries by the driver's re-detection loop and validated + * on their own. */ + if (shared_ptr_vreg >= 0) + return 0; + + /* Bail out for a derived IV whose computed address value can reach a MEMORY + * ACCESS or otherwise escape the register domain inside the loop. * * For such a DIV (address temp = base + iv*stride, then `*addr` is read or * written), rewriting the address computation to the strength-reduced pointer @@ -961,8 +1022,14 @@ int transform_derived_iv(TCCIRState *ir, IRLoop *loop, InductionVar *iv, Derived * Skipping these keeps strength reduction correct; the backend already forms * efficient indexed (LDR/STR rN,[rb,rm,LSL#k]) and post-increment addressing * for array element accesses, so little is lost. A genuine non-memory - * derived IV (address used only in further pointer arithmetic) is still - * reduced. */ + * derived IV (address used only in further register arithmetic/compares) is + * still reduced. + * + * The scan must cover the FULL loop body: for an unrotated top-tested loop + * the body proper is a detached range AFTER the back-edge ([start..end] only + * covers test+latch), reached via a forward jump — exactly how va-arg-24's + * store escaped the earlier [start_idx..end_idx] scan. body_instrs[] holds + * the extended contiguous range computed by tcc_ir_detect_loops. */ if (div->use_idx >= 0 && div->use_idx < ir->next_instruction_index) { int uop = ir->compact_instructions[div->use_idx].op; @@ -971,29 +1038,23 @@ int transform_derived_iv(TCCIRState *ir, IRLoop *loop, InductionVar *iv, Derived { IROperand ud = tcc_ir_op_get_dest(ir, &ir->compact_instructions[div->use_idx]); int32_t ud_vr = irop_get_vreg(ud); - int lo = loop->start_idx >= 0 ? loop->start_idx : 0; - int hi = loop->end_idx < ir->next_instruction_index ? loop->end_idx : ir->next_instruction_index - 1; if (ud_vr >= 0) { - for (int si = lo; si <= hi && !feeds_mem; si++) + int lo = loop->start_idx >= 0 ? loop->start_idx : 0; + int hi = loop->end_idx; + if (loop->num_body_instrs > 0) { - IRQuadCompact *sq = &ir->compact_instructions[si]; - /* STORE-like: the address is the (lval) destination. The base may be - * ud_vr itself or `ud_vr + field_offset` (see sr_vreg_is_ud_or_offset). */ - if ((sq->op == TCCIR_OP_STORE || sq->op == TCCIR_OP_STORE_INDEXED || sq->op == TCCIR_OP_STORE_POSTINC)) - { - IROperand sd = tcc_ir_op_get_dest(ir, sq); - if (sd.is_lval && sr_vreg_is_ud_or_offset(ir, loop, irop_get_vreg(sd), ud_vr)) - feeds_mem = 1; - } - /* LOAD-like / any deref: the address is an lval source operand. */ - if (!feeds_mem && irop_config[sq->op].has_src1) - { - IROperand s1 = tcc_ir_op_get_src1(ir, sq); - if (s1.is_lval && sr_vreg_is_ud_or_offset(ir, loop, irop_get_vreg(s1), ud_vr)) - feeds_mem = 1; - } + int b_first = loop->body_instrs[0]; + int b_last = loop->body_instrs[loop->num_body_instrs - 1]; + if (b_first >= 0 && b_first < lo) + lo = b_first; + if (b_last > hi) + hi = b_last; } + if (hi >= ir->next_instruction_index) + hi = ir->next_instruction_index - 1; + feeds_mem = !sr_div_value_stays_in_regs(ir, lo, hi, ud_vr); + LOG_IV_SR("IV_SR: escape scan [%d..%d] for DIV at use_idx=%d: feeds_mem=%d", lo, hi, div->use_idx, feeds_mem); } } if (feeds_mem) @@ -1833,12 +1894,11 @@ int iv_strength_reduction_core(TCCIRState *ir, IRLoops *loops) LOG_IV_SR("IV_SR: Found %d DIV(s) in loop %d", num_divs, li); /* Deduplicate DIVs that compute identical (iv, stride, base) recurrences. - * Without this, N identical MLAs (e.g. arr[i].a, arr[i].b, arr[i].c each - * computing the same &arr[i]) would each get its own strength-reduced - * pointer, requiring N pointer bumps in the latch — strictly worse than - * the original. Mark each duplicate's share_with field with the index of - * the earliest equivalent DIV, so the transform can rewrite them to - * ASSIGN dest, primary_ptr instead of allocating fresh pointers. */ + * A duplicate (share_with >= 0) is only attempted when its primary FAILED + * to transform, and transform_derived_iv refuses shared rewrites outright + * (no escape analysis ran for the duplicate's use site — docs/bugs.md #2), + * so marking a duplicate effectively defers it: the driver's re-detection + * loop revisits it as an independent primary with exact indices. */ for (int dj = 1; dj < num_divs; dj++) { for (int dk = 0; dk < dj; dk++) @@ -2239,8 +2299,17 @@ int collect_body_instructions(TCCIRState *ir, IRLoop *loop, int iv_vreg, int cmp /* Scan only [start_idx..end_idx]. The forward-jump extension in the loop * detector can pull in post-loop instructions (e.g. the exit target), which * must NOT be treated as body. The merge pass already ensures end_idx - * covers all body instructions from overlapping loops. */ - for (int i = loop->start_idx; i <= loop->end_idx && count < max_body; i++) + * covers all body instructions from overlapping loops. + * + * Scan the FULL range — do NOT stop at max_body. Stopping early would + * silently TRUNCATE the body: try_unroll_loop_ex would then NOP the whole + * [start..end] region and re-emit only the collected prefix × trip_count, + * dropping every instruction past the cap (including an inner loop's control + * flow, which lives in the tail). That miscompiles — random-C seed 18 has a + * 203-instruction body whose first 32 collectable insns are straight-line, so + * the truncated prefix passed the JUMPIF/call rejection below and unrolled an + * incomplete body. An over-cap body is rejected outright (see below). */ + for (int i = loop->start_idx; i <= loop->end_idx; i++) { IRQuadCompact *q = &ir->compact_instructions[i]; @@ -2304,6 +2373,14 @@ int collect_body_instructions(TCCIRState *ir, IRLoop *loop, int iv_vreg, int cmp return -1; } + /* Body has more real instructions than we can buffer / safely unroll. + * Reject instead of truncating: a truncated body unrolls to wrong code. */ + if (count >= max_body) + { + LOG_LOOP_OPT("collect_body: REJECTED body exceeds max_body=%d", max_body); + return -1; + } + LOG_LOOP_OPT("collect_body: body[%d] = instr [%d] op=%d", count, i, q->op); body_indices[count++] = i; } @@ -2593,6 +2670,21 @@ int try_eliminate_loop_symbolic(TCCIRState *ir, IRLoop *loop) (guard_cmp >= 0 && guard_jmpif >= 0 && num_acc_used == 1 && !counter_used_after && ivs[single_acc_idx].init_val == 0); + /* The fallback closed form below writes UNCONDITIONAL final IV values + * (counter = limit; acc = limit*step). Those are only correct when the loop + * provably executes at least once. But the limit here is SYMBOLIC (constant + * limits go through try_eliminate_loop), so a top-tested `while`/`for` with + * limit <= init runs ZERO times and every IV must keep its init value — e.g. + * `i=0; while(istart_idx; i <= loop->end_idx; i++) ir->compact_instructions[i].op = TCCIR_OP_NOP; @@ -2952,16 +3044,38 @@ int try_unroll_loop_ex(TCCIRState *ir, IRLoop *loop, IRLoops *loops, int loop_id * can include post-loop instructions that must not be touched. */ int loop_end = loop->end_idx; + /* The loop is removed by NOPing the whole region (including its exit + * JUMPIF) and writing the unrolled body in place; control then leaves the + * region by fall-through to loop_end+1. That only reaches the loop's exit + * target when exit_target IS the physical successor. For a loop nested in a + * branch (or an inner loop whose exit is the outer latch) the exit target + * sits past intervening code, and the original exit was taken ONLY via the + * now-NOP'd JUMPIF — never by fall-through. Detect that and reserve a slot + * for an explicit exit JUMP (mirrors need_exit_jump in try_rotate_loop). */ + int need_exit_jump = 0; + { + int n2 = ir->next_instruction_index; + int ft = loop_end + 1; + while (ft < n2 && ir->compact_instructions[ft].op == TCCIR_OP_NOP) + ft++; + int et = exit_target; + while (et < n2 && ir->compact_instructions[et].op == TCCIR_OP_NOP) + et++; + if (ft != et) + need_exit_jump = 1; + } + /* The unrolled body needs trip_count*body_count slots plus 1 optional slot - * for the IV final value (if used after the loop). When the original loop - * region is too small, insert NOPs immediately after loop_end and extend - * loop_end to cover them. insert_instr_at shifts subsequent instructions - * and patches all jump targets that point at or past the insertion site. - * Indices inside [start_idx..loop_end] (body_indices, cmp_idx, jmpif_idx, - * iv->def_idx, iv->init_idx) are unchanged; exit_target sits after the loop - * and must be shifted manually. */ + * for the IV final value (if used after the loop) and 1 more for the exit + * JUMP (if needed). When the original loop region is too small, insert NOPs + * immediately after loop_end and extend loop_end to cover them. + * insert_instr_at shifts subsequent instructions and patches all jump + * targets that point at or past the insertion site. Indices inside + * [start_idx..loop_end] (body_indices, cmp_idx, jmpif_idx, iv->def_idx, + * iv->init_idx) are unchanged; exit_target sits after the loop and must be + * shifted manually. */ int avail_slots = loop_end - loop->start_idx + 1; - int needed_slots = total_insns + 1; /* +1 reserved for IV final assignment */ + int needed_slots = total_insns + 1 + need_exit_jump; /* +1 IV final, +1 exit JUMP */ /* Only grow the IR (and ripple-update sibling loop records) when this is * the sole loop being processed. In multi-loop functions the cross-loop * book-keeping is fragile — even with body_instrs/start/end fix-up some @@ -3250,12 +3364,33 @@ int try_unroll_loop_ex(TCCIRState *ir, IRLoop *loop, IRLoops *loops, int loop_id if (ir->compact_instructions[i].op == TCCIR_OP_NOP) { write_instr_at_nop(ir, i, TCCIR_OP_ASSIGN, iv_dest, iv_val_op, (IROperand){0}); + write_pos = i + 1; break; } } } } + /* Emit the loop's exit branch when fall-through does not reach exit_target. + * The original exit JUMPIF was NOP'd with the rest of the loop; without this + * the unrolled body falls through into whatever code physically follows the + * loop (e.g. the else block of an enclosing if, or an outer loop's body). + * Must come after the IV-final assignment so that value is still computed. */ + if (need_exit_jump) + { + for (int i = write_pos; i <= loop_end; i++) + { + if (ir->compact_instructions[i].op == TCCIR_OP_NOP) + { + IROperand exit_dest = irop_make_imm32(-1, exit_target, IROP_BTYPE_INT32); + write_instr_at_nop(ir, i, TCCIR_OP_JUMP, exit_dest, (IROperand){0}, (IROperand){0}); + if (exit_target >= 0 && exit_target < ir->next_instruction_index) + ir->compact_instructions[exit_target].is_jump_target = 1; + break; + } + } + } + ret = 1; unroll_cleanup: @@ -3338,6 +3473,30 @@ int try_rotate_loop(TCCIRState *ir, IRLoop *loop) return 0; } + /* Reject rotating a loop that is nested inside an ALREADY-ROTATED loop. + * Rotating both an outer loop and an inner loop nested within it produces a + * doubly-rotated nested shape that a later pass miscompiles (random-C O2 + * wrong-code, Finding #15 follow-up, seed 49: nested csmix accumulators). + * Rotating EITHER loop alone is correct, so decline the inner one once the + * enclosing loop has been rotated. A rotated (bottom-tested) loop's back-edge + * is a conditional JUMPIF that branches backward to the loop body; an + * un-rotated (top-tested) loop's back-edge is an unconditional JUMP to the + * header. So look for a backward-branching JUMPIF that strictly encloses + * [hi, backedge_idx] — that is an enclosing rotated loop. */ + for (int i = 0; i < n; i++) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op != TCCIR_OP_JUMPIF) + continue; + IROperand jd = tcc_ir_op_get_dest(ir, q); + int jt = (int)irop_get_imm64_ex(ir, jd); + if (jt >= 0 && jt < hi && i > backedge_idx) + { + LOG_LOOP_OPT("Rotation: reject — nested inside already-rotated loop [%d..%d]", jt, i); + return 0; + } + } + /* --- Step 3: Identify latch region [latch_start .. latch_end] --- */ int latch_start = hi + 3; int latch_end = backedge_idx - 1; /* exclude back-edge JUMP */ @@ -3562,6 +3721,114 @@ int try_rotate_loop(TCCIRState *ir, IRLoop *loop) if (body_count > 128) return 0; + /* A VAR lval marked local is the IR's ordinary "read/write this local + * variable" spelling. Other lval operands dereference an address carried in + * a vreg (for example T123***DEREF***). Keep those loops top-tested: the + * rotated bottom-tested shape exposes the deref to later forwarding/threading + * passes in forms they do not fully model yet. */ +#define ROT_LVAL_IS_INDIRECT(op_) \ + ((op_).is_lval && irop_get_vreg(op_) >= 0 && \ + !(TCCIR_DECODE_VREG_TYPE(irop_get_vreg(op_)) == TCCIR_VREG_TYPE_VAR && (op_).is_local)) + + { + int32_t iv_vr = irop_get_vreg(tcc_ir_op_get_src1(ir, cmp_q)); + int32_t seen_reads[32]; + int nseen_reads = 0; + int32_t carried_defs[8]; + int ncarried_defs = 0; + +#define ROT_NOTE_READ(op_) \ + do { \ + int32_t _vr = irop_get_vreg(op_); \ + if (_vr >= 0 && _vr != iv_vr && TCCIR_DECODE_VREG_TYPE(_vr) == TCCIR_VREG_TYPE_VAR) { \ + int _seen = 0; \ + for (int _k = 0; _k < nseen_reads; _k++) \ + if (seen_reads[_k] == _vr) { \ + _seen = 1; \ + break; \ + } \ + if (!_seen && nseen_reads < (int)(sizeof(seen_reads) / sizeof(seen_reads[0]))) \ + seen_reads[nseen_reads++] = _vr; \ + } \ + } while (0) + +#define ROT_NOTE_DEF(op_) \ + do { \ + int32_t _vr = irop_get_vreg(op_); \ + if (_vr >= 0 && _vr != iv_vr && TCCIR_DECODE_VREG_TYPE(_vr) == TCCIR_VREG_TYPE_VAR) { \ + int _read = 0; \ + for (int _k = 0; _k < nseen_reads; _k++) \ + if (seen_reads[_k] == _vr) { \ + _read = 1; \ + break; \ + } \ + if (_read) { \ + int _carried = 0; \ + for (int _k = 0; _k < ncarried_defs; _k++) \ + if (carried_defs[_k] == _vr) { \ + _carried = 1; \ + break; \ + } \ + if (!_carried && ncarried_defs < (int)(sizeof(carried_defs) / sizeof(carried_defs[0]))) \ + carried_defs[ncarried_defs++] = _vr; \ + } \ + } \ + } while (0) + + for (int i = body_start; i <= body_end; i++) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + int op = q->op; + if (irop_config[op].has_src1) + ROT_NOTE_READ(tcc_ir_op_get_src1(ir, q)); + if (irop_config[op].has_src2) + ROT_NOTE_READ(tcc_ir_op_get_src2(ir, q)); + if (op == TCCIR_OP_MLA) + ROT_NOTE_READ(tcc_ir_op_get_accum(ir, q)); + if (irop_config[op].has_dest) + ROT_NOTE_DEF(tcc_ir_op_get_dest(ir, q)); + } + if (ncarried_defs > 1) + { + LOG_LOOP_OPT("Rotation: reject — body carries %d non-IV VARs", ncarried_defs); + return 0; + } + +#undef ROT_NOTE_READ +#undef ROT_NOTE_DEF + } + + /* Calls inside the rotated body make the carried live ranges cross a + * different control-flow shape after rotation. Later forwarding/coalescing + * can then observe the preheader/body copies as interchangeable when the + * call-clobbered value is not. Keep call-containing loops in their original + * top-tested form; simple call-free counted loops still rotate. */ + for (int i = body_start; i <= body_end; i++) + { + int op = ir->compact_instructions[i].op; + IRQuadCompact *q = &ir->compact_instructions[i]; + if (op == TCCIR_OP_FUNCCALLVAL || op == TCCIR_OP_FUNCCALLVOID) + { + LOG_LOOP_OPT("Rotation: reject — body has call at %d", i); + return 0; + } + if (op == TCCIR_OP_LOAD_INDEXED || op == TCCIR_OP_STORE_INDEXED) + { + LOG_LOOP_OPT("Rotation: reject — body has indexed memory op at %d", i); + return 0; + } + if ((irop_config[op].has_src1 && ROT_LVAL_IS_INDIRECT(tcc_ir_op_get_src1(ir, q))) || + (irop_config[op].has_src2 && ROT_LVAL_IS_INDIRECT(tcc_ir_op_get_src2(ir, q))) || + (op == TCCIR_OP_MLA && ROT_LVAL_IS_INDIRECT(tcc_ir_op_get_accum(ir, q))) || + ((op == TCCIR_OP_STORE || op == TCCIR_OP_STORE_POSTINC) && + irop_config[op].has_dest && ROT_LVAL_IS_INDIRECT(tcc_ir_op_get_dest(ir, q)))) + { + LOG_LOOP_OPT("Rotation: reject — body has indirect lvalue operand at %d", i); + return 0; + } + } +#undef ROT_LVAL_IS_INDIRECT + /* --- Step 4a2: Reject if body has a fall-through exit --- */ /* When body_end_is_implicit, the body may end with trailing NOPs (from * eliminated fall-through jumps) after a JUMPIF. In the original layout, @@ -4001,4 +4268,3 @@ int loop_size_cmp(const void *a, const void *b) int sb = lb->end_idx - lb->start_idx; return sa - sb; } - diff --git a/ir/opt_memory.c b/ir/opt_memory.c index 6113fdc8..9b24b344 100644 --- a/ir/opt_memory.c +++ b/ir/opt_memory.c @@ -211,6 +211,7 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) int64_t offset; IROperand value; int btype; + int idx; } estores[MAX_ENTRY_STORES]; int estore_count = 0; @@ -276,12 +277,14 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) { estores[found].value = imm; estores[found].btype = IROP_BTYPE_INT32; + estores[found].idx = i; } else { estores[estore_count].offset = off; estores[estore_count].value = imm; estores[estore_count].btype = IROP_BTYPE_INT32; + estores[estore_count].idx = i; estore_count++; } } @@ -342,12 +345,14 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) { estores[found].value = src1; estores[found].btype = irop_get_btype(dest); + estores[found].idx = i; } else if (estore_count < MAX_ENTRY_STORES) { estores[estore_count].offset = off; estores[estore_count].value = src1; estores[estore_count].btype = irop_get_btype(dest); + estores[estore_count].idx = i; estore_count++; } } @@ -392,15 +397,48 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) if (eq->op != TCCIR_OP_STORE && eq->op != TCCIR_OP_STORE_INDEXED && eq->op != TCCIR_OP_STORE_POSTINC) continue; IROperand sd = tcc_ir_op_get_dest(ir, eq); - if (!sd.is_local || !sd.is_lval || sd.is_llocal) - continue; - if (irop_get_tag(sd) != IROP_TAG_STACKOFF) + int64_t soff = 0; + int have_soff = 0; + if (eq->op == TCCIR_OP_STORE) + { + if (sd.is_local && sd.is_lval && !sd.is_llocal && irop_get_tag(sd) == IROP_TAG_STACKOFF) + { + soff = irop_get_stack_offset(sd); + have_soff = 1; + } + } + else if (eq->op == TCCIR_OP_STORE_INDEXED) + { + /* disp_fusion rewrites stores like `st.field = x` into a + * STORE_INDEXED whose base is a non-lval Addr[StackLoc[base]] and + * whose index is the field byte offset. It still overwrites the + * concrete stack slot, so stale entry initializers for that field + * must be invalidated just like direct STORE StackLoc writes. */ + if (sd.is_local && !sd.is_lval && !sd.is_llocal && irop_get_tag(sd) == IROP_TAG_STACKOFF) + { + IROperand idx = tcc_ir_op_get_src2(ir, eq); + IROperand scale_op = tcc_ir_op_get_scale(ir, eq); + if (irop_is_immediate(idx) && !idx.is_sym && irop_is_immediate(scale_op)) + { + soff = irop_get_stack_offset(sd) + (irop_get_imm64_ex(ir, idx) << irop_get_imm64_ex(ir, scale_op)); + have_soff = 1; + } + } + } + if (!have_soff) continue; - int64_t soff = irop_get_stack_offset(sd); for (int k = 0; k < estore_count; k++) { if (estores[k].offset == soff) { + /* Once an offset is overwritten after the entry BB, its entry-BB + * value must not be forwarded: any later load of that offset — + * including a loop-interior deref reached via the back-edge — may + * observe the overwritten value, not the entry-BB initializer. + * Loads with runtime indices read memory directly; the entry-BB + * store instruction itself is preserved by the redundant-store + * elimination pass, which separately flushes on runtime + * LOAD_INDEXED. */ LOG_IR_GEN("ENTRY_STORE_PROP: invalidated off=%lld (rewritten at i=%d)", (long long)soff, j); estores[k].offset = 0x7FFFFFFFLL; } @@ -476,6 +514,20 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) SimpleLeaEntry *lea_map = tcc_mallocz(sizeof(SimpleLeaEntry) * (max_tmp + 1)); SimpleLeaEntry *var_lea_map = tcc_mallocz(sizeof(SimpleLeaEntry) * (max_var + 1)); + /* Separate map for TEMPs that hold `array_base + RUNTIME_index` pointers. Kept + * OUT of lea_map (whose exact offsets the forwarder relies on) so it cannot + * perturb constant-offset resolution — it is consumed only by the dedicated + * entry-BB runtime-store invalidation below (seed 294). */ + int64_t *rt_base = tcc_mallocz(sizeof(int64_t) * (max_tmp + 1)); + uint8_t *rt_valid = tcc_mallocz(max_tmp + 1); + /* VAR analogue of rt_base: a `&arr[RUNTIME_index]` pointer materialised into + * a VAR local (e.g. `unsigned *p = &arr9[u6&7];`). The runtime store through + * such a VAR pointer must invalidate the whole array's entry initializers, + * but the TEMP-only rt_base map loses the base when the address lands in / + * is copied through a VAR. Tracking it here lets Phase 2.6 fire (ptr fuzz + * seed 3343: `*p11` with p11=&arr9[u6&7] left arr9[2]'s init forwardable). */ + int64_t *var_rt_base = tcc_mallocz(sizeof(int64_t) * (max_var + 1)); + uint8_t *var_rt_valid = tcc_mallocz(max_var + 1); for (int i = 0; i < n; i++) { @@ -498,6 +550,19 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) lea_map[p].valid = 1; } } + /* Same address landing directly in a VAR (e.g. an alias pointer + * `unsigned *q = &local;`): record it so a later store through q (or a + * TEMP copied from q) invalidates the matching entry-store (fuzz ptr + * seeds 206/368/394 — symmetric with the VAR-dest ADD case below). */ + else if (vr >= 0 && TCCIR_DECODE_VREG_TYPE(vr) == TCCIR_VREG_TYPE_VAR) + { + int p = TCCIR_DECODE_VREG_POSITION(vr); + if (p <= max_var) + { + var_lea_map[p].offset = irop_get_stack_offset(src1); + var_lea_map[p].valid = 1; + } + } } } @@ -518,6 +583,12 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) var_lea_map[dp].offset = lea_map[sp].offset; var_lea_map[dp].valid = 1; } + /* Carry a runtime array base into the VAR alias pointer too. */ + else if (sp <= max_tmp && rt_valid[sp] && dp <= max_var) + { + var_rt_base[dp] = rt_base[sp]; + var_rt_valid[dp] = 1; + } } } @@ -538,6 +609,34 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) lea_map[dp].offset = var_lea_map[sp].offset; lea_map[dp].valid = 1; } + /* A TEMP copied from a VAR runtime array pointer (`T = p11`) carries the + * runtime base, so a store `*T = ...` invalidates the array (seed 3343). */ + else if (sp <= max_var && var_rt_valid[sp] && dp <= max_tmp) + { + rt_base[dp] = var_rt_base[sp]; + rt_valid[dp] = 1; + } + } + /* ASSIGN: TEMP <-- TEMP → a plain pointer copy carries the resolved + * stack offset (agg_deep seed 12085: `T12 = Addr[StackLoc[-100]] + 48; + * T15 = T12; *T15 = x` — without this, the store through T15 never + * invalidates the BLOCK_COPY initializer at offset -52 and Phase 3 + * forwards the stale constant). */ + else if (d_vr >= 0 && TCCIR_DECODE_VREG_TYPE(d_vr) == TCCIR_VREG_TYPE_TEMP && s1_vr >= 0 && + TCCIR_DECODE_VREG_TYPE(s1_vr) == TCCIR_VREG_TYPE_TEMP && !s1.is_lval) + { + int sp = TCCIR_DECODE_VREG_POSITION(s1_vr); + int dp = TCCIR_DECODE_VREG_POSITION(d_vr); + if (sp <= max_tmp && lea_map[sp].valid && dp <= max_tmp) + { + lea_map[dp].offset = lea_map[sp].offset; + lea_map[dp].valid = 1; + } + else if (sp <= max_tmp && rt_valid[sp] && dp <= max_tmp) + { + rt_base[dp] = rt_base[sp]; + rt_valid[dp] = 1; + } } } @@ -563,6 +662,19 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) lea_map[dp].offset = lea_map[sp].offset + irop_get_imm64_ex(ir, s2); lea_map[dp].valid = 1; } + else if (sp <= max_tmp && rt_valid[sp]) + { + /* `T = + const`: adding an immediate + * (column / field displacement) to a runtime-indexed array base + * keeps the result a runtime pointer into the SAME array — carry + * the base forward so a store through it still invalidates the + * array's entry initializers. Without this, Phase 2.6 loses the + * base at `T44 = T43 + #8` (T43 = &m + (row<<4)) and the stale + * 2-D-array initializer is forwarded past the loop store + * (agg_deep seed 781). */ + rt_base[dp] = rt_base[sp]; + rt_valid[dp] = 1; + } } else if (s1.is_local && !s1.is_lval && irop_get_tag(s1) == IROP_TAG_STACKOFF && irop_is_immediate(s2) && !s2.is_sym) @@ -570,28 +682,96 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) lea_map[dp].offset = irop_get_stack_offset(s1) + irop_get_imm64_ex(ir, s2); lea_map[dp].valid = 1; } + else if (!irop_is_immediate(s2)) + { + /* base + RUNTIME index → record the array base (separate map). */ + int64_t base; + int have = 0; + if (s1_vr >= 0 && TCCIR_DECODE_VREG_TYPE(s1_vr) == TCCIR_VREG_TYPE_TEMP) + { + int sp = TCCIR_DECODE_VREG_POSITION(s1_vr); + if (sp <= max_tmp && lea_map[sp].valid) { base = lea_map[sp].offset; have = 1; } + else if (sp <= max_tmp && rt_valid[sp]) { base = rt_base[sp]; have = 1; } + } + else if (s1.is_local && !s1.is_lval && irop_get_tag(s1) == IROP_TAG_STACKOFF) + { + base = irop_get_stack_offset(s1); + have = 1; + } + if (have) { rt_base[dp] = base; rt_valid[dp] = 1; } + } + } + } + /* VAR-dest of `Addr[StackLoc] + const` or `LEA_temp + const` (an alias + * pointer materialized into a VAR, e.g. `V = &arr[1]`): record the + * constant offset so a store through it (directly, or via a TEMP copied + * from it) invalidates the matching entry-store instead of forwarding the + * stale initializer (fuzz ptr seeds 206/368/394). */ + else if (d_vr >= 0 && TCCIR_DECODE_VREG_TYPE(d_vr) == TCCIR_VREG_TYPE_VAR) + { + int dp = TCCIR_DECODE_VREG_POSITION(d_vr); + if (dp <= max_var) + { + IROperand s1 = tcc_ir_op_get_src1(ir, q); + IROperand s2 = tcc_ir_op_get_src2(ir, q); + int32_t s1_vr = irop_get_vreg(s1); + if (s1_vr >= 0 && TCCIR_DECODE_VREG_TYPE(s1_vr) == TCCIR_VREG_TYPE_TEMP && + irop_is_immediate(s2) && !s2.is_sym) + { + int sp = TCCIR_DECODE_VREG_POSITION(s1_vr); + if (sp <= max_tmp && lea_map[sp].valid) + { + var_lea_map[dp].offset = lea_map[sp].offset + irop_get_imm64_ex(ir, s2); + var_lea_map[dp].valid = 1; + } + else if (sp <= max_tmp && rt_valid[sp]) + { + /* VAR analogue of the TEMP case above: `V = + const` stays a runtime pointer into the same array. */ + var_rt_base[dp] = rt_base[sp]; + var_rt_valid[dp] = 1; + } + } + else if (s1.is_local && !s1.is_lval && irop_get_tag(s1) == IROP_TAG_STACKOFF && + irop_is_immediate(s2) && !s2.is_sym) + { + var_lea_map[dp].offset = irop_get_stack_offset(s1) + irop_get_imm64_ex(ir, s2); + var_lea_map[dp].valid = 1; + } + /* `V = base + RUNTIME index` (an alias pointer into a stack array + * stored in a VAR). Record the array base so a later store through + * V — or a TEMP copied from V — is recognised as a runtime array + * write and invalidates the array's entry initializers (seed 3343). */ + else if (!irop_is_immediate(s2)) + { + int64_t base; + int have = 0; + if (s1_vr >= 0 && TCCIR_DECODE_VREG_TYPE(s1_vr) == TCCIR_VREG_TYPE_TEMP) + { + int sp = TCCIR_DECODE_VREG_POSITION(s1_vr); + if (sp <= max_tmp && lea_map[sp].valid) { base = lea_map[sp].offset; have = 1; } + else if (sp <= max_tmp && rt_valid[sp]) { base = rt_base[sp]; have = 1; } + } + else if (s1.is_local && !s1.is_lval && irop_get_tag(s1) == IROP_TAG_STACKOFF) + { + base = irop_get_stack_offset(s1); + have = 1; + } + if (have) { var_rt_base[dp] = base; var_rt_valid[dp] = 1; } + } } } } } - /* Phase 2.5: Invalidate entries for pointer stores through LEA-resolved TEMPs. - * Phase 1.5 only catches direct StackLoc stores; stores like T***DEREF*** <-- #0 - * where T resolves to a known stack offset via the LEA map are missed. After - * inlining, struct field writes go through pointer dereferences, so this is - * needed to prevent forwarding a stale entry-BB value past an overwrite. */ + /* Phase 2.5: Invalidate entries for later pointer stores through + * LEA-resolved TEMPs. Phase 1.5 only catches direct StackLoc stores; stores + * like T***DEREF*** <-- #0 where T resolves to a known stack offset via the + * LEA map are missed. These writes can still be in the entry BB after the + * direct initializer stores, so scan the whole function and use estores[].idx + * to reject only writes that happen after the collected entry value. */ { - int entry_bb_end = 0; for (int j = 0; j < n; j++) - { - IRQuadCompact *eq = &ir->compact_instructions[j]; - if (eq->is_jump_target || eq->op == TCCIR_OP_JUMP || eq->op == TCCIR_OP_JUMPIF) - { - entry_bb_end = j; - break; - } - } - for (int j = entry_bb_end; j < n; j++) { IRQuadCompact *eq = &ir->compact_instructions[j]; if (eq->op != TCCIR_OP_STORE && eq->op != TCCIR_OP_STORE_INDEXED && eq->op != TCCIR_OP_STORE_POSTINC) @@ -630,11 +810,13 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) IROperand s2 = tcc_ir_op_get_src2(ir, eq); if (!irop_is_immediate(s2)) continue; - soff += irop_get_imm64_ex(ir, s2); + IROperand scale_op = ir->iroperand_pool[eq->operand_base + 3]; + int scale = (int)irop_get_imm64_ex(ir, scale_op); + soff += (irop_get_imm64_ex(ir, s2) << scale); } for (int k = 0; k < estore_count; k++) { - if (estores[k].offset == soff) + if (j > estores[k].idx && estores[k].offset == soff) { LOG_IR_GEN("ENTRY_STORE_PROP: invalidated off=%lld (ptr store via LEA at i=%d)", (long long)soff, j); estores[k].offset = 0x7FFFFFFFLL; @@ -651,6 +833,10 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) { tcc_free(lea_map); tcc_free(var_lea_map); + tcc_free(rt_base); + tcc_free(rt_valid); + tcc_free(var_rt_base); + tcc_free(var_rt_valid); return 0; } @@ -747,6 +933,100 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) estore_count = v2; } + /* Phase 2.6: RUNTIME-indexed array stores anywhere in the function. + * `arr[i] = x` with i not constant lowers to a STORE through a pointer + * `arr_base + (i<compact_instructions[j]; + if (eq->op != TCCIR_OP_STORE && eq->op != TCCIR_OP_STORE_INDEXED && eq->op != TCCIR_OP_STORE_POSTINC) + continue; + IROperand sd = tcc_ir_op_get_dest(ir, eq); + int64_t base = 0x7FFFFFFFLL; + int32_t dv = irop_get_vreg(sd); + if (eq->op == TCCIR_OP_STORE_INDEXED) + { + /* A STORE_INDEXED writes at base + (index << scale). It aliases an + * array element at an unknown (runtime) offset when EITHER the index is + * runtime OR the base pointer is itself a runtime array pointer + * (`arr + (i<= 0 && TCCIR_DECODE_VREG_TYPE(dv) == TCCIR_VREG_TYPE_TEMP) + { + int dp = TCCIR_DECODE_VREG_POSITION(dv); + if (dp <= max_tmp && lea_map[dp].valid) + { + if (imm_index) + continue; /* constant base + constant index — Phase 2.5 handles it */ + base = lea_map[dp].offset; + } + else if (dp <= max_tmp && rt_valid[dp]) + base = rt_base[dp]; /* runtime base: address is runtime even if index is immediate */ + } + } + else /* plain STORE / STORE_POSTINC through a TEMP / VAR deref */ + { + if (sd.is_local) continue; /* direct stores handled by Phase 1 */ + if (dv < 0) continue; + int dp = TCCIR_DECODE_VREG_POSITION(dv); + if (TCCIR_DECODE_VREG_TYPE(dv) == TCCIR_VREG_TYPE_TEMP) + { + if (dp <= max_tmp && rt_valid[dp]) base = rt_base[dp]; + } + else if (TCCIR_DECODE_VREG_TYPE(dv) == TCCIR_VREG_TYPE_VAR) + { + /* `*p = ...` directly through a VAR runtime array pointer. */ + if (dp <= max_var && var_rt_valid[dp]) base = var_rt_base[dp]; + } + } + if (base == 0x7FFFFFFFLL) + continue; + for (int k = 0; k < estore_count; k++) + if (estores[k].offset != 0x7FFFFFFFLL && estores[k].offset >= base) + { + LOG_IR_GEN("ENTRY_STORE_PROP: invalidated off=%lld (runtime store at i=%d, base=%lld)", + (long long)estores[k].offset, j, (long long)base); + estores[k].offset = 0x7FFFFFFFLL; + any_inval = 1; + } + } + if (any_inval) + { + int v2 = 0; + for (int k = 0; k < estore_count; k++) + if (estores[k].offset != 0x7FFFFFFFLL) + estores[v2++] = estores[k]; + estore_count = v2; + } + } + /* Phase 3: Forward entry-BB stores into deref operands. * For each instruction, check src1 and src2 for T***DEREF*** where T * is in the LEA map and the resolved offset matches an entry-BB store. */ @@ -786,6 +1066,11 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) { if (estores[k].offset != resolved_offset) continue; + /* The collected store must precede the load it is forwarded into. + * Last-write-wins keeps the newest entry-BB store, so a load earlier + * than that store would otherwise see the wrong value. */ + if (i <= estores[k].idx) + continue; /* Match! Replace deref with the stored value. * Reuse the original stored operand directly to preserve @@ -829,7 +1114,9 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) continue; int64_t base_off = lea_map[bp].offset; - int64_t eff_off = base_off + irop_get_imm64_ex(ir, li_src2); + IROperand scale_op = ir->iroperand_pool[q->operand_base + 3]; + int scale = (int)irop_get_imm64_ex(ir, scale_op); + int64_t eff_off = base_off + (irop_get_imm64_ex(ir, li_src2) << scale); /* If the LEA base's address was taken, the struct it points to could * have been modified by a function call. Skip forwarding. */ @@ -853,6 +1140,9 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) continue; if (estores[k].btype != irop_get_btype(li_src1)) continue; + /* The collected store must precede the load it is forwarded into. */ + if (i <= estores[k].idx) + continue; q->op = TCCIR_OP_ASSIGN; { @@ -884,6 +1174,10 @@ int tcc_ir_opt_entry_store_prop(TCCIRState *ir) tcc_free(lea_map); tcc_free(var_lea_map); + tcc_free(rt_base); + tcc_free(rt_valid); + tcc_free(var_rt_base); + tcc_free(var_rt_valid); return changes; } @@ -1073,6 +1367,7 @@ static int sl_fwd_narrow_demand_only(TCCIRState *ir, int32_t target_vr, int star static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir); int tcc_ir_opt_sl_forward(TCCIRState *ir) { + if (tcc_ir_opt_pass_disabled("sl_forward")) return 0; tcc_pass_timing_init(); if (!tcc_pass_timing_on) return tcc_ir_opt_sl_forward__timed(ir); unsigned long _t = tcc_pass_clk_us(); @@ -2564,7 +2859,9 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) { if (!lo_e->valid || lo_e->local_sym != addr_sym || lo_e->local_offset != lo_offset) continue; - if (lo_e->store_btype != IROP_BTYPE_INT64 || !irop_is_immediate(lo_e->stored_value)) + int lo_tag = irop_get_tag(lo_e->stored_value); + if (lo_e->store_btype != IROP_BTYPE_INT64 || !irop_is_immediate(lo_e->stored_value) || + (lo_tag != IROP_TAG_I64 && lo_tag != IROP_TAG_F64)) continue; int64_t full64 = irop_get_imm64_ex(ir, lo_e->stored_value); int32_t upper = (int32_t)(uint32_t)(full64 >> 32); @@ -2731,7 +3028,11 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) } if (stale) continue; - uint32_t full = (uint32_t)prev_e->stored_value.u.imm32; + /* Read via irop_get_imm64_ex: for I64/F64-tagged immediates + * u.imm32 holds a POOL INDEX, not the value (an unsigned 32-bit + * constant > INT32_MAX is I64-encoded — bitfield seed 12264 + * forwarded pool index 0 as the byte value). */ + uint32_t full = (uint32_t)irop_get_imm64_ex(ir, prev_e->stored_value); uint32_t bit_shift = (uint32_t)delta * 8; uint32_t byte_mask = (load_bytes == 1) ? 0xFFu : 0xFFFFu; int32_t narrow = (int32_t)((full >> bit_shift) & byte_mask); @@ -2742,8 +3043,10 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) narrow = narrow >> shift; } LOG_SL_FWD("LOAD@i=%d FORWARD-SUBBYTE: store@i=%d delta=%d entry_bytes=%d " - "load_bytes=%d full=0x%x narrow=%d", - i, prev_e->instruction_idx, delta, entry_bytes, load_bytes, full, narrow); + "load_bytes=%d full=0x%x narrow=%d sv_tag=%d sv_islval=%d sv_islocal=%d", + i, prev_e->instruction_idx, delta, entry_bytes, load_bytes, full, narrow, + (int)irop_get_tag(prev_e->stored_value), (int)prev_e->stored_value.is_lval, + (int)prev_e->stored_value.is_local); if (q->op != TCCIR_OP_FUNCPARAMVAL) q->op = TCCIR_OP_ASSIGN; int pool_off = q->operand_base + irop_config[q->op].has_dest; @@ -2792,7 +3095,15 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) int bp = TCCIR_DECODE_VREG_POSITION(base_vr); if (bp <= max_tmp && lea_map[bp].valid) { - int64_t eff_off = lea_map[bp].offset + irop_get_imm64_ex(ir, li_src2); + /* LOAD_INDEXED address = base + (index << scale). + * The scale (operand_base+3) is 0 for disp_fusion-created ops + * (byte offset index) and >0 for indexed_memory_fusion-created + * ops (element index). Without applying the scale, the effective + * offset is wrong, causing store-load forwarding to miss matches + * or — worse — to match an unrelated slot. */ + IROperand scale_op = ir->iroperand_pool[q->operand_base + 3]; + int scale = (int)irop_get_imm64_ex(ir, scale_op); + int64_t eff_off = lea_map[bp].offset + (irop_get_imm64_ex(ir, li_src2) << scale); const Sym *eff_sym = lea_map[bp].sym; uint32_t lih = ((uintptr_t)eff_sym * 31 + (uint32_t)eff_off * 17) % 128; StoreEntry *lie; @@ -3213,6 +3524,44 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) store_btype != IROP_BTYPE_FLOAT32 && store_btype != IROP_BTYPE_FLOAT64) store_btype = IROP_BTYPE_INT32; + /* Overlap invalidation — mirror the plain-STORE path. A narrow + * indexed store overlaps any WIDER entry at a lower offset (a + * packed-bitfield byte write must kill the enclosing word's + * tracked constant, or a later word RMW load forwards the stale + * init and the rebuilt store wipes this byte — bitfield seed + * 11840); a wide store overlaps narrower entries above it. + * Conservative: invalidate (no cross-merge on this path). */ + { + int si_bytes = ir_opt_store_btype_size_bytes(store_btype); + if (si_bytes <= 0) + si_bytes = 4; + for (int delta = 1; delta <= 7; delta++) + { + int64_t lo_off = si_off - delta; + uint32_t loh = ((uintptr_t)si_sym * 31 + (uint32_t)lo_off * 17) % 128; + for (StoreEntry *sie = hash_table[loh]; sie != NULL; sie = sie->next) + { + if (!sie->valid || sie->local_sym != si_sym || sie->local_offset != lo_off) + continue; + int eb = ir_opt_store_btype_size_bytes(sie->store_btype); + if (eb <= 0) + eb = 4; + if (eb > delta) + sie->valid = 0; + } + } + for (int fwd = 1; fwd < si_bytes; fwd++) + { + int64_t hi_off = si_off + fwd; + uint32_t hih = ((uintptr_t)si_sym * 31 + (uint32_t)hi_off * 17) % 128; + for (StoreEntry *sie = hash_table[hih]; sie != NULL; sie = sie->next) + { + if (sie->valid && sie->local_sym == si_sym && sie->local_offset == hi_off) + sie->valid = 0; + } + } + } + /* Record the store. */ StoreEntry *sne = &entries[entry_count++]; sne->valid = 1; @@ -3258,6 +3607,7 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) IROperand stsrc1 = tcc_ir_op_get_src1(ir, q); if (stsrc1.is_local && stsrc1.is_lval) { + int32_t s_vr = irop_get_vreg(stsrc1); const Sym *s_sym; int64_t s_offset; if (irop_get_tag(stsrc1) == IROP_TAG_SYMREF) @@ -3272,7 +3622,13 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) s_offset = irop_get_imm64_ex(ir, stsrc1); } - int32_t s_vr = irop_get_vreg(stsrc1); + /* VAR-backed locals use stack offsets that can collide with anonymous + * StackLoc offsets in the hash table. Use a distinct sentinel sym per + * VAR (matching the LOAD side and STORE tracking side) so a VAR source + * never alias-matches an anonymous StackLoc store at the same offset. */ + if (s_vr >= 0 && TCCIR_DECODE_VREG_TYPE(s_vr) == TCCIR_VREG_TYPE_VAR) + s_sym = (const Sym *)(uintptr_t)(1 + (unsigned)TCCIR_DECODE_VREG_POSITION(s_vr)); + int src_addrtaken = 0; if (s_vr >= 0) { @@ -3539,18 +3895,24 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) } if (entry_bytes <= delta) continue; - /* Try to merge the narrow store's bytes into the wider entry. */ + /* Try to merge the narrow store's bytes into the wider entry. + * Values are read via irop_get_imm64_ex and the merged operand is + * rebuilt as a plain IMM32: I64/F64-tagged immediates keep a POOL + * INDEX in u.imm32, so touching that field raw would merge into / + * corrupt the index instead of the value (bitfield seed 12264). */ int ce_is_imm = irop_is_immediate(ce->stored_value); if (new_src_is_imm && ce_is_imm && new_bytes > 0 && entry_bytes <= 4 && delta + new_bytes <= entry_bytes) { - int32_t old_v = ce->stored_value.u.imm32; - int32_t new_v = new_src1.u.imm32; + int32_t old_v = (int32_t)irop_get_imm64_ex(ir, ce->stored_value); + int32_t new_v = (int32_t)irop_get_imm64_ex(ir, new_src1); uint32_t byte_mask = (new_bytes == 4) ? 0xFFFFFFFFu : ((1u << (new_bytes * 8)) - 1); uint32_t pos_mask = byte_mask << (delta * 8); uint32_t value_in_pos = ((uint32_t)new_v & byte_mask) << (delta * 8); int32_t merged = (int32_t)(((uint32_t)old_v & ~pos_mask) | value_in_pos); - ce->stored_value.u.imm32 = merged; + ce->stored_value = irop_make_imm32(-1, merged, irop_get_btype(ce->stored_value) == IROP_BTYPE_INT64 + ? IROP_BTYPE_INT32 + : irop_get_btype(ce->stored_value)); ce->instruction_idx = i; LOG_SL_FWD("STORE@i=%d CROSS-MERGE into store@i=? at off=%lld delta=%d: " "new_bytes=%d entry_bytes=%d old_v=%d new_v=%d merged=%d", @@ -3601,6 +3963,22 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) new_entry->instruction_idx = i; new_entry->store_dest_vr = addr_vr; new_entry->store_btype = dest.btype; + /* Resolve stored value through forwarded-temp tracking before deriving + * access width from the source vreg. The source TEMP's live interval can + * be stale/over-wide after earlier folds, while the forwarded value has + * the actual width being stored. */ + { + IROperand sv = new_entry->stored_value; + int32_t sv_vr = irop_get_vreg(sv); + if (sv_vr >= 0 && TCCIR_DECODE_VREG_TYPE(sv_vr) == TCCIR_VREG_TYPE_TEMP && !sv.is_lval) + { + int sv_pos = TCCIR_DECODE_VREG_POSITION(sv_vr); + if (sv_pos <= max_tmp && fwd_tmp_valid[sv_pos]) + { + new_entry->stored_value = fwd_tmp_val[sv_pos]; + } + } + } /* A 64-bit value can only be stored to a >=8-byte location (narrowing it * to a smaller slot requires an explicit cast, which makes the stored * value narrow first), so a STORE of a 64-bit value really writes 8 bytes. @@ -3613,12 +3991,29 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) * rejects a 64-bit-store -> 32-bit-read forward. Restricted to 64-bit so * genuine narrowing byte/short stores are left untouched. Mirrors the * STORE_INDEXED paths, which derive the access width from src1. */ - if (dest.btype != IROP_BTYPE_INT64 && dest.btype != IROP_BTYPE_FLOAT64) + /* For stores resolved through the LEA map, the destination is a real + * dereferenced lvalue (`*p`) and its btype is the memory access width. + * Source operands on these pointer stores can carry stale/wider metadata + * after forwarding, so do not let them turn a 32-bit deref store into an + * 8-byte tracked store. */ + if (!addr_via_pointer && dest.btype != IROP_BTYPE_INT64 && dest.btype != IROP_BTYPE_FLOAT64) { int sv_is_64 = 0, sv_is_double = 0; int sv_tag = irop_get_tag(new_entry->stored_value); if (sv_tag == IROP_TAG_I64) - sv_is_64 = 1; + { + /* An I64-tagged immediate is also how an unsigned 32-bit constant + * (> INT32_MAX) gets encoded; such a store to a 32-bit field still + * writes only 4 bytes. Only treat it as a genuine 8-byte store when + * the value truly needs 64 bits — i.e. the upper word is neither a + * sign- nor a zero-extension of the low word. Without this, an + * `unsigned f = bigconst;` field store looked like a 64-bit store and + * a later cross-offset upper-half forward (FORWARD-HI) read its bogus + * zero upper half into the next field (fuzz seed 3210). */ + int64_t v64 = irop_get_imm64_ex(ir, new_entry->stored_value); + if (v64 != (int64_t)(int32_t)v64 && v64 != (int64_t)(uint32_t)v64) + sv_is_64 = 1; + } else if (sv_tag == IROP_TAG_F64) sv_is_64 = sv_is_double = 1; else @@ -3643,24 +4038,6 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) LOG_SL_FWD("STORE@i=%d TRACK: sym=%p off=%lld btype=%d addrtaken=%d via_ptr=%d", i, (const void *)addr_sym, (long long)addr_offset, (int)dest.btype, addr_addrtaken, addr_via_pointer); - /* Resolve stored value through forwarded-temp tracking: - * If src1 is a TEMP that was assigned a value by earlier forwarding - * (e.g. T2 <-- #7), use that value directly. This enables transitive - * forwarding: STORE loc1 <-- #7; LOAD T2 <-- loc1 (forwarded to #7); - * STORE loc2 <-- T2 → stored_value becomes #7 instead of T2. */ - { - IROperand sv = new_entry->stored_value; - int32_t sv_vr = irop_get_vreg(sv); - if (sv_vr >= 0 && TCCIR_DECODE_VREG_TYPE(sv_vr) == TCCIR_VREG_TYPE_TEMP && !sv.is_lval) - { - int sv_pos = TCCIR_DECODE_VREG_POSITION(sv_vr); - if (sv_pos <= max_tmp && fwd_tmp_valid[sv_pos]) - { - new_entry->stored_value = fwd_tmp_val[sv_pos]; - } - } - } - /* LEA-through / local-lval forwarding: if the stored value reads from * a memory location with a tracked constant, forward the constant. * Path 1: T***DEREF*** where T is in the LEA map → resolve to StackLoc. @@ -3687,6 +4064,7 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) } if (!sv_resolved && sv.is_lval && sv.is_local && !sv.is_llocal) { + int32_t sv_vr = irop_get_vreg(sv); int sv_tag = irop_get_tag(sv); if (sv_tag == IROP_TAG_STACKOFF) { @@ -3700,6 +4078,8 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) resolved_off = sr ? sr->addend : 0; sv_resolved = 1; } + if (sv_resolved && sv_vr >= 0 && TCCIR_DECODE_VREG_TYPE(sv_vr) == TCCIR_VREG_TYPE_VAR) + resolved_sym = (const Sym *)(uintptr_t)(1 + (unsigned)TCCIR_DECODE_VREG_POSITION(sv_vr)); } if (sv_resolved) { @@ -3909,7 +4289,23 @@ static int tcc_ir_opt_sl_forward__timed(TCCIRState *ir) * For each forwarded store, scan all remaining (non-NOP) instructions to * check if any src operand still references the same local offset. * Only anonymous stores (vreg < 0) are candidates — already filtered above. */ - for (int fi = 0; fi < fwd_store_count; fi++) + int skip_fwd_store_dse = 0; + for (int sj = 0; sj < n; sj++) + { + IRQuadCompact *sq = &ir->compact_instructions[sj]; + if (sq->op != TCCIR_OP_STORE_INDEXED && sq->op != TCCIR_OP_LOAD_INDEXED) + continue; + IROperand idx = tcc_ir_op_get_src2(ir, sq); + /* The read scan below tracks fixed local offsets. Runtime indexed stack + * array accesses can still depend on forwarded stores even when no exact + * offset operand remains, so keep the stores in those functions. */ + if (!irop_is_immediate(idx) || idx.is_sym) + { + skip_fwd_store_dse = 1; + break; + } + } + for (int fi = 0; !skip_fwd_store_dse && fi < fwd_store_count; fi++) { int store_idx = fwd_stores[fi].store_idx; int64_t off = fwd_stores[fi].offset; @@ -4106,8 +4502,13 @@ static void rse_build_def_map(TCCIRState *ir) max_pos = p; } } - rse_def_map_size = max_pos + 1; + /* Release any map left over from an earlier build before overwriting the + * pointer. tcc_ir_opt_const_memcpy_to_dest rebuilds the map after every + * successful rewrite, so without this the previous allocation would leak + * (tcc_free(NULL) is a no-op on the first/clean call). */ + tcc_free(rse_def_map); rse_def_map = NULL; + rse_def_map_size = max_pos + 1; if (rse_def_map_size <= 0) return; rse_def_map = (int *)tcc_malloc(sizeof(int) * rse_def_map_size); @@ -4213,6 +4614,72 @@ static int rse_resolve_temp_addr(TCCIRState *ir, int32_t vr, return rse_resolve_temp_addr_impl(ir, vr, out_sym, out_off, 4); } +/* Resolve a TEMP address operand that points into an array at a RUNTIME index, + * i.e. its def chain ends in `base + `. Returns 1 and sets + * out_sym/out_off to the array BASE (the exact element is unknown) when the + * address is `base + runtime_offset`; 0 otherwise. + * + * The exact-offset resolver rse_resolve_temp_addr() bails the moment it meets a + * non-immediate ADD addend, so a plain DEREF read of `arr + i` (a TEMP holding + * `Addr[StackLoc] ADD `) looks like "no read" to redundant-store + * elimination. A later store to a constant element of the same array would + * then wrongly kill the array's initializer store, even though the runtime + * DEREF may have read that element first (fuzz seed 6447). Detecting the base + * lets the caller flush the whole array range, exactly like the runtime + * LOAD_INDEXED handling below. */ +static int rse_resolve_runtime_base(TCCIRState *ir, int32_t vr, + const Sym **out_sym, int64_t *out_off, int depth) +{ + if (depth <= 0) + return 0; + if (vr < 0 || TCCIR_DECODE_VREG_TYPE(vr) != TCCIR_VREG_TYPE_TEMP) + return 0; + int pos = TCCIR_DECODE_VREG_POSITION(vr); + if (!rse_def_map || pos >= rse_def_map_size) + return 0; + int def_idx = rse_def_map[pos]; + if (def_idx < 0) /* -1 (none) or RSE_DEF_MULTI */ + return 0; + IRQuadCompact *dq = &ir->compact_instructions[def_idx]; + + /* ASSIGN/LEA copy of another address TEMP — follow the chain. */ + if (dq->op == TCCIR_OP_ASSIGN || dq->op == TCCIR_OP_LEA) + { + IROperand s1 = tcc_ir_op_get_src1(ir, dq); + if (!s1.is_lval && irop_get_tag(s1) == IROP_TAG_VREG) + return rse_resolve_runtime_base(ir, irop_get_vreg(s1), out_sym, out_off, depth - 1); + return 0; + } + + if (dq->op != TCCIR_OP_ADD) + return 0; + IROperand s2 = tcc_ir_op_get_src2(ir, dq); + if (irop_is_immediate(s2)) + return 0; /* constant addend — the exact resolver already handles this */ + IROperand s1 = tcc_ir_op_get_src1(ir, dq); + + /* src1 is the array base address: &global+addend, Addr[StackLoc], or a TEMP + * that resolves to one via the exact resolver. */ + if (s1.is_sym && !s1.is_lval) + { + IRPoolSymref *sr = irop_get_symref_ex(ir, s1); + if (!sr || !sr->sym) + return 0; + *out_sym = sr->sym; + *out_off = (int64_t)sr->addend; + return 1; + } + if (s1.is_local && !s1.is_lval && !s1.is_llocal && irop_get_tag(s1) == IROP_TAG_STACKOFF) + { + *out_sym = NULL; + *out_off = irop_get_stack_offset(s1); + return 1; + } + if (!s1.is_lval && irop_get_tag(s1) == IROP_TAG_VREG) + return rse_resolve_temp_addr(ir, irop_get_vreg(s1), out_sym, out_off); + return 0; +} + /* A resolved (sym, off) access of `width` bytes is "key-safe" only when it * stays within `sym`'s own storage. Global base-sharing emits stores like * `T = &g0; STORE_INDEXED T, #off` where off reaches a *different* global g1 @@ -4471,6 +4938,173 @@ int tcc_ir_opt_store_redundant(TCCIRState *ir) RSE_EVICT_FOR_SRC(tcc_ir_op_get_src1(ir, q)); if (irop_config[q->op].has_src2) RSE_EVICT_FOR_SRC(tcc_ir_op_get_src2(ir, q)); + /* MLA's accumulator (4th operand) is a read not surfaced by src1/src2 + * (bitfield seed 17717: `T <-- Ta MLA Tb + T3***DEREF***` read a packed + * field's init store, which then looked overwritten-without-read). */ + if (q->op == TCCIR_OP_MLA) + RSE_EVICT_FOR_SRC(tcc_ir_op_get_accum(ir, q)); + + /* A plain DEREF read through a TEMP holding `array_base + RUNTIME_index` + * (e.g. `T = &arr[0] + (i<<2); x = *T`) reads an unknown element of that + * array. The exact-offset eviction above misses it (rse_resolve_temp_addr + * bails on the non-constant addend), so flush every tracked store in the + * array's range — mirroring the runtime LOAD_INDEXED handler below (fuzz + * seed 6447). */ +#define RSE_FLUSH_RUNTIME_DEREF(SRC_OP) \ + do \ + { \ + IROperand _src = (SRC_OP); \ + if (_src.is_lval && irop_get_tag(_src) == IROP_TAG_VREG) \ + { \ + const Sym *_bsym; \ + int64_t _boff; \ + if (rse_resolve_runtime_base(ir, irop_get_vreg(_src), &_bsym, &_boff, 4)) \ + { \ + for (int _k = 0; _k < active_count;) \ + { \ + if (active[_k].sym == _bsym && active[_k].offset >= _boff && (active[_k].offset - _boff) < 1024) \ + active[_k] = active[--active_count]; \ + else \ + _k++; \ + } \ + } \ + } \ + } while (0) + if (irop_config[q->op].has_src1) + RSE_FLUSH_RUNTIME_DEREF(tcc_ir_op_get_src1(ir, q)); + if (irop_config[q->op].has_src2) + RSE_FLUSH_RUNTIME_DEREF(tcc_ir_op_get_src2(ir, q)); + if (q->op == TCCIR_OP_MLA) + RSE_FLUSH_RUNTIME_DEREF(tcc_ir_op_get_accum(ir, q)); +#undef RSE_FLUSH_RUNTIME_DEREF + + /* A DEREF read whose pointer resolves to NEITHER an exact (sym,off) + * [RSE_EVICT_FOR_SRC] NOR an array base+runtime-index [RSE_FLUSH_RUNTIME_DEREF] + * may read ANY tracked slot. The canonical miss is a read through a + * VAR-materialized pointer — `V = &arr[k]; T = V; x = *T` — where the + * single-def chain runs through a VAR, so rse_resolve_temp_addr bails on the + * non-TEMP link and rse_resolve_runtime_base finds no runtime addend. Such a + * read was silently treated as "no read", letting a later store to the same + * slot wrongly eliminate the value this read still needs (fuzz ptr seed 323). + * Be conservative: flush all pending stores (same as the CALL flush). */ +#define RSE_FLUSH_UNRESOLVED_DEREF(SRC_OP) \ + do \ + { \ + IROperand _src = (SRC_OP); \ + if (_src.is_lval && irop_get_tag(_src) == IROP_TAG_VREG) \ + { \ + const Sym *_us; \ + int64_t _uo; \ + int32_t _uv = irop_get_vreg(_src); \ + if (!rse_resolve_temp_addr(ir, _uv, &_us, &_uo) && \ + !rse_resolve_runtime_base(ir, _uv, &_us, &_uo, 4)) \ + active_count = 0; \ + } \ + } while (0) + if (irop_config[q->op].has_src1) + RSE_FLUSH_UNRESOLVED_DEREF(tcc_ir_op_get_src1(ir, q)); + if (irop_config[q->op].has_src2) + RSE_FLUSH_UNRESOLVED_DEREF(tcc_ir_op_get_src2(ir, q)); + if (q->op == TCCIR_OP_MLA) + RSE_FLUSH_UNRESOLVED_DEREF(tcc_ir_op_get_accum(ir, q)); +#undef RSE_FLUSH_UNRESOLVED_DEREF + + /* LOAD_INDEXED with a runtime index reads an unknown element of its + * base array. Any active store whose offset falls within the array's + * range might be read before being overwritten, so flush those entries + * to prevent incorrect redundant-store elimination. (Constant-index + * LOAD_INDEXED is handled by the exact-match RSE_EVICT_FOR_SRC above.) */ + if (q->op == TCCIR_OP_LOAD_INDEXED) + { + IROperand li_src2 = tcc_ir_op_get_src2(ir, q); + if (!irop_is_immediate(li_src2)) + { + IROperand li_s1 = tcc_ir_op_get_src1(ir, q); + int64_t li_base = 0; + int got_base = 0; + if (li_s1.is_local && irop_get_tag(li_s1) == IROP_TAG_STACKOFF) + { + li_base = irop_get_stack_offset(li_s1); + got_base = 1; + } + else + { + const Sym *_sym; + if (rse_resolve_temp_addr(ir, irop_get_vreg(li_s1), &_sym, &li_base)) + got_base = 1; + } + if (got_base) + { + for (int k = 0; k < active_count;) + { + if (active[k].sym == NULL && active[k].offset >= li_base && + (active[k].offset - li_base) < 1024) + active[k] = active[--active_count]; + else + k++; + } + } + } + else + { + /* Constant-index LOAD_INDEXED reads base + (index << scale). The + * generic RSE_EVICT_FOR_SRC above only evicts the base offset (element + * 0), so a read of a non-zero element (e.g. arr[2]) would otherwise + * fail to keep its producing store alive — letting a later store to the + * same slot wrongly kill it (fuzz seed 2874). Evict the exact slot. */ + IROperand li_s1 = tcc_ir_op_get_src1(ir, q); + IROperand li_sc = tcc_ir_op_get_scale(ir, q); + int64_t li_base = 0; + const Sym *li_sym = NULL; + int got_base = 0; + if (li_s1.is_local && irop_get_tag(li_s1) == IROP_TAG_STACKOFF) + { + li_base = irop_get_stack_offset(li_s1); + got_base = 1; + } + else if (rse_resolve_temp_addr(ir, irop_get_vreg(li_s1), &li_sym, &li_base)) + { + got_base = 1; + } + if (got_base) + { + int64_t sc = irop_is_immediate(li_sc) ? irop_get_imm64_ex(ir, li_sc) : 0; + int64_t eoff = li_base + (irop_get_imm64_ex(ir, li_src2) << sc); + for (int k = 0; k < active_count;) + { + if (active[k].sym == li_sym && active[k].offset == eoff) + active[k] = active[--active_count]; + else + k++; + } + } + else if (irop_get_tag(li_s1) == IROP_TAG_VREG) + { + /* A constant index does NOT imply a constant address: the base can be + * a runtime array pointer `arr + (row << k)` for which the exact + * resolver above bailed on the non-constant addend. Such a load reads + * an unknown element of that array, so flush the whole array range — + * mirroring the runtime-index branch (agg_deep seed 36641: a + * `m[row][C]` store was wrongly killed by a later `m[C2][C]` store to + * the same slot because the intervening `m[row2][C]` read carried a + * runtime base with a constant column index). */ + const Sym *rb_sym; + int64_t rb_off; + if (rse_resolve_runtime_base(ir, irop_get_vreg(li_s1), &rb_sym, &rb_off, 4)) + { + for (int k = 0; k < active_count;) + { + if (active[k].sym == rb_sym && active[k].offset >= rb_off && + (active[k].offset - rb_off) < 1024) + active[k] = active[--active_count]; + else + k++; + } + } + } + } + } + #undef RSE_EVICT_FOR_SRC /* STORE / STORE_INDEXED to a local non-addr-taken address, or to a @@ -5172,8 +5806,18 @@ int tcc_ir_opt_dead_local_slot_elim(TCCIRState *ir) * (b) an operand carrying a vreg whose vreg_slot[] is set. * * For each such address-of-local use, classify the containing op. - * Mark the relevant slot(s) non-tame if the use isn't recognized. */ - for (int k = 0; k < 3; k++) + * Mark the relevant slot(s) non-tame if the use isn't recognized. + * + * k==3 is the MLA accumulator (pool[base+3]). It must be scanned here + * for the same reason the live-collection loop below scans it: when a + * slot-pointer vreg is dereferenced *only* as an MLA addend + * (`T <- Addr[StackLoc[X]]; MLA ... + T***DEREF***`), missing it here + * leaves the slot looking tame with no recorded read, so its defining + * store is wrongly eliminated. Critically, the live-collection loop + * only records that deref precisely when `dls_precise_ok`; when the + * function has an indexed/postinc op (or a back-edge) that path is + * gated off, and this poison is the *only* thing that keeps the store. */ + for (int k = 0; k < 4; k++) { IROperand op; int has; @@ -5189,12 +5833,18 @@ int tcc_ir_opt_dead_local_slot_elim(TCCIRState *ir) if (has) op = tcc_ir_op_get_src1(ir, q); } - else + else if (k == 2) { has = irop_config[q->op].has_src2; if (has) op = tcc_ir_op_get_src2(ir, q); } + else + { + has = (q->op == TCCIR_OP_MLA); + if (has) + op = tcc_ir_op_get_accum(ir, q); + } if (!has) continue; @@ -5692,10 +6342,14 @@ int tcc_ir_opt_dead_local_slot_elim(TCCIRState *ir) if (dest.is_complex) width *= 2; /* Position-aware liveness: only reads at positions AFTER this STORE - * make it live. Earlier reads were satisfied by an earlier definition. */ + * make it live. Earlier reads were satisfied by an earlier definition. + * Only sound in forward-only control flow: with a back edge, a read at + * an earlier position can execute AFTER this store (loop-carried value, + * float fuzz seed 6632: in-loop `st.f0 = ...` read at the loop top), so + * any overlapping read keeps the store. */ int alive = 0; for (int k = 0; k < live_count; k++) - if (live[k].pos > i && + if ((dls_has_backedge || live[k].pos > i) && off < live[k].off + live[k].width && off + width > live[k].off) { alive = 1; @@ -5763,8 +6417,14 @@ int tcc_ir_opt_dead_local_slot_elim(TCCIRState *ir) * - no live[] read AFTER this op intersects S. * The store's exact offset within S is unknown (a moving pointer walking * an array), so we use whole-slot liveness: any later read landing in - * [S, tame_slot_end[idx(S)]) keeps the store. */ - if (!has_unknown_deref) + * [S, tame_slot_end[idx(S)]) keeps the store. + * + * Indexed/postinc loads are NOT recorded in live[] (their runtime offset is + * a bare vreg this pass does not classify). When any such load exists we + * must skip this elimination entirely, otherwise a STORE_INDEXED to a slot + * can be removed even though a later LOAD_INDEXED reads the same bytes + * (packed struct/array stride: bug_bitfield_packed10). */ + if (!has_unknown_deref && !dls_has_indexed) for (int i = 0; i < n; i++) { IRQuadCompact *q = &ir->compact_instructions[i]; @@ -5787,10 +6447,12 @@ int tcc_ir_opt_dead_local_slot_elim(TCCIRState *ir) /* No non-tame deref through a different slot may reach into S's bytes. */ if (DLS_NONTAME_RANGE_OVERLAPS(slot, slot_end - slot)) continue; - /* Whole-slot liveness: any later read in [slot, slot_end)? */ + /* Whole-slot liveness: any later read in [slot, slot_end)? (Position + * filter is void under back edges — see the direct-StackLoc loop.) */ int alive = 0; for (int k = 0; k < live_count; k++) - if (live[k].pos > i && live[k].off < slot_end && live[k].off + live[k].width > slot) + if ((dls_has_backedge || live[k].pos > i) && + live[k].off < slot_end && live[k].off + live[k].width > slot) { alive = 1; break; @@ -5828,7 +6490,8 @@ int tcc_ir_opt_dead_local_slot_elim(TCCIRState *ir) continue; int alive = 0; for (int k = 0; k < live_count; k++) - if (live[k].pos > i && base < live[k].off + live[k].width && base + width > live[k].off) + if ((dls_has_backedge || live[k].pos > i) && + base < live[k].off + live[k].width && base + width > live[k].off) { alive = 1; break; @@ -5869,10 +6532,11 @@ int tcc_ir_opt_dead_local_slot_elim(TCCIRState *ir) if (sz <= 0) continue; /* Position-aware: a read AT or BEFORE this call was satisfied by an - * earlier write; only later reads keep the call alive. */ + * earlier write; only later reads keep the call alive. (Position + * filter is void under back edges — see the direct-StackLoc loop.) */ int alive = 0; for (int k = 0; k < live_count; k++) - if (live[k].pos > i && + if ((dls_has_backedge || live[k].pos > i) && base < live[k].off + live[k].width && base + sz > live[k].off) { alive = 1; @@ -5938,10 +6602,11 @@ int tcc_ir_opt_dead_local_slot_elim(TCCIRState *ir) * read AFTER this call? (Earlier reads were satisfied upstream.) * Wide live[] entries (e.g. memcpy-source bounded reads of N bytes) * may start below `slot` and extend across it — check that the - * range's end exceeds `slot`, not just its base. */ + * range's end exceeds `slot`, not just its base. (Position filter + * is void under back edges — see the direct-StackLoc loop.) */ int alive = 0; for (int k = 0; k < live_count; k++) - if (live[k].pos > i && live[k].off + live[k].width > slot) + if ((dls_has_backedge || live[k].pos > i) && live[k].off + live[k].width > slot) { alive = 1; break; @@ -6486,6 +7151,7 @@ int tcc_ir_opt_addrof_var_fwd(TCCIRState *ir) static int tcc_ir_opt_global_sl_fwd__timed(TCCIRState *ir); int tcc_ir_opt_global_sl_fwd(TCCIRState *ir) { + if (tcc_ir_opt_pass_disabled("global_sl_fwd")) return 0; tcc_pass_timing_init(); if (!tcc_pass_timing_on) return tcc_ir_opt_global_sl_fwd__timed(ir); unsigned long _t = tcc_pass_clk_us(); @@ -10374,8 +11040,32 @@ int tcc_ir_opt_ptr_load_cse(TCCIRState *ir) } } } - q->op = TCCIR_OP_NOP; - changes++; + /* The redirection above stops at the basic-block boundary (it breaks + * at a jump target / branch) and never rewrites an MLA accumulator, + * so a use of dest_vr in a LATER block — e.g. a deref-STORE address + * `T***DEREF*** <- v` reached through a branch — is NOT redirected. + * NOPing the copy there leaves that store reading an undefined + * address (fuzz ptr seed 291). Only drop the copy when no use of + * dest_vr survives; DCE removes it later if it becomes truly dead. */ + { + int dv_live = 0; + for (int m = i + 1; m < n && !dv_live; m++) + { + IRQuadCompact *mq = &ir->compact_instructions[m]; + if (mq->op == TCCIR_OP_NOP) + continue; + if ((irop_config[mq->op].has_src1 && irop_get_vreg(tcc_ir_op_get_src1(ir, mq)) == dest_vr) || + (irop_config[mq->op].has_src2 && irop_get_vreg(tcc_ir_op_get_src2(ir, mq)) == dest_vr) || + (irop_config[mq->op].has_dest && irop_get_vreg(tcc_ir_op_get_dest(ir, mq)) == dest_vr) || + (mq->op == TCCIR_OP_MLA && irop_get_vreg(tcc_ir_op_get_accum(ir, mq)) == dest_vr)) + dv_live = 1; + } + if (!dv_live) + { + q->op = TCCIR_OP_NOP; + changes++; + } + } goto plcse_next; } } @@ -10395,6 +11085,14 @@ int tcc_ir_opt_ptr_load_cse(TCCIRState *ir) { IROperand dest = tcc_ir_op_get_dest(ir, q); int32_t dest_vr = irop_get_vreg(dest); + if (dest_vr >= 0 && !dest.is_lval && + TCCIR_DECODE_VREG_TYPE(dest_vr) == TCCIR_VREG_TYPE_VAR) { + IRLiveInterval *li = tcc_ir_vreg_live_interval(ir, dest_vr); + if (li && li->addrtaken) { + cache_count = 0; + continue; + } + } if (dest_vr >= 0 && !dest.is_lval) { int w = 0; diff --git a/ir/opt_neg_chain.c b/ir/opt_neg_chain.c index b003be5b..191573ad 100644 --- a/ir/opt_neg_chain.c +++ b/ir/opt_neg_chain.c @@ -154,7 +154,11 @@ int tcc_ir_opt_neg_chain_cse(TCCIRState *ir) { IROperand src1 = tcc_ir_op_get_src1(ir, q); int32_t src_vr = irop_get_vreg(src1); - if (!src1.is_lval && src_vr >= 0 && TCCIR_DECODE_VREG_TYPE(src_vr) == TCCIR_VREG_TYPE_TEMP) + /* Width must match for the copy to be value-preserving — an ASSIGN that + * narrows/widens (e.g. T_b:I8 <- T_a:I32) does not carry T_a's full value, + * so it must anchor to itself rather than join T_a's canonical chain. */ + if (!src1.is_lval && src_vr >= 0 && TCCIR_DECODE_VREG_TYPE(src_vr) == TCCIR_VREG_TYPE_TEMP && + irop_get_btype(dest) == irop_get_btype(src1)) { int src_pos = TCCIR_DECODE_VREG_POSITION(src_vr); if (src_pos <= max_tmp && canon[src_pos].valid) @@ -173,8 +177,19 @@ int tcc_ir_opt_neg_chain_cse(TCCIRState *ir) { IROperand src1 = tcc_ir_op_get_src1(ir, q); IROperand src2 = tcc_ir_op_get_src2(ir, q); - /* Match the negation idiom: T_b = #0 SUB T_a. */ - if (irop_is_immediate(src1) && irop_get_imm64_ex(ir, src1) == 0) + int dest_btype = irop_get_btype(dest); + int src_btype = irop_get_btype(src2); + /* Match the negation idiom: T_b = #0 SUB T_a. + * + * Width must match — a width-changing negation (e.g. T_b:I8 = -T_a:I32) + * truncates, so it is NOT value-preserving and must NOT join T_a's + * canonical chain. Were it recorded as "T_b = -base" against the wider + * base, a later same-width negation could be folded straight back to the + * wide base, dropping the truncation and miscompiling. When the widths + * differ the dest anchors to itself (base = dest, sign = +) via the + * defaults above, keeping first_pos/first_neg homogeneous per base. */ + if (irop_is_immediate(src1) && irop_get_imm64_ex(ir, src1) == 0 && + dest_btype == src_btype) { int32_t src_vr = irop_get_vreg(src2); if (!src2.is_lval && src_vr >= 0 && TCCIR_DECODE_VREG_TYPE(src_vr) == TCCIR_VREG_TYPE_TEMP) @@ -191,27 +206,20 @@ int tcc_ir_opt_neg_chain_cse(TCCIRState *ir) sign = 1; } - /* Width must match — otherwise an ASSIGN of a different-width TEMP - * could drop or extend bits the SUB wouldn't have. */ - int dest_btype = irop_get_btype(dest); - int src_btype = irop_get_btype(src2); - if (dest_btype == src_btype) + int base_pos = TCCIR_DECODE_VREG_POSITION(base_vr); + int32_t existing = (sign == 1) ? first_neg[base_pos] : first_pos[base_pos]; + if (existing >= 0 && existing != dest_vr) { - int base_pos = TCCIR_DECODE_VREG_POSITION(base_vr); - int32_t existing = (sign == 1) ? first_neg[base_pos] : first_pos[base_pos]; - if (existing >= 0 && existing != dest_vr) - { - IROperand new_src = irop_make_vreg(existing, dest_btype); - q->op = TCCIR_OP_ASSIGN; - tcc_ir_set_src1(ir, i, new_src); - tcc_ir_set_src2(ir, i, IROP_NONE); - LOG_NEG_CHAIN("@%d: T%d = -T%d folded to T%d = T%d (base=T%d sign=%d)", - i, dest_pos, TCCIR_DECODE_VREG_POSITION(src_vr), - dest_pos, TCCIR_DECODE_VREG_POSITION(existing), - base_pos, sign); - changes++; - did_replace = 1; - } + IROperand new_src = irop_make_vreg(existing, dest_btype); + q->op = TCCIR_OP_ASSIGN; + tcc_ir_set_src1(ir, i, new_src); + tcc_ir_set_src2(ir, i, IROP_NONE); + LOG_NEG_CHAIN("@%d: T%d = -T%d folded to T%d = T%d (base=T%d sign=%d)", + i, dest_pos, TCCIR_DECODE_VREG_POSITION(src_vr), + dest_pos, TCCIR_DECODE_VREG_POSITION(existing), + base_pos, sign); + changes++; + did_replace = 1; } } } diff --git a/ir/opt_pack64.c b/ir/opt_pack64.c index 80b6be20..67741ff5 100644 --- a/ir/opt_pack64.c +++ b/ir/opt_pack64.c @@ -146,6 +146,17 @@ int tcc_ir_opt_pack64_from_stack_stores(TCCIRState *ir) * a vreg / sym). Bail if the operand has any kind of indirection. */ if (src.is_llocal || src.is_sym) continue; + /* CRITICAL: a STACKOFF operand is only a *direct* stack-slot read when it + * has no associated vreg (vreg_type == 0, i.e. irop_get_vreg == -1). A VAR + * or PARAM referenced through its potential spill encoding also has + * tag==STACKOFF/is_local/is_lval, but its offset is mere "where it would + * spill" metadata — the value is actually read from the vreg, not that slot + * (see the IROP_TAG_STACKOFF note in tccir_operand.h). Matching STOREs by + * that phantom offset can grab an unrelated variable's stores when the slot + * was reused (longlong fuzz seed 7: a u64 local whose spill home aliased an + * array's live slot got folded to PACK64 of the array's elements). */ + if (irop_get_vreg(src) != -1) + continue; int64_t addr_lo = irop_get_imm64_ex(ir, src); int64_t addr_hi = addr_lo + 4; @@ -179,7 +190,7 @@ int tcc_ir_opt_pack64_from_stack_stores(TCCIRState *ir) { IROperand jdst = tcc_ir_op_get_dest(ir, jq); if (jq->op == TCCIR_OP_STORE && jdst.tag == IROP_TAG_STACKOFF && jdst.is_local && jdst.is_lval && - !jdst.is_llocal && !jdst.is_sym && irop_get_btype(jdst) == IROP_BTYPE_INT32) + !jdst.is_llocal && !jdst.is_sym && irop_get_vreg(jdst) == -1 && irop_get_btype(jdst) == IROP_BTYPE_INT32) { int64_t joff = irop_get_imm64_ex(ir, jdst); IROperand jsrc = tcc_ir_op_get_src1(ir, jq); @@ -1019,7 +1030,12 @@ int tcc_ir_opt_shl32_or_chain(TCCIRState *ir) int64_t imm = irop_get_imm64_ex(ir, q_src2); if (q->op == TCCIR_OP_SHL && imm == 32) is_shl32 = 1; - else if (q->op == TCCIR_OP_AND && (uint64_t)imm == 0xFFFFFFFFULL) + else if (q->op == TCCIR_OP_AND && (uint32_t)imm == 0xFFFFFFFFu) + /* Compare the low 32 bits only: irop_get_imm64_ex sign-extends a + 32-bit immediate, so the natural 0xFFFFFFFF low-word mask arrives + here as int64_t -1 (0xFFFF...FFFF), which would never equal a + 0x00000000FFFFFFFF test. A full 64-bit IROP_TAG_I64 constant of + 0xFFFFFFFF (not sign-extended) also matches, as intended. */ is_and_low = 1; else continue; diff --git a/ir/opt_pipeline.c b/ir/opt_pipeline.c index 8ba084e6..fff80abc 100644 --- a/ir/opt_pipeline.c +++ b/ir/opt_pipeline.c @@ -10,14 +10,16 @@ #define USING_GLOBALS +#include + #include "ir.h" #include "opt_pipeline.h" #include "opt.h" +#include "opt_utils.h" #include "opt_gens_fusion.h" #include "opt_gens_bool.h" #include "opt_gens_call_result.h" #include "opt_gens_branch.h" -#include "opt_utils.h" #include "opt_xform.h" #define FLAG(f) (uint16_t)offsetof(TCCState, f) @@ -107,6 +109,12 @@ void dbg_scan_imm_dest(TCCIRState *ir, const char *pass) } } +/* Every pass this loop runs is made observable via tcc_ir_dump_after_pass(), + * the same -dump-ir-passes= hook ir/regalloc.c's RUN_SSA wires up for + * the SSA driver — otherwise group-registered passes (including compound + * cascade wrappers like "esp_cleanup"/"kb_cascade" that have no other call + * site) are invisible to the golden-IR snapshot harness. No-op outside + * CONFIG_TCC_DEBUG builds. */ int tcc_ir_opt_run_group(IROptCtx *ctx, const IRPassGroup *group) { int total_changes = 0; @@ -125,6 +133,8 @@ int tcc_ir_opt_run_group(IROptCtx *ctx, const IRPassGroup *group) const IROptPass *trigger = &group->passes[group->trigger_idx]; if (trigger->flag_offset && !*((unsigned char *)tcc_state + trigger->flag_offset)) break; + if (tcc_ir_opt_pass_disabled(trigger->name)) + break; if (tcc_pass_timing_on > 0) { unsigned long _rt = tcc_pass_clk_us(); pipeline_ensure_requirements(ctx, trigger->requires); @@ -137,7 +147,15 @@ int tcc_ir_opt_run_group(IROptCtx *ctx, const IRPassGroup *group) tcc_pass_timing_add(trigger->name ? trigger->name : "P:trigger", tcc_pass_clk_us() - _tt); dbg_scan_imm_dest(ctx->ir, trigger->name); dbg_scan_overlap(ctx->ir, trigger->name); + tcc_ir_dump_after_pass(ctx->ir, trigger->name); pipeline_trace_pass(group, trigger, iter, tch); + /* Exiting on an idle trigger can stall a cascade: non-trigger passes + * from the previous round may have created new work the trigger would + * only find next round. Groups sidestep this with internal fixpoint + * wrappers (kb_cascade, branch_cleanup). The general alternative — + * re-iterate while round_changes > 0 and use the trigger only as a + * first-round gate — changes semantics for every triggered group and + * needs a full fuzz-sweep validation before switching. */ if (tch <= 0) break; round_changes += tch; @@ -152,6 +170,8 @@ int tcc_ir_opt_run_group(IROptCtx *ctx, const IRPassGroup *group) continue; if (pass->flag_offset && !*((unsigned char *)tcc_state + pass->flag_offset)) continue; + if (tcc_ir_opt_pass_disabled(pass->name)) + continue; if (tcc_pass_timing_on > 0) { unsigned long _rt = tcc_pass_clk_us(); @@ -166,6 +186,7 @@ int tcc_ir_opt_run_group(IROptCtx *ctx, const IRPassGroup *group) tcc_pass_timing_add(pass->name ? pass->name : "P:pass", tcc_pass_clk_us() - _pt); dbg_scan_imm_dest(ctx->ir, pass->name); dbg_scan_overlap(ctx->ir, pass->name); + tcc_ir_dump_after_pass(ctx->ir, pass->name); if (changes > 0) { round_changes += changes; pipeline_apply_invalidations(ctx, pass->invalidates); @@ -180,7 +201,13 @@ int tcc_ir_opt_run_group(IROptCtx *ctx, const IRPassGroup *group) tcc_ir_opt_ctx_invalidate(ctx); } - if (round_changes == 0 && group->trigger_idx < 0) + /* Fixpoint termination: stop once a round produces no changes. The old + `&& group->trigger_idx < 0` clause was redundant (docs/bugs.md #4): a + trigger-bearing group that reaches here already had tch > 0 (otherwise + it broke at the `tch <= 0` check above), so round_changes >= tch > 0 and + this condition is never true for it anyway -- its termination is driven + solely by the trigger. Dropping the clause changes no behavior. */ + if (round_changes == 0) break; } diff --git a/ir/opt_promote.c b/ir/opt_promote.c index ea15b61b..2abda47a 100644 --- a/ir/opt_promote.c +++ b/ir/opt_promote.c @@ -337,21 +337,14 @@ int tcc_ir_opt_var_tmp_fwd(TCCIRState *ir) if (TCCIR_DECODE_VREG_TYPE(src_vr) != TCCIR_VREG_TYPE_TEMP) continue; - /* DEREF source guard: forwarding V → *T duplicates the load at every - * use site. Only beneficial when V has exactly one use (which we're - * about to rewrite), making V dead and DCE'ing the STORE. Multiple - * uses → substituting one reintroduces the load there without removing - * the STORE the other uses still need. Pattern from inlined check1: - * V <- *T [STORE] \ - * CMP got, V -- if both rewritten, V dies. But if - * PARAM3 V (outside BB) / PARAM3 stays, the substitution at CMP - * adds a redundant ldr without payoff. */ - if (src1.is_lval && var_use_count) - { - int dpos = TCCIR_DECODE_VREG_POSITION(dest_vr); - if (dpos >= 0 && dpos < max_var_for_use && var_use_count[dpos] > 1) - continue; - } + /* DEREF source guard: forwarding V ← *T turns a VAR load into a raw + * memory dereference at every use site. Even with a single use this is + * unsafe: downstream passes treat a direct StackLoc/address deref as an + * unaliased load and may fold it to the initializer, ignoring loop-carried + * or indexed writes that alias the same slot (seed 588). Only forward + * non-lval (register-held) TEMP sources. */ + if (src1.is_lval) + continue; /* Don't forward TEMPs that hold a computed stack/symbol ADDRESS (from * LEA / Addr[...]). Even when V is single-use, removing the VAR that @@ -365,6 +358,14 @@ int tcc_ir_opt_var_tmp_fwd(TCCIRState *ir) int t_def = tcc_ir_find_defining_instruction(ir, src_vr, i); if (t_def >= 0 && ir->compact_instructions[t_def].op == TCCIR_OP_LEA) continue; + /* Keep this forwarding local to the producer. Extending a TEMP across + * intervening stores can perturb the store-heavy csmix shape enough for + * later cleanup/codegen to miscompile seed 814. */ + int prev = i - 1; + while (prev >= 0 && ir->compact_instructions[prev].op == TCCIR_OP_NOP) + prev--; + if (t_def != prev) + continue; } int src_btype = irop_get_btype(src1); @@ -2009,6 +2010,24 @@ int tcc_ir_opt_post_ra_forward_diamond(TCCIRState *ir) if (!safe) continue; + /* Pin both sides of every eliminated no-op copy to their shared physical + * register. Without this, a later codegen scratch-conflict fixup + * (try_reassign_scratch_conflict) can independently move just the dest + * vreg's interval to a different register — the two vregs stop sharing a + * register even though the copy that would keep them in sync no longer + * exists in the IR, so the fall-through edge silently reads a register + * that was never written on that path. phi_pinned is the same guard + * ra_phi_copy_needed() sets for the identical post-RA-identity case. */ + for (int j = 0; j < num_assigns; j++) { + IRQuadCompact *aq = &ir->compact_instructions[i + 1 + j]; + int32_t adst_vr = irop_get_vreg(tcc_ir_op_get_dest(ir, aq)); + int32_t asrc_vr = irop_get_vreg(tcc_ir_op_get_src1(ir, aq)); + IRLiveInterval *dli = tcc_ir_vreg_live_interval(ir, adst_vr); + IRLiveInterval *sli = tcc_ir_vreg_live_interval(ir, asrc_vr); + if (dli) dli->phi_pinned = 1; + if (sli) sli->phi_pinned = 1; + } + int inv_cond = invert_condition(cond); if (inv_cond < 0) continue; diff --git a/ir/opt_utils.c b/ir/opt_utils.c index dea8d07e..6dfa0572 100644 --- a/ir/opt_utils.c +++ b/ir/opt_utils.c @@ -10,6 +10,10 @@ #define USING_GLOBALS +#include +#include +#include + #include "ir.h" #include "opt_utils.h" @@ -17,6 +21,37 @@ static int ir_opt_pure_expr_equal_impl(TCCIRState *ir, IROperand a, int a_use_idx, IROperand b, int b_use_idx, int depth); +/* ============================================================================ + * Pass-disable helper (for debugging / bisection) + * ============================================================================ */ + +int tcc_ir_opt_pass_disabled(const char *name) +{ + static const char *disabled = NULL; + static int checked = 0; + if (!checked) { + checked = 1; + disabled = getenv("TCC_DISABLE_PASS"); + } + if (!disabled || !name) + return 0; + const char *p = disabled; + size_t nlen = strlen(name); + while (*p) { + while (*p == ',' || isspace((unsigned char)*p)) + p++; + if (!*p) + break; + const char *start = p; + while (*p && *p != ',' && !isspace((unsigned char)*p)) + p++; + size_t len = p - start; + if (len == nlen && strncmp(start, name, len) == 0) + return 1; + } + return 0; +} + /* ============================================================================ * Constant evaluators * ============================================================================ */ @@ -124,6 +159,14 @@ int ir_opt_eval_const_u64(TCCIRState *ir, IROperand op, int use_idx, uint64_t *o return 0; if (!ir_opt_eval_const_u64(ir, tcc_ir_op_get_src2(ir, q), def_idx, &v2, depth + 1)) return 0; + /* Determine the operand width so that shifts are evaluated at the + * correct precision. Without this, a 32-bit SHR of a sign-extended + * negative constant (e.g. -u4 stored as 0xFFFFFFFFxxxxxxxx) would be + * computed as a 64-bit shift, yielding a completely different result + * than the runtime 32-bit operation. */ + IROperand shift_src1 = tcc_ir_op_get_src1(ir, q); + int shift_btype = irop_get_btype(shift_src1); + int shift_is_64 = (shift_btype == IROP_BTYPE_INT64 || shift_btype == IROP_BTYPE_FLOAT64); switch (q->op) { case TCCIR_OP_ADD: @@ -148,10 +191,16 @@ int ir_opt_eval_const_u64(TCCIRState *ir, IROperand op, int use_idx, uint64_t *o *out = v1 << v2; break; case TCCIR_OP_SHR: - *out = v1 >> v2; + if (shift_is_64) + *out = v1 >> v2; + else + *out = (uint64_t)((uint32_t)v1 >> (v2 & 31)); break; case TCCIR_OP_SAR: - *out = (uint64_t)((int64_t)v1 >> v2); + if (shift_is_64) + *out = (uint64_t)((int64_t)v1 >> v2); + else + *out = (uint64_t)((int64_t)(int32_t)(uint32_t)v1 >> (v2 & 31)); break; case TCCIR_OP_ROR: { @@ -446,6 +495,23 @@ uint8_t *ir_opt_build_merge_bitmap(TCCIRState *ir, int n) is_merge[target / 8] |= (1 << (target % 8)); } } + else if (q->op == TCCIR_OP_SWITCH_TABLE) + { + IROperand src2 = tcc_ir_op_get_src2(ir, q); + int table_id = (int)irop_get_imm64_ex(ir, src2); + if (table_id >= 0 && table_id < ir->num_switch_tables) + { + TCCIRSwitchTable *table = &ir->switch_tables[table_id]; + for (int j = 0; j < table->num_entries; j++) + { + int target = table->targets[j]; + if (target >= 0 && target < n) + pred_count[target]++; + } + if (table->default_target >= 0 && table->default_target < n) + pred_count[table->default_target]++; + } + } /* NOP is NOT a terminator — it falls through. Counting its fall-through * edge is required so a merge whose preceding block ends in DCE-left NOP * padding is still detected (pred_count >= 2). Omitting it leaves stale @@ -481,6 +547,23 @@ void ir_opt_mark_block_starts(TCCIRState *ir, int *block_start_seen, int gen, in if (tgt >= 0 && tgt < n) block_start_seen[tgt] = gen; } + else if (q->op == TCCIR_OP_SWITCH_TABLE) + { + IROperand src2 = tcc_ir_op_get_src2(ir, q); + int table_id = (int)irop_get_imm64_ex(ir, src2); + if (table_id >= 0 && table_id < ir->num_switch_tables) + { + TCCIRSwitchTable *table = &ir->switch_tables[table_id]; + for (int j = 0; j < table->num_entries; j++) + { + int tgt = table->targets[j]; + if (tgt >= 0 && tgt < n) + block_start_seen[tgt] = gen; + } + if (table->default_target >= 0 && table->default_target < n) + block_start_seen[table->default_target] = gen; + } + } } } @@ -500,6 +583,23 @@ uint8_t *ir_opt_build_block_starts_bitmap(TCCIRState *ir, int n) if (i + 1 < n) bs[(i + 1) / 8] |= (1 << ((i + 1) % 8)); } + else if (q->op == TCCIR_OP_SWITCH_TABLE) + { + IROperand src2 = tcc_ir_op_get_src2(ir, q); + int table_id = (int)irop_get_imm64_ex(ir, src2); + if (table_id >= 0 && table_id < ir->num_switch_tables) + { + TCCIRSwitchTable *table = &ir->switch_tables[table_id]; + for (int j = 0; j < table->num_entries; j++) + { + int tgt = table->targets[j]; + if (tgt >= 0 && tgt < n) + bs[tgt / 8] |= (1 << (tgt % 8)); + } + if (table->default_target >= 0 && table->default_target < n) + bs[table->default_target / 8] |= (1 << (table->default_target % 8)); + } + } } return bs; } @@ -738,25 +838,86 @@ static int ir_opt_setif_cmp_operand_equal(TCCIRState *ir, IROperand a, IROperand return 0; } +/* Append to `ids` the vreg identities of `q`'s variable reads: lval sources + * whose vreg names a VAR/PARAM. Whether such a read is spill-encoded as a + * STACKOFF slot or a direct VREG lval, it observes the variable's *current* + * value — so a redefinition of that vreg between two compared sites changes + * what the read returns even when no explicit STORE op is involved. + * Returns the new element count. */ +static int ir_opt_collect_var_read_ids(TCCIRState *ir, IRQuadCompact *q, int32_t *ids, int count) +{ + IROperand srcs[3]; + int nsrc = 0; + if (irop_config[q->op].has_src1) + srcs[nsrc++] = tcc_ir_op_get_src1(ir, q); + if (irop_config[q->op].has_src2) + srcs[nsrc++] = tcc_ir_op_get_src2(ir, q); + if (q->op == TCCIR_OP_MLA) + srcs[nsrc++] = tcc_ir_op_get_accum(ir, q); + for (int s = 0; s < nsrc; s++) + { + int32_t vr = irop_get_vreg(srcs[s]); + int type; + if (!srcs[s].is_lval || vr < 0) + continue; + type = TCCIR_DECODE_VREG_TYPE(vr); + if (type == TCCIR_VREG_TYPE_VAR || type == TCCIR_VREG_TYPE_PARAM) + ids[count++] = vr; + } + return count; +} + /* When a def reads memory (`Sym***DEREF***` or `T_vreg***DEREF***` source), the * value at that address must be the same at both `a_def_idx` and `b_def_idx` * for the defs to be value-equivalent. Conservatively require no aliasing * store, call, inline-asm, or branch target between the two defs. Pure ALU - * ops (and loads — they only read) are safe to skip. */ + * ops (and loads — they only read) are safe to skip — unless their *dest* + * writes memory (lval / stack-slot destination), or redefines a VAR/PARAM + * that one of the endpoint instructions reads (switch fuzz seed 8261: + * `T127 <- V6 AND #1; ...; V6 <- V5 XOR #k; T130 <- V6 AND #1` — the XOR is + * a plain vreg def, but the two AND sources are spill-encoded STACKOFF reads + * of V6, so their values differ). */ static int ir_opt_pure_def_memory_stable(TCCIRState *ir, int a_def_idx, int b_def_idx) { int lo = a_def_idx < b_def_idx ? a_def_idx : b_def_idx; int hi = a_def_idx < b_def_idx ? b_def_idx : a_def_idx; + int32_t read_ids[6]; + int nids = 0; + nids = ir_opt_collect_var_read_ids(ir, &ir->compact_instructions[a_def_idx], read_ids, nids); + nids = ir_opt_collect_var_read_ids(ir, &ir->compact_instructions[b_def_idx], read_ids, nids); for (int k = lo + 1; k < hi; k++) { - int kop = ir->compact_instructions[k].op; - if (kop == TCCIR_OP_STORE || kop == TCCIR_OP_STORE_INDEXED || - kop == TCCIR_OP_STORE_POSTINC || kop == TCCIR_OP_BLOCK_COPY || - kop == TCCIR_OP_FUNCCALLVOID || kop == TCCIR_OP_FUNCCALLVAL || - kop == TCCIR_OP_INLINE_ASM || kop == TCCIR_OP_VLA_ALLOC) + IRQuadCompact *kq = &ir->compact_instructions[k]; + int kop = kq->op; + if (kop == TCCIR_OP_FUNCCALLVOID || kop == TCCIR_OP_FUNCCALLVAL) + { + /* Pure helpers (isnan, __aeabi_f2d, ...) touch no memory: they may + * sit between two compared sites without invalidating stability + * (compare-fp-3's isunordered||!isunordered fold depends on this). + * Their result def is still subject to the dest checks below. */ + Sym *callee = irop_get_sym_ex(ir, tcc_ir_op_get_src1(ir, kq)); + const char *name = callee ? get_tok_str(callee->v, NULL) : NULL; + if (!ir_opt_is_pure_helper_name(name)) + return 0; + } + else if (kop == TCCIR_OP_STORE || kop == TCCIR_OP_STORE_INDEXED || + kop == TCCIR_OP_STORE_POSTINC || kop == TCCIR_OP_BLOCK_COPY || + kop == TCCIR_OP_INLINE_ASM || kop == TCCIR_OP_VLA_ALLOC) return 0; - if (ir->compact_instructions[k].is_jump_target) + if (kq->is_jump_target) return 0; + if (irop_config[kop].has_dest) + { + IROperand kd = tcc_ir_op_get_dest(ir, kq); + int32_t kd_vr = irop_get_vreg(kd); + /* A destination that itself names memory mutates it like a STORE. */ + if (kd.is_lval || irop_get_tag(kd) == IROP_TAG_STACKOFF) + return 0; + /* A plain redefinition of a VAR/PARAM the endpoints read. */ + for (int s = 0; s < nids; s++) + if (kd_vr == read_ids[s]) + return 0; + } } return 1; } @@ -769,6 +930,8 @@ static int ir_opt_pure_def_has_memory_read(TCCIRState *ir, IRQuadCompact *q) return 1; if (irop_config[q->op].has_src2 && tcc_ir_op_get_src2(ir, q).is_lval) return 1; + if (q->op == TCCIR_OP_MLA && tcc_ir_op_get_accum(ir, q).is_lval) + return 1; return 0; } @@ -934,19 +1097,11 @@ int ir_opt_pure_def_equal(TCCIRState *ir, int a_def_idx, int b_def_idx, int dept if (cmp_a->op != TCCIR_OP_CMP || cmp_b->op != TCCIR_OP_CMP) return 0; - int lo = cmp_a_idx < cmp_b_idx ? cmp_a_idx : cmp_b_idx; - int hi = cmp_a_idx < cmp_b_idx ? cmp_b_idx : cmp_a_idx; - for (int k = lo + 1; k < hi; k++) - { - int kop = ir->compact_instructions[k].op; - if (kop == TCCIR_OP_STORE || kop == TCCIR_OP_STORE_INDEXED || - kop == TCCIR_OP_BLOCK_COPY || kop == TCCIR_OP_FUNCCALLVOID || - kop == TCCIR_OP_FUNCCALLVAL || kop == TCCIR_OP_INLINE_ASM || - kop == TCCIR_OP_VLA_ALLOC) - return 0; - if (ir->compact_instructions[k].is_jump_target) - return 0; - } + /* Memory (and any VAR/PARAM the two CMPs read) must be unchanged + * between the CMP sites — the operand comparison below treats + * structurally-identical slot reads as equal on that premise. */ + if (!ir_opt_pure_def_memory_stable(ir, cmp_a_idx, cmp_b_idx)) + return 0; IROperand a1 = tcc_ir_op_get_src1(ir, cmp_a); IROperand a2 = tcc_ir_op_get_src2(ir, cmp_a); @@ -983,7 +1138,27 @@ static int ir_opt_pure_expr_equal_impl(TCCIRState *ir, IROperand a, int a_use_id a_tag = irop_get_tag(a); b_tag = irop_get_tag(b); if (a_tag != IROP_TAG_VREG || b_tag != IROP_TAG_VREG) - return ir_opt_nonvreg_expr_equal(ir, a, b); + { + if (!ir_opt_nonvreg_expr_equal(ir, a, b)) + return 0; + /* Structurally-identical memory reads (spill-encoded VAR/PARAM slots, + * global lvals) only yield the same value when neither memory nor the + * named variable changed between the two use sites. */ + if (a.is_lval && a_use_idx >= 0 && b_use_idx >= 0 && a_use_idx != b_use_idx && + !ir_opt_pure_def_memory_stable(ir, a_use_idx, b_use_idx)) + return 0; + return 1; + } + + /* A dereferenced operand `*(V)` (is_lval) and a plain address operand `V` + * (not is_lval) are different values — one loads from memory, the other is + * the address itself — even when V resolves to the same definition. Without + * this guard, `c->field0 + K` (value-of-load + K) is treated as equal to + * `&c->field0 + K` (== &c->fieldK, an address), which mis-folds comparisons + * like `(c->size + K) > c->size_allocated` to a constant when K is the + * byte offset between the two fields. */ + if (a.is_lval != b.is_lval) + return 0; a_vr = irop_get_vreg(a); b_vr = irop_get_vreg(b); @@ -1058,6 +1233,43 @@ int ir_opt_get_call_param_operand(TCCIRState *ir, int call_idx, int param_idx, I return 0; } +int ir_opt_get_call_param_index(TCCIRState *ir, int call_idx, int param_idx) +{ + IRQuadCompact *call_q; + IROperand call_src2; + int call_id; + + if (!ir || call_idx < 0 || call_idx >= ir->next_instruction_index) + return -1; + + call_q = &ir->compact_instructions[call_idx]; + if (call_q->op != TCCIR_OP_FUNCCALLVAL && call_q->op != TCCIR_OP_FUNCCALLVOID) + return -1; + + call_src2 = tcc_ir_op_get_src2(ir, call_q); + call_id = TCCIR_DECODE_CALL_ID((uint32_t)irop_get_imm64_ex(ir, call_src2)); + + for (int i = call_idx - 1; i >= 0; --i) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP) + continue; + if (q->op != TCCIR_OP_FUNCPARAMVAL && q->op != TCCIR_OP_FUNCPARAMVOID) + continue; + + IROperand enc = tcc_ir_op_get_src2(ir, q); + uint32_t encoded = (uint32_t)irop_get_imm64_ex(ir, enc); + if (TCCIR_DECODE_CALL_ID(encoded) != call_id) + continue; + if (TCCIR_DECODE_PARAM_IDX(encoded) != param_idx) + continue; + + return i; + } + + return -1; +} + void ir_opt_nop_call_params(TCCIRState *ir, int call_idx) { IRQuadCompact *call_q; @@ -1223,6 +1435,8 @@ int change_callee_sym(TCCIRState *ir, int instr_idx, const char *new_name, int r CType ftype; ftype.t = VT_FUNC; ftype.ref = sym_push2(&global_stack, SYM_FIELD, ret_btype, 0); + if (!ftype.ref) + return 0; /* out of symbols — leave the callee unchanged rather than crash */ ftype.ref->f.func_call = FUNC_CDECL; ftype.ref->f.func_type = FUNC_OLD; @@ -1277,3 +1491,33 @@ int tcc_ir_vreg_has_single_def(TCCIRState *ir, int32_t vreg) } return def_count == 1; } + +/* True iff `vreg` is written by two or more instructions. Unlike + * tcc_ir_vreg_has_single_def, a vreg with ZERO defs (e.g. an incoming + * parameter never re-assigned in this function) counts as safe here: with + * no def anywhere, there is no instruction a back-edge could route through + * to change its value, so it is exactly as trustworthy as a genuine + * single-def vreg for reasoning that a linearly-scanned value stays + * constant between two program points. */ +int tcc_ir_vreg_has_multi_def(TCCIRState *ir, int32_t vreg) +{ + int def_count = 0; + int n = ir->next_instruction_index; + + for (int i = 0; i < n; ++i) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP) + continue; + if (!irop_config[q->op].has_dest) + continue; + IROperand dest = tcc_ir_op_get_dest(ir, q); + if (irop_get_vreg(dest) == vreg) + { + def_count++; + if (def_count > 1) + return 1; + } + } + return 0; +} diff --git a/ir/opt_utils.h b/ir/opt_utils.h index 4628a838..01551653 100644 --- a/ir/opt_utils.h +++ b/ir/opt_utils.h @@ -30,6 +30,12 @@ int evaluate_compare_condition(int64_t val1, int64_t val2, int cond_token); int is_power_of_2(int64_t n); +/* ============================================================================ + * Pass-disable helper (for debugging / bisection) + * ============================================================================ */ + +int tcc_ir_opt_pass_disabled(const char *name); + /* ============================================================================ * Condition token helpers * ============================================================================ */ @@ -86,6 +92,12 @@ int ir_opt_pure_expr_equal(struct TCCIRState *ir, IROperand a, int a_use_idx, int ir_opt_get_call_param_operand(struct TCCIRState *ir, int call_idx, int param_idx, IROperand *out); +/* Instruction index of the FUNCPARAMVAL/FUNCPARAMVOID marshalling `param_idx` + * for the call at `call_idx`, or -1. Use this as the reaching-def use-site for + * a param's source: the call index is wrong because the source may be redefined + * between param marshalling and the call. */ +int ir_opt_get_call_param_index(struct TCCIRState *ir, int call_idx, + int param_idx); void ir_opt_nop_call_params(struct TCCIRState *ir, int call_idx); void ir_opt_nop_call_param(struct TCCIRState *ir, int call_idx, int param_idx); void ir_opt_change_call_argc(struct TCCIRState *ir, int call_idx, int argc); @@ -101,6 +113,7 @@ const char *ir_opt_get_constant_string_from_symref(struct TCCIRState *ir, IROperand op); int tcc_ir_vreg_has_single_def(struct TCCIRState *ir, int32_t vreg); +int tcc_ir_vreg_has_multi_def(struct TCCIRState *ir, int32_t vreg); /* ============================================================================ * Callee symbol replacement helpers diff --git a/ir/opt_xform.c b/ir/opt_xform.c index 2d6613ac..97a34400 100644 --- a/ir/opt_xform.c +++ b/ir/opt_xform.c @@ -25,6 +25,57 @@ int ir_xform_same_block(TCCIRState *ir, int from_idx, int to_idx) return 1; } +int ir_xform_range_preserves_memory(TCCIRState *ir, int lo, int hi) +{ + if (hi < lo) + return 0; + for (int k = lo + 1; k < hi; k++) { + const IRQuadCompact *q = &ir->compact_instructions[k]; + if (q->op == TCCIR_OP_NOP) + continue; + /* A jump target means another path enters the range; stores on that + * path execute between the operand's old and new read points. */ + if (q->is_jump_target) + return 0; + switch (q->op) { + /* control flow — the range is not straight-line */ + case TCCIR_OP_JUMP: + case TCCIR_OP_JUMPIF: + case TCCIR_OP_IJUMP: + case TCCIR_OP_SWITCH_TABLE: + case TCCIR_OP_RETURNVOID: + case TCCIR_OP_RETURNVALUE: + /* memory writers / barriers */ + case TCCIR_OP_STORE: + case TCCIR_OP_STORE_INDEXED: + case TCCIR_OP_STORE_POSTINC: + case TCCIR_OP_BLOCK_COPY: + case TCCIR_OP_FUNCCALLVAL: + case TCCIR_OP_FUNCCALLVOID: + case TCCIR_OP_CALLARG_STACK: + case TCCIR_OP_INLINE_ASM: + case TCCIR_OP_ASM_INPUT: + case TCCIR_OP_ASM_OUTPUT: + case TCCIR_OP_VLA_ALLOC: + case TCCIR_OP_VLA_SP_SAVE: + case TCCIR_OP_VLA_SP_RESTORE: + case TCCIR_OP_SETJMP: + case TCCIR_OP_LONGJMP: + case TCCIR_OP_NL_SETJMP: + case TCCIR_OP_NL_LONGJMP: + case TCCIR_OP_BUILTIN_APPLY_ARGS: + case TCCIR_OP_BUILTIN_APPLY: + case TCCIR_OP_BUILTIN_RETURN: + case TCCIR_OP_SET_CHAIN: + case TCCIR_OP_INIT_CHAIN_SLOT: + return 0; + default: + break; + } + } + return 1; +} + /* In-place arithmetic fold: * T <-- V OP src (T is a single-use TEMP, OP is a simple arith op) * V <-- T [STORE] (immediately following, no other ops between) diff --git a/ir/opt_xform.h b/ir/opt_xform.h index fd30a13a..1a377d57 100644 --- a/ir/opt_xform.h +++ b/ir/opt_xform.h @@ -32,4 +32,20 @@ int tcc_ir_opt_store_inplace_arith(TCCIRState *ir); struct IROptCtx; int tcc_ir_opt_store_inplace_arith_ex(struct IROptCtx *ctx); +/* An operand with is_lval (or is_llocal) is a fused memory read — a stack + * slot, a deref through a pointer, or a global — evaluated when the + * instruction executes, not when the operand's vreg was defined. */ +static inline int ir_xform_operand_reads_memory(IROperand op) +{ + return op.is_lval || op.is_llocal; +} + +/* Moving an instruction's memory-read operand to a different program point + * changes which value the load observes if any store to that location can + * execute in between. Return 1 when every instruction strictly between lo + * and hi is straight-line (no control flow in or out, no jump targets) and + * cannot write memory, so a memory read may be moved between lo and hi + * safely. */ +int ir_xform_range_preserves_memory(TCCIRState *ir, int lo, int hi); + #endif /* TCC_IR_OPT_XFORM_H */ \ No newline at end of file diff --git a/ir/pool.c b/ir/pool.c index 948cb755..67312016 100644 --- a/ir/pool.c +++ b/ir/pool.c @@ -20,8 +20,14 @@ int tcc_ir_pool_add(TCCIRState *ir, IROperand irop) { if (ir->iroperand_pool_count >= ir->iroperand_pool_capacity) { - ir->iroperand_pool_capacity *= 2; - ir->iroperand_pool = (IROperand *)tcc_realloc(ir->iroperand_pool, + /* Guard against a zero (or negative) capacity: `0 * 2 == 0` would never + grow the pool, and the subsequent write would overflow a zero-size + buffer. Seed to 1 so the doubling below makes progress. */ + if (ir->iroperand_pool_capacity <= 0) + ir->iroperand_pool_capacity = 1; + else + ir->iroperand_pool_capacity *= 2; + ir->iroperand_pool = (IROperand *)tcc_realloc(ir->iroperand_pool, sizeof(IROperand) * ir->iroperand_pool_capacity); if (!ir->iroperand_pool) { @@ -58,6 +64,10 @@ void tcc_ir_pool_ensure(TCCIRState *ir, int n) int needed = ir->iroperand_pool_count + n; if (needed > ir->iroperand_pool_capacity) { + /* Guard against a zero (or negative) capacity: `0 * 2 == 0` forever, so + the doubling loop below would never terminate. Seed to 1 first. */ + if (ir->iroperand_pool_capacity <= 0) + ir->iroperand_pool_capacity = 1; while (ir->iroperand_pool_capacity < needed) ir->iroperand_pool_capacity *= 2; ir->iroperand_pool = (IROperand *)tcc_realloc(ir->iroperand_pool, diff --git a/ir/regalloc.c b/ir/regalloc.c index 27c22607..415afac1 100644 --- a/ir/regalloc.c +++ b/ir/regalloc.c @@ -29,6 +29,8 @@ #include "opt/ssa_opt.h" #include "licm.h" +extern int tcc_ir_opt_pass_disabled(const char *name); + #define RA_DBG(fmt, ...) LOG_LS(fmt, ##__VA_ARGS__) /* ============================================================================ @@ -46,6 +48,7 @@ typedef struct SSAInterval { uint8_t addrtaken : 1; uint8_t is_param : 1; uint8_t reg_shared : 1; /* cur shares hr with another active interval (return-block tail); skip expire-free and active push */ + uint8_t loop_phi_locked : 1; /* absorbed a loop-phi partner (carries a loop-carried value across the whole loop body); must not be evicted — spilling it mid-loop would not reload the partner's uses and corrupts the IV */ uint8_t reg_type; uint16_t use_count; int8_t precolored; @@ -86,9 +89,18 @@ static int *ra_build_call_prefix(TCCIRState *ir) int *prefix = tcc_malloc(sizeof(int) * (n + 1)); prefix[0] = 0; for (int i = 0; i < n; i++) { - TccIrOp op = ir->compact_instructions[i].op; + IRQuadCompact *q = &ir->compact_instructions[i]; + TccIrOp op = q->op; int is_call = (op == TCCIR_OP_FUNCCALLVOID || op == TCCIR_OP_FUNCCALLVAL || op == TCCIR_OP_BUILTIN_APPLY || ir_op_is_implicit_call_ra(op)); + /* A large BLOCK_COPY lowers to a memcpy() call in the backend, clobbering + * the caller-saved registers. The inline (small) lowering saves/restores + * everything it touches, so only the memcpy-sized copies count as calls. */ + if (!is_call && op == TCCIR_OP_BLOCK_COPY) { + int bc_size = (int)irop_get_imm64_ex(ir, tcc_ir_op_get_src2(ir, q)); + if (bc_size >= TCCIR_BLOCK_COPY_MEMCPY_MIN_BYTES) + is_call = 1; + } prefix[i + 1] = prefix[i] + is_call; } return prefix; @@ -105,6 +117,43 @@ static int ra_has_call_in_range(const int *prefix, int start, int end, int n) return (prefix[end] - prefix[start + 1]) != 0; } +/* Prefix sum of SWITCH_TABLE / SWITCH_LOAD dispatches. The Thumb lowering of + * both ops (tcc_gen_machine_switch_table_mop / _switch_load_mop in + * arm-thumb-gen.c) uses R_IP (R12) as a fixed scratch for the jump-table base + * and clobbers it. R12 is caller-saved, so a value that is merely live across + * the dispatch is not otherwise forced off it — see ra_has_switch_in_range. */ +static int *ra_build_switch_prefix(TCCIRState *ir) +{ + int n = ir->next_instruction_index; + if (n <= 0) + return NULL; + int *prefix = tcc_malloc(sizeof(int) * (n + 1)); + prefix[0] = 0; + for (int i = 0; i < n; i++) { + TccIrOp op = ir->compact_instructions[i].op; + int is_switch = (op == TCCIR_OP_SWITCH_TABLE || op == TCCIR_OP_SWITCH_LOAD); + prefix[i + 1] = prefix[i] + is_switch; + } + return prefix; +} + +/* True if a SWITCH_TABLE/SWITCH_LOAD dispatch sits at any position k with + * start < k <= end, i.e. the interval [start,end] is live across the dispatch. + * `end` is inclusive (unlike ra_has_call_in_range): a value whose only use is + * a *backward* switch target has its last use laid out before the dispatch in + * IR order, with its interval extended forward by the back-edge pass to exactly + * the dispatch position — so end == k must still count. Such a value would be + * read at a switch target *after* the R12 clobber, so it must avoid R12. */ +static int ra_has_switch_in_range(const int *prefix, int start, int end, int n) +{ + if (!prefix || n <= 0) + return 0; + if (start < -1) start = -1; + if (end > n - 1) end = n - 1; + if (end < start + 1) return 0; + return (prefix[end + 1] - prefix[start + 1]) != 0; +} + static const char *ra_vreg_type_char(int type) { switch (type) { @@ -290,6 +339,17 @@ static int ra_fold_const_branches(TCCIRState *ir) if (pop == TCCIR_OP_CMP) { cmp_idx = j; break; } /* Other flag-setting ops invalidate the CMP we'd want to read. */ if (pop == TCCIR_OP_TEST_ZERO || pop == TCCIR_OP_FCMP) break; + /* A call clobbers CPSR (AAPCS: flags are caller-saved), so a CMP before + * it cannot be the JUMPIF's flag source. Critically, the soft-float + * compare helpers (__aeabi_cfcmple / cdcmple, ...) are FUNCCALLVOID + * flag-setters: they ARE the branch's real flag source, and striding + * past them would mis-attribute the branch to an earlier integer CMP and + * wrongly NOP it (orphaning a SELECT that consumes it — fuzz seed 2049). */ + if (pop == TCCIR_OP_FUNCCALLVAL || pop == TCCIR_OP_FUNCCALLVOID) break; + /* A flag-consumer between the CMP and this JUMPIF means the CMP has + * another reader; folding the branch would still NOP the CMP and break + * that consumer, so bail. */ + if (pop == TCCIR_OP_SETIF || pop == TCCIR_OP_SELECT) break; /* BB boundary. */ if (pop == TCCIR_OP_JUMP || pop == TCCIR_OP_JUMPIF || pop == TCCIR_OP_IJUMP || pop == TCCIR_OP_SWITCH_TABLE || @@ -645,6 +705,10 @@ static void ra_build_intervals(TCCIRState *ir, IRCFG *cfg, IRSSAState *ssa, if (temp_count > max_vreg_pos) max_vreg_pos = temp_count; if (param_count > max_vreg_pos) max_vreg_pos = param_count; + /* SWITCH_TABLE/SWITCH_LOAD dispatch clobbers R_IP (R12); see + * ra_has_switch_in_range below. */ + int *switch_prefix = ra_build_switch_prefix(ir); + /* Allocate per-vreg start/end tracking indexed by encoded vreg. * Use flat arrays indexed by (type * max_pos + position). */ int table_size = 4 * max_vreg_pos; @@ -1273,6 +1337,7 @@ static void ra_build_intervals(TCCIRState *ir, IRCFG *cfg, IRSSAState *ssa, iv->co_member = 0; iv->is_param = (type == TCCIR_VREG_TYPE_PARAM); iv->reg_shared = 0; + iv->loop_phi_locked = 0; IRLiveInterval *li = tcc_ir_vreg_live_interval(ir, vreg); iv->addrtaken = li->addrtaken; @@ -1306,6 +1371,16 @@ static void ra_build_intervals(TCCIRState *ir, IRCFG *cfg, IRSSAState *ssa, } } + /* Switch crossing: a SWITCH_TABLE/SWITCH_LOAD dispatch clobbers R_IP + * (R12) as its jump-table scratch (tcc_gen_machine_switch_table_mop). + * A value live across the dispatch must therefore not occupy R12. R12 + * is caller-saved, so reuse crosses_call to force the value into a + * callee-saved register — exactly what the -O1 allocator already does. + * (fuzz seed 102: at -O2 the loop-carried checksum `cs` was placed in + * R12 and clobbered by the switch dispatch, corrupting the result.) */ + if (!iv->crosses_call) + iv->crosses_call = ra_has_switch_in_range(switch_prefix, iv->start, iv->end, n); + /* Params: start at 0, precolor if in register. * Do NOT bump end past its last actual use — the pref_reg boundary * eviction (a->end == cur->start) relies on the param expiring at @@ -1351,6 +1426,7 @@ static void ra_build_intervals(TCCIRState *ir, IRCFG *cfg, IRSSAState *ssa, *out_count = wi; if (out_max_vreg_pos) *out_max_vreg_pos = max_vreg_pos; + if (switch_prefix) tcc_free(switch_prefix); #undef VREG_IDX } @@ -1787,6 +1863,22 @@ static int ra_safe_loop_phi_coalesce(TCCIRState *ir, SSAInterval *cur, SSAInterv IRQuadCompact *q = &ir->compact_instructions[j]; if (q->op == TCCIR_OP_NOP) continue; + /* cur must be defined only at def_pos. The override's correctness rests on + * "after def_pos the register holds cur's value and the back-edge copy is + * mov R,R"; a *second* def of cur before the back-edge breaks that — the + * register then carries an intermediate value while partner is still + * (textually) live, and coalescing conflates two distinct values. This + * happens when def_pos is a copy `cur <- partner` at the top of an OUTER + * loop body and cur is then re-assigned inside a nested (rotated) inner + * loop before the outer back-edge copy `partner <- cur` (longlong seed 218: + * g12-carried hash T160<-T161, re-defined inside the rotated g16 loop). The + * linear scan cannot model the inner back-edge, so reject conservatively. */ + if (irop_config[q->op].has_dest) { + IROperand cd = tcc_ir_op_get_dest(ir, q); + if (irop_has_vreg(cd) && irop_get_vreg(cd) == cur_vreg) + return 0; + } + int uses_partner_as_src = 0; if (irop_config[q->op].has_src1) { IROperand s = tcc_ir_op_get_src1(ir, q); @@ -2054,13 +2146,6 @@ static void ra_linear_scan(TCCIRState *ir, SSAInterval *intervals, int count, uint64_t dirty_int = 0; uint64_t dirty_fp = 0; - /* DEBUG: trace the linear-scan allocation decisions for the 90_struct - * miscompile (why R8 gets assigned to the printf-arg LEA temp on device but - * spilled on QEMU). RA90 lines: per-interval state + int_free + branch taken. */ - int dbg90 = funcname && !strcmp((const char *)funcname, "test_init_struct_from_struct"); - if (dbg90) - fprintf(stderr, "RA90 start count=%d int_allowed=0x%x\n", count, (unsigned)int_allowed); - /* Active set sorted by end point */ SSAInterval **active = tcc_malloc(sizeof(SSAInterval *) * count); int active_count = 0; @@ -2084,11 +2169,6 @@ static void ra_linear_scan(TCCIRState *ir, SSAInterval *intervals, int count, for (int i = 0; i < count; i++) { SSAInterval *cur = &intervals[i]; - if (dbg90) - fprintf(stderr, "RA90 i=%d vr=0x%x [%u,%u] xcall=%d prec=%d rt=%d addr=%d coal=%d r0in=%d int_free=0x%x\n", i, - (unsigned)cur->vreg, cur->start, cur->end, cur->crosses_call, cur->precolored, cur->reg_type, - cur->addrtaken, cur->coalesce_to, cur->r0, (unsigned)int_free); - /* Graph coalescing: non-representative members are merged into their * representative's interval and inherit its register after the scan. Skip * them so they neither consume a register nor enter the active set. */ @@ -2111,9 +2191,6 @@ static void ra_linear_scan(TCCIRState *ir, SSAInterval *intervals, int count, } else { int_free |= (1ull << a->r0); if (a->r1 >= 0) int_free |= (1ull << a->r1); - if (dbg90) - fprintf(stderr, "RA90 expire vr=0x%x end=%u < curstart=%u -> free R%d (int_free=0x%x)\n", - (unsigned)a->vreg, a->end, cur->start, a->r0, (unsigned)int_free); } } } else { @@ -2477,6 +2554,12 @@ static void ra_linear_scan(TCCIRState *ir, SSAInterval *intervals, int count, if (partner->end > cur->end) cur->end = partner->end; cur->r0 = reg; + /* cur now carries partner's loop-carried value over the extended + * range; the partner is gone from active, so cur is the sole + * holder of hr that the partner's remaining uses depend on. + * Evicting cur mid-loop would spill it without reloading those + * partner uses → IV corruption. Lock it against eviction. */ + cur->loop_phi_locked = 1; active[partner_active_idx] = active[--active_count]; } } @@ -2546,12 +2629,14 @@ static void ra_linear_scan(TCCIRState *ir, SSAInterval *intervals, int count, int conflict = 0; for (int p = (int)cur->start; p <= (int)cur->end && !conflict; p++) { IRQuadCompact *pq = &ir->compact_instructions[p]; - IROperand s1 = tcc_ir_op_get_src1(ir, pq); - IROperand s2 = tcc_ir_op_get_src2(ir, pq); - if (irop_has_vreg(s1) && !irop_is_immediate(s1) && - irop_get_vreg(s1) == a->vreg) { conflict = 1; break; } - if (irop_has_vreg(s2) && !irop_is_immediate(s2) && - irop_get_vreg(s2) == a->vreg) { conflict = 1; break; } + /* Any operand reference to the partner clobbers the share: + * cur's def at cur->start overwrites hr, so partner must not be + * needed anywhere in the range. Use ra_instr_touches_vreg so a + * STORE-class op's dest (its base *pointer*, which the store + * READS) and an MLA accumulator count — a naive src1/src2 scan + * missed a partner used as a store base and shared hr anyway, + * emitting `str rX, [rX]` (value written through itself). */ + if (ra_instr_touches_vreg(ir, pq, a->vreg)) { conflict = 1; break; } } if (!conflict) { cur->r0 = hr; @@ -2584,10 +2669,6 @@ static void ra_linear_scan(TCCIRState *ir, SSAInterval *intervals, int count, } } - if (dbg90) - fprintf(stderr, "RA90 DECIDE vr=0x%x -> reg=%d (int_free=0x%x xcall=%d) %s\n", (unsigned)cur->vreg, reg, - (unsigned)int_free, cur->crosses_call, reg >= 0 ? "ASSIGN" : "SPILL"); - if (cur->reg_shared) { /* Return-block share: cur->r0 was set in the pref_reg path. * Don't touch int_free (partner still owns hr) and don't add cur @@ -2618,6 +2699,10 @@ static void ra_linear_scan(TCCIRState *ir, SSAInterval *intervals, int count, if (a->precolored >= 0) continue; if (a->reg_type != LS_REG_TYPE_INT) continue; if (a->end <= cur->end) continue; + /* Never evict a loop-phi-locked interval: it holds a loop-carried value + * (its absorbed partner's uses still read this register across the loop + * body) and spilling it here would not reload those uses. */ + if (a->loop_phi_locked) continue; if (a->use_count < victim_uses || (a->use_count == victim_uses && victim && a->end > victim->end)) { victim_uses = a->use_count; @@ -3364,8 +3449,20 @@ static void ra_resolve_phis(TCCIRState *ir, IRCFG *cfg, IRSSAState *ssa) * builder (it tries to extend phi-dest intervals as if the phi were * still semantically active, on top of the now-explicit defs). */ if (ra_phi_resolve_pre_ra_mode) { - for (int b = 0; b < nb; b++) + /* Free each block's phi list before detaching it — the explicit copies are + * now the source of truth, so these nodes are dead. Merely NULLing the + * heads (as before) orphaned every phi node + operand array: tcc_ir_ssa_free + * later sees an empty block_phis and frees nothing, leaking on every compile. */ + for (int b = 0; b < nb; b++) { + IRPhiNode *phi = ssa->block_phis[b]; + while (phi) { + IRPhiNode *next = phi->next; + tcc_free(phi->operands); + tcc_free(phi); + phi = next; + } ssa->block_phis[b] = NULL; + } tcc_free(old_to_new); tcc_free(copies_per_block); tcc_free(copy_records); @@ -3464,6 +3561,7 @@ static void ra_build_live_regs_bitmap(TCCIRState *ir) if (lsi->end > max_end) max_end = lsi->end; } int sz = (int)max_end + 1; + if (sz < ir->next_instruction_index) sz = ir->next_instruction_index; if (sz > 0) { if (ir->ls.live_regs_by_instruction) tcc_free(ir->ls.live_regs_by_instruction); @@ -3488,6 +3586,7 @@ static void ra_build_live_regs_bitmap(TCCIRState *ir) for (int k = s; k <= e; k++) ir->ls.live_regs_by_instruction[k] |= mask; } + if (TCC_LOG_LS) { for (int k = 0; k < sz; k++) RA_DBG(" instr[%d] live=0x%x", k, ir->ls.live_regs_by_instruction[k]); @@ -3569,6 +3668,120 @@ static void ra_co_ops(TCCIRState *ir, IRQuadCompact *q, #define RA_BS_CLR(bs, i) ((bs)[(i) >> 6] &= ~(1ull << ((i) & 63))) #define RA_BS_TEST(bs, i) (((bs)[(i) >> 6] >> ((i) & 63)) & 1ull) +/* Refine live_regs_by_instruction (the interval-derived approximation the + * scratch-register picker consults) with ACCURATE per-instruction liveness from + * a real CFG backward dataflow. + * + * The interval bitmap models each value as one contiguous [start,end] range. + * For a loop-carried value (defined inside a rotated loop body and live across + * the back-edge into the next iteration) that single range does NOT span the + * loop-header prefix where the value is still live, so the bitmap under-reports + * the value's register as free there. The scratch picker then hands it out and + * clobbers the loop-carried value (random-C O2 wrong-code / HardFault once loop + * rotation is enabled — Finding #15 follow-up, seeds 244 et al). + * + * This dataflow (same ra_co_ops def/use model the graph-coalescer trusts) marks + * every register holding a genuinely-live, register-resident vreg. It is + * strictly conservative for the picker: it can only ADD live bits, never remove + * them, so it can never introduce a new clobber — it only prevents real ones. + * Bails (leaving the interval bitmap as-is) on functions with un-enumerated + * edges (IJUMP / SWITCH_TABLE), matching the coalescer's own guard. */ +static void ra_refine_live_regs_accurate(TCCIRState *ir) +{ + int n = ir->next_instruction_index; + if (n <= 0) return; + for (int i = 0; i < n; i++) { + int op = ir->compact_instructions[i].op; + if (op == TCCIR_OP_IJUMP || op == TCCIR_OP_SWITCH_TABLE || op == TCCIR_OP_SWITCH_LOAD) + return; + } + IRCFG *cfg = tcc_ir_cfg_build(ir); + if (!cfg) return; + tcc_ir_cfg_compute_dominators(cfg); + int nb = cfg->num_blocks; + if (nb <= 0) { tcc_ir_cfg_free(cfg); return; } + /* vreg index space */ + int maxpos = 1; + for (int j = 0; j < ir->ls.next_interval_index; j++) { + int p = TCCIR_DECODE_VREG_POSITION(ir->ls.intervals[j].vreg); + if (p + 1 > maxpos) maxpos = p + 1; + } + int tbl = 4 * maxpos; + int nw = (tbl + 63) / 64; + #define DVIDX(vr) ((TCCIR_DECODE_VREG_TYPE(vr) * maxpos) + TCCIR_DECODE_VREG_POSITION(vr)) + /* vreg -> physical regs */ + int8_t *vr0 = tcc_malloc(tbl); int8_t *vr1 = tcc_malloc(tbl); + for (int i = 0; i < tbl; i++) { vr0[i] = -1; vr1[i] = -1; } + for (int j = 0; j < ir->ls.next_interval_index; j++) { + LSLiveInterval *iv = &ir->ls.intervals[j]; + if (iv->stack_location != 0) continue; + int vi = DVIDX(iv->vreg); + if (vi < 0 || vi >= tbl) continue; + vr0[vi] = (int8_t)iv->r0; vr1[vi] = (int8_t)iv->r1; + } + uint64_t *useb = tcc_mallocz(sizeof(uint64_t)*(size_t)nb*nw); + uint64_t *defbk= tcc_mallocz(sizeof(uint64_t)*(size_t)nb*nw); + uint64_t *livein=tcc_mallocz(sizeof(uint64_t)*(size_t)nb*nw); + uint64_t *liveout=tcc_mallocz(sizeof(uint64_t)*(size_t)nb*nw); + for (int b = 0; b < nb; b++) { + uint64_t *ub = useb + (size_t)b*nw, *db = defbk + (size_t)b*nw; + int s = cfg->blocks[b].start_idx, e = cfg->blocks[b].end_idx; + for (int i = s; i < e && i < n; i++) { + int32_t def=-1, hd=0, uses[4], nu=0; + ra_co_ops(ir, &ir->compact_instructions[i], &def, &hd, uses, &nu); + for (int k=0;k=tbl)continue; if(!RA_BS_TEST(db,u)) RA_BS_SET(ub,u);} + if (hd && tcc_ir_vreg_is_valid(ir,def)){int d=DVIDX(def); if(d>=0&&drpo_count-1; ri>=0; ri--) { + int b = cfg->rpo_order ? cfg->rpo_order[ri] : ri; + if (b<0||b>=nb) continue; + uint64_t *lo=liveout+(size_t)b*nw,*li=livein+(size_t)b*nw,*ub=useb+(size_t)b*nw,*db=defbk+(size_t)b*nw; + for (int w=0;wblocks[b].num_succs;si++){int sb=cfg->blocks[b].succs[si]; if(sb<0||sb>=nb)continue; uint64_t*sli=livein+(size_t)sb*nw; for(int w=0;wcompact_instructions[bi]; + if (q->op != TCCIR_OP_JUMP && q->op != TCCIR_OP_JUMPIF) continue; + int t = (int)tcc_ir_op_get_dest(ir, q).u.imm32; + if (t < 0 || t >= bi) continue; /* not a back-edge */ + /* live-in at the loop header t: find the block starting at t. */ + int hb = -1; + for (int b = 0; b < nb; b++) if (cfg->blocks[b].start_idx == t) { hb = b; break; } + if (hb < 0) continue; + uint64_t *hli = livein + (size_t)hb*nw; + uint32_t mask = 0; + for (int vi=0; vi=0 && vr0[vi]<16) mask |= (1u<=0 && vr1[vi]<16) mask |= (1u<ls.live_regs_by_instruction) continue; + int e = bi; if (e >= ir->ls.live_regs_by_instruction_size) e = ir->ls.live_regs_by_instruction_size - 1; + for (int k = t; k <= e; k++) + ir->ls.live_regs_by_instruction[k] |= mask; + } + #undef DVIDX + tcc_free(vr0);tcc_free(vr1);tcc_free(useb);tcc_free(defbk);tcc_free(livein);tcc_free(liveout); + tcc_ir_cfg_free(cfg); +} + static void ra_coalesce_graph(TCCIRState *ir, SSAInterval *intervals, int count, int max_vreg_pos) { @@ -3709,6 +3922,29 @@ static void ra_coalesce_graph(TCCIRState *ir, SSAInterval *intervals, int count, } } + /* ---- Build per-block def bitmap to detect phi-related copies. ---- + * A copy dest that is defined in more than one block is a phi result + * (explicit copies inserted after SSA phi resolution). Coalescing such + * a dest with its source can overwrite the source's value on a sibling + * phi arm when the source is still live across the merge (seed 860). */ + uint64_t *def_blocks = tcc_mallocz(sizeof(uint64_t) * (size_t)nb * nw); + int *instr_block = tcc_malloc(sizeof(int) * n); + for (int i = 0; i < n; i++) instr_block[i] = -1; + for (int b = 0; b < nb; b++) { + int s = cfg->blocks[b].start_idx, e = cfg->blocks[b].end_idx; + for (int i = s; i < e && i < n; i++) { + instr_block[i] = b; + IRQuadCompact *q = &ir->compact_instructions[i]; + int32_t def = -1, hd = 0, uses[4], nu = 0; + ra_co_ops(ir, q, &def, &hd, uses, &nu); + if (hd && tcc_ir_vreg_is_valid(ir, def)) { + int d = VIDX(def); + if (d >= 0 && d < tbl) + RA_BS_SET(def_blocks + (size_t)b * nw, d); + } + } + } + /* ---- Collect copy edges + candidate set (Stage 4 prep). ---- */ /* Copy edge kinds: ASSIGN dst<-src; two-address dst<-src OP imm (ADD/SUB). */ int *cand_id = tcc_malloc(sizeof(int) * tbl); @@ -3742,15 +3978,55 @@ static void ra_coalesce_graph(TCCIRState *ir, SSAInterval *intervals, int count, int di = VIDX(dv), si = VIDX(sv); if (di < 0 || di >= tbl || si < 0 || si >= tbl) continue; if (iv_of[di] < 0 || iv_of[si] < 0) continue; /* both must have intervals */ + /* Reject unsafe phi-result copies: the dest is defined on multiple + * incoming edges. The dangerous case is when the source temp is itself + * a copy of a VAR that is live-out of the merge block; coalescing the + * phi result with that source (transitively with the VAR) lets a sibling + * phi arm overwrite the still-live VAR (seed 860). Latch-style loop + * phis, where the source is computed in the latch, are unaffected. */ + { + int def_bc = 0; + for (int b = 0; b < nb; b++) { + if (RA_BS_TEST(def_blocks + (size_t)b * nw, di)) { + def_bc++; + if (def_bc > 1) break; + } + } + if (def_bc > 1) { + /* Allow phi-copy coalescing only when the merge block is a loop header + * (one of its predecessors is a back edge, i.e. the merge block dominates + * that predecessor). Loop phis coalesce safely because the latch source + * is not live-out of the header. Conditional-merge phis can have a + * source equivalent to a variable live across the merge; coalescing them + * lets the sibling arm overwrite that variable (seed 860). */ + int bi = instr_block[i]; + int is_loop_header = 0; + if (bi >= 0 && bi < nb) { + for (int pi = 0; pi < cfg->blocks[bi].num_preds; pi++) { + int pb = cfg->blocks[bi].preds[pi]; + if (pb >= 0 && pb < nb && tcc_ir_cfg_dominates(cfg, bi, pb)) { + is_loop_header = 1; + break; + } + } + } + if (!is_loop_header) { + continue; + } + } + } ADD_CAND(di); ADD_CAND(si); if (ne >= ecap) { ecap *= 2; edge_d = tcc_realloc(edge_d, sizeof(int32_t)*ecap); edge_s = tcc_realloc(edge_s, sizeof(int32_t)*ecap); } edge_d[ne] = di; edge_s[ne] = si; ne++; } + tcc_free(def_blocks); + if (ncand < 2 || ne == 0) { tcc_free(iv_of); tcc_free(useb); tcc_free(defbk); tcc_free(livein); - tcc_free(liveout); tcc_free(cand_id); tcc_free(edge_d); tcc_free(edge_s); + tcc_free(liveout); tcc_free(instr_block); + tcc_free(cand_id); tcc_free(edge_d); tcc_free(edge_s); tcc_ir_cfg_free(cfg); return; } @@ -3950,6 +4226,7 @@ static void ra_coalesce_graph(TCCIRState *ir, SSAInterval *intervals, int count, #undef VIDX tcc_free(deg); tcc_free(live); tcc_free(iv_of); tcc_free(useb); tcc_free(defbk); tcc_free(livein); tcc_free(liveout); + tcc_free(instr_block); tcc_free(cand_id); tcc_free(cand_vidx); tcc_free(edge_d); tcc_free(edge_s); tcc_ir_cfg_free(cfg); } @@ -3960,6 +4237,153 @@ static void ra_coalesce_graph(TCCIRState *ir, SSAInterval *intervals, int count, void dbg_scan_imm_dest(TCCIRState *ir, const char *pass); void dbg_scan_overlap(TCCIRState *ir, const char *pass); +/* Promote multiply-block-defined TEMPs to fresh VARs so SSA construction places + * phis for them. The frontend emits a single TEMP written on BOTH arms of a + * branch-lowered ternary (`cond ? a : b` where an arm has a side effect / call, + * so it cannot lower to SELECT) — e.g. `T323 <- a` in one block and `T323 <- b` + * in another, then a merge-block use. That violates the SSA-by-construction + * assumption the renamer makes for TEMPs (it renames only VARs and leaves such a + * TEMP untouched), so the merge use resolves to ONE arm's definition + * unconditionally — random-C O1/O2 wrong-code, seeds 100/118 (the value reached a + * later inlined-csmix use as the else-arm value regardless of the condition). + * Converting the TEMP to a VAR routes it through the normal var→SSA promotion, + * which inserts the phi. VAR and TEMP operands share the IROP_TAG_VREG encoding + * and differ only in the type bits, so irop_set_vreg suffices; tcc_ir_vreg_alloc_var + * grows the live-interval array. Only fires for the rare multi-block-def TEMP. */ +static void ra_promote_multidef_temps_to_vars(TCCIRState *ir, IRCFG *cfg) +{ + int n = ir->next_instruction_index; + int ntmp = ir->next_temporary_variable; + if (n <= 0 || ntmp <= 0 || !cfg || cfg->num_blocks <= 1) + return; + + /* Skip functions that take label addresses (GCC labels-as-values, `&&label`): + * their exact machine-code layout is observable at runtime via the label-offset + * map, so the phi-resolution copies this promotion introduces would shift those + * offsets (96_nodata_wanted measures code size with `&&label` arithmetic). + * Such functions also have inlining disabled (tccgen gates auto-inline on + * !func_has_label_addr), so they never hit the inlined-ternary miscompile this + * promotion fixes — skipping them is free of correctness cost. */ + if (ir->func_has_label_addr) + return; + + /* Only run when SSA construction will actually proceed and rename the new VARs + * back into SSA temps. SSA construction BAILS on un-enumerable control flow + * (IJUMP / computed goto, SETJMP); if we promoted there, the converted VARs + * would be left as unpromoted stack slots and change codegen for the worse + * (96_nodata_wanted's `&&label` arithmetic). Mirror ssa_has_unsupported_ops. */ + for (int i = 0; i < n; i++) { + TccIrOp op = ir->compact_instructions[i].op; + if (op == TCCIR_OP_IJUMP || op == TCCIR_OP_SETJMP || op == TCCIR_OP_NL_SETJMP) + return; + } + + /* def_block[t] = the block of t's first def, or -2 = multi-block, -1 = none. */ + int *def_block = tcc_malloc(sizeof(int) * ntmp); + for (int t = 0; t < ntmp; t++) def_block[t] = -1; + + for (int i = 0; i < n; i++) { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP || !irop_config[q->op].has_dest) + continue; + if (q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED || + q->op == TCCIR_OP_STORE_POSTINC || q->op == TCCIR_OP_FUNCPARAMVAL || + q->op == TCCIR_OP_FUNCPARAMVOID) + continue; + IROperand d = tcc_ir_op_get_dest(ir, q); + int32_t vr = irop_get_vreg(d); + if (vr < 0 || TCCIR_DECODE_VREG_TYPE(vr) != TCCIR_VREG_TYPE_TEMP) continue; + if (d.is_lval) continue; /* a deref store target, not a plain TEMP def */ + int t = TCCIR_DECODE_VREG_POSITION(vr); + if (t < 0 || t >= ntmp) continue; + int blk = cfg->instr_to_block[i]; + if (def_block[t] == -1) def_block[t] = blk; + else if (def_block[t] != blk) def_block[t] = -2; /* multi-block */ + } + + /* A multi-block-defined TEMP only needs a phi (and only then is its renaming + * actually wrong) when it has a USE in a block that does not itself define it — + * a value flowing across a merge. A TEMP whose uses are all in its own + * def-blocks reaches each use from the local def and is already correct; + * promoting it would insert needless phi-copies and grow code (96_nodata_wanted + * measures code size via `&&label` arithmetic and is sensitive to this). For + * each multi-block TEMP, mark its def-blocks and require a use elsewhere. */ + int32_t *temp_to_var = tcc_malloc(sizeof(int32_t) * ntmp); + for (int t = 0; t < ntmp; t++) temp_to_var[t] = -1; + uint8_t *needs_phi = tcc_mallocz(ntmp); + { + uint8_t *isdef = tcc_mallocz(cfg->num_blocks); + for (int t = 0; t < ntmp; t++) { + if (def_block[t] != -2) continue; + memset(isdef, 0, cfg->num_blocks); + /* collect def-blocks of t */ + for (int i = 0; i < n; i++) { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP || !irop_config[q->op].has_dest) continue; + if (q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED || + q->op == TCCIR_OP_STORE_POSTINC || q->op == TCCIR_OP_FUNCPARAMVAL || + q->op == TCCIR_OP_FUNCPARAMVOID) continue; + IROperand d = tcc_ir_op_get_dest(ir, q); + int32_t vr = irop_get_vreg(d); + if (vr >= 0 && !d.is_lval && TCCIR_DECODE_VREG_TYPE(vr) == TCCIR_VREG_TYPE_TEMP && + TCCIR_DECODE_VREG_POSITION(vr) == t) + isdef[cfg->instr_to_block[i]] = 1; + } + /* a use in a non-def block ⇒ needs a phi */ + for (int i = 0; i < n && !needs_phi[t]; i++) { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP) continue; + int blk = cfg->instr_to_block[i]; + if (isdef[blk]) continue; + int32_t uses[5]; int nu = 0; + if (irop_config[q->op].has_src1) uses[nu++] = irop_get_vreg(tcc_ir_op_get_src1(ir, q)); + if (irop_config[q->op].has_src2) uses[nu++] = irop_get_vreg(tcc_ir_op_get_src2(ir, q)); + if (q->op == TCCIR_OP_MLA) uses[nu++] = irop_get_vreg(tcc_ir_op_get_accum(ir, q)); + if (q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED || q->op == TCCIR_OP_STORE_POSTINC) + uses[nu++] = irop_get_vreg(tcc_ir_op_get_dest(ir, q)); + for (int u = 0; u < nu; u++) + if (uses[u] >= 0 && TCCIR_DECODE_VREG_TYPE(uses[u]) == TCCIR_VREG_TYPE_TEMP && + TCCIR_DECODE_VREG_POSITION(uses[u]) == t) { needs_phi[t] = 1; break; } + } + } + tcc_free(isdef); + } + int any = 0; + for (int t = 0; t < ntmp; t++) { + if (needs_phi[t]) { temp_to_var[t] = tcc_ir_vreg_alloc_var(ir); any = 1; } + } + tcc_free(needs_phi); + if (!any) { tcc_free(def_block); tcc_free(temp_to_var); return; } + + /* Rewrite every operand referencing a promoted TEMP to its VAR (type bits only; + * is_local/is_lval/tag are preserved). */ + #define REMAP(getter, setter) \ + do { \ + IROperand o = getter(ir, q); \ + int32_t ovr = irop_get_vreg(o); \ + if (ovr >= 0 && TCCIR_DECODE_VREG_TYPE(ovr) == TCCIR_VREG_TYPE_TEMP) { \ + int op_t = TCCIR_DECODE_VREG_POSITION(ovr); \ + if (op_t >= 0 && op_t < ntmp && temp_to_var[op_t] >= 0) { \ + irop_set_vreg(&o, temp_to_var[op_t]); \ + setter(ir, q, o); \ + } \ + } \ + } while (0) + + for (int i = 0; i < n; i++) { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP) continue; + if (irop_config[q->op].has_dest) REMAP(tcc_ir_op_get_dest, tcc_ir_op_set_dest); + if (irop_config[q->op].has_src1) REMAP(tcc_ir_op_get_src1, tcc_ir_op_set_src1); + if (irop_config[q->op].has_src2) REMAP(tcc_ir_op_get_src2, tcc_ir_op_set_src2); + if (q->op == TCCIR_OP_MLA) REMAP(tcc_ir_op_get_accum, tcc_ir_op_set_accum); + } + #undef REMAP + + tcc_free(def_block); + tcc_free(temp_to_var); +} + void tcc_ir_ssa_regalloc(TCCIRState *ir, const RegAllocTarget *target, int spill_base) { if (!ir || !target) return; @@ -3974,6 +4398,9 @@ void tcc_ir_ssa_regalloc(TCCIRState *ir, const RegAllocTarget *target, int spill tcc_ir_cfg_compute_dominators(cfg); tcc_ir_cfg_compute_dom_frontiers(cfg); + ra_promote_multidef_temps_to_vars(ir, cfg); + tcc_ir_dump_after_pass(ir, "ssa_promote"); + /* Construct SSA */ IRSSAState *ssa = tcc_ir_ssa_construct(ir, cfg); int had_promotable = (ssa != NULL); @@ -3986,6 +4413,7 @@ void tcc_ir_ssa_regalloc(TCCIRState *ir, const RegAllocTarget *target, int spill } else { tcc_ir_ssa_rename(ir, ssa); } + tcc_ir_dump_after_pass(ir, "ssa_rename"); dbg_scan_imm_dest(ir, "ssa_rename"); dbg_scan_overlap(ir, "ssa_rename"); /* SSA optimization passes. @@ -4002,24 +4430,34 @@ void tcc_ir_ssa_regalloc(TCCIRState *ir, const RegAllocTarget *target, int spill if (had_promotable) { tcc_ir_ssa_opt_run(&ssa_opt_ctx); } else { + /* Run a pass, then make it observable to -dump-ir-passes= + * golden snapshots (same names as the tcc_ir_ssa_opt_run driver). */ +#define RUN_SSA(name, call) \ + do \ + { \ + if (!tcc_ir_opt_pass_disabled(name)) \ + (call); \ + tcc_ir_dump_after_pass(ir, name); \ + } while (0) ssa_opt_ctx.no_stack_fwd = 0; - ssa_opt_var_const_fold(&ssa_opt_ctx); - ssa_opt_var_forward(&ssa_opt_ctx); - ssa_opt_sccp(&ssa_opt_ctx); - ssa_opt_load_cse(&ssa_opt_ctx); - ssa_opt_cprop(&ssa_opt_ctx); - ssa_opt_fold(&ssa_opt_ctx); - ssa_opt_branch(&ssa_opt_ctx); - ssa_opt_reassoc(&ssa_opt_ctx); - ssa_opt_strength(&ssa_opt_ctx); - ssa_opt_narrow(&ssa_opt_ctx); - ssa_opt_gvn(&ssa_opt_ctx); - ssa_opt_phi_simplify(&ssa_opt_ctx); - ssa_opt_dce(&ssa_opt_ctx); + RUN_SSA("ssa:var_const_fold", ssa_opt_var_const_fold(&ssa_opt_ctx)); + RUN_SSA("ssa:var_forward", ssa_opt_var_forward(&ssa_opt_ctx)); + RUN_SSA("ssa:sccp", ssa_opt_sccp(&ssa_opt_ctx)); + RUN_SSA("ssa:load_cse", ssa_opt_load_cse(&ssa_opt_ctx)); + RUN_SSA("ssa:cprop", ssa_opt_cprop(&ssa_opt_ctx)); + RUN_SSA("ssa:fold", ssa_opt_fold(&ssa_opt_ctx)); + RUN_SSA("ssa:branch", ssa_opt_branch(&ssa_opt_ctx)); + RUN_SSA("ssa:reassoc", ssa_opt_reassoc(&ssa_opt_ctx)); + RUN_SSA("ssa:strength", ssa_opt_strength(&ssa_opt_ctx)); + RUN_SSA("ssa:narrow", ssa_opt_narrow(&ssa_opt_ctx)); + RUN_SSA("ssa:gvn", ssa_opt_gvn(&ssa_opt_ctx)); + RUN_SSA("ssa:phi_simplify", ssa_opt_phi_simplify(&ssa_opt_ctx)); + RUN_SSA("ssa:dce", ssa_opt_dce(&ssa_opt_ctx)); /* Target-specific fusions (MLA, LOAD/STORE_INDEXED on ARM). These * don't need promotable vars or phi nodes — they pattern-match on * existing TEMP vregs. */ tcc_ir_ssa_opt_run_target(&ssa_opt_ctx); +#undef RUN_SSA } } else { ssa_opt_cprop(&ssa_opt_ctx); @@ -4214,6 +4652,7 @@ void tcc_ir_ssa_regalloc(TCCIRState *ir, const RegAllocTarget *target, int spill * is cleared. We just need to build the live_regs bitmap from the * intervals the linear scan produced. */ ra_build_live_regs_bitmap(ir); + ra_refine_live_regs_accurate(ir); /* Cleanup */ tcc_free(intervals); @@ -4387,9 +4826,15 @@ int tcc_ir_move_coalescing(TCCIRState *ir) dst_iv->r0 = src_reg; for (int k = (int)dst_iv->start; k <= (int)dst_iv->end && k < tbl_size; ++k) { - ls->live_regs_by_instruction[k] &= ~(1u << old_reg); + /* old_reg's bit may be shared with another interval that coalesced + * onto it earlier (in-place two-address ops overlap on purpose) — + * only clear positions where no other claimant is still live. */ + if (!tcc_ls_reg_held_by_other(ls, old_reg, k, dst_iv)) + ls->live_regs_by_instruction[k] &= ~(1u << old_reg); ls->live_regs_by_instruction[k] |= (1u << src_reg); } + RA_DBG("move_coalesce fwd @%d: T%d R%d->R%d [%u,%u]", i, + (int)(dv & 0xffffff), old_reg, src_reg, dst_iv->start, dst_iv->end); coalesced++; continue; } @@ -4445,6 +4890,42 @@ try_reverse:; } if (conflict) goto rev_check_done; + /* Symmetric guard (dest side): after this copy src and dest share + * dest_reg holding the same value. If dest is given a NEW, independent + * value while src is still live, that write clobbers dest_reg and src's + * remaining uses read the wrong value. The loop-carried phi copy this + * pass targets has src dying at the copy (src_iv->end == i), so the range + * below is empty and legitimate coalescing is unaffected; the guard only + * fires when src OUTLIVES the copy and dest is re-defined underneath it + * (bitfield 40979: `u4 = u3` copy, then `u4 = const` clobbers the shared + * register while `u3` is still read). A redefinition at exactly src's + * last use that also reads src is the two-address read-before-write case + * and stays safe. */ + for (int k = i + 1; k <= (int)src_iv->end && k < n; ++k) + { + IRQuadCompact *qk = &ir->compact_instructions[k]; + if (qk->op == TCCIR_OP_NOP) continue; + if (!irop_config[qk->op].has_dest) continue; + IROperand dk = tcc_ir_op_get_dest(ir, qk); + int is_mem_store = (qk->op == TCCIR_OP_STORE || qk->op == TCCIR_OP_STORE_INDEXED || + qk->op == TCCIR_OP_STORE_POSTINC) && dk.is_lval; + if (is_mem_store) continue; + if (irop_get_vreg(dk) != dv) continue; + if (k == (int)src_iv->end) { + int reads_src = 0; + if (irop_config[qk->op].has_src1 && + irop_get_vreg(tcc_ir_op_get_src1(ir, qk)) == sv) reads_src = 1; + if (!reads_src && irop_config[qk->op].has_src2 && + irop_get_vreg(tcc_ir_op_get_src2(ir, qk)) == sv) reads_src = 1; + if (!reads_src && qk->op == TCCIR_OP_MLA && + irop_get_vreg(tcc_ir_op_get_accum(ir, qk)) == sv) reads_src = 1; + if (reads_src) continue; + } + conflict = 1; + break; + } + if (conflict) goto rev_check_done; + /* Check dest not used between src's def and the ASSIGN. * src's def overwrites dest_reg; any intervening use of dest * would read the wrong value. */ @@ -4489,11 +4970,16 @@ try_reverse:; rev_check_done: if (conflict) continue; - /* Check dest_reg not occupied by other intervals during src's range */ + /* Check dest_reg not occupied by other intervals during src's range. + * Identity-based: earlier coalesces may have moved a third interval onto + * dest_reg inside dst_iv's range, so "position within dst_iv's range" is + * not proof the claim is dst_iv's own. */ for (int k = (int)src_iv->start; k <= (int)src_iv->end && k < tbl_size; ++k) { if (ls->live_regs_by_instruction[k] & (1u << dest_reg)) { + if (tcc_ls_reg_held_by_other(ls, dest_reg, k, dst_iv)) + { conflict = 1; break; } /* dest_reg is live here — only OK if it's from dest_iv itself */ if (k < (int)dst_iv->start || k > (int)dst_iv->end) { conflict = 1; break; } @@ -4505,9 +4991,16 @@ try_reverse:; src_iv->r0 = dest_reg; for (int k = (int)src_iv->start; k <= (int)src_iv->end && k < tbl_size; ++k) { - ls->live_regs_by_instruction[k] &= ~(1u << old_reg); + /* old_reg's bit may be shared with another interval that coalesced + * onto it earlier — only clear positions with no other live claimant + * (volatile 36818: T175 leaving R5 wiped T212's in-place-XOR claim, + * and the phase-3 scratch fixup then put the outer loop counter there). */ + if (!tcc_ls_reg_held_by_other(ls, old_reg, k, src_iv)) + ls->live_regs_by_instruction[k] &= ~(1u << old_reg); ls->live_regs_by_instruction[k] |= (1u << dest_reg); } + RA_DBG("move_coalesce rev @%d: T%d R%d->R%d [%u,%u]", i, + (int)(sv & 0xffffff), old_reg, dest_reg, src_iv->start, src_iv->end); /* Record this src vreg as reverse-coalesced */ rev_done = tcc_realloc(rev_done, sizeof(uint32_t) * (rev_done_size + 1)); rev_done[rev_done_size++] = (uint32_t)sv; diff --git a/ir/ssa.c b/ir/ssa.c index 264ea011..d633016d 100644 --- a/ir/ssa.c +++ b/ir/ssa.c @@ -149,21 +149,49 @@ static void ssa_var_info_free(SSAVarInfo *info) tcc_free(info->var_btype); } -static uint8_t *ssa_build_promotable(const SSAVarInfo *info, int nb, int *out_count) +/* Decide whether a local VAR should be promoted to SSA (and get phi nodes). + * + * Single-block CFG: no back-edges, so any non-addrtaken VAR is safely + * promotable to a TEMP via straight-line renaming — no phi placement needed. + * Enabling this lets GVN / cprop / DCE see local-variable defs in leaf + * functions. + * + * Multi-block CFG: a VAR defined in >=2 blocks (multi_block_def) needs phis and + * is promoted. A VAR defined in only ONE block ALSO needs a phi when that def + * does not dominate all later uses — i.e. its def-block has a non-empty + * dominance frontier. The classic case is a value defined only inside a loop + * and read again on the next iteration through the back-edge (the loop header + * is in the def-block's DF): without a phi it stays an unpromoted VAR with no + * loop-header definition, and the register allocator can hand it a register + * that is clobbered around the loop body (gcc-torture pr125291). A value + * defined on one arm of a branch and read after the merge is the same shape. + * Promoting it is always safe: the phi resolver drops undef (vreg<0) operands, + * so a path that leaves the var genuinely uninitialized is unchanged. */ +static int ssa_var_promotable(const SSAVarInfo *info, IRCFG *cfg, int nb, int v, + int single_block) +{ + if (bitset_test(info->addrtaken, v)) + return 0; + if (single_block || bitset_test(info->multi_block_def, v)) + return 1; + /* Single-block-def: promote iff a phi would actually be placed, i.e. some + * def-block has a non-empty dominance frontier. */ + const uint8_t *def_bits = &info->def_blocks[v * info->block_bitset_bytes]; + for (int b = 0; b < nb; b++) { + if (bitset_test(def_bits, b) && cfg->blocks[b].num_df > 0) + return 1; + } + return 0; +} + +static uint8_t *ssa_build_promotable(const SSAVarInfo *info, IRCFG *cfg, int nb, + int *out_count) { int num_vars = info->num_vars; - /* Single-block CFG: no back-edges, so any non-addrtaken VAR is safely - * promotable to a TEMP via straight-line renaming — no phi placement - * needed. Enabling this lets GVN / cprop / DCE see local-variable defs - * in leaf functions. Multi-block CFGs must keep the multi_block_def - * criterion: a VAR defined in only one block but used across a back-edge - * still needs a phi at the loop header. */ int single_block = (nb <= 1); int count = 0; for (int v = 0; v < num_vars; v++) { - if (bitset_test(info->addrtaken, v)) - continue; - if (single_block || bitset_test(info->multi_block_def, v)) + if (ssa_var_promotable(info, cfg, nb, v, single_block)) count++; } *out_count = count; @@ -172,9 +200,7 @@ static uint8_t *ssa_build_promotable(const SSAVarInfo *info, int nb, int *out_co uint8_t *is_promotable = tcc_mallocz((num_vars + 7) / 8); for (int v = 0; v < num_vars; v++) { - if (bitset_test(info->addrtaken, v)) - continue; - if (single_block || bitset_test(info->multi_block_def, v)) + if (ssa_var_promotable(info, cfg, nb, v, single_block)) bitset_set(is_promotable, v); } return is_promotable; @@ -255,7 +281,7 @@ IRSSAState *tcc_ir_ssa_construct(TCCIRState *ir, IRCFG *cfg) ssa_scan_var_defs(ir, cfg, &info); int promotable_count; - uint8_t *is_promotable = ssa_build_promotable(&info, nb, &promotable_count); + uint8_t *is_promotable = ssa_build_promotable(&info, cfg, nb, &promotable_count); if (!is_promotable) { ssa_var_info_free(&info); return NULL; @@ -274,7 +300,11 @@ IRSSAState *tcc_ir_ssa_construct(TCCIRState *ir, IRCFG *cfg) int phi_counter = 0; for (int v = 0; v < num_vars; v++) { - if (!bitset_test(info.multi_block_def, v) || bitset_test(info.addrtaken, v)) + /* Place phis for every promoted var (is_promotable already excludes + * addrtaken). For single-block-def vars this now also covers the ones kept + * as VARs before — loop-carried / branch-merge-live values that need a phi. + * In a single-block CFG the def-block has an empty DF, so this places none. */ + if (!bitset_test(is_promotable, v)) continue; uint8_t *def_bits = &info.def_blocks[v * bitset_bytes]; phi_counter = ssa_place_phis_for_var(ssa, ir, cfg, v, info.var_btype[v], def_bits, diff --git a/ir/stack.c b/ir/stack.c index ca96d102..482ebfc3 100644 --- a/ir/stack.c +++ b/ir/stack.c @@ -376,7 +376,7 @@ void tcc_ir_stack_reg_assign(TCCIRState *ir, int vreg, int offset, int r0, int r void tcc_ir_stack_reg_get(TCCIRState *ir, int vreg, int *r0, int *r1) { - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, vreg); + IRLiveInterval *interval = tcc_ir_try_get_live_interval(ir, vreg); if (!interval) { if (r0) diff --git a/lib/builtin.c b/lib/builtin.c index 1d4fff53..f48f4b5f 100644 --- a/lib/builtin.c +++ b/lib/builtin.c @@ -12,6 +12,16 @@ unsigned long __tcc_strlen(const char *s); char *__tcc_strcpy(char *d, const char *s); #endif +#if !defined(__arm__) +/* Host-fallback string helpers use word-at-a-time null-byte detection. + * The magic constants must match sizeof(unsigned long), otherwise on a 64-bit + * host only the low 32 bits of each word are checked and the scan overruns + * the string terminator. */ +#define __TCC_WORD_ONES ((unsigned long)-1 / 0xFF) +#define __TCC_WORD_HIGHS (__TCC_WORD_ONES << 7) +#define __TCC_HAS_NULL_BYTE(w) (((w) - __TCC_WORD_ONES) & ~(w) & __TCC_WORD_HIGHS) +#endif + /* ---------------------------------------------- */ /* This file implements: * __builtin_ffs @@ -629,11 +639,11 @@ int __tcc_strcmp(const char *s1, const char *s2) a = w1[0]; b = w2[0]; /* Single branch: words differ OR null byte present */ - if (a != b || ((a - 0x01010101UL) & ~a & 0x80808080UL)) + if (a != b || __TCC_HAS_NULL_BYTE(a)) break; a = w1[1]; b = w2[1]; - if (a != b || ((a - 0x01010101UL) & ~a & 0x80808080UL)) + if (a != b || __TCC_HAS_NULL_BYTE(a)) { w1++; w2++; @@ -683,7 +693,7 @@ unsigned long __tcc_strlen(const char *s) for (;;) { w = *wp; - if ((w - 0x01010101UL) & ~w & 0x80808080UL) + if (__TCC_HAS_NULL_BYTE(w)) break; wp++; } @@ -863,11 +873,11 @@ char *__tcc_strcpy(char *d, const char *s) for (;;) { w0 = ws[0]; - if ((w0 - 0x01010101UL) & ~w0 & 0x80808080UL) + if (__TCC_HAS_NULL_BYTE(w0)) break; w1 = ws[1]; wd[0] = w0; - if ((w1 - 0x01010101UL) & ~w1 & 0x80808080UL) + if (__TCC_HAS_NULL_BYTE(w1)) { wd++; ws++; diff --git a/metrics/gate.py b/metrics/gate.py new file mode 100644 index 00000000..83128966 --- /dev/null +++ b/metrics/gate.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +"""Track-first-then-block regression gate over metrics.db. + +Compares a recorded run against its parent commit's run (same host) and +reports: + - correctness regressions: a divergent seed that appears now but did not + appear for the parent AND is not in the `accepted_divergence` allowlist. + - codesize regressions: the `` tcc/gcc instruction ratio grew by more + than --codesize-tolerance-pct. + +compile_time and perf are reported for visibility only -- they are noisy by +nature (hardware, scheduling) and are not part of the automated pass/fail +signal; judge them by eye on the dashboard during a migration step (see +docs/metrics_dashboard.md). + +Modes: + default -- print the report, always exit 0 (safe to run before the + baseline is green; this is the "track" half of track-first). + --strict -- exit 1 if any correctness or codesize regression survives the + allowlist (the "block" half; flip on via METRICS_GATE_ENABLED + in .github/workflows/metrics.yml once the baseline is clean). + +Managing the allowlist (no raw SQL needed for the common case): + python3 metrics/gate.py --db metrics.db \ + --accept ptr:olevels:12345 --reason "pre-existing, see docs/bugs.md" +""" + +import argparse +import sqlite3 +import sys +import time + + +def warn(msg: str) -> None: + print(f"[gate] WARN: {msg}", file=sys.stderr, flush=True) + + +def resolve_run(conn: sqlite3.Connection, rev: str, host: str) -> sqlite3.Row: + """rev may be a full/short sha or 'HEAD'-resolved sha the caller already + turned into a real sha; we match by prefix so either works.""" + row = conn.execute( + "SELECT * FROM runs WHERE host=? AND commit_sha LIKE ? ORDER BY run_ts DESC LIMIT 1", + (host, rev + "%")).fetchone() + if row is None: + sys.exit(f"[gate] no recorded run for rev={rev!r} host={host!r} -- " + f"run metrics/record.py first") + return row + + +def accepted_seeds(conn, profile, oracle) -> set: + return {r[0] for r in conn.execute( + "SELECT seed FROM accepted_divergence WHERE profile=? AND oracle=? AND seed IS NOT NULL", + (profile, oracle))} + + +def accepted_baseline(conn, profile, oracle): + row = conn.execute( + "SELECT baseline FROM accepted_divergence WHERE profile=? AND oracle=? AND seed IS NULL", + (profile, oracle)).fetchone() + return row[0] if row else None + + +def check_correctness(conn, run_id, parent_id) -> list: + """Return a list of (profile, oracle, new_seeds) regressions.""" + regressions = [] + for profile, oracle, count in conn.execute( + "SELECT profile, oracle, divergent_count FROM correctness WHERE run_id=?", + (run_id,)): + cur_seeds = {r[0] for r in conn.execute( + "SELECT seed FROM correctness_seed WHERE run_id=? AND profile=? AND oracle=?", + (run_id, profile, oracle))} + if parent_id is not None: + parent_seeds = {r[0] for r in conn.execute( + "SELECT seed FROM correctness_seed WHERE run_id=? AND profile=? AND oracle=?", + (parent_id, profile, oracle))} + else: + warn(f"{profile}/{oracle}: no parent run recorded -- can't diff, " + f"treating all {count} seed(s) as pre-existing this time") + parent_seeds = cur_seeds # first-ever run: nothing "new" + + baseline = accepted_baseline(conn, profile, oracle) + allow = accepted_seeds(conn, profile, oracle) | parent_seeds + new_seeds = cur_seeds - allow + if new_seeds and baseline is not None and len(cur_seeds) <= baseline: + new_seeds = set() # covered by a count-based allowlist entry + if new_seeds: + regressions.append((profile, oracle, sorted(new_seeds))) + return regressions + + +def check_codesize(conn, run_id, parent_id, tolerance_pct: float): + """Return (cur_ratio, parent_ratio, pct_delta) if the total ratio grew by + more than tolerance_pct, else None.""" + cur = conn.execute( + "SELECT ratio FROM codesize_rollup WHERE run_id=? AND suite=''", + (run_id,)).fetchone() + if not cur or parent_id is None: + return None + parent = conn.execute( + "SELECT ratio FROM codesize_rollup WHERE run_id=? AND suite=''", + (parent_id,)).fetchone() + if not parent or parent[0] <= 0: + return None + pct = (cur[0] - parent[0]) / parent[0] * 100.0 + if pct > tolerance_pct: + return cur[0], parent[0], pct + return None + + +def print_visibility(conn, run_id, parent_id) -> None: + """compile_time / perf: informational only, never gates.""" + ct = conn.execute( + "SELECT seconds FROM compile_time WHERE run_id=? AND scope='codesize_corpus_o2'", + (run_id,)).fetchone() + if ct and parent_id is not None: + pct_row = conn.execute( + "SELECT seconds FROM compile_time WHERE run_id=? AND scope='codesize_corpus_o2'", + (parent_id,)).fetchone() + if pct_row and pct_row[0] > 0: + pct = (ct[0] - pct_row[0]) / pct_row[0] * 100.0 + print(f"[gate] compile time: {ct[0]:.1f}s ({pct:+.1f}% vs parent) -- informational") + for row in conn.execute( + "SELECT benchmark, compiler, opt_level, cycles_per_iter FROM perf WHERE run_id=?", + (run_id,)): + print(f"[gate] perf {row[0]} {row[1]}/{row[2]}: {row[3]:.0f} cycles/iter -- informational") + + +def do_accept(conn, spec: str, reason: str) -> None: + parts = spec.split(":") + if len(parts) != 3: + sys.exit("--accept expects PROFILE:ORACLE:SEED") + profile, oracle, seed = parts + conn.execute( + "INSERT OR REPLACE INTO accepted_divergence(profile,oracle,seed,baseline,reason,added_by,added_ts) " + "VALUES(?,?,?,NULL,?,?,?)", + (profile, oracle, int(seed), reason or "unspecified", "metrics_gate.py", int(time.time()))) + conn.commit() + print(f"[gate] accepted {profile}/{oracle} seed {seed}: {reason}") + + +def main(argv=None) -> int: + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--db", required=True) + p.add_argument("--rev", default="HEAD") + p.add_argument("--host") + p.add_argument("--strict", action="store_true", + help="exit 1 on any unaccepted regression (the 'block' switch)") + p.add_argument("--codesize-tolerance-pct", type=float, default=1.0) + p.add_argument("--accept", metavar="PROFILE:ORACLE:SEED", + help="add an allowlist entry and exit (no gate check)") + p.add_argument("--reason", default="", help="reason text for --accept") + args = p.parse_args(argv) + + conn = sqlite3.connect(args.db) + if args.accept: + do_accept(conn, args.accept, args.reason) + return 0 + + import socket + import subprocess + host = args.host or socket.gethostname() + rev = args.rev + if rev == "HEAD" or len(rev) < 40: + try: + rev = subprocess.run(["git", "rev-parse", rev], capture_output=True, + text=True, check=True).stdout.strip() + except subprocess.CalledProcessError: + pass # fall through to prefix match against whatever was passed + + conn.row_factory = sqlite3.Row + run = resolve_run(conn, rev, host) + parent = None + if run["parent_sha"]: + parent = conn.execute( + "SELECT run_id FROM runs WHERE commit_sha=? AND host=?", + (run["parent_sha"], host)).fetchone() + parent_id = parent[0] if parent else None + if parent_id is None: + warn(f"no recorded run for parent {(run['parent_sha'] or '?')[:12]} -- " + f"limited comparison this time") + + correctness_regressions = check_correctness(conn, run["run_id"], parent_id) + codesize_regression = check_codesize(conn, run["run_id"], parent_id, args.codesize_tolerance_pct) + print_visibility(conn, run["run_id"], parent_id) + + ok = True + if correctness_regressions: + ok = False + print(f"[gate] CORRECTNESS REGRESSION on {run['commit_sha'][:12]}:") + for profile, oracle, seeds in correctness_regressions: + print(f" {profile}/{oracle}: new divergent seed(s) {seeds}") + if codesize_regression: + ok = False + cur, par, pct = codesize_regression + print(f"[gate] CODESIZE REGRESSION: ratio {par:.3f} -> {cur:.3f} ({pct:+.1f}%, " + f"tolerance {args.codesize_tolerance_pct}%)") + if ok: + print(f"[gate] {run['commit_sha'][:12]}: no regressions vs parent") + + if args.strict and not ok: + return 1 + if not ok: + print("[gate] (non-strict mode: not failing the build -- track-first policy)") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/metrics/grafana/dashboards/optimizer_regressions.json b/metrics/grafana/dashboards/optimizer_regressions.json new file mode 100644 index 00000000..a601ab22 --- /dev/null +++ b/metrics/grafana/dashboards/optimizer_regressions.json @@ -0,0 +1,111 @@ +{ + "id": null, + "uid": "tcc-optimizer-regressions", + "title": "TinyCC Optimizer Regressions", + "tags": ["tinycc", "optimizer"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "editable": true, + "time": { "from": "now-2y", "to": "now" }, + "refresh": "", + "templating": { "list": [] }, + "annotations": { "list": [] }, + "panels": [ + { + "id": 1, + "title": "Divergent seeds per profile (olevels)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "spanNulls": true } }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "format": "time_series", + "queryText": "SELECT r.commit_ts * 1000 AS time, c.profile AS metric, c.divergent_count AS value FROM correctness c JOIN runs r USING(run_id) WHERE c.oracle = 'olevels' ORDER BY time" + } + ] + }, + { + "id": 2, + "title": "Total divergence (all profiles, all oracles)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "bars", "spanNulls": false } }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "format": "time_series", + "queryText": "SELECT r.commit_ts * 1000 AS time, SUM(c.divergent_count) AS value FROM correctness c JOIN runs r USING(run_id) GROUP BY r.run_id ORDER BY time" + } + ] + }, + { + "id": 3, + "title": "Code size vs GCC (tcc_O2 / gcc_O2 ratio, )", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "spanNulls": true }, "unit": "none" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "format": "time_series", + "queryText": "SELECT r.commit_ts * 1000 AS time, cr.suite AS metric, cr.ratio AS value FROM codesize_rollup cr JOIN runs r USING(run_id) WHERE cr.suite = '' ORDER BY time" + } + ] + }, + { + "id": 4, + "title": "Compile time (code-size corpus, -O2)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "spanNulls": true }, "unit": "s" }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "format": "time_series", + "queryText": "SELECT r.commit_ts * 1000 AS time, ct.seconds AS value FROM compile_time ct JOIN runs r USING(run_id) WHERE ct.scope = 'codesize_corpus_o2' ORDER BY time" + } + ] + }, + { + "id": 5, + "title": "RP2350 cycles/iter (-O2, TCC vs GCC)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "fieldConfig": { "defaults": { "custom": { "drawStyle": "line", "spanNulls": true } }, "overrides": [] }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "format": "time_series", + "queryText": "SELECT r.commit_ts * 1000 AS time, p.benchmark || ' (' || p.compiler || ')' AS metric, p.cycles_per_iter AS value FROM perf p JOIN runs r USING(run_id) WHERE p.opt_level = 'o2' ORDER BY time" + } + ] + }, + { + "id": 6, + "title": "Regressed since parent commit", + "type": "table", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "targets": [ + { + "refId": "A", + "datasource": { "type": "frser-sqlite-datasource", "uid": "tcc-metrics-sqlite" }, + "format": "table", + "queryText": "SELECT r.commit_sha AS commit, r.subject AS subject, c.profile, c.oracle, c.divergent_count AS now, p.divergent_count AS parent, (c.divergent_count - p.divergent_count) AS delta FROM correctness c JOIN runs r USING(run_id) JOIN runs rp ON rp.commit_sha = r.parent_sha AND rp.host = r.host JOIN correctness p ON p.run_id = rp.run_id AND p.profile = c.profile AND p.oracle = c.oracle WHERE (c.divergent_count - p.divergent_count) > 0 ORDER BY delta DESC" + } + ] + } + ] +} diff --git a/metrics/grafana/docker-compose.yml b/metrics/grafana/docker-compose.yml new file mode 100644 index 00000000..17b6ecc2 --- /dev/null +++ b/metrics/grafana/docker-compose.yml @@ -0,0 +1,17 @@ +services: + grafana: + image: docker.io/grafana/grafana:latest + restart: unless-stopped + environment: + GF_INSTALL_PLUGINS: frser-sqlite-datasource + GF_AUTH_ANONYMOUS_ENABLED: "false" + volumes: + - /var/lib/tcc-metrics/metrics.db:/data/metrics.db:ro,Z + - ./provisioning:/etc/grafana/provisioning:Z + - ./dashboards:/etc/grafana/dashboards:Z + - grafana-data:/var/lib/grafana + ports: + - "3000:3000" + +volumes: + grafana-data: diff --git a/metrics/grafana/provisioning/dashboards/dashboards.yml b/metrics/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 00000000..e398605f --- /dev/null +++ b/metrics/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: tcc-metrics + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: true + options: + path: /etc/grafana/dashboards diff --git a/metrics/grafana/provisioning/datasources/sqlite.yml b/metrics/grafana/provisioning/datasources/sqlite.yml new file mode 100644 index 00000000..62b8ff2e --- /dev/null +++ b/metrics/grafana/provisioning/datasources/sqlite.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: TCC Metrics + type: frser-sqlite-datasource + uid: tcc-metrics-sqlite + isDefault: true + jsonData: + path: /data/metrics.db diff --git a/metrics/grafana/tcc-metrics-grafana.service b/metrics/grafana/tcc-metrics-grafana.service new file mode 100644 index 00000000..c807bf7e --- /dev/null +++ b/metrics/grafana/tcc-metrics-grafana.service @@ -0,0 +1,17 @@ +[Unit] +Description=TinyCC metrics Grafana dashboard (podman compose) +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +RemainAfterExit=yes +# Point this at a persistent clone of the repo on the Pi -- not the ephemeral +# actions/checkout workspace the CI job uses. Edit before installing. +WorkingDirectory=/opt/tcc-metrics/tinycc/metrics/grafana +ExecStart=/usr/bin/podman-compose up -d +ExecStop=/usr/bin/podman-compose down +TimeoutStartSec=0 + +[Install] +WantedBy=multi-user.target diff --git a/metrics/record.py b/metrics/record.py new file mode 100644 index 00000000..51d6b905 --- /dev/null +++ b/metrics/record.py @@ -0,0 +1,422 @@ +#!/usr/bin/env python3 +"""Record per-revision optimizer-regression metrics into the SQLite store. + +For one git revision this collects four metric families and upserts them into +metrics.db (schema: metrics/schema.sql), keyed by (commit_sha, host): + + 1. correctness -- O1/O2 divergence per fuzz profile (reuses tests/fuzz/sweep_all.py) + 2. code size -- instructions/function vs GCC (reuses scripts/regression_disasm.py) + 3. compile time -- wall time of the code-size corpus compile (coarse, deterministic) + 4. perf -- RP2350 hardware cycles (reuses tests/benchmarks/run_benchmark.py) + +Correctness + perf are measured against the tcc binary built IN PLACE at the repo +root (tests/fuzz/batch_sweep.py hardcodes armv8m-tcc and cannot be redirected), so +--rev must match the checked-out tree for those. Code size + compile time CAN be +measured against any revision via --backfill (build_tcc_at_rev + TCC_OVERRIDE). + +Idempotent: re-recording the same commit replaces its rows (no duplicates). + +Examples +-------- + # record HEAD (built in place), fast prescan band, no hardware perf + python3 metrics/record.py --db /var/lib/tcc-metrics/metrics.db \ + --rev HEAD --seed-lo 0 --seed-hi 2000 --mode prescan + + # nightly: full-recall triage band + per-function detail + RP2350 perf + python3 metrics/record.py --db "$METRICS_DB" --rev HEAD \ + --seed-lo 0 --seed-hi 20000 --mode triage --codesize-detail \ + --perf-host 127.0.0.1 --perf-identity ~/.ssh/id_rp + + # seed the code-size / compile-time graphs from history (slow, run once) + python3 metrics/record.py --db /var/lib/tcc-metrics/metrics.db --backfill 100 +""" + +import argparse +import csv +import io +import os +import shutil +import socket +import sqlite3 +import subprocess +import sys +import time +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent +SCHEMA_SQL = SCRIPT_DIR / "schema.sql" + +# Make the reused modules importable. +sys.path.insert(0, str(REPO_ROOT / "tests" / "fuzz")) +sys.path.insert(0, str(REPO_ROOT / "scripts")) +sys.path.insert(0, str(REPO_ROOT / "tests" / "benchmarks")) + + +def warn(msg: str) -> None: + print(f"[metrics] WARN: {msg}", file=sys.stderr, flush=True) + + +def info(msg: str) -> None: + print(f"[metrics] {msg}", file=sys.stderr, flush=True) + + +def die(msg: str) -> None: + print(f"[metrics] FATAL: {msg}", file=sys.stderr, flush=True) + sys.exit(1) + + +# --------------------------------------------------------------------------- git + +def _run_git(args: list[str]) -> str: + """Run a git command in REPO_ROOT, surfacing stderr on failure. + + subprocess.CalledProcessError's default str() only includes the exit + code, not stderr -- that swallowed the actual git error the last time + this failed in CI (dubious-ownership in a container job), leaving just + an unhelpful "returned non-zero exit status 128" traceback. + """ + proc = subprocess.run( + ["git", "-C", str(REPO_ROOT), *args], capture_output=True, text=True) + if proc.returncode != 0: + die(f"git {' '.join(args)} failed (exit {proc.returncode}): " + f"{proc.stderr.strip()}") + return proc.stdout + + +def git_meta(rev: str) -> dict: + """Resolve `rev` to full commit metadata via one `git show -s`.""" + fmt = "%H%n%P%n%an%n%ae%n%ct%n%s" + out = _run_git(["show", "-s", f"--format={fmt}", rev]).splitlines() + sha, parents, author, email, cts, subject = (out + [""] * 6)[:6] + return { + "commit_sha": sha, + "parent_sha": (parents.split() or [None])[0], + "author": author, + "author_email": email, + "commit_ts": int(cts) if cts else 0, + "subject": subject, + } + + +def rev_list(n: int) -> list[str]: + """First-parent commit shas, newest first, capped at n.""" + return _run_git( + ["rev-list", "--first-parent", f"--max-count={n}", "mob"]).split() + + +# ------------------------------------------------------------------------- db + +def connect(db_path: str) -> sqlite3.Connection: + Path(db_path).parent.mkdir(parents=True, exist_ok=True) + conn = sqlite3.connect(db_path, timeout=60) + conn.execute("PRAGMA foreign_keys = ON") + conn.executescript(SCHEMA_SQL.read_text()) # self-initializing / idempotent + return conn + + +def upsert_run(conn: sqlite3.Connection, meta: dict, host: str, branch: str, + trigger: str, seed_lo, seed_hi, mode, wall_seconds=None, + tcc_build_ok=1) -> int: + """Insert-or-replace the run row; return its run_id and wipe its child rows + so the caller can re-insert fresh metrics (idempotent replace).""" + conn.execute( + """INSERT INTO runs(commit_sha, parent_sha, branch, author, author_email, + subject, commit_ts, run_ts, host, tcc_build_ok, + wall_seconds, seed_lo, seed_hi, mode, trigger) + VALUES(:commit_sha,:parent_sha,:branch,:author,:author_email,:subject, + :commit_ts,:run_ts,:host,:tcc_build_ok,:wall_seconds, + :seed_lo,:seed_hi,:mode,:trigger) + ON CONFLICT(commit_sha, host) DO UPDATE SET + run_ts=excluded.run_ts, parent_sha=excluded.parent_sha, + subject=excluded.subject, commit_ts=excluded.commit_ts, + tcc_build_ok=excluded.tcc_build_ok, wall_seconds=excluded.wall_seconds, + seed_lo=excluded.seed_lo, seed_hi=excluded.seed_hi, + mode=excluded.mode, trigger=excluded.trigger""", + {**meta, "branch": branch, "run_ts": int(time.time()), "host": host, + "tcc_build_ok": tcc_build_ok, "wall_seconds": wall_seconds, + "seed_lo": seed_lo, "seed_hi": seed_hi, "mode": mode, "trigger": trigger}) + run_id = conn.execute( + "SELECT run_id FROM runs WHERE commit_sha=? AND host=?", + (meta["commit_sha"], host)).fetchone()[0] + for tbl in ("correctness", "correctness_seed", "codesize_rollup", + "codesize_func", "compile_time", "perf"): + conn.execute(f"DELETE FROM {tbl} WHERE run_id=?", (run_id,)) + return run_id + + +# ---------------------------------------------------------------- correctness + +def record_correctness(conn, run_id, lo, hi, mode, jobs) -> None: + """Sweep every profile and store per-(profile,oracle) divergence counts. + Mirrors sweep_all.run_profile's oracle selection so the numbers match a + manual `sweep_all.py` run exactly.""" + import sweep_all as SW + + def emit(line: str) -> None: + print(" " + line, file=sys.stderr, flush=True) + + for name, oracle, _blurb in SW.PROFILES: + merge_gcc = (mode != "triage" and oracle in ("vsgcc", "both")) + ol, vg, gccbad, err = [], [], [], "" + if merge_gcc: + ol, vg, gccbad, err = SW.run_olevels_prescan_with_gcc(name, lo, hi, jobs, emit) + else: + if mode == "triage": + ol, err = SW.run_olevels_triage_sweep(name, lo, hi, jobs, emit) + else: + ol, err = SW.run_olevels_prescan(name, lo, hi, jobs, emit) + if not err and oracle in ("vsgcc", "both"): # triage vs-gcc pass + vg, verr = SW.run_vsgcc(name, lo, hi, jobs, emit) + if verr: + warn(f"{name} vs-gcc: {verr}") + + if err: + # A sweep error (e.g. QEMU/newlib not prepared) means "not measured": + # skip the row so the graph shows a gap rather than a false 0. + warn(f"{name} olevels: {err} -- skipping row") + continue + + low = 1 if (mode != "triage" and name in SW.LOW_RECALL_ON_PRESCAN) else 0 + conn.execute( + """INSERT INTO correctness(run_id,profile,oracle,divergent_count, + gccbad_count,seed_lo,seed_hi,mode,low_recall) + VALUES(?,?,'olevels',?,0,?,?,?,?)""", + (run_id, name, len(ol), lo, hi, mode, low)) + conn.executemany( + "INSERT OR IGNORE INTO correctness_seed VALUES(?,?,'olevels',?)", + [(run_id, name, s) for s in ol]) + if oracle in ("vsgcc", "both"): + conn.execute( + """INSERT INTO correctness(run_id,profile,oracle,divergent_count, + gccbad_count,seed_lo,seed_hi,mode,low_recall) + VALUES(?,?,'vsgcc',?,?,?,?,?,?)""", + (run_id, name, len(vg), len(gccbad), lo, hi, mode, low)) + conn.executemany( + "INSERT OR IGNORE INTO correctness_seed VALUES(?,?,'vsgcc',?)", + [(run_id, name, s) for s in vg]) + info(f"{name}: olevels={len(ol)} vsgcc={len(vg)} gccbad={len(gccbad)}") + + +# ------------------------------------------------------------------- code size + +def _parse_codesize_csv(csv_text: str): + """Yield (suite, test, function, tcc_n, gcc_n) from run_csv_mode output. + Column order is fixed (suite,test,function,tcc_O2,gcc_,ratio); we parse + positionally so the dynamic gcc column name doesn't matter.""" + for row in csv.reader(io.StringIO(csv_text)): + if len(row) < 6 or row[0] == "suite": + continue + try: + yield row[0], row[1], row[2], int(row[3]), int(row[4]) + except ValueError: + continue + + +def record_codesize(conn, run_id, jobs, detail: bool, tcc_override=None) -> float: + """Record code size (rollup + optional per-function detail) and return the + corpus compile wall-time (the coarse compile-time proxy).""" + from regression_disasm import run_csv_mode + t0 = time.monotonic() + csv_text = run_csv_mode("-O2", None, "all", jobs, tcc_override=tcc_override) + elapsed = time.monotonic() - t0 + + rollup = {} # suite -> [func_count, tcc, gcc] + tot = [0, 0, 0] + detail_rows = [] + for suite, test, func, tcc_n, gcc_n in _parse_codesize_csv(csv_text): + r = rollup.setdefault(suite, [0, 0, 0]) + r[0] += 1; r[1] += tcc_n; r[2] += gcc_n + tot[0] += 1; tot[1] += tcc_n; tot[2] += gcc_n + if detail: + ratio = (tcc_n / gcc_n) if gcc_n > 0 else 0.0 + detail_rows.append((run_id, suite, test, func, tcc_n, gcc_n, ratio)) + + for suite, (fc, tcc_n, gcc_n) in list(rollup.items()) + [("", tot)]: + ratio = (tcc_n / gcc_n) if gcc_n > 0 else 0.0 + conn.execute( + "INSERT OR REPLACE INTO codesize_rollup VALUES(?,?,?,?,?,?)", + (run_id, suite, fc, tcc_n, gcc_n, ratio)) + if detail_rows: + conn.executemany( + "INSERT OR REPLACE INTO codesize_func VALUES(?,?,?,?,?,?,?)", detail_rows) + info(f"codesize: {tot[0]} funcs, tcc={tot[1]} gcc={tot[2]} " + f"ratio={tot[1]/tot[2]:.3f} in {elapsed:.0f}s" + if tot[2] else f"codesize: {tot[0]} funcs") + return elapsed + + +def record_compile_time(conn, run_id, corpus_secs, n_units) -> None: + conn.execute( + "INSERT OR REPLACE INTO compile_time VALUES(?,?,?,?)", + (run_id, "codesize_corpus_o2", corpus_secs, n_units)) + + +def import_codesize(conn, run_id, src_db_path, commit_sha) -> bool: + """Copy codesize_rollup/codesize_func/compile_time rows recorded for + `commit_sha` in another metrics db (e.g. a cloud-runner scratch db from a + faster build host) into `run_id`, instead of recomputing them locally.""" + conn.execute("ATTACH DATABASE ? AS src", (src_db_path,)) + try: + src_run = conn.execute( + "SELECT run_id FROM src.runs WHERE commit_sha=? ORDER BY run_ts DESC LIMIT 1", + (commit_sha,)).fetchone() + found = src_run is not None + if found: + src_run_id = src_run[0] + conn.execute( + """INSERT OR REPLACE INTO codesize_rollup + SELECT ?, suite, func_count, tcc_o2, gcc_o2, ratio + FROM src.codesize_rollup WHERE run_id=?""", (run_id, src_run_id)) + conn.execute( + """INSERT OR REPLACE INTO codesize_func + SELECT ?, suite, test, function, tcc_o2, gcc_o2, ratio + FROM src.codesize_func WHERE run_id=?""", (run_id, src_run_id)) + conn.execute( + """INSERT OR REPLACE INTO compile_time + SELECT ?, scope, seconds, n_units + FROM src.compile_time WHERE run_id=?""", (run_id, src_run_id)) + else: + warn(f"no codesize data for {commit_sha[:12]} in {src_db_path} -- skipping import") + conn.commit() # DETACH requires no pending transaction on `conn`, success or not + if not found: + return False + n = conn.execute( + "SELECT COUNT(*) FROM codesize_rollup WHERE run_id=?", (run_id,)).fetchone()[0] + info(f"imported codesize/compile_time from {src_db_path} ({n} codesize rows)") + return n > 0 + finally: + conn.execute("DETACH DATABASE src") + + +# ------------------------------------------------------------------------ perf + +def record_perf(conn, run_id, perf_host, perf_identity, scratch: Path) -> None: + """Run the RP2350 benchmark over SSH and store cycles/build-size. Any + failure (no host, SSH down, no board) is non-fatal: perf is simply absent + for this commit and the dashboard shows a gap.""" + if not perf_host: + return + json_out = scratch / "perf.json" + cmd = [sys.executable, str(REPO_ROOT / "tests" / "benchmarks" / "run_benchmark.py"), + perf_host, "--opt-level", "all", "--save-data", str(json_out)] + if perf_identity: + cmd += ["--identity", perf_identity] + rc = subprocess.run(cmd, cwd=str(REPO_ROOT)).returncode + if rc != 0 or not json_out.exists(): + warn("perf skipped: RP2350 benchmark did not produce data") + return + from run_benchmark import load_results_json + results = load_results_json(str(json_out)) + n = 0 + for key, res in results.items(): + # key like 'tcc_o2' / 'gcc_o0'; res.compiler is 'TCC'/'GCC' + opt = key.split("_", 1)[1] if "_" in key else "o?" + bs = res.build_size or {} + for b in res.benchmarks: + conn.execute( + "INSERT OR REPLACE INTO perf VALUES(?,?,?,?,?,?,?,?,?)", + (run_id, b.name, res.compiler, opt, b.cycles_per_iter, + bs.get("text"), bs.get("data"), bs.get("bss"), b.verify)) + n += 1 + info(f"perf: {n} benchmark rows from {len(results)} builds") + + +# ---------------------------------------------------------------------- record + +def record_one(conn, meta, host, branch, trigger, args, tcc_override=None, + do_correctness=True, do_perf=True) -> None: + t0 = time.monotonic() + run_id = upsert_run(conn, meta, host, branch, trigger, + args.seed_lo, args.seed_hi, args.mode) + if do_correctness: + record_correctness(conn, run_id, args.seed_lo, args.seed_hi, args.mode, args.jobs) + if args.import_codesize_from: + import_codesize(conn, run_id, args.import_codesize_from, meta["commit_sha"]) + else: + corpus_secs = record_codesize(conn, run_id, args.jobs, args.codesize_detail, tcc_override) + n_units = conn.execute( + "SELECT func_count FROM codesize_rollup WHERE run_id=? AND suite=''", + (run_id,)).fetchone() + record_compile_time(conn, run_id, corpus_secs, n_units[0] if n_units else None) + if do_perf and args.perf_host: + record_perf(conn, run_id, args.perf_host, args.perf_identity, + Path(args.scratch or ".")) + conn.execute("UPDATE runs SET wall_seconds=? WHERE run_id=?", + (time.monotonic() - t0, run_id)) + conn.commit() + info(f"recorded {meta['commit_sha'][:12]} ({meta['subject'][:50]}) " + f"in {time.monotonic()-t0:.0f}s") + + +def do_backfill(conn, host, branch, args) -> None: + """Seed code-size + compile-time history across past revisions. Correctness + and perf are NOT backfillable (batch_sweep is in-place-only; perf needs the + board per rev), so those are skipped -- consistent with track-first.""" + revs = rev_list(args.backfill) + info(f"backfill: {len(revs)} revisions (codesize + compile-time only)") + for i, rev in enumerate(revs, 1): + try: + meta = git_meta(rev) + except subprocess.CalledProcessError: + warn(f"skip {rev}: bad rev"); continue + if conn.execute("SELECT 1 FROM codesize_rollup r JOIN runs u USING(run_id) " + "WHERE u.commit_sha=? AND u.host=?", + (meta["commit_sha"], host)).fetchone(): + info(f"[{i}/{len(revs)}] {rev[:12]} already has codesize -- skip") + continue + try: + from regression_disasm import build_tcc_at_rev + tcc_path, build_dir = build_tcc_at_rev(rev, args.jobs) + except SystemExit: + warn(f"[{i}/{len(revs)}] {rev[:12]} build failed -- skip"); continue + try: + info(f"[{i}/{len(revs)}] recording {rev[:12]} ...") + record_one(conn, meta, host, branch, "backfill", args, + tcc_override=tcc_path, do_correctness=False, do_perf=False) + finally: + shutil.rmtree(build_dir, ignore_errors=True) + + +def main(argv=None) -> int: + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--db", required=True, help="path to metrics.db") + p.add_argument("--rev", default="HEAD", help="git revision to record (default HEAD)") + p.add_argument("--seed-lo", type=int, default=0) + p.add_argument("--seed-hi", type=int, default=2000) + p.add_argument("--mode", choices=["prescan", "triage"], default="prescan") + p.add_argument("--codesize-detail", action="store_true", + help="also store per-function code size (large; nightly)") + p.add_argument("--perf-host", help="SSH host for the RP2350 benchmark (omit to skip perf)") + p.add_argument("--perf-identity", help="SSH identity file for --perf-host") + p.add_argument("--jobs", type=int, default=os.cpu_count() or 4) + p.add_argument("--host", default=os.environ.get("METRICS_HOST") or socket.gethostname()) + p.add_argument("--branch", default="mob") + p.add_argument("--trigger", default="manual") + p.add_argument("--scratch", help="scratch dir for perf JSON (default cwd)") + p.add_argument("--backfill", type=int, metavar="N", + help="record codesize+compile-time for the last N first-parent commits") + p.add_argument("--no-correctness", action="store_true", + help="skip the fuzz sweep (codesize/compile-time only)") + p.add_argument("--import-codesize-from", metavar="DB_PATH", + help="skip local codesize/compile-time measurement; copy those rows " + "from another metrics.db recorded for the same commit (e.g. a " + "cloud-runner scratch db)") + args = p.parse_args(argv) + + conn = connect(args.db) + try: + if args.backfill: + do_backfill(conn, args.host, args.branch, args) + else: + meta = git_meta(args.rev) + record_one(conn, meta, args.host, args.branch, args.trigger, args, + do_correctness=not args.no_correctness) + finally: + conn.close() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/metrics/schema.sql b/metrics/schema.sql new file mode 100644 index 00000000..954ef0bd --- /dev/null +++ b/metrics/schema.sql @@ -0,0 +1,133 @@ +-- schema.sql -- per-revision optimizer-regression metrics store. +-- +-- One SQLite file (default /var/lib/tcc-metrics/metrics.db on the Pi) written by +-- metrics/record.py and read by Grafana (frser-sqlite-datasource). +-- The x-axis for every dashboard panel is runs.commit_ts (committer unix time), +-- so the graphs are commit-indexed, not wall-clock-indexed. +-- +-- Idempotency contract (see record.py): `runs` is UNIQUE(commit_sha, host); +-- child rows are DELETEd for a run_id and re-INSERTed inside one transaction, so +-- re-recording a commit is a clean replace, never a duplicate. +-- +-- Apply with: sqlite3 metrics.db < metrics/schema.sql (safe to re-run). + +PRAGMA journal_mode = WAL; -- Grafana reads never block the recorder's writes +PRAGMA foreign_keys = ON; + +-- One row per (commit, host). parent_sha = first parent, used by the +-- "regressed since parent" panels. host matters because perf (cycles) is +-- hardware-specific; correctness/codesize are host-independent but still keyed +-- by host so one DB can hold more than one runner. +CREATE TABLE IF NOT EXISTS runs ( + run_id INTEGER PRIMARY KEY, + commit_sha TEXT NOT NULL, + parent_sha TEXT, + branch TEXT NOT NULL DEFAULT 'mob', + author TEXT, + author_email TEXT, + subject TEXT, + commit_ts INTEGER NOT NULL, -- committer unix ts = graph x-axis + run_ts INTEGER NOT NULL, -- when the recorder ran + host TEXT NOT NULL, + tcc_build_ok INTEGER NOT NULL DEFAULT 1, + wall_seconds REAL, + seed_lo INTEGER, -- correctness band actually swept + seed_hi INTEGER, + mode TEXT, -- 'prescan' | 'triage' + trigger TEXT, -- 'push' | 'schedule' | 'backfill' | 'manual' + notes TEXT, + UNIQUE(commit_sha, host) +); +CREATE INDEX IF NOT EXISTS ix_runs_commit_ts ON runs(commit_ts); +CREATE INDEX IF NOT EXISTS ix_runs_sha ON runs(commit_sha); + +-- (1) O1/O2 correctness divergence, one row per (run, profile, oracle). +-- oracle='olevels' -> tcc -O0/-O1/-O2/-Os self-consistency +-- oracle='vsgcc' -> vs arm-none-eabi-gcc -O2 gold +-- gccbad_count -> seeds where gcc -O0 != gcc -O2 (oracle-unreliable, quarantined) +-- low_recall -> 1 for ptr/struct_byval under prescan (~80% recall caveat) +CREATE TABLE IF NOT EXISTS correctness ( + run_id INTEGER NOT NULL REFERENCES runs(run_id) ON DELETE CASCADE, + profile TEXT NOT NULL, + oracle TEXT NOT NULL, + divergent_count INTEGER NOT NULL DEFAULT 0, + gccbad_count INTEGER NOT NULL DEFAULT 0, + seed_lo INTEGER NOT NULL, + seed_hi INTEGER NOT NULL, + mode TEXT NOT NULL, + low_recall INTEGER NOT NULL DEFAULT 0, + PRIMARY KEY(run_id, profile, oracle) +); + +-- Drill-down + gate input: the actual divergent seed ids. +CREATE TABLE IF NOT EXISTS correctness_seed ( + run_id INTEGER NOT NULL REFERENCES runs(run_id) ON DELETE CASCADE, + profile TEXT NOT NULL, + oracle TEXT NOT NULL, + seed INTEGER NOT NULL, + PRIMARY KEY(run_id, profile, oracle, seed) +); + +-- (2a) code-size ROLLUP -- always written, small (one row per suite + a +-- '' grand-total row). ratio = tcc_o2 / gcc_o2. +CREATE TABLE IF NOT EXISTS codesize_rollup ( + run_id INTEGER NOT NULL REFERENCES runs(run_id) ON DELETE CASCADE, + suite TEXT NOT NULL, -- '' for the grand total + func_count INTEGER NOT NULL, + tcc_o2 INTEGER NOT NULL, + gcc_o2 INTEGER NOT NULL, + ratio REAL NOT NULL, + PRIMARY KEY(run_id, suite) +); + +-- (2b) code-size DETAIL -- per-function; large (~thousands of rows/run), so +-- written only when the recorder is invoked with --codesize-detail (nightly). +CREATE TABLE IF NOT EXISTS codesize_func ( + run_id INTEGER NOT NULL REFERENCES runs(run_id) ON DELETE CASCADE, + suite TEXT NOT NULL, + test TEXT NOT NULL, + function TEXT NOT NULL, + tcc_o2 INTEGER NOT NULL, + gcc_o2 INTEGER NOT NULL, + ratio REAL NOT NULL, + PRIMARY KEY(run_id, suite, test, function) +); + +-- (3) compile time. scope='codesize_corpus_o2' is the wall time of the code-size +-- corpus compile (deterministic, no hardware); n_units = function count for +-- throughput normalization. +CREATE TABLE IF NOT EXISTS compile_time ( + run_id INTEGER NOT NULL REFERENCES runs(run_id) ON DELETE CASCADE, + scope TEXT NOT NULL, + seconds REAL NOT NULL, + n_units INTEGER, + PRIMARY KEY(run_id, scope) +); + +-- (4) RP2350 hardware perf, one row per (run, benchmark, compiler, opt_level). +CREATE TABLE IF NOT EXISTS perf ( + run_id INTEGER NOT NULL REFERENCES runs(run_id) ON DELETE CASCADE, + benchmark TEXT NOT NULL, + compiler TEXT NOT NULL, -- 'TCC' | 'GCC' + opt_level TEXT NOT NULL, -- 'o0' | 'o1' | 'o2' + cycles_per_iter REAL NOT NULL, + build_text INTEGER, + build_data INTEGER, + build_bss INTEGER, + verify TEXT, -- 'PASS'/'FAIL' from BenchmarkResult.verify + PRIMARY KEY(run_id, benchmark, compiler, opt_level) +); + +-- Gate allowlist (track-first -> block): pre-existing / accepted divergences the +-- gate must not fail on. A row with a concrete `seed` accepts exactly that seed; +-- a row with seed IS NULL accepts a count baseline (`baseline`) for the profile. +CREATE TABLE IF NOT EXISTS accepted_divergence ( + profile TEXT NOT NULL, + oracle TEXT NOT NULL, + seed INTEGER, + baseline INTEGER, + reason TEXT NOT NULL, + added_by TEXT, + added_ts INTEGER NOT NULL, + PRIMARY KEY(profile, oracle, seed) +); diff --git a/reduce.py b/reduce.py new file mode 100644 index 00000000..078e4235 --- /dev/null +++ b/reduce.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +"""Generic C-source reducer for the fuzz triage miscompiles. + +Oracle: a "interesting" predicate. We compile the reduced program at a given +opt-level and an oracle opt-level (default O0 / known-good), run under QEMU via +tests/ir_tests/run.py, and require: + * outputs DIFFER (so we don't reduce to a trivially-correct program) + * reduced-bad-output equals the originally-recorded bad checksum + * oracle output equals the recorded good checksum + +Keeps the reduction faithful to the *original* miscompile. + +Usage: + python3 reduce.py -O -G [-g oracle_level] +""" +from __future__ import annotations +import argparse, os, re, subprocess, sys, random, tempfile, shutil + +REPO = os.path.abspath(os.path.dirname(__file__)) +RUN = os.path.join(REPO, "tests", "ir_tests", "run.py") +ENV = dict(os.environ, ASAN_OPTIONS="detect_leaks=0") +ENV.pop("TCC_DISABLE_PASS", None) + +_cache: dict[bytes, tuple[str, str]] = {} + +def run(src: bytes, level: str) -> tuple[str, str]: + h = hash((src, level)) + if h in _cache: + return _cache[h] + with tempfile.NamedTemporaryFile("wb", suffix=".c", delete=False) as f: + f.write(src); path = f.name + try: + p = subprocess.run(["python", RUN, "-c", path, "--cflags=" + level], + capture_output=True, text=True, env=ENV, + cwd=os.path.join(REPO, "tests", "ir_tests")) + out = p.stdout + m = re.search(r"checksum=([0-9a-f]+)", out) + summ = m.group(1) if m else ("ERR" if p.returncode else "NOOUT") + res = (summ, out) + finally: + os.unlink(path) + _cache[h] = res + return res + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("file") + ap.add_argument("-O", "--bad-level", required=True) + ap.add_argument("-g", "--good-level", default="-O0") + ap.add_argument("badsum") + ap.add_argument("goodsum", nargs="?") + args = ap.parse_args() + + with open(args.file, "rb") as f: + src0 = f.read() + + def interesting(src: bytes) -> bool: + bad, _ = run(src, args.bad_level) + if bad != args.badsum: + return False + if args.goodsum: + good, _ = run(src, args.good_level) + if good != args.goodsum: + return False + return True + + assert interesting(src0), "original does not reproduce" + print(f"[start] {len(src0)} bytes", flush=True) + + src = src0 + # Strategy 1: drop contiguous line ranges + lines = src.split(b"\n") + improved = True + while improved: + improved = False + n = len(lines) + # try dropping larger chunks first + for span in [n, n//2, n//4, n//8, 16, 8, 4, 2, 1]: + if span < 1: continue + i = 0 + while i + span <= n: + cand = lines[:i] + lines[i+span:] + cs = b"\n".join(cand) + if interesting(cs): + lines = cand + n = len(lines) + improved = True + print(f"[drop {span} @ {i}] -> {len(lines)} lines", flush=True) + continue + i += span + src = b"\n".join(lines) + + # Strategy 2: blank out substrings within a line (keep structure) + # Replace parenthesized sub-expressions and identifier tokens with 0 + src = b"\n".join(lines) + improved = True + while improved: + improved = False + # replace each long token-ish run with '0' + new = re.sub(rb"(0x[0-9a-fA-F]+|[0-9]+u?)", b"0", src) + if new != src and interesting(new): + src = new; improved = True; print("[num->0]", flush=True) + # collapse sequences of casts/parens + break + + # Strategy 3: repeated token-level deletion + toks = src.split(b" ") + improved = True + while improved: + improved = False + n = len(toks) + for span in [n, n//2, n//4, 8, 4, 2, 1]: + if span < 1: continue + i = 0 + while i + span <= n: + cand = toks[:i] + toks[i+span:] + cs = b" ".join(cand) + if interesting(cs): + toks = cand; n = len(toks); improved = True + print(f"[tokdrop {span} @ {i}] -> {n} toks", flush=True) + continue + i += span + src = b" ".join(toks) + + with open(args.file + ".reduced.c", "wb") as f: + f.write(src) + print(f"[done] wrote {args.file}.reduced.c ({len(src)} bytes)") + +if __name__ == "__main__": + main() diff --git a/scripts/asan_sweep.py b/scripts/asan_sweep.py new file mode 100755 index 00000000..c7ec4bdd --- /dev/null +++ b/scripts/asan_sweep.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python3 +""" +asan_sweep.py — corpus enumeration + sweep driver + dedup/report for the +tinycc ASAN/UBSan bug-hunting sweep (Phase BH, Track 1). + +This is a *helper* invoked by scripts/asan_sweep.sh; the bash script remains the +entry point. It exists because robust corpus enumeration (gcc-torture builtins +source expansion, shardable file lists), per-file compile invocation, sanitizer +signature detection and stack-frame dedup are far cleaner in Python than in bash. + +The oracle is the sanitizer output printed by `armv8m-tcc` (built with +-fsanitize=address by default). An ordinary "unsupported feature" compile error +(nonzero exit, no sanitizer line) is NOT a hit; only a real sanitizer report is. + +Test/tooling only. Does not modify production code or config.mak. +""" + +import argparse +import os +import re +import subprocess +import sys +from pathlib import Path + +REPO = Path(__file__).resolve().parent.parent + +# Sanitizer signatures that mark a genuine hit. We deliberately key on the +# sanitizer's own markers, NOT on the compiler exit code (a plain "unsupported +# feature" error also exits nonzero but prints none of these). +SANITIZER_RE = re.compile( + r"(ERROR: AddressSanitizer" + r"|ERROR: LeakSanitizer" + r"|LeakSanitizer: detected memory leaks" + r"|runtime error:" # UBSan + r"|SUMMARY: .*Sanitizer)" +) + +# A SUMMARY line is the most human-readable one-liner for the report. +SUMMARY_RE = re.compile(r"SUMMARY: .*?Sanitizer:.*") +# UBSan runtime errors do not always emit a SUMMARY; capture the first one. +UBSAN_RE = re.compile(r".*runtime error:.*") + +# Backtrace frame: " #3 0x... in (...)" +FRAME_RE = re.compile(r"#\d+\s+0x[0-9a-f]+\s+in\s+(\S+)") + +# Generic allocator / wrapper / runtime frames that are NOT the root cause and +# must be skipped when building a dedup key (otherwise every leak collapses into +# one bucket regardless of where it was actually allocated). +NOISE_FRAMES = { + "malloc", "calloc", "realloc", "free", "reallocarray", + "realloc.part.0", "malloc.part.0", + "operator new", "operator new[]", + "default_reallocator", "default_realloc", + "tcc_malloc", "tcc_mallocz", "tcc_realloc", "tcc_realloc_debug", + "tcc_malloc_debug", "tcc_mallocz_debug", "tcc_free", "tcc_strdup", + "__interceptor_malloc", "__interceptor_calloc", "__interceptor_realloc", + "__libc_start_main", "__libc_start_call_main", "_start", "main", + "__asan_memcpy", "__asan_memset", "__asan_memmove", + "__sanitizer_print_stack_trace", +} + + +def _is_noise(sym): + if sym in NOISE_FRAMES: + return True + # libasan internal frames have no real symbol of interest. + if sym.startswith("__asan_") or sym.startswith("__ubsan_") or sym.startswith("__lsan_"): + return True + if sym.startswith("__interceptor_"): + return True + return False + + +def meaningful_frames(stderr_text, k=3): + """Return the first k meaningful (non-noise) backtrace symbols across the + whole report, in order. This is the dedup key — the same bug across many + files collapses to a single entry.""" + frames = [] + for m in FRAME_RE.finditer(stderr_text): + sym = m.group(1) + if _is_noise(sym): + continue + frames.append(sym) + if len(frames) >= k: + break + return frames + + +def summary_line(stderr_text): + m = SUMMARY_RE.search(stderr_text) + if m: + return m.group(0).strip() + m = UBSAN_RE.search(stderr_text) + if m: + return m.group(0).strip()[:200] + # Fall back to the ERROR line. + for line in stderr_text.splitlines(): + if "Sanitizer" in line and ("ERROR" in line or "WARNING" in line): + return line.strip() + return "Sanitizer report (no SUMMARY line)" + + +# -------------------------------------------------------------------------- +# Corpus enumeration +# -------------------------------------------------------------------------- + +def _gcc_torture_root(): + return REPO / "tests/gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture" + + +def expand_gcc_builtin_sources(source): + """Mirror tests/ir_tests/run.py:expand_gcc_builtin_sources — a builtins/ + execute test needs its -lib.c companion plus lib/main.c so the + compile actually exercises the same multi-TU shape the real harness uses.""" + extra = [] + if source.name.endswith("-lib.c"): + return extra + parent = source.parent + if parent.name != "builtins": + return extra + if parent.parent.name != "execute": + return extra + if parent.parent.parent.name != "gcc.c-torture": + return extra + lib_file = source.with_name(f"{source.stem}-lib.c") + builtins_main = parent / "lib" / "main.c" + for f in (lib_file, builtins_main): + if f.exists(): + extra.append(f) + return extra + + +def enumerate_corpus(corpus): + """Return a list of (primary_source: Path, extra_sources: [Path]) work items.""" + items = [] + + def add_gcc_torture(): + root = _gcc_torture_root() + if not root.exists(): + print(f"warning: gcc-torture not found at {root} " + f"(run 'make download-gcc-tests')", file=sys.stderr) + return + execute = root / "execute" + # Top-level + ieee + builtins, recursively; skip -lib.c companions and + # files inside lib/ (they are pulled in as extra sources, not compiled + # standalone). + for c in sorted(execute.rglob("*.c")): + if c.name.endswith("-lib.c"): + continue + if c.parent.name == "lib": + continue + items.append((c, expand_gcc_builtin_sources(c))) + compile_dir = root / "compile" + if compile_dir.exists(): + for c in sorted(compile_dir.glob("*.c")): + items.append((c, [])) + + if corpus in ("gcc-torture", "all"): + add_gcc_torture() + if corpus in ("tests2", "all"): + for c in sorted((REPO / "tests/tests2").glob("*.c")): + items.append((c, [])) + if corpus in ("ir_tests", "all"): + for c in sorted((REPO / "tests/ir_tests").glob("*.c")): + items.append((c, [])) + + return items + + +def apply_shard_limit(items, shard, limit): + if shard: + i, n = shard + items = [it for idx, it in enumerate(items) if idx % n == (i - 1)] + if limit: + items = items[:limit] + return items + + +# -------------------------------------------------------------------------- +# Compile +# -------------------------------------------------------------------------- + +def build_compile_cmd(compiler, include_flags, abi_flags, opt, sources): + cmd = [str(compiler), f"-B{REPO}"] + cmd += abi_flags + cmd += include_flags + cmd += [opt, "-c"] + cmd += [str(s) for s in sources] + cmd += ["-o", "/dev/null"] + return cmd + + +def run_one(compiler, include_flags, abi_flags, opt, primary, extras, timeout): + sources = [primary] + list(extras) + cmd = build_compile_cmd(compiler, include_flags, abi_flags, opt, sources) + try: + proc = subprocess.run( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + timeout=timeout, + ) + stderr = proc.stderr.decode("utf-8", errors="replace") + rc = proc.returncode + except subprocess.TimeoutExpired as e: + stderr = (e.stderr or b"").decode("utf-8", errors="replace") + rc = -1 + return rc, stderr + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--compiler", default=str(REPO / "armv8m-tcc"), + help="path to the cross compiler (ASAN-built armv8m-tcc)") + ap.add_argument("--corpus", default="all", + choices=["gcc-torture", "tests2", "ir_tests", "all"]) + ap.add_argument("--olevels", default="-O0,-O1,-O2", + help="comma-separated optimization levels") + ap.add_argument("--shard", default=None, + help="i/N — sweep only shard i of N (1-based)") + ap.add_argument("--limit", type=int, default=0, + help="cap number of files swept (after sharding)") + ap.add_argument("--timeout", type=int, default=60, + help="per-compile timeout in seconds") + ap.add_argument("--include-flags", default="", + help="space-separated -I flags from the harness Makefile") + ap.add_argument("--abi-flags", default="", + help="space-separated ABI/codegen flags from the Makefile") + ap.add_argument("--report", default=None, + help="write the deduped report here (also printed to stdout)") + ap.add_argument("--list-hits-raw", default=None, + help="append every raw hit line (file|olevel|key) here") + ap.add_argument("--progress-every", type=int, default=100) + args = ap.parse_args() + + shard = None + if args.shard: + i, n = args.shard.split("/") + shard = (int(i), int(n)) + if not (1 <= shard[0] <= shard[1]): + print(f"error: bad shard {args.shard}", file=sys.stderr) + return 2 + + olevels = [o.strip() for o in args.olevels.split(",") if o.strip()] + include_flags = args.include_flags.split() + abi_flags = args.abi_flags.split() + + items = enumerate_corpus(args.corpus) + total_files = len(items) + items = apply_shard_limit(items, shard, args.limit) + + compiler = Path(args.compiler) + if not compiler.exists(): + print(f"error: compiler not found: {compiler}", file=sys.stderr) + return 2 + + # bug_key -> dict(summary, key_frames, count, repros=[(file, olevel)]) + bugs = {} + swept = 0 + hit_compiles = 0 + raw_hits = [] + + for idx, (primary, extras) in enumerate(items): + for opt in olevels: + swept += 1 + rc, stderr = run_one(compiler, include_flags, abi_flags, + opt, primary, extras, args.timeout) + if not SANITIZER_RE.search(stderr): + continue + hit_compiles += 1 + frames = meaningful_frames(stderr, k=3) + key = " <- ".join(frames) if frames else "(no meaningful frames)" + summ = summary_line(stderr) + rel = os.path.relpath(primary, REPO) + raw_hits.append(f"{rel}|{opt}|{key}") + b = bugs.setdefault(key, { + "summary": summ, + "frames": frames, + "count": 0, + "repro": None, + "files": set(), + }) + b["count"] += 1 + b["files"].add(rel) + if b["repro"] is None: + b["repro"] = (rel, opt) + # Prefer the most informative summary if a later one is richer. + if summ and len(summ) > len(b["summary"]): + b["summary"] = summ + if args.progress_every and (idx + 1) % args.progress_every == 0: + print(f" ... {idx + 1}/{len(items)} files, " + f"{len(bugs)} unique bug(s)", file=sys.stderr) + + # ---- report ---- + lines = [] + lines.append("=" * 78) + lines.append("ASAN/UBSan sweep report") + lines.append("=" * 78) + lines.append(f"corpus : {args.corpus}") + lines.append(f"olevels : {','.join(olevels)}") + if shard: + lines.append(f"shard : {shard[0]}/{shard[1]}") + if args.limit: + lines.append(f"limit : {args.limit}") + lines.append(f"files in corpus : {total_files}") + lines.append(f"files this run : {len(items)}") + lines.append(f"compiles run : {swept}") + lines.append(f"sanitizer hits : {hit_compiles} compile(s)") + lines.append(f"unique bugs : {len(bugs)}") + lines.append("") + + if bugs: + # Sort by count descending so the most-frequent bug is first. + for n, (key, b) in enumerate( + sorted(bugs.items(), key=lambda kv: -kv[1]["count"]), 1): + repro_file, repro_opt = b["repro"] + lines.append(f"[BUG {n}] {key}") + lines.append(f" summary : {b['summary']}") + lines.append(f" seen in : {b['count']} compile(s) " + f"across {len(b['files'])} file(s)") + lines.append(f" repro : {repro_file} {repro_opt}") + lines.append("") + else: + lines.append("No sanitizer hits in this slice.") + lines.append("") + + report = "\n".join(lines) + print(report) + + if args.report: + Path(args.report).write_text(report) + if args.list_hits_raw and raw_hits: + with open(args.list_hits_raw, "a") as f: + for h in raw_hits: + f.write(h + "\n") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/asan_sweep.sh b/scripts/asan_sweep.sh new file mode 100755 index 00000000..4d9639c7 --- /dev/null +++ b/scripts/asan_sweep.sh @@ -0,0 +1,175 @@ +#!/usr/bin/env bash +# +# asan_sweep.sh — Phase BH / Track 1 ASAN+UBSan corpus sweep for tinycc. +# +# The cross compiler armv8m-tcc is built with AddressSanitizer ON by default +# (config.mak: -fsanitize=address), so compiling any corpus file *with* it makes +# tcc report ASAN/LeakSanitizer errors on its OWN heap bugs. The ORACLE is the +# sanitizer output printed by tcc, not the compile exit code: a plain +# "unsupported feature" compile error is NOT a hit. +# +# This sweeps the corpus (gcc-torture compile+execute, tests2, ir_tests) across +# -O0/-O1/-O2, greps stderr for sanitizer signatures, and dedups hits by the top +# meaningful backtrace frames so one bug across many files collapses to one entry. +# +# Test/tooling only. Does NOT modify production code. --with-ubsan builds a +# SEPARATE compiler out-of-band (config.mak is saved+restored) so the shared +# armv8m-tcc other agents depend on is never mutated. +# +# Usage: +# scripts/asan_sweep.sh [options] +# +# --corpus C gcc-torture | tests2 | ir_tests | all (default: all) +# --olevels L comma list of opt levels (default: -O0,-O1,-O2) +# --shard i/N sweep only shard i of N (1-based) for parallel runs +# --limit N cap number of files swept (after sharding) +# --timeout S per-compile timeout in seconds (default: 60) +# --compiler PATH compiler to use (default: ./armv8m-tcc; the ASAN build) +# --with-ubsan ALSO build an out-of-band UBSan compiler and sweep with it +# (rebuilds into a temp dir, restoring config.mak; SLOW) +# --report PATH write the deduped report to PATH (also printed) +# --raw-hits PATH append every raw hit line (file|olevel|key) to PATH +# -h | --help show this help +# +# Examples: +# # full sweep, all corpora, all O-levels: +# scripts/asan_sweep.sh --corpus all +# # one shard of gcc-torture for a parallel fleet: +# scripts/asan_sweep.sh --corpus gcc-torture --shard 3/40 +# # quick smoke: +# scripts/asan_sweep.sh --corpus tests2 --limit 30 +# +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO="$(cd "$SCRIPT_DIR/.." && pwd)" +HELPER="$SCRIPT_DIR/asan_sweep.py" + +# ---- defaults ---- +CORPUS="all" +OLEVELS="-O0,-O1,-O2" +SHARD="" +LIMIT="0" +TIMEOUT="60" +COMPILER="$REPO/armv8m-tcc" +WITH_UBSAN="0" +REPORT="" +RAW_HITS="" + +usage() { sed -n '2,45p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'; } + +while [[ $# -gt 0 ]]; do + case "$1" in + --corpus) CORPUS="$2"; shift 2;; + --olevels) OLEVELS="$2"; shift 2;; + --shard) SHARD="$2"; shift 2;; + --limit) LIMIT="$2"; shift 2;; + --timeout) TIMEOUT="$2"; shift 2;; + --compiler) COMPILER="$2"; shift 2;; + --with-ubsan) WITH_UBSAN="1"; shift;; + --report) REPORT="$2"; shift 2;; + --raw-hits) RAW_HITS="$2"; shift 2;; + -h|--help) usage; exit 0;; + *) echo "unknown option: $1" >&2; usage; exit 2;; + esac +done + +# -------------------------------------------------------------------------- +# Reconstruct the EXACT include/ABI flags the real torture harness passes when +# CC is armv8m-tcc. Mirrors tests/ir_tests/qemu/mps2-an505/Makefile: +# GCC_ABI_FLAGS = -mcpu=cortex-m33 -mthumb -mfloat-abi=soft +# CFLAGS += -nostdlib -fvisibility=hidden $(GCC_ABI_FLAGS) -ffunction-sections +# (armv8m-tcc branch) -I libc_includes -I libc_imports -I newlib +# -I $(ARM_SYSROOT)/include -I $(TCC_PATH)/include +# -------------------------------------------------------------------------- +GCC_ABI_FLAGS="-mcpu=cortex-m33 -mthumb -mfloat-abi=soft" +ABI_FLAGS="-nostdlib -fvisibility=hidden $GCC_ABI_FLAGS -ffunction-sections" + +LIBC_INCLUDES="$(realpath "$REPO/tests/ir_tests/libc_includes")" +LIBC_IMPORTS="$(realpath "$REPO/tests/ir_tests/libc_imports")" +NEWLIB_INCLUDES="$LIBC_INCLUDES/newlib" +ARM_SYSROOT="$(arm-none-eabi-gcc $GCC_ABI_FLAGS --print-sysroot 2>/dev/null || echo /usr/arm-none-eabi)" +INCLUDE_FLAGS="-I$LIBC_INCLUDES -I$LIBC_IMPORTS -I$NEWLIB_INCLUDES -I$ARM_SYSROOT/include -I$REPO/include" + +run_sweep() { + local compiler="$1" tag="$2" report_arg=() + echo "================================================================" + echo " Sweep ($tag): $compiler" + echo "================================================================" + local report_path="" + if [[ -n "$REPORT" ]]; then + if [[ "$tag" == "ubsan" ]]; then + report_path="${REPORT%.txt}.ubsan.txt" + else + report_path="$REPORT" + fi + report_arg=(--report "$report_path") + fi + local raw_arg=() + [[ -n "$RAW_HITS" ]] && raw_arg=(--list-hits-raw "$RAW_HITS") + local shard_arg=() + [[ -n "$SHARD" ]] && shard_arg=(--shard "$SHARD") + + # Values that begin with '-' (olevels, the -I/-m flag bundles) are passed with + # '=' so argparse does not mistake them for options. + python3 "$HELPER" \ + --compiler "$compiler" \ + --corpus "$CORPUS" \ + --olevels="$OLEVELS" \ + --limit "$LIMIT" \ + --timeout "$TIMEOUT" \ + --include-flags="$INCLUDE_FLAGS" \ + --abi-flags="$ABI_FLAGS" \ + "${shard_arg[@]}" \ + "${report_arg[@]}" \ + "${raw_arg[@]}" +} + +# ---- ASAN sweep (the default, using the existing shared compiler) ---- +if [[ ! -x "$COMPILER" ]]; then + echo "error: compiler not found or not executable: $COMPILER" >&2 + echo " build it with 'make cross' first." >&2 + exit 2 +fi +run_sweep "$COMPILER" "asan" + +# ---- optional out-of-band UBSan sweep ---- +if [[ "$WITH_UBSAN" == "1" ]]; then + echo + echo "################################################################" + echo "# --with-ubsan: building a SEPARATE UBSan compiler out-of-band" + echo "# (config.mak is saved + restored; shared armv8m-tcc untouched)" + echo "################################################################" + + UBSAN_DIR="$(mktemp -d "${TMPDIR:-/tmp}/asan_sweep_ubsan.XXXXXX")" + CONFIG_BAK="$(mktemp "${TMPDIR:-/tmp}/config.mak.bak.XXXXXX")" + cp "$REPO/config.mak" "$CONFIG_BAK" + + restore_config() { + cp "$CONFIG_BAK" "$REPO/config.mak" + rm -f "$CONFIG_BAK" + echo "restored config.mak" + } + trap restore_config EXIT + + UBSAN_TCC="$UBSAN_DIR/armv8m-tcc" + ( + cd "$REPO" + # Reconfigure with UBSan (this rewrites config.mak — restored on exit). + ./configure --enable-ubsan >/dev/null + # Build the cross compiler into the temp dir without clobbering the shared + # armv8m-tcc: build normally, then move the artifact aside and restore the + # shared one from git (it is a tracked binary in this repo layout — if not, + # the ASAN compiler is rebuilt by the next 'make cross' anyway). + make cross >/dev/null 2>&1 || { echo "UBSan build failed" >&2; exit 1; } + cp "$REPO/armv8m-tcc" "$UBSAN_TCC" + ) + # Rebuild the shared ASAN compiler so concurrent agents see it unchanged. + restore_config + trap - EXIT + ( cd "$REPO" && make cross >/dev/null 2>&1 ) || \ + echo "warning: could not rebuild shared ASAN armv8m-tcc; run 'make cross'" >&2 + + run_sweep "$UBSAN_TCC" "ubsan" + rm -rf "$UBSAN_DIR" +fi diff --git a/scripts/bisect_opt.py b/scripts/bisect_opt.py new file mode 100755 index 00000000..1dc82c99 --- /dev/null +++ b/scripts/bisect_opt.py @@ -0,0 +1,415 @@ +#!/usr/bin/env python3 +"""Pinpoint the optimization pass / knob that flips a program's output. + +Given a seed (or a .c file) that diverges between two -O levels (e.g. tcc -O0 +correct, tcc -O1 wrong), this script tells you *exactly* what to look at: + + Phase A -- knob bisection (QEMU-confirmed, exact): + For every optimization knob ``-f`` known to the compiler, rebuild at + the failing level with ``-fno-`` and re-run under QEMU. Any knob + whose removal restores the reference signature is reported as a culprit. + + Phase B -- pass text-diff (narrows to the specific pass + IR line): + Dumps the IR after every optimization pass (``-dump-ir-passes=all``) at the + failing level, walks consecutive pass outputs, and flags the pass where a + memory read (LOAD / LOAD_INDEXED / ``***DEREF***``) at a given instruction + address turns into a constant ``#...`` -- the classic misfold signature. + Each flagged pass is correlated (via ir/opt_pipeline.c) to its gating knob + and printed with the before/after IR lines. + +The two phases cross-check: Phase A names the culprit knob(s); Phase B names the +specific pass and the exact transformation, filtered to the culprit knob set so +the noise from unrelated constant folds is suppressed. + +This reuses tests/fuzz/fuzz_harness.py (QEMU + newlib plumbing). + +Usage: + python scripts/bisect_opt.py --seed 295 + python scripts/bisect_opt.py --seed 295 --low -O0 --high -O2 + python scripts/bisect_opt.py --file tests/fuzz/fuzz_triage_repros/seed295.c + python scripts/bisect_opt.py --file path.c --high -O1 --skip-knobs # IR only + +Exit code: 0 if a culprit was identified, 1 otherwise. +""" + +from __future__ import annotations + +import argparse +import re +import shlex +import subprocess +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +FUZZ_DIR = REPO_ROOT / "tests" / "fuzz" +if str(FUZZ_DIR) not in sys.path: + sys.path.insert(0, str(FUZZ_DIR)) + +import fuzz_harness as H # noqa: E402 +from gen_c import generate_program # noqa: E402 + +IR_TESTS_DIR = H.IR_TESTS_DIR +TCC = H.TCC_BIN + +# Include flags mirrored from tests/fuzz/runseed.sh so a direct armv8m-tcc +# invocation compiles the same program the Makefile-driven path does. +INC_FLAGS = [ + f"-I{IR_TESTS_DIR / 'libc_includes'}", + f"-I{IR_TESTS_DIR / 'libc_imports'}", + f"-I{IR_TESTS_DIR / 'libc_includes' / 'newlib'}", + "-I/include", + f"-I{REPO_ROOT / 'include'}", +] +BASE_TCC_FLAGS = [ + "-nostdlib", "-fvisibility=hidden", "-mcpu=cortex-m33", "-mthumb", + "-mfloat-abi=soft", "-ffunction-sections", +] + + +# --------------------------------------------------------------------------- +# Static introspection of the compiler's knob / pass tables +# --------------------------------------------------------------------------- + +def _parse_knobs() -> list[str]: + """Extract the list of -f optimization flags from libtcc.c.""" + src = (REPO_ROOT / "libtcc.c").read_text() + return sorted(set(re.findall(r'offsetof\(TCCState, (opt_[a-z_]+)\), 0, "([a-z-]+)"', src)), + key=lambda t: t[1]) + + +def _parse_pass_to_knob() -> dict[str, str]: + """Map individual pass name -> knob name from the PASS_GATED table. + + Reads the ``PASS_GATED("name", ..., FLAG(opt_X))`` entries in + ir/opt_pipeline.c. Note: the per-pass IR dump labels group-level phases + (e.g. ``entry_store_group``, ``propagation_group``) that aggregate several + such passes, so a dump label often does NOT appear in this map. Use + :func:`_parse_group_labels` to recognise group labels, and + :func:`_passes_for_knob` to list the individual passes a knob gates. + """ + src = (REPO_ROOT / "ir" / "opt_pipeline.c").read_text() + out: dict[str, str] = {} + for m in re.finditer(r'PASS_GATED\(\s*"([^"]+)"[^)]*?FLAG\(opt_([a-z_]+)\)', src): + out.setdefault(m.group(1), m.group(2)) + return out + + +def _parse_group_labels() -> set[str]: + """Return the set of IRPassGroup variable names (dump labels that are + groups rather than individual passes).""" + src = (REPO_ROOT / "ir" / "opt_pipeline.c").read_text() + return set(re.findall(r'IRPassGroup\s+(\w+)\s*=', src)) + + +def _passes_for_knob(pass2knob: dict[str, str], knob: str) -> list[str]: + """Individual pass names gated by ``knob`` (a flag name, e.g. 'store-load-fwd').""" + field = knob.replace("-", "_") + return sorted(p for p, k in pass2knob.items() if k == field) + + +# --------------------------------------------------------------------------- +# Final-IR diff between ``high`` and ``high -fno-`` (the general fallback) +# --------------------------------------------------------------------------- + +def _dump_final_ir(source: Path, opt_level: str) -> str: + """Return the final optimized IR text (``-dump-ir``) for ``source``. + + Each function is delimited by an ``=== IR AFTER OPTIMIZATIONS ===`` block; + we keep all of them so a multi-function program diffs cleanly. + """ + cmd = [str(TCC), "-dump-ir", *shlex.split(opt_level), *BASE_TCC_FLAGS, *INC_FLAGS, + "-c", str(source), "-o", "/dev/null"] + proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if "=== IR AFTER OPTIMIZATIONS ===" not in (proc.stdout or ""): + raise RuntimeError(f"tcc dump failed:\n{proc.stderr.strip()}") + return proc.stdout + + +# Array-initializer stores: ``StackLoc[-NN] <-- #const [STORE]`` for |NN|>=32. +# These dominate the diff with pure noise (the program's literal initializers), +# so strip them when comparing two opt variants of the same program. +_INIT_STORE_RE = re.compile(r"StackLoc\[-\d+\] <-- #") + + +def _filter_final_ir(text: str) -> list[str]: + """Keep instruction lines, dropping section markers and array-init stores.""" + out = [] + for ln in text.splitlines(): + if ln.startswith("=== ") or _INIT_STORE_RE.search(ln): + continue + out.append(ln) + return out + + +def diff_knob(source: Path, high: str, knob: str) -> int: + """Print a unified diff of final IR: ``high`` vs ``high -fno-``. + + This is the general-purpose fallback that catches ANY class of miscompile + (const folds, dropped stores, control-flow rewrites) -- not just the + memory->constant folds Phase B heuristics flag. The knob must be one that + Phase A found to fix the divergence, so the two IRs differ exactly in what + that pass changes. + """ + print(f"\n[bisect] Phase C: final-IR diff ({high}) vs ({high} -fno-{knob})") + try: + a = _filter_final_ir(_dump_final_ir(source, high)) + b = _filter_final_ir(_dump_final_ir(source, f"{high} -fno-{knob}")) + except RuntimeError as e: + print(f"[bisect] could not dump final IR: {e}", file=sys.stderr) + return 0 + import difflib + ndiff = 0 + for line in difflib.unified_diff(a, b, + fromfile=f"{high} (buggy)", + tofile=f"{high} -fno-{knob} (correct)", + lineterm=""): + print(line) + if line[:1] in ("+", "-") and line[:2] not in ("++", "--"): + ndiff += 1 + if ndiff == 0: + print("[bisect] (no differences -- knob did not change final IR)") + return ndiff + + +# --------------------------------------------------------------------------- +# IR pass-dump parsing +# --------------------------------------------------------------------------- + +_PASS_HDR = re.compile(r"^=== AFTER (.+?) ===$") +_PASS_END = re.compile(r"^=== END AFTER .+? ===$") + + +def dump_passes(source: Path, opt_level: str) -> list[tuple[str, list[str]]]: + """Return [(pass_name, [ir_lines]), ...] in document order for ``source``.""" + cmd = [str(TCC), "-dump-ir-passes=all", *shlex.split(opt_level), *BASE_TCC_FLAGS, *INC_FLAGS, + "-c", str(source), "-o", "/dev/null"] + proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + # Compile errors surface on stderr and produce no dump -- bubble them up. + if proc.returncode != 0 or "=== AFTER" not in (proc.stdout or ""): + raise RuntimeError(f"tcc dump failed:\n{proc.stderr.strip()}") + blocks: list[tuple[str, list[str]]] = [] + cur_name, cur_lines = None, None + for ln in (proc.stdout or "").splitlines(): + hm = _PASS_HDR.match(ln) + if hm: + cur_name, cur_lines = hm.group(1), [] + continue + if _PASS_END.match(ln): + if cur_name is not None: + blocks.append((cur_name, cur_lines)) + cur_name, cur_lines = None, None + continue + if cur_name is not None: + cur_lines.append(ln) + return blocks + + +_ADDR_RE = re.compile(r"^\s*(\d+):\s*(.*)$") +# A memory read at a given instruction address: a load through a pointer +# (***DEREF***), a plain [LOAD], or a LOAD_INDEXED op. +_MEM_READ_RE = re.compile(r"(LOAD_INDEXED|\*\*\*DEREF\*\*\*|\[LOAD\])") +_CONST_ASSIGN_RE = re.compile(r"<--\s*#-?[0-9a-fA-Fx]+\b") + + +def _index_by_addr(lines: list[str]) -> dict[str, str]: + """Map instruction-address -> normalized RHS text for an IR block.""" + idx: dict[str, str] = {} + for ln in lines: + m = _ADDR_RE.match(ln) + if not m: + continue + addr, rhs = m.group(1), m.group(2) + idx[addr] = rhs + return idx + + +def find_const_folds(blocks: list[tuple[str, list[str]]]) -> list[dict]: + """Find passes that turned a memory read into a constant at the same addr. + + Returns a list of dicts: {pass, addr, before_line, after_line}. Operates on + consecutive block pairs in document order; pipeline restarts (new function) + produce totally different addr sets and are naturally ignored because no + shared addr is both a mem-read and a const-assign. + """ + findings: list[dict] = [] + for (name_a, lines_a), (name_b, lines_b) in zip(blocks, blocks[1:]): + ia, ib = _index_by_addr(lines_a), _index_by_addr(lines_b) + for addr, rhs_b in ib.items(): + rhs_a = ia.get(addr) + if rhs_a is None: + continue + was_mem = bool(_MEM_READ_RE.search(rhs_a)) + now_const = bool(_CONST_ASSIGN_RE.search(rhs_b)) and ("[LOAD]" in rhs_b or "[ASSIGN]" in rhs_b) + # The interesting transition: was a real memory read, now a constant. + if was_mem and now_const and not _MEM_READ_RE.search(rhs_b): + findings.append({ + "pass": name_b, "addr": addr, + "before": f"{addr}: {rhs_a}", "after": f"{addr}: {rhs_b}", + }) + # previous block for next iteration + del ia, ib + return findings + + +# --------------------------------------------------------------------------- +# Phases +# --------------------------------------------------------------------------- + +def phase_knobs(source: Path, low: str, high: str, work_dir: Path) -> list[str]: + """Phase A: which -fno- flags at ``high`` restore the ``low`` signature.""" + ref = H.run_with_tcc(source, low, work_dir) + if not ref.ok: + print(f"[bisect] reference level {low} did not produce output: {ref.error}", file=sys.stderr) + return [] + ref_sig = ref.signature + print(f"[bisect] reference {low} signature = {ref_sig[0]!r}/{ref_sig[1]}") + + bad = H.run_with_tcc(source, high, work_dir) + if bad.ok and bad.signature == ref_sig: + print(f"[bisect] {high} already matches {low} -- nothing to bisect.") + return [] + if bad.ok: + print(f"[bisect] {high} signature = {bad.signature[0]!r}/{bad.signature[1]} (DIVERGENT)") + else: + print(f"[bisect] {high} failed to build/run: {bad.error}", file=sys.stderr) + + knobs = _parse_knobs() + print(f"[bisect] Phase A: testing {len(knobs)} knobs under QEMU ...") + fixes: list[str] = [] + for i, (opt_field, flag) in enumerate(knobs, 1): + cflags = f"{high} -fno-{flag}" + r = H.run_with_tcc(source, cflags, work_dir) + restored = r.ok and r.signature == ref_sig + tag = "FIXES" if restored else " " + if restored: + fixes.append(flag) + print(f" [{i:2d}/{len(knobs)}] {tag} -fno-{flag:<22} -> " + f"{r.signature[0]!r}/{r.signature[1]}" + ("" if r.ok else " (build/run fail)")) + return fixes + + +def phase_passes(source: Path, high: str, culprit_knobs: list[str]) -> int: + """Phase B: dump passes, find memory->constant folds, correlate with knobs. + Returns the number of folds surfaced (after culprit filtering).""" + pass2knob = _parse_pass_to_knob() + group_labels = _parse_group_labels() + try: + blocks = dump_passes(source, high) + except RuntimeError as e: + print(f"[bisect] could not dump IR passes: {e}", file=sys.stderr) + return 0 + print(f"\n[bisect] Phase B: {len(blocks)} pass blocks dumped at {high}; " + f"scanning for memory->constant folds ...") + folds = find_const_folds(blocks) + if not folds: + print("[bisect] no memory->constant folds detected between consecutive passes.") + return 0 + + def label_knob(label: str) -> str: + if label in pass2knob: + return pass2knob[label][len("opt_"):] + if label in group_labels: + return "" + return "?" + + culprit_set = set(culprit_knobs) + shown = 0 + seen_passes: set[str] = set() + for f in folds: + knob = label_knob(f["pass"]) + seen_passes.add(f["pass"]) + # When we have culprit knobs, only surface folds whose pass is gated by + # one of them; otherwise show everything. Group labels aggregate many + # passes, so we always show them (the LLM greps the label in ir/). + is_suspect = (knob in culprit_set) or (f["pass"] in group_labels) if culprit_set else True + if is_suspect: + tag = f" (knob={knob})" + (" <<" if knob in culprit_set else "") + print(f" pass={f['pass']:<24}{tag}") + print(f" BEFORE: {f['before']}") + print(f" AFTER : {f['after']}") + shown += 1 + print(f"\n[bisect] passes introducing folds: {sorted(seen_passes)}") + + if culprit_set: + print("\n[bisect] individual passes gated by each culprit knob " + "(functions to inspect in ir/opt_*.c):") + for knob in culprit_knobs: + plist = _passes_for_knob(pass2knob, knob) + print(f" -fno-{knob}: {plist}") + return shown + + +# --------------------------------------------------------------------------- +# main +# --------------------------------------------------------------------------- + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + g = ap.add_mutually_exclusive_group() + g.add_argument("--seed", type=int, help="gen_c.py seed to (re)generate") + g.add_argument("--file", type=str, help="existing .c file to bisect") + ap.add_argument("--low", default="-O0", help="reference (correct) opt level") + ap.add_argument("--high", default="-O1", help="divergent opt level") + ap.add_argument("--work-dir", type=str, default=None) + ap.add_argument("--skip-knobs", action="store_true", + help="skip Phase A (QEMU knob sweep); do IR-only Phase B") + ap.add_argument("--diff-knob", type=str, default=None, + help="run only Phase C: diff final IR at vs -fno-") + ap.add_argument("--require-qemu", action="store_true") + args = ap.parse_args(argv) + + usable, reason = H.qemu_available() + if not usable and not args.skip_knobs and not args.diff_knob: + print(f"[bisect] QEMU/newlib not usable: {reason}", file=sys.stderr) + return 1 if args.require_qemu else 2 + + work_dir = Path(args.work_dir) if args.work_dir else (FUZZ_DIR / "results" / "_bisect") + work_dir.mkdir(parents=True, exist_ok=True) + + if args.file: + source = Path(args.file) + else: + seed = args.seed if args.seed is not None else 295 + source = work_dir / f"fuzz_{seed}.c" + source.write_text(generate_program(seed)) + + print(f"[bisect] source: {source}") + print(f"[bisect] low={args.low} high={args.high}") + + # Phase C standalone: diff final IR for a single knob and exit. + if args.diff_knob: + diff_knob(source, args.high, args.diff_knob) + return 0 + + culprit = [] + if not args.skip_knobs: + culprit = phase_knobs(source, args.low, args.high, work_dir) + if culprit: + print(f"\n[bisect] >> Culprit knob(s) [QEMU-confirmed]: {culprit}") + else: + print("\n[bisect] no single -fno- restored the reference.") + + folds_shown = phase_passes(source, args.high, culprit) + + # Phase C: when a culprit knob is known, always show the exact final-IR + # delta it induces. This is the general fallback that catches bugs Phase B's + # fold heuristic misses (dropped stores, control-flow rewrites). Prefer the + # knob least likely to be a mere propagator (store-load-fwd/jump-threading + # over const-prop, which gates the most passes). + if culprit: + prefer = [k for k in ("jump-threading", "store-load-fwd", "loop-unroll", + "dead-store-elim", "disp-fusion") if k in culprit] + chosen = prefer[0] if prefer else culprit[0] + diff_knob(source, args.high, chosen) + + print("\n[bisect] next steps:") + print(" 1. read the BEFORE/AFTER fold line (Phase B) and/or the final-IR diff (Phase C)") + print(" 2. open the implicated pass function in ir/opt_*.c (see docs/debugging_fuzz_divergences.md)") + print(" 3. add a reduced regression test in tests/ir_tests/ before fixing") + return 0 if (culprit or args.skip_knobs) else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/collect_ci_failure_artifacts.sh b/scripts/collect_ci_failure_artifacts.sh new file mode 100755 index 00000000..b51bc771 --- /dev/null +++ b/scripts/collect_ci_failure_artifacts.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +# Gather a compact debug bundle after a failing `make test`, for CI to upload as +# an artifact (see .github/workflows/ci.yml). It captures: +# +# * make-test.log — the full (untruncated) build+test console output +# * junit.xml — the structured pass/fail report +# * armv8m-tcc, armv8m-libtcc1.a, config.mak — the exact cross compiler + +# runtime that produced the failure, so it can be reproduced locally +# * failed-test-dirs/ — ONLY the per-test work dirs (.elf/.o/...) of the tests +# that actually failed. pytest keeps every test's tmp dir, which for the +# ~13k-case torture suite is far too large to upload wholesale, so we map +# each failed JUnit testcase to its tmp-dir prefix and copy just those. +# +# Best-effort throughout: a missing piece is skipped, never fatal, so the +# bundle is produced even when the build failed before any test ran. +set -uo pipefail + +TOP="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +OUT="${1:-$TOP/ci-failure-artifacts}" +LOG="${MAKE_TEST_LOG:-/tmp/make-test.log}" +JUNIT="${PYTEST_JUNIT_XML:-/tmp/ci-junit.xml}" +BASETEMP_ROOT="${PYTEST_BASETEMP_ROOT:-/tmp/pytest-of-root}" +MAX_TESTDIR_BYTES="${MAX_TESTDIR_BYTES:-209715200}" # 200 MB cap on collected tmp dirs + +rm -rf "$OUT" +mkdir -p "$OUT" + +# 1) Logs / reports. +[ -f "$LOG" ] && cp "$LOG" "$OUT/make-test.log" || true +[ -f "$JUNIT" ] && cp "$JUNIT" "$OUT/junit.xml" || true + +# 2) The cross compiler + runtime + build config. +for f in armv8m-tcc armv8m-tcc.exe armv8m-libtcc1.a config.mak; do + [ -f "$TOP/$f" ] && cp "$TOP/$f" "$OUT/" || true +done + +# 3) Work dirs of the failed tests only. +if [ -f "$OUT/junit.xml" ] && [ -d "$BASETEMP_ROOT" ]; then + python3 - "$OUT/junit.xml" "$BASETEMP_ROOT" "$OUT/failed-test-dirs" "$MAX_TESTDIR_BYTES" <<'PY' || true +import os, re, shutil, sys, xml.etree.ElementTree as ET + +junit, basetemp_root, dest, max_bytes = sys.argv[1:5] +max_bytes = int(max_bytes) + +try: + root = ET.parse(junit).getroot() +except Exception as e: + print(f"collect: could not parse junit ({e})", file=sys.stderr) + sys.exit(0) + +# pytest names a test's tmp dir from re.sub(r"\W","_", node_name)[:30] + a number. +prefixes = { + re.sub(r"\W", "_", tc.get("name", ""))[:30] + for tc in root.iter("testcase") + if tc.find("failure") is not None or tc.find("error") is not None +} +if not prefixes: + print("collect: no failed testcases in junit") + sys.exit(0) + +def dir_size(p): + total = 0 + for r, _, files in os.walk(p): + for f in files: + fp = os.path.join(r, f) + if not os.path.islink(fp) and os.path.exists(fp): + total += os.path.getsize(fp) + return total + +os.makedirs(dest, exist_ok=True) +total = copied = 0 +for run in sorted(os.listdir(basetemp_root)): + run_dir = os.path.join(basetemp_root, run) + if not os.path.isdir(run_dir): + continue + for d in sorted(os.listdir(run_dir)): + src = os.path.join(run_dir, d) + if not os.path.isdir(src) or not any(d.startswith(p) for p in prefixes): + continue + sz = dir_size(src) + if total + sz > max_bytes: + print(f"collect: 200MB cap reached at {total} bytes; skipping remaining dirs", + file=sys.stderr) + print(f"collect: copied {copied} failed-test dir(s), {total} bytes") + sys.exit(0) + shutil.copytree(src, os.path.join(dest, f"{run}__{d}"), dirs_exist_ok=True) + total += sz + copied += 1 +print(f"collect: copied {copied} failed-test dir(s), {total} bytes") +PY +fi + +# 4) One archive for upload. +( cd "$(dirname "$OUT")" && tar czf "$(basename "$OUT").tar.gz" "$(basename "$OUT")" ) || true +echo "collect: bundle at $OUT.tar.gz" +ls -la "$OUT" 2>/dev/null || true diff --git a/scripts/diff_olevels.py b/scripts/diff_olevels.py new file mode 100644 index 00000000..0b41c855 --- /dev/null +++ b/scripts/diff_olevels.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +"""Track 2 -- optimization-level self-consistency differential. + +Oracle: a program's observable output (stdout + exit code) must be **identical** +at ``-O0``, ``-O1`` and ``-O2``. Any divergence means an optimization changed +behaviour -> a candidate miscompile, with the offending O-level pinned. + +For each seed we generate a UB-free random C program (``tests/fuzz/gen_c.py``), +compile it with ``armv8m-tcc`` at each O-level, run each under QEMU +``mps2-an505`` (reusing the ``tests/ir_tests`` plumbing via +``tests/fuzz/fuzz_harness.py``), and compare the (stdout, exit) signatures. + +On divergence the offending ``.c`` and the per-level outputs are saved to a +results directory and the seed is reported. Because the generator is UB-free by +construction, a divergence here is a real self-consistency failure (re-check the +generator's guarantees before filing, per the plan's rules). + +Usage: + python scripts/diff_olevels.py --seeds 0-49 + python scripts/diff_olevels.py --seed 0 --seed 7 --seed 42 + python scripts/diff_olevels.py --count 100 --start 0 --results-dir /tmp/fuzz_olevels + python scripts/diff_olevels.py --file path/to/program.c # one fixed file + +Exit code: 0 if all consistent, 1 if any divergence (or harness unusable when +``--require-qemu`` is given). +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +# Make tests/fuzz importable. +REPO_ROOT = Path(__file__).resolve().parent.parent +FUZZ_DIR = REPO_ROOT / "tests" / "fuzz" +if str(FUZZ_DIR) not in sys.path: + sys.path.insert(0, str(FUZZ_DIR)) + +import fuzz_harness as H # noqa: E402 +from gen_c import generate_program # noqa: E402 + +DEFAULT_OPT_LEVELS = ["-O0", "-O1", "-O2"] + + +def parse_seed_spec(args) -> list[int]: + """Resolve --seed / --seeds RANGE / --count+--start into a seed list.""" + seeds: list[int] = [] + if args.seeds: + for token in args.seeds.split(","): + token = token.strip() + if "-" in token: + lo, hi = token.split("-", 1) + seeds.extend(range(int(lo), int(hi) + 1)) + elif token: + seeds.append(int(token)) + seeds.extend(args.seed or []) + if args.count: + seeds.extend(range(args.start, args.start + args.count)) + if not seeds and not args.file: + seeds = list(range(0, 20)) # sensible default + # De-dup, preserve order. + seen = set() + out = [] + for s in seeds: + if s not in seen: + seen.add(s) + out.append(s) + return out + + +def _save_divergence(results_dir: Path, tag: str, source: Path, results) -> Path: + results_dir.mkdir(parents=True, exist_ok=True) + case_dir = results_dir / tag + case_dir.mkdir(parents=True, exist_ok=True) + dest_c = case_dir / source.name + dest_c.write_text(Path(source).read_text()) + summary = [f"# O-level self-consistency divergence: {tag}", ""] + for r in results: + summary.append(f"[{r.label}] ok={r.ok} exit={r.exit_code} " + f"stdout={r.stdout.strip()!r} err={r.error.strip()!r}") + (case_dir / "outputs.txt").write_text("\n".join(summary) + "\n") + return case_dir + + +def check_one(source: Path, opt_levels, work_dir: Path): + """Run ``source`` at every opt level; return (consistent, results).""" + results = [H.run_with_tcc(source, o, work_dir) for o in opt_levels] + # A build/run failure is itself a divergence-worthy event to report. + if not all(r.ok for r in results): + return False, results + sigs = {r.signature for r in results} + return (len(sigs) == 1), results + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--seed", type=int, action="append", help="a single seed (repeatable)") + ap.add_argument("--seeds", type=str, help="comma list / ranges, e.g. '0-49,100'") + ap.add_argument("--count", type=int, default=0, help="number of seeds from --start") + ap.add_argument("--start", type=int, default=0, help="first seed for --count") + ap.add_argument("--file", type=str, default=None, + help="diff a fixed .c file instead of generated seeds") + ap.add_argument("--opt-levels", type=str, default=",".join(DEFAULT_OPT_LEVELS), + help="comma-separated opt levels (default -O0,-O1,-O2)") + ap.add_argument("--results-dir", type=str, default=None, + help="where to save divergences (default tests/fuzz/results/olevels)") + ap.add_argument("--work-dir", type=str, default=None, + help="scratch build dir (default /_build)") + ap.add_argument("--require-qemu", action="store_true", + help="exit non-zero if QEMU/newlib is unprepared (default: skip)") + args = ap.parse_args(argv) + + usable, reason = H.qemu_available() + if not usable: + msg = f"[diff_olevels] QEMU/newlib not usable: {reason}" + print(msg, file=sys.stderr) + return 1 if args.require_qemu else 0 + + opt_levels = [o.strip() for o in args.opt_levels.split(",") if o.strip()] + results_dir = Path(args.results_dir) if args.results_dir else (FUZZ_DIR / "results" / "olevels") + work_dir = Path(args.work_dir) if args.work_dir else (results_dir / "_build") + work_dir.mkdir(parents=True, exist_ok=True) + + divergences = 0 + checked = 0 + + if args.file: + source = Path(args.file) + consistent, results = check_one(source, opt_levels, work_dir) + checked += 1 + status = "OK " if consistent else "DIVERGE" + sigs = " | ".join(f"{r.label}={r.stdout.strip()!r}/{r.exit_code}" for r in results) + print(f"[{status}] {source.name}: {sigs}") + if not consistent: + divergences += 1 + d = _save_divergence(results_dir, source.stem, source, results) + print(f" saved -> {d}") + else: + seeds = parse_seed_spec(args) + for seed in seeds: + src = work_dir / f"fuzz_{seed}.c" + src.write_text(generate_program(seed)) + consistent, results = check_one(src, opt_levels, work_dir) + checked += 1 + if consistent: + ref = results[0].stdout.strip() + print(f"[OK ] seed {seed}: {ref!r} exit={results[0].exit_code}") + else: + divergences += 1 + sigs = " | ".join( + f"{r.label}={r.stdout.strip()!r}/{r.exit_code}" + f"{'' if r.ok else ' (' + r.error.strip().splitlines()[0] + ')' if r.error.strip() else ''}" + for r in results + ) + print(f"[DIVERGE] seed {seed}: {sigs}") + d = _save_divergence(results_dir, f"seed_{seed}", src, results) + print(f" repro saved -> {d}") + + print(f"\n[diff_olevels] checked={checked} divergences={divergences} " + f"opt_levels={opt_levels}") + return 1 if divergences else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/diff_vs_gcc.py b/scripts/diff_vs_gcc.py new file mode 100644 index 00000000..1ae1bfe7 --- /dev/null +++ b/scripts/diff_vs_gcc.py @@ -0,0 +1,254 @@ +#!/usr/bin/env python3 +"""Track 3 -- differential vs arm-none-eabi-gcc. + +Oracle: **gcc** (trusted). The same C program is compiled by ``armv8m-tcc`` +(at each O-level) and by ``arm-none-eabi-gcc -O2``, both run under the SAME QEMU +``mps2-an505`` harness (reused from ``tests/fuzz/fuzz_harness.py``). Any tcc +level whose (stdout, exit) signature differs from gcc's is a candidate +miscompile -- including bugs where all tcc levels AGREE but are wrong, which +Track 2 cannot catch. + +Two modes +--------- +``--mode random`` (default, the priority path) + Generate UB-free random C programs (``tests/fuzz/gen_c.py``) and diff each + tcc O-level against the gcc reference. UB-freedom is guaranteed by the + generator, so a divergence is a real wrong-output bug (re-verify generator + guarantees before filing, per plan rules). + +``--mode torture`` + Run the existing gcc c-torture **execute** tests through tcc. These tests + are self-checking -- they ``abort()`` (non-zero exit) on a wrong result -- + so we treat a non-zero exit as a candidate miscompile, triaged against the + suite's known skip / xfail lists (reused from ``tests/gcctestsuite``). No + gcc run is needed in this mode (the program is its own oracle). + +Usage: + python scripts/diff_vs_gcc.py --seeds 0-49 + python scripts/diff_vs_gcc.py --mode random --count 100 --start 0 + python scripts/diff_vs_gcc.py --file prog.c --gcc-opt -O2 + python scripts/diff_vs_gcc.py --mode torture --limit 200 + +Exit code: 0 if everything matched gcc / passed; 1 on any candidate miscompile +(or harness unusable with --require-qemu). +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +FUZZ_DIR = REPO_ROOT / "tests" / "fuzz" +if str(FUZZ_DIR) not in sys.path: + sys.path.insert(0, str(FUZZ_DIR)) + +import fuzz_harness as H # noqa: E402 +from gen_c import generate_program # noqa: E402 + +DEFAULT_TCC_OPT_LEVELS = ["-O0", "-O1", "-O2"] + + +# --------------------------------------------------------------------------- +# seed parsing (shared shape with diff_olevels) +# --------------------------------------------------------------------------- + +def parse_seed_spec(args) -> list[int]: + seeds: list[int] = [] + if args.seeds: + for token in args.seeds.split(","): + token = token.strip() + if "-" in token: + lo, hi = token.split("-", 1) + seeds.extend(range(int(lo), int(hi) + 1)) + elif token: + seeds.append(int(token)) + seeds.extend(args.seed or []) + if args.count: + seeds.extend(range(args.start, args.start + args.count)) + if not seeds and not args.file: + seeds = list(range(0, 20)) + seen, out = set(), [] + for s in seeds: + if s not in seen: + seen.add(s) + out.append(s) + return out + + +def _save_divergence(results_dir: Path, tag: str, source: Path, ref, tcc_results) -> Path: + results_dir.mkdir(parents=True, exist_ok=True) + case_dir = results_dir / tag + case_dir.mkdir(parents=True, exist_ok=True) + (case_dir / source.name).write_text(Path(source).read_text()) + lines = [f"# tcc-vs-gcc divergence: {tag}", ""] + lines.append(f"[{ref.label} REFERENCE] ok={ref.ok} exit={ref.exit_code} " + f"stdout={ref.stdout.strip()!r} err={ref.error.strip()!r}") + for r in tcc_results: + agree = "MATCH" if (r.ok and ref.ok and r.signature == ref.signature) else "DIFF" + lines.append(f"[{r.label}] {agree} ok={r.ok} exit={r.exit_code} " + f"stdout={r.stdout.strip()!r} err={r.error.strip()!r}") + (case_dir / "outputs.txt").write_text("\n".join(lines) + "\n") + return case_dir + + +# --------------------------------------------------------------------------- +# Mode: random +# --------------------------------------------------------------------------- + +def run_random(args) -> int: + ok_ref, reason = H.gcc_reference_available() + if not ok_ref: + print(f"[diff_vs_gcc] gcc reference not usable: {reason}", file=sys.stderr) + return 1 if args.require_qemu else 0 + + tcc_opts = [o.strip() for o in args.tcc_opt_levels.split(",") if o.strip()] + gcc_opt = args.gcc_opt + results_dir = Path(args.results_dir) if args.results_dir else (FUZZ_DIR / "results" / "vs_gcc") + work_dir = Path(args.work_dir) if args.work_dir else (results_dir / "_build") + work_dir.mkdir(parents=True, exist_ok=True) + + divergences = 0 + checked = 0 + + def diff_source(source: Path, tag: str): + nonlocal divergences, checked + ref = H.run_with_gcc(source, gcc_opt, work_dir) + checked += 1 + if not ref.ok: + print(f"[GCC-FAIL] {tag}: reference build/run failed: " + f"{ref.error.strip().splitlines()[0] if ref.error.strip() else '?'}") + return + tcc_results = [H.run_with_tcc(source, o, work_dir) for o in tcc_opts] + mismatched = [r for r in tcc_results if not (r.ok and r.signature == ref.signature)] + if not mismatched: + print(f"[OK ] {tag}: gcc{gcc_opt}={ref.stdout.strip()!r}/{ref.exit_code} " + f"(all tcc levels match)") + return + divergences += 1 + parts = [f"gcc{gcc_opt}={ref.stdout.strip()!r}/{ref.exit_code}"] + for r in tcc_results: + mark = "" if (r.ok and r.signature == ref.signature) else " <-- DIFF" + parts.append(f"{r.label}={r.stdout.strip()!r}/{r.exit_code}{mark}") + print(f"[DIVERGE] {tag}:\n " + "\n ".join(parts)) + d = _save_divergence(results_dir, tag.replace(" ", "_"), source, ref, tcc_results) + print(f" repro saved -> {d}") + + if args.file: + diff_source(Path(args.file), Path(args.file).stem) + else: + for seed in parse_seed_spec(args): + src = work_dir / f"fuzz_{seed}.c" + src.write_text(generate_program(seed)) + diff_source(src, f"seed_{seed}") + + print(f"\n[diff_vs_gcc:random] checked={checked} divergences={divergences} " + f"tcc_opts={tcc_opts} gcc_opt={gcc_opt}") + return 1 if divergences else 0 + + +# --------------------------------------------------------------------------- +# Mode: torture (self-checking gcc execute tests through tcc) +# --------------------------------------------------------------------------- + +def run_torture(args) -> int: + usable, reason = H.qemu_available() + if not usable: + print(f"[diff_vs_gcc] QEMU/newlib not usable: {reason}", file=sys.stderr) + return 1 if args.require_qemu else 0 + + # Reuse the gcctestsuite discovery + skip/xfail lists. + import importlib.util + gcc_conf_path = REPO_ROOT / "tests" / "gcctestsuite" / "conftest.py" + spec = importlib.util.spec_from_file_location("gcc_conftest", gcc_conf_path) + gcc_conf = importlib.util.module_from_spec(spec) + spec.loader.exec_module(gcc_conf) + + if not gcc_conf.GCC_TORTURE_PATH.exists(): + print(f"[diff_vs_gcc:torture] torture tests not found at " + f"{gcc_conf.GCC_TORTURE_PATH}; run 'make download-gcc-tests'", + file=sys.stderr) + return 1 if args.require_qemu else 0 + + tcc_opts = [o.strip() for o in args.tcc_opt_levels.split(",") if o.strip()] + results_dir = Path(args.results_dir) if args.results_dir else (FUZZ_DIR / "results" / "torture") + work_dir = Path(args.work_dir) if args.work_dir else (results_dir / "_build") + work_dir.mkdir(parents=True, exist_ok=True) + + cases = gcc_conf.discover_gcc_execute_tests() + if args.limit: + cases = cases[: args.limit] + + candidates = 0 + ran = 0 + skipped = 0 + + for tc in cases: + skip = gcc_conf.should_skip_gcc_test(tc.source) + xfail = gcc_conf.is_xfail_test(tc.source) + if skip or xfail: + skipped += 1 + continue + for opt in tcc_opts: + cflags = opt + if tc.dg_options: + cflags = f"{opt} {tc.dg_options}" + # Reuse the tcc QEMU path; the program self-checks via abort(). + res = H.run_with_tcc(tc.source, cflags, work_dir) + ran += 1 + # A self-checking execute test passes iff it exits 0. + passed = res.ok and res.exit_code == 0 + if passed: + continue + candidates += 1 + reason = (res.error.strip().splitlines()[0] + if res.error.strip() else f"exit={res.exit_code}") + print(f"[CANDIDATE] {tc.source.stem} {opt}: {reason}") + results_dir.mkdir(parents=True, exist_ok=True) + log = results_dir / f"{tc.source.stem}{opt.replace('-', '')}.txt" + log.write_text( + f"# torture candidate miscompile: {tc.source} {opt}\n" + f"exit={res.exit_code} ok={res.ok}\n" + f"stdout={res.stdout.strip()!r}\n" + f"error={res.error.strip()!r}\n" + ) + + print(f"\n[diff_vs_gcc:torture] ran={ran} candidates={candidates} " + f"skipped(known)={skipped} tcc_opts={tcc_opts}") + return 1 if candidates else 0 + + +# --------------------------------------------------------------------------- + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--mode", choices=["random", "torture"], default="random", + help="random C generator (default) or gcc-torture execute tests") + # random-mode inputs + ap.add_argument("--seed", type=int, action="append", help="single seed (repeatable)") + ap.add_argument("--seeds", type=str, help="comma list / ranges, e.g. '0-49,100'") + ap.add_argument("--count", type=int, default=0, help="number of seeds from --start") + ap.add_argument("--start", type=int, default=0, help="first seed for --count") + ap.add_argument("--file", type=str, default=None, help="diff a fixed .c file") + ap.add_argument("--gcc-opt", type=str, default="-O2", help="gcc reference O-level") + ap.add_argument("--tcc-opt-levels", type=str, default=",".join(DEFAULT_TCC_OPT_LEVELS), + help="comma-separated tcc opt levels") + # torture-mode inputs + ap.add_argument("--limit", type=int, default=0, + help="(torture) cap the number of discovered tests") + # shared + ap.add_argument("--results-dir", type=str, default=None) + ap.add_argument("--work-dir", type=str, default=None) + ap.add_argument("--require-qemu", action="store_true", + help="exit non-zero if QEMU/newlib is unprepared (default: skip)") + args = ap.parse_args(argv) + + if args.mode == "torture": + return run_torture(args) + return run_random(args) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/reduce_divergence.py b/scripts/reduce_divergence.py new file mode 100644 index 00000000..46fd1a1a --- /dev/null +++ b/scripts/reduce_divergence.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +"""Delta-reduce a divergent C program to a smaller repro (Phase BH helper). + +Given a ``.c`` file that produces different output under armv8m-tcc at two +different optimization levels (the "interestingness" property), greedily delete +top-level functions and individual statement lines while the divergence persists, +yielding a smaller program with the same bug. Reuses the QEMU harness +(``tests/fuzz/fuzz_harness.py``) so the reduced program is still validated +end-to-end on the real target. + +This is intentionally simple (line/function granularity, not a full C reducer +like creduce) -- enough to hand a much smaller repro to bug-fix work. + +Usage: + python scripts/reduce_divergence.py FILE.c --low -O0 --high -O1 -o reduced.c + python scripts/reduce_divergence.py FILE.c --low -O0 --high -O2 + +The reduced program is only guaranteed to *reproduce the divergence*; it is not +re-checked for UB (the original was UB-free; deletions cannot introduce signed +overflow etc. given the generator's all-unsigned discipline, but treat the +reduced output as a starting point for manual minimization). +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +FUZZ_DIR = REPO_ROOT / "tests" / "fuzz" +if str(FUZZ_DIR) not in sys.path: + sys.path.insert(0, str(FUZZ_DIR)) + +import fuzz_harness as H # noqa: E402 + + +def diverges(source_text: str, low: str, high: str, work_dir: Path) -> bool: + """True iff tcc at ``low`` and ``high`` produce different (stdout, exit) AND + both builds/runs succeed (so we don't 'reduce' into a compile error).""" + tmp = work_dir / "candidate.c" + tmp.write_text(source_text) + rl = H.run_with_tcc(tmp, low, work_dir) + rh = H.run_with_tcc(tmp, high, work_dir) + if not (rl.ok and rh.ok): + return False + return rl.signature != rh.signature + + +def _split_top_level(text: str) -> list[str]: + """Return lines; we operate at line granularity but never remove the + csmix/printf scaffolding that defines the observable output.""" + return text.splitlines(keepends=False) + + +def reduce_text(text: str, low: str, high: str, work_dir: Path) -> str: + work_dir.mkdir(parents=True, exist_ok=True) + assert diverges(text, low, high, work_dir), "input does not diverge" + + lines = _split_top_level(text) + # Protect lines that are structurally required to keep a compilable program + # that still prints something: includes, csmix, the main signature, the + # printf/return, and brace-only lines (cheap structural safety). + def protected(ln: str) -> bool: + s = ln.strip() + return ( + s.startswith("#include") + or "csmix" in s + or s.startswith("int main") + or s.startswith("printf") + or s.startswith("return") + or s in ("{", "}") + or s.startswith("struct S") + or s.startswith("unsigned cs =") + ) + + changed = True + while changed: + changed = False + i = 0 + while i < len(lines): + if protected(lines[i]): + i += 1 + continue + trial = lines[:i] + lines[i + 1:] + if diverges("\n".join(trial) + "\n", low, high, work_dir): + lines = trial + changed = True + # don't advance i; the next line shifted into position i + else: + i += 1 + return "\n".join(lines) + "\n" + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("file", help="divergent .c file to reduce") + ap.add_argument("--low", default="-O0", help="reference O-level (default -O0)") + ap.add_argument("--high", default="-O2", help="divergent O-level (default -O2)") + ap.add_argument("-o", "--output", default=None, help="write reduced program here") + ap.add_argument("--work-dir", default=None, help="scratch build dir") + args = ap.parse_args(argv) + + usable, reason = H.qemu_available() + if not usable: + print(f"[reduce] QEMU/newlib not usable: {reason}", file=sys.stderr) + return 2 + + src = Path(args.file) + text = src.read_text() + work_dir = Path(args.work_dir) if args.work_dir else (FUZZ_DIR / "results" / "_reduce") + work_dir.mkdir(parents=True, exist_ok=True) + + if not diverges(text, args.low, args.high, work_dir): + print(f"[reduce] {src} does not diverge at {args.low} vs {args.high}; nothing to do", + file=sys.stderr) + return 1 + + before = len(text.splitlines()) + reduced = reduce_text(text, args.low, args.high, work_dir) + after = len(reduced.splitlines()) + out = Path(args.output) if args.output else src.with_name(src.stem + "_reduced.c") + out.write_text(reduced) + print(f"[reduce] {src.name}: {before} -> {after} lines " + f"(still diverges {args.low} vs {args.high}) -> {out}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/reduce_runseed.py b/scripts/reduce_runseed.py new file mode 100755 index 00000000..c9cd9c60 --- /dev/null +++ b/scripts/reduce_runseed.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Line-granularity reducer using runseed.sh (ground-truth QEMU oracle). + +Interesting = both O-levels compile+run and print different checksum lines. +Greedy: try deleting each line (also matching-brace blocks), keep if still +interesting. Repeats until a fixed point. +""" +import subprocess, sys, os, tempfile + +RUNSEED = "/home/mateusz/repos/tinycc/tests/fuzz/runseed.sh" + +def sig(path, olevel): + try: + out = subprocess.run(["bash", RUNSEED, path, olevel], capture_output=True, + text=True, timeout=60).stdout.strip().splitlines() + return out[-1] if out else "NO_OUTPUT" + except subprocess.TimeoutExpired: + return "TIMEOUT" + +def interesting(lines, lo, hi, tmpdir): + src = "\n".join(lines) + "\n" + p = os.path.join(tmpdir, "cand.c") + with open(p, "w") as f: + f.write(src) + a = sig(p, lo) + if not a.startswith("checksum="): + return False + b = sig(p, hi) + if not b.startswith("checksum="): + return False + return a != b + +def block_end(lines, i): + """If line i opens a block ({ at end), return index of matching close.""" + depth = 0 + opened = False + for j in range(i, len(lines)): + depth += lines[j].count("{") - lines[j].count("}") + if lines[j].count("{"): + opened = True + if opened and depth <= 0: + return j + if j > i + 200: + break + return None + +def main(): + src, lo, hi, out = sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] + lines = open(src).read().splitlines() + tmpdir = tempfile.mkdtemp(prefix="rreduce") + assert interesting(lines, lo, hi, tmpdir), "original not interesting!" + changed = True + rounds = 0 + while changed and rounds < 6: + changed = False + rounds += 1 + i = 0 + while i < len(lines): + line = lines[i].strip() + if not line or line.startswith("#include") or line.startswith("return"): + # deleting a return from a helper whose value is used + # introduces UB (uninitialized r0) — the divergence then + # tracks garbage, not the original bug + i += 1 + continue + # try deleting a whole block first if the line opens one + cand = None + if line.endswith("{") or ("{" in line and "}" not in line): + j = block_end(lines, i) + if j is not None and j > i: + cand = lines[:i] + lines[j+1:] + if interesting(cand, lo, hi, tmpdir): + lines = cand + changed = True + print(f"[rreduce] deleted block {i}..{j} ({len(lines)} lines left)", flush=True) + continue + # then the single line + cand = lines[:i] + lines[i+1:] + if interesting(cand, lo, hi, tmpdir): + lines = cand + changed = True + print(f"[rreduce] deleted line {i} ({len(lines)} lines left)", flush=True) + continue + i += 1 + with open(out, "w") as f: + f.write("\n".join(lines) + "\n") + print(f"[rreduce] done: {len(lines)} lines -> {out}") + +main() diff --git a/scripts/triage_seed.py b/scripts/triage_seed.py new file mode 100644 index 00000000..b7676b3a --- /dev/null +++ b/scripts/triage_seed.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +"""One-command per-seed triage collector for the fuzz-divergence playbook. + +Given a sweep-report entry like ``longlong 3161`` this script recollects, in +one run, every artifact the per-bug investigation loop in +``docs/debugging_fuzz_divergences.md`` starts from: + + / + seed.c the generated program (gen_c.py --profile --seed N) + outputs.txt tcc signatures at every O-level -- FULL stdout, so a + HardFault keeps its PC=/CFSR=/BFAR= register dump + gcc_reference.txt arm-none-eabi-gcc -O2 ground truth (the trusted oracle) + reduced.c line-granularity reduction that preserves the divergence + bisect.txt scripts/bisect_opt.py Phase A/B/C output (reduced repro) + crash_disasm.txt (crash signatures only) force-thumb disassembly window + around the faulting PC of the divergent tcc ELF + SUMMARY.md one-page digest of all of the above + +The sweep reports (fuzz_triage_*.md) list seeds PER SUITE/PROFILE: ``ptr 5759`` +is seed 5759 of gen_c.py's ``ptr`` profile, which is NOT the program that +``diff_olevels.py --seed 5759`` (default profile) generates. This script owns +that mapping so nobody has to re-derive it. + +Usage: + python3 scripts/triage_seed.py --suite longlong --seed 3161 + python3 scripts/triage_seed.py --suite ptr --seed 5759 --olevels -O0,-O2 + python3 scripts/triage_seed.py --file repro.c # existing repro + python3 scripts/triage_seed.py --suite ptr --seed 5759 --skip-reduce + +Exit code: 0 = consistent (nothing to triage), 1 = divergence collected, +2 = harness/infra error. +""" + +from __future__ import annotations + +import argparse +import re +import subprocess +import sys +from pathlib import Path + +# Make tests/fuzz importable (same pattern as diff_olevels.py). +REPO_ROOT = Path(__file__).resolve().parent.parent +FUZZ_DIR = REPO_ROOT / "tests" / "fuzz" +if str(FUZZ_DIR) not in sys.path: + sys.path.insert(0, str(FUZZ_DIR)) + +import fuzz_harness as H # noqa: E402 +from fuzz_harness import CompileConfig, compile_testcase, MACHINE # noqa: E402 +from gen_c import generate_program, PROFILES # noqa: E402 + +DEFAULT_OPT_LEVELS = ["-O0", "-O1", "-O2", "-Os"] +OBJDUMP = "arm-none-eabi-objdump" + + +def log(msg: str) -> None: + print(f"[triage_seed] {msg}", flush=True) + + +def run_tcc_keep_elf(source: Path, opt_level: str, out_dir: Path): + """Like fuzz_harness.run_with_tcc, but also return the ELF path so a crash + signature can be disassembled afterwards.""" + label = f"tcc{opt_level}" + out_dir.mkdir(parents=True, exist_ok=True) + suffix = "_" + opt_level.replace("-", "").replace(" ", "_") + config = CompileConfig( + extra_cflags=opt_level, + output_dir=out_dir, + output_suffix=suffix, + clean_before_build=False, + ) + result = compile_testcase([Path(source)], MACHINE, config=config) + if not result.success: + return (H.RunResult(label, False, "", None, + error="tcc compile failed: " + (result.error or "").strip()), + None) + return H._run_elf(result.elf_file, label), Path(result.elf_file) + + +def crash_pc(stdout: str): + """Extract the stacked PC from a HardFault register dump, if present.""" + m = re.search(r"PC=0x([0-9A-Fa-f]+)", stdout) + return int(m.group(1), 16) if m else None + + +def disassemble_window(elf: Path, pc: int, before: int = 0x80, after: int = 0x40) -> str: + """force-thumb disassembly window around the faulting PC. A HardFault PC + inside what objdump renders as garbage/data usually means execution fell + into a literal pool or jump table -- exactly the layout-bug signature.""" + cmd = [OBJDUMP, "-d", "-M", "force-thumb", + f"--start-address={max(pc - before, 0):#x}", + f"--stop-address={pc + after:#x}", str(elf)] + r = subprocess.run(cmd, capture_output=True, text=True) + header = f"$ {' '.join(cmd)}\n(faulting PC: {pc:#x})\n\n" + return header + (r.stdout or r.stderr) + + +def main(argv=None) -> int: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--suite", "--profile", dest="suite", type=str, default=None, + choices=sorted(PROFILES.keys()), + help="gen_c.py profile the seed belongs to (sweep-report section name)") + ap.add_argument("--seed", type=int, default=None, help="seed within --suite") + ap.add_argument("--file", type=str, default=None, + help="triage an existing .c repro instead of generating one") + ap.add_argument("--olevels", type=str, default=",".join(DEFAULT_OPT_LEVELS), + help=f"comma-separated opt levels (default {','.join(DEFAULT_OPT_LEVELS)})") + ap.add_argument("--out", type=str, default=None, + help="artifact dir (default tests/fuzz/results/triage/_)") + ap.add_argument("--skip-reduce", action="store_true", + help="skip reduce_divergence.py (bisect runs on the full repro)") + ap.add_argument("--skip-bisect", action="store_true", + help="skip bisect_opt.py") + args = ap.parse_args(argv) + + if args.file is None and (args.suite is None or args.seed is None): + ap.error("either --suite AND --seed, or --file is required") + + usable, reason = H.qemu_available() + if not usable: + log(f"QEMU/newlib not usable: {reason}") + return 2 + + opt_levels = [o.strip() for o in args.olevels.split(",") if o.strip()] + if "-O0" not in opt_levels: + opt_levels.insert(0, "-O0") # -O0 is the trusted self-consistency oracle + + tag = f"{args.suite}_{args.seed}" if args.file is None else Path(args.file).stem + out_dir = Path(args.out) if args.out else REPO_ROOT / "tests" / "fuzz" / "results" / "triage" / tag + out_dir.mkdir(parents=True, exist_ok=True) + build_dir = out_dir / "_build" + + # -- 1. the program ----------------------------------------------------- + source = out_dir / "seed.c" + if args.file is not None: + source.write_text(Path(args.file).read_text()) + log(f"copied repro {args.file} -> {source}") + else: + source.write_text(generate_program(args.seed, profile=args.suite)) + log(f"generated {args.suite} seed {args.seed} -> {source}") + + # -- 2. tcc signatures at every O-level (full output, incl. fault dumps) - + results = {} + elfs = {} + for o in opt_levels: + res, elf = run_tcc_keep_elf(source, o, build_dir) + results[o], elfs[o] = res, elf + log(f"{res.label}: exit={res.exit_code} stdout={res.stdout.strip()!r}" + + (f" err={res.error}" if res.error else "")) + outputs = [f"[{results[o].label}] ok={results[o].ok} exit={results[o].exit_code}\n" + f"{results[o].stdout.rstrip()}\n" for o in opt_levels] + (out_dir / "outputs.txt").write_text("\n".join(outputs)) + + ref_sig = results["-O0"].signature + divergent = [o for o in opt_levels + if o != "-O0" and (not results[o].ok or results[o].signature != ref_sig)] + + # -- 3. gcc ground truth (must equal tcc -O0) --------------------------- + gcc_line = "unavailable" + gcc_ok, gcc_reason = H.gcc_reference_available() + if gcc_ok: + gcc_res = H.run_with_gcc(source, "-O2", out_dir / "_gccbuild") + gcc_line = f"exit={gcc_res.exit_code} stdout={gcc_res.stdout.strip()!r}" + (out_dir / "gcc_reference.txt").write_text( + f"[{gcc_res.label}] ok={gcc_res.ok} {gcc_line}\n") + log(f"gcc -O2 reference: {gcc_line}") + if gcc_res.ok and gcc_res.signature != ref_sig: + log("WARNING: gcc -O2 disagrees with tcc -O0 -- one oracle is " + "miscompiling; cross-check before trusting either " + "(see the gcc-bad quarantine cases in the sweep reports)") + else: + log(f"gcc reference skipped: {gcc_reason}") + + if not divergent: + (out_dir / "SUMMARY.md").write_text( + f"# {tag}: CONSISTENT\n\nAll of {', '.join(opt_levels)} produced " + f"{ref_sig!r}; gcc -O2: {gcc_line}.\nNothing to triage.\n") + log(f"CONSISTENT across {','.join(opt_levels)} -- nothing to triage") + return 0 + + high = divergent[0] + log(f"DIVERGENT at {','.join(divergent)}; using --high={high}") + + # -- 4. crash disassembly (before reduce: layout bugs die under reduction + # of a DIFFERENT kind, and the full seed is what actually faulted) -- + pc = crash_pc(results[high].stdout) if results[high].ok else None + if pc is not None and elfs[high] is not None: + (out_dir / "crash_disasm.txt").write_text(disassemble_window(elfs[high], pc)) + log(f"crash at PC={pc:#x}: disassembly window -> crash_disasm.txt") + + # -- 5. reduce ---------------------------------------------------------- + reduced = out_dir / "reduced.c" + bisect_input = source + if args.skip_reduce: + log("reduction skipped (--skip-reduce)") + else: + log(f"reducing (low=-O0 high={high}) ... this can take a few minutes") + r = subprocess.run( + [sys.executable, str(REPO_ROOT / "scripts" / "reduce_divergence.py"), + str(source), f"--low=-O0", f"--high={high}", "-o", str(reduced)], + capture_output=True, text=True) + if r.returncode == 0 and reduced.exists(): + bisect_input = reduced + log(f"reduced -> {reduced} ({sum(1 for _ in open(reduced))} lines)") + else: + log(f"reduction failed (rc={r.returncode}); bisecting the full seed\n" + + (r.stderr or r.stdout).strip()) + + # -- 6. bisect (Phase A knobs / Phase B folds / Phase C final-IR diff) --- + culprits = "not run" + if args.skip_bisect: + log("bisect skipped (--skip-bisect)") + else: + log(f"bisecting {bisect_input.name} at {high} ...") + r = subprocess.run( + [sys.executable, str(REPO_ROOT / "scripts" / "bisect_opt.py"), + "--file", str(bisect_input), f"--high={high}"], + capture_output=True, text=True) + (out_dir / "bisect.txt").write_text(r.stdout + r.stderr) + m = re.search(r"Culprit knob\(s\).*?:\s*(.*)", r.stdout) + culprits = m.group(1).strip() if m else "none found (see bisect.txt)" + log(f"culprit knob(s): {culprits}") + + # -- 7. summary ---------------------------------------------------------- + lines = [f"# Triage data: {tag}", ""] + if args.file is None: + lines.append(f"Suite/profile: `{args.suite}` seed: `{args.seed}`") + lines += [ + "", + "| level | exit | output |", + "|---|---|---|", + *[f"| `tcc {o}` | {results[o].exit_code} | `{results[o].stdout.strip()!r}` |" + for o in opt_levels], + f"| `gcc -O2` (oracle) | | `{gcc_line}` |", + "", + f"Divergent level(s): **{', '.join(divergent)}**", + f"Culprit knob(s) [Phase A]: **{culprits}**", + ] + if pc is not None: + lines += [ + f"Crash: faulting PC `{pc:#x}` -- see `crash_disasm.txt`.", + "", + "> Many unrelated \"fixing\" knobs + a wild PC/BFAR usually means a", + "> layout-sensitive BACKEND bug (literal pool / IT block / branch", + "> range), not an IR misfold: read `crash_disasm.txt` around the PC", + "> first (is it inside pool data? right after an IT block?).", + ] + lines += [ + "", + "Artifacts: `seed.c`, `outputs.txt`, `gcc_reference.txt`, " + "`reduced.c`, `bisect.txt`" + + (", `crash_disasm.txt`" if pc is not None else "") + ".", + "", + "Next steps: docs/debugging_fuzz_divergences.md sections 3-5 " + "(read the IR, write the regression test FIRST, then fix).", + ] + (out_dir / "SUMMARY.md").write_text("\n".join(lines) + "\n") + log(f"summary -> {out_dir / 'SUMMARY.md'}") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tcc.h b/tcc.h index a12b3d1a..10c3e3c0 100644 --- a/tcc.h +++ b/tcc.h @@ -1459,6 +1459,15 @@ struct TCCState } *vla_param_exprs; int nb_vla_param_exprs; + /* Inner (nested) VLA dimension token streams saved on a SYM_FIELD's + vla_array_str. Materialization (func_vla_arg_code) frees and NULLs them at + a function definition's entry, but an inner VLA inside an abstract / + function-pointer declarator (e.g. a typedef `void(*)(int[][n()])`) is never + materialized, so its heap token stream would leak. Tracked here so any + unconsumed buffer is reclaimed at end of translation unit. */ + int **vla_inner_exprs; + int nb_vla_inner_exprs; + /* linker script support */ char *linker_script; /* path to linker script file (-T option) */ struct LDScript *ld_script; /* parsed linker script */ diff --git a/tccgen.c b/tccgen.c index e5bca23e..65ac38a1 100644 --- a/tccgen.c +++ b/tccgen.c @@ -67,6 +67,56 @@ static int local_scope; static int func_param_decl_depth; ST_DATA char debug_modes; +typedef struct FuncallScratch +{ + SValue *saved_args; + unsigned char **saved_args_cid; + int *saved_args_cid_size; + int saved_arg_count; + struct FuncallScratch *next; +} FuncallScratch; + +static FuncallScratch *funcall_scratch_stack; + +static void funcall_scratch_free(FuncallScratch *fs) +{ + int i; + + if (!fs) + return; + tcc_free(fs->saved_args); + for (i = 0; i < fs->saved_arg_count; i++) + tcc_free(fs->saved_args_cid[i]); + tcc_free(fs->saved_args_cid); + tcc_free(fs->saved_args_cid_size); + tcc_free(fs); +} + +static void funcall_scratch_pop_free(FuncallScratch *fs) +{ + FuncallScratch **p; + + for (p = &funcall_scratch_stack; *p; p = &(*p)->next) + { + if (*p == fs) + { + *p = fs->next; + break; + } + } + funcall_scratch_free(fs); +} + +static void funcall_scratch_free_all(void) +{ + while (funcall_scratch_stack) + { + FuncallScratch *next = funcall_scratch_stack->next; + funcall_scratch_free(funcall_scratch_stack); + funcall_scratch_stack = next; + } +} + typedef struct PendingAliasDef { Sym *alias_sym; @@ -1084,11 +1134,45 @@ ST_FUNC void tccgen_finish(TCCState *s1) tcc_ir_func_write_summary_clear_all(); /* Same for the TU-wide read/call summary used by dead-static-store elim. */ tcc_ir_tu_func_summary_clear_all(); + funcall_scratch_free_all(); tcc_free(pending_aliases); pending_aliases = NULL; nb_pending_aliases = 0; + /* Reclaim inner VLA dimension token streams that were never materialized + (abstract / function-pointer declarators, e.g. `typedef void(*)(int[][n()])`). + Consumed ones were already freed and NULLed in func_vla_arg_code. */ + if (s1->vla_inner_exprs) + { + for (int i = 0; i < s1->nb_vla_inner_exprs; i++) + tcc_free(s1->vla_inner_exprs[i]); + tcc_free(s1->vla_inner_exprs); + s1->vla_inner_exprs = NULL; + s1->nb_vla_inner_exprs = 0; + } + + /* Free any label-difference fixups left over from a symbol/label diff + (e.g. `int z = &"s"[1] - &"s"[0];`) that appeared in a GLOBAL initializer + with no enclosing function: gen_function's resolver only runs per function + body, so a global-only translation unit would leak the fixup node. We only + RECLAIM them here, deliberately not re-applying the st_value-difference + patch: the slot already holds the addend difference written by init_putv, + and re-resolving at global scope changes that emitted value (the existing + resolver is meant for in-function computed-goto label diffs). Leaving the + value untouched keeps codegen identical to before — this is purely a leak + fix. */ + { + LabelDiffFixup *f = s1->label_diff_fixups; + while (f) + { + LabelDiffFixup *next = f->next; + tcc_free(f); + f = next; + } + s1->label_diff_fixups = NULL; + } + /* If compilation aborted while generating a function, the per-function IR block allocated in gen_function() may not have been released (because we unwind via longjmp). Free it here to avoid leaks on compile errors. */ @@ -1631,6 +1715,11 @@ ST_FUNC void sym_pop(Sym **ptop, Sym *b, int keep) ps = &ts->sym_identifier; *ps = s->prev_tok; } + if (!keep && s->const_init_data) + { + tcc_free(s->const_init_data); + s->const_init_data = NULL; + } /* Don't free symbols that have been exported to ELF (sym->c != 0) as they may still be referenced by IR instructions */ if (!keep && s->c == 0) @@ -4065,6 +4154,29 @@ static void gen_opl(int op) /* FALLTHROUGH */ case '*': t = vtop->type.t; /* Save type for lbuild at end */ + /* Speculative / code-suppressed contexts (try_inline_const_eval, if(0) + * dead branches, constant-expression and data-only evaluation) run with + * nocode_wanted set, where tcc_ir_put is a no-op (see ir/core.c) and gv() + * is suppressed. The generic 64x64 lexpand/lbuild expansion below assumes + * real register codegen and walks vtop off the vstack into the heap in + * that state. No code is emitted here, so just collapse the two operands + * into a single 64-bit result, mirroring the +/-/&/|/^ IR paths above. + * (CODE_OFF_BIT-only dead code after return still needs real IR for + * backpatching, so exclude it — same predicate tcc_ir_put uses.) */ + if (nocode_wanted & ~CODE_OFF_BIT) + { + vtop--; + vtop->type.t = VT_LLONG | (t & VT_UNSIGNED); + vtop->r = 0; + if (tcc_state->ir) + { + vtop->vr = tcc_ir_get_vreg_temp(tcc_state->ir); + tcc_ir_set_llong_type(tcc_state->ir, vtop->vr); + } + else + vtop->vr = -1; + break; + } /* Widening-multiply peephole: when both 64-bit operands are 32->64 * extensions (zero or sign), emit a single 32x32->64 UMULL/SMULL * instead of the generic 64x64 expansion. */ @@ -14221,7 +14333,20 @@ static int post_type(CType *type, AttributeDef *ad, int storage, int td) { /* for function args, the top dimension is converted to pointer */ if ((t1 & VT_VLA) && ((td & TYPE_NEST) || (func_param_decl_depth && !(td & TYPE_PARAM)))) + { s->vla_array_str = vla_array_str; + /* Track for end-of-TU reclamation. func_vla_arg_code frees this at a + function definition's entry (and drops it from the list there), but + an inner VLA dimension inside an abstract / function-pointer + declarator is never materialized and would otherwise leak. */ + if (vla_array_str_on_heap) + { + int vi = tcc_state->nb_vla_inner_exprs++; + tcc_state->vla_inner_exprs = tcc_realloc(tcc_state->vla_inner_exprs, + tcc_state->nb_vla_inner_exprs * sizeof(*tcc_state->vla_inner_exprs)); + tcc_state->vla_inner_exprs[vi] = vla_array_str; + } + } else if ((t1 & VT_VLA) && (td & TYPE_PARAM)) { /* Outermost VLA dimension of a function param: save the token string @@ -15699,9 +15824,15 @@ static void unary_funcall(void) if (pc > saved_args_cap) saved_args_cap = pc; } - SValue *saved_args = tcc_mallocz(saved_args_cap * sizeof(SValue)); - unsigned char **saved_args_cid = tcc_mallocz(saved_args_cap * sizeof(unsigned char *)); - int *saved_args_cid_size = tcc_mallocz(saved_args_cap * sizeof(int)); + FuncallScratch *saved_scratch = tcc_mallocz(sizeof(*saved_scratch)); + saved_scratch->saved_args = tcc_mallocz(saved_args_cap * sizeof(SValue)); + saved_scratch->saved_args_cid = tcc_mallocz(saved_args_cap * sizeof(unsigned char *)); + saved_scratch->saved_args_cid_size = tcc_mallocz(saved_args_cap * sizeof(int)); + saved_scratch->next = funcall_scratch_stack; + funcall_scratch_stack = saved_scratch; + SValue *saved_args = saved_scratch->saved_args; + unsigned char **saved_args_cid = saved_scratch->saved_args_cid; + int *saved_args_cid_size = saved_scratch->saved_args_cid_size; int saved_arg_count = 0; int can_try_fold = 0; int can_inline_builtin = 0; @@ -16171,6 +16302,7 @@ static void unary_funcall(void) aapcs_last_const_init = NULL; } saved_arg_count++; + saved_scratch->saved_arg_count = saved_arg_count; } else { @@ -16289,7 +16421,10 @@ static void unary_funcall(void) { saved_args[nb_args - 1 - n] = *vtop; if (n == 0) + { saved_arg_count = nb_args; + saved_scratch->saved_arg_count = saved_arg_count; + } } /* We evaluate right-to-left; assign 0-based parameter indices @@ -17483,11 +17618,8 @@ static void unary_funcall(void) } } } /* end of else block for non-folded function calls */ - tcc_free(saved_args); - for (int ci = 0; ci < saved_arg_count; ci++) - tcc_free(saved_args_cid[ci]); - tcc_free(saved_args_cid); - tcc_free(saved_args_cid_size); + saved_scratch->saved_arg_count = saved_arg_count; + funcall_scratch_pop_free(saved_scratch); if (s->f.func_noreturn) { if (debug_modes) @@ -17772,19 +17904,12 @@ static void __attribute__((noinline)) unary_builtin_fp(void) vset(&uint_type, VT_LOCAL | VT_LVAL, tmp_loc + high_word_offset); vtop->vr = vr_tmp; - if (fp_size == 4) - { - /* Match GCC __builtin_signbitf runtime behavior: return the raw - * sign mask (0x80000000) for negative float values. */ - vpushi(0x80000000u); - gen_op('&'); - } - else - { - /* Runtime double stays normalized to 0/1. */ - vpushi(31); - gen_op(TOK_SHR); - } + /* Match arm-none-eabi-gcc runtime behaviour: it emits + * `and r0, , #0x80000000` for both signbitf and signbit, + * returning the raw sign mask (0x80000000 = -2147483648 as signed int) + * for negative values and 0 otherwise. */ + vpushi(0x80000000u); + gen_op('&'); } break; } @@ -22964,6 +23089,8 @@ static __attribute__((noinline)) int unary_primary(void) if (s->c <= 0) s->c = -3; /* LABEL_ADDR_TAKEN marker */ func_has_label_addr = 1; + if (tcc_state->ir) + tcc_state->ir->func_has_label_addr = 1; /* mirror for the IR layer (regalloc) */ if ((s->type.t & VT_BTYPE) != VT_PTR) { s->type.t = VT_VOID; @@ -27042,6 +27169,7 @@ static void decl_initializer(init_params *p, CType *type, unsigned long c, int f tcc_error("unhandled string literal merging"); while (tok == TOK_STR || tok == TOK_LSTR) { + int tok_width = (tok == TOK_STR) ? 1 : (int)sizeof(nwchar_t); if (initstr.size) initstr.size -= size1; if (tok == TOK_STR) @@ -27049,7 +27177,25 @@ static void decl_initializer(init_params *p, CType *type, unsigned long c, int f else len += tokc.str.size / sizeof(nwchar_t); len--; - cstr_cat(&initstr, tokc.str.data, tokc.str.size); + if (tok_width == size1) + { + cstr_cat(&initstr, tokc.str.data, tokc.str.size); + } + else if (size1 == (int)sizeof(nwchar_t) && tok == TOK_STR) + { + /* Mixing a narrow piece into a wide initializer (C permits e.g. + * `L"a" "b"`): widen each byte to an nwchar_t element instead of + * byte-copying it, which would otherwise be read back at the wider + * element stride below and over-read initstr. */ + const unsigned char *np = (const unsigned char *)tokc.str.data; + for (int z = 0; z < tokc.str.size; z++) + cstr_wccat(&initstr, np[z]); + } + else + { + /* A wide piece in a narrow (char) array is not representable. */ + tcc_error("unhandled string literal merging"); + } next(); } if (tok != ')' && tok != '}' && tok != ',' && tok != ';' && tok != TOK_EOF) @@ -27996,7 +28142,7 @@ static void decl_initializer_alloc(CType *type, AttributeDef *ad, int r, int has /* restore parse state if needed */ if (init_str) { - end_macro(); + end_macro_to(init_str); next(); } @@ -28033,7 +28179,14 @@ static void func_vla_arg_code(Sym *arg) vswap(); vstore(); vpop(); - /* Free the VLA expression token buffer now that it's been evaluated */ + /* Free the VLA expression token buffer now that it's been evaluated, and + drop it from the end-of-TU reclamation list so it is not double-freed. */ + for (int i = 0; i < tcc_state->nb_vla_inner_exprs; i++) + if (tcc_state->vla_inner_exprs[i] == arg->type.ref->vla_array_str) + { + tcc_state->vla_inner_exprs[i] = NULL; + break; + } tcc_free(arg->type.ref->vla_array_str); arg->type.ref->vla_array_str = NULL; } @@ -28977,41 +29130,14 @@ static void gen_instrument_call(Sym *cur_func_sym, const char *hook_name) } #ifdef CONFIG_TCC_DEBUG -/* Returns 1 if `pass_name` matches the comma-separated list in - * s->dump_ir_passes (or the list contains the special token "all"). - * Used by DUMP_AFTER_PASS to gate per-pass IR dumps. */ -static int dump_ir_passes_match(TCCState *s, const char *pass_name) -{ - if (!s->dump_ir_passes || !pass_name) - return 0; - const char *p = s->dump_ir_passes; - size_t name_len = strlen(pass_name); - while (*p) - { - const char *comma = strchr(p, ','); - size_t tok_len = comma ? (size_t)(comma - p) : strlen(p); - if (tok_len == 3 && !memcmp(p, "all", 3)) - return 1; - if (tok_len == name_len && !memcmp(p, pass_name, name_len)) - return 1; - if (!comma) - break; - p = comma + 1; - } - return 0; -} - /* If pass_name matches -dump-ir-passes selection, dump the IR labeled with * the pass name. Intended to be called immediately after a - * tcc_ir_opt_() call to bisect which pass corrupts the IR. */ + * tcc_ir_opt_() call to bisect which pass corrupts the IR. Thin wrapper + * over the shared implementation in ir/dump.c (also used by the SSA driver). */ static void dump_ir_after_pass(TCCState *s, TCCIRState *ir, const char *pass_name) { - if (!dump_ir_passes_match(s, pass_name)) - return; - tcc_ir_dump_set_show_physical_regs(0); - printf("=== AFTER %s ===\n", pass_name); - tcc_ir_show(ir); - printf("=== END AFTER %s ===\n", pass_name); + (void)s; + tcc_ir_dump_after_pass(ir, pass_name); } /* Run a pass call and dump if selected. `expr` is the call, `name` is a @@ -29293,6 +29419,13 @@ static void gen_function(Sym *sym) #endif + /* Carry narrow plain-STORE access widths onto their value operands before any + * pass converts a plain STORE (width from dest) into a STORE_INDEXED (width + * from the value operand) — so a char/short store is not widened to a word. + * Run again before regalloc to catch widths lost to later value forwarding. */ + if (tcc_state->optimize > 0) + tcc_ir_opt_narrow_store_value_btype(ir); + /* Block copy init: replace memset(0) + consecutive stores with BLOCK_COPY * from a pre-built rodata block. Run once before the iterative loop. */ { void dbg_scan_overlap(TCCIRState*,const char*); dbg_scan_overlap(ir,"pre-block_copy_init"); } @@ -29614,7 +29747,6 @@ static void gen_function(Sym *sym) break; } } - /* Post-SL_FWD cleanup: the SL_FWD loop's DCE may have killed dead branches * that were the only remaining defs of a VAR (e.g. `fail = 1` in a dead * printf path). Re-run const_prop + branch_folding + DCE so the now- @@ -29640,16 +29772,31 @@ static void gen_function(Sym *sym) if (tcc_state->opt_store_load_fwd && !ir->has_static_chain) { int padrof_changed = tcc_ir_opt_param_addrof_const_fold(ir) > 0; +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_padrof"); +#endif int ladrof_changed = tcc_ir_opt_local_addrof_const_fold(ir) > 0; +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_ladrof"); +#endif int aofvar_changed = 0; int gslfwd_changed = 0; int iglh_changed = 0; if (tcc_state->opt_const_prop) aofvar_changed = tcc_ir_opt_addrof_var_fwd(ir) > 0; +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_aofvar"); +#endif if (tcc_state->opt_store_load_fwd) gslfwd_changed = tcc_ir_opt_global_sl_fwd(ir) > 0; +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_gslfwd"); +#endif if (tcc_state->opt_store_load_fwd) iglh_changed = tcc_ir_opt_invariant_global_load_hoist(ir) > 0; +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_iglh"); +#endif if (padrof_changed || ladrof_changed || aofvar_changed || gslfwd_changed || iglh_changed) { if (tcc_state->opt_const_prop) @@ -29731,6 +29878,9 @@ static void gen_function(Sym *sym) * overwritten by a subsequent CALL, using the callee's write summary. */ if (tcc_state->opt_dead_store) tcc_ir_opt_dead_init_via_call(ir); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_dead_init_via_call"); +#endif /* Late cleanup: store elimination, dead var/addrvar elimination, redundant assign. * Run with max_iterations=2 so dead_addrvar_elim → DSE cascade works. @@ -29753,6 +29903,9 @@ static void gen_function(Sym *sym) tcc_ir_opt_ctx_init(&cleanup_ctx, ir); tcc_ir_opt_run_group(&cleanup_ctx, cleanup_group); tcc_ir_opt_ctx_free(&cleanup_ctx); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_late_cleanup_1"); +#endif if (tcc_state->opt_dead_store) { for (int iter = 0; iter < 4; iter++) { @@ -29791,6 +29944,9 @@ static void gen_function(Sym *sym) * and before IV strength reduction which benefits from rotated layout. */ if (tcc_state->opt_loop_rotation) tcc_ir_opt_loop_rotation(ir); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_loop_rotation"); +#endif /* Phase 4c.5: First-iteration-exit peeling. Rewrites a loop's exit * JUMPIF to unconditional JUMP when the header test is provably true @@ -29916,6 +30072,9 @@ static void gen_function(Sym *sym) { if (tcc_ir_opt_diamond_store_fwd(ir) > 0) { +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_diamond_store_fwd"); +#endif for (int dsf_iter = 0; dsf_iter < 6; dsf_iter++) { int dsf_ch = 0; @@ -29967,6 +30126,9 @@ static void gen_function(Sym *sym) tcc_ir_opt_compact_nops(ir); } (void)total_lcs_changes; +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_loop_const_sim"); +#endif } /* Phase 5a: Loop Unrolling - fully unroll small constant-trip-count loops. @@ -30008,6 +30170,9 @@ static void gen_function(Sym *sym) ch2 += tcc_ir_opt_value_tracking(ir); } while (ch2 > 0 && ++iter2 < 10); } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_loop_unroll"); +#endif } /* Phase 5: Loop-Invariant Code Motion - DISABLED * The LICM pass has a bug in hoist_const_exprs_from_loop(): instruction @@ -30033,6 +30198,9 @@ static void gen_function(Sym *sym) tcc_ir_opt_iv_strength_reduction(ir); } tcc_ir_free_loops(licm_loops); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ_iv_strength_red"); +#endif /* Local ALU CSE: dedupe pure arithmetic ops within a basic block. * Catches `arr[i].x` + `arr[i].y` patterns where the same `i*stride+base` @@ -30065,6 +30233,10 @@ static void gen_function(Sym *sym) fprintf(stderr, "[local_alu_cse] %d changes in %d iterations\n", total_changes, loops); } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_alu_cse"); +#endif + /* Phase 6b: Pointer store-to-load forwarding — after local_alu_cse has * CSE'd identical address computations (e.g. 5x `T = hstent + 12` collapsed * to one), bitfield read-modify-write chains now use the same address vreg. @@ -30146,6 +30318,10 @@ static void gen_function(Sym *sym) } } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_psl_fwd"); +#endif + if (tcc_state->opt_redundant_store) { if (tcc_ir_opt_rmw_byte_clear(ir) > 0) @@ -30160,6 +30336,9 @@ static void gen_function(Sym *sym) if (tcc_state->opt_strength_red) dbg_scan_overlap(ir,"Q3-before-strength_reduction"); tcc_ir_opt_strength_reduction(ir); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_strength_red"); +#endif /* Late copy propagation + dead store elimination. * Late passes (IV strength reduction, loop rotation) may introduce @@ -30171,6 +30350,9 @@ static void gen_function(Sym *sym) if (late_cp > 0 && tcc_state->opt_dead_store) tcc_ir_opt_dse(ir); } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_late_cp"); +#endif if (tcc_state->opt_const_prop) { @@ -30186,6 +30368,9 @@ static void gen_function(Sym *sym) tcc_ir_opt_compact_nops(ir); } } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_sas"); +#endif /* Late memmove→indexed-stores: earlier calls miss patterns where the * destination address is computed through inline-parameter VAR chains @@ -30228,6 +30413,9 @@ static void gen_function(Sym *sym) if (tcc_state->opt_dce) tcc_ir_opt_dce(ir); } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_shl32"); +#endif /* OR-bool-diamond — fold `acc |= (cond ? 1 : 0)` materialization. */ if (tcc_state->opt_const_prop) @@ -30237,6 +30425,9 @@ static void gen_function(Sym *sym) * their defining deref expressions, creating STORE+CMP deref pairs. */ if (tcc_state->opt_const_prop) tcc_ir_opt_deref_fwd(ir); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_deref_fwd"); +#endif /* Late VAR→TMP forwarding is deferred to after final compact_nops + * eliminate_fallthrough (below), because the forward scan needs clean @@ -30256,6 +30447,9 @@ static void gen_function(Sym *sym) * does not run again after this point. */ if (tcc_state->opt_copy_prop) tcc_ir_opt_postinc_assign_fold(ir); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_paf"); +#endif /* Combine `V = V ± C1; V = V ± C2; ...` chains into a single update. * Produced by loop unrolling of pointer-increment loops once @@ -30277,6 +30471,9 @@ static void gen_function(Sym *sym) tcc_ir_opt_compact_nops(ir); } } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_csaf"); +#endif /* Loop-aware post-increment fusion — fuse embedded deref in loop body with * latch pointer increment into LOAD_POSTINC. Must run after IV strength @@ -30295,6 +30492,9 @@ static void gen_function(Sym *sym) dbg_scan_overlap(ir,"Q4-before-decrement_to_zero"); tcc_ir_opt_decrement_to_zero(ir); dbg_scan_overlap(ir,"Q4b-after-decrement_to_zero"); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_dtz"); +#endif /* Redundant Init Elimination - remove function-entry VAR inits that are * always killed before use. Must run after decrement-to-zero (which NOPs @@ -30319,6 +30519,9 @@ static void gen_function(Sym *sym) } } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_dle1"); +#endif tcc_ir_opt_dce(ir); /* Final pass to mark unreachable code as NOP */ /* Re-run dead loop elimination after final DCE: earlier loops may now have @@ -30370,6 +30573,9 @@ static void gen_function(Sym *sym) tcc_ir_opt_dse(ir); } } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_vtf"); +#endif /* PACK64 tautology — collapse PACK64(low(X), X>>32) into ASSIGN X. * Must run AFTER late var_tmp_fwd + copy_prop: those passes resolve the @@ -30390,6 +30596,9 @@ static void gen_function(Sym *sym) if (tcc_state->opt_dce) tcc_ir_opt_dce(ir); } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_p64t"); +#endif /* ADD-immediate + DEREF fold into LOAD_INDEXED — DISABLED. * The fold moves the memory load from the DEREF use site to the ADD @@ -30421,6 +30630,9 @@ static void gen_function(Sym *sym) tcc_ir_opt_eliminate_fallthrough(ir); } } +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_lr"); +#endif /* Redundant zero-trip entry-guard elimination. Sequential counted loops * sharing a counter (memclr's 3 loops over i) keep a pre-loop guard on the @@ -30447,6 +30659,9 @@ static void gen_function(Sym *sym) * half setup and compare. */ dbg_scan_overlap(ir,"P3-before-cmp_narrow_64"); dbg_scan_overlap(ir,"R4-just-before-cmp_narrow"); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_lge"); +#endif tcc_ir_opt_cmp_narrow_64(ir); /* ASSIGN fusion — fold `T_new = X OP Y; T_final = T_new ASSIGN` into a @@ -30456,6 +30671,9 @@ static void gen_function(Sym *sym) dbg_scan_overlap(ir,"P4-before-assign_fuse"); tcc_ir_opt_assign_fuse(ir); dbg_scan_overlap(ir,"P4b-after-assign_fuse"); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_af"); +#endif /* Phase 8: Conditional Select - replace if/else diamonds with SELECT. * Must run late, after all other optimizations have simplified the IR, @@ -30468,6 +30686,9 @@ static void gen_function(Sym *sym) * SELECT's flag-setting CMP is not deleted by a downstream orphan-CMP pass. */ if (tcc_state->optimize > 0) tcc_ir_opt_setif_neg_to_select(ir); +#ifdef CONFIG_TCC_DEBUG + dump_ir_after_pass(tcc_state, ir, "ZZ2_sel"); +#endif /* Recompute leafness after IR optimizations. * IR construction marks the function non-leaf as soon as a call op is @@ -30804,6 +31025,12 @@ static void gen_function(Sym *sym) if (tcc_state->optimize > 0) tcc_ir_opt_shift64_dead_half(ir); + /* Carry narrow plain-STORE access widths onto their value operands so the + * later STORE_INDEXED conversions (which take the store width from the value + * operand, not the dest) do not widen a char/short store to a word. */ + if (tcc_state->optimize > 0) + tcc_ir_opt_narrow_store_value_btype(ir); + /* Register allocation (SSA-based linear scan) */ { const RegAllocTarget *ra_target = arm_get_regalloc_target(); diff --git a/tccir.h b/tccir.h index 36d8bb15..16000b0f 100644 --- a/tccir.h +++ b/tccir.h @@ -229,6 +229,16 @@ typedef enum TccIrOp TCCIR_OP_SMULL, } TccIrOp; +/* Size (in bytes) at or above which the backend lowers a TCCIR_OP_BLOCK_COPY to + * a real memcpy() call instead of an inline LDM/STM sequence (see + * tcc_gen_machine_block_copy_mop in arm-thumb-gen.c). A memcpy call clobbers + * the caller-saved registers, so register allocation must treat a block copy of + * at least this size as a call site (ra_build_call_prefix in ir/regalloc.c) and + * force any value live across it off r0-r3/r12. The inline path below this size + * preserves everything it touches via scratch save/restore, so it is not a call. + * The two sites must agree on this threshold; keep them in sync via this macro. */ +#define TCCIR_BLOCK_COPY_MEMCPY_MIN_BYTES 64 + /* FUNCPARAMVAL encoding helpers: * src2.c.i encodes both parameter index (lower 16 bits) and call_id (upper 16 bits) * This keeps call/param binding explicit and makes the IR more compact. @@ -585,6 +595,12 @@ typedef struct TCCIRState uint32_t *orig_ir_to_code_mapping; int orig_ir_to_code_mapping_size; + /* Mirror of tccgen's func_has_label_addr for the current function: set when the + * body takes a label address (GCC labels-as-values, `&&label`). Kept on the IR + * state so the IR layer (regalloc) can consult it without referencing a tccgen + * global (which the standalone unit-test link does not provide). */ + int func_has_label_addr; + LSLiveIntervalState ls; /* Extra scratch allocation flags to apply during materialization for the current IR instruction. */ @@ -620,6 +636,16 @@ typedef struct TCCIRState * Entry = lsb (bits 0-7) | (width << 8); width >= 1 so a real BFI entry is * never 0. Consumed by tcc_gen_machine_bfi_mop. */ uint16_t *bfi_params; + + /* Codegen temporaries owned by tcc_ir_codegen_generate while it is running. + * They are normally freed before return; tcc_ir_free also releases them when + * a compile error longjmps out of codegen. */ + int *codegen_return_jump_addrs; + int *codegen_dry_insn_scratch; + uint16_t *codegen_dry_insn_saves; + void *codegen_mop_cache; + uint32_t *codegen_cbz_dry_mapping; + uint8_t *codegen_branch_target_reset; } TCCIRState; TCCIRState *tcc_ir_allocate_block(); @@ -659,9 +685,14 @@ void tcc_ir_assign_physical_register(TCCIRState *ir, int vreg, int offset, int r const char *tcc_ir_get_op_name(TccIrOp op); void tcc_ir_show(TCCIRState *ir); void tcc_ir_dump_set_show_physical_regs(int show); +/* -dump-ir-passes= helpers (shared by the legacy optimize loop in tccgen.c and + * the SSA optimizer driver in ir/opt/ssa_opt.c). */ +int tcc_ir_dump_passes_match(TCCState *s, const char *pass_name); +void tcc_ir_dump_after_pass(TCCIRState *ir, const char *pass_name); void tcc_ir_set_addrtaken(TCCIRState *ir, int vreg); IRLiveInterval *tcc_ir_get_live_interval(TCCIRState *ir, int vreg); +IRLiveInterval *tcc_ir_try_get_live_interval(TCCIRState *ir, int vreg); void tcc_ir_backpatch(TCCIRState *ir, int t, int target_address); void tcc_ir_backpatch_to_here(TCCIRState *ir, int t); void tcc_ir_backpatch_first(TCCIRState *ir, int t, int target_address); @@ -842,6 +873,17 @@ static inline void tcc_ir_set_src1(TCCIRState *ir, int index, IROperand irop) if (!irop_config[q->op].has_src1) return; int off = irop_config[q->op].has_dest; + /* A STORE_INDEXED / STORE_POSTINC derives its store width from the VALUE + * (src1) operand's btype. A value rewrite (e.g. copy-propagation forwarding + * a wider temp into a char/short bitfield store) must not widen it — that + * would turn a byte/half store into a word store and clobber adjacent memory. + * Preserve the existing narrow access width. */ + if (q->op == TCCIR_OP_STORE_INDEXED || q->op == TCCIR_OP_STORE_POSTINC) { + uint8_t old_bt = ir->iroperand_pool[q->operand_base + off].btype; + if ((old_bt == IROP_BTYPE_INT8 || old_bt == IROP_BTYPE_INT16) && + irop.btype == IROP_BTYPE_INT32) + irop.btype = old_bt; + } ir->iroperand_pool[q->operand_base + off] = irop; } diff --git a/tccir_operand.c b/tccir_operand.c index 26289004..786148e1 100644 --- a/tccir_operand.c +++ b/tccir_operand.c @@ -808,29 +808,15 @@ int irop_compare_svalue(const TCCIRState *ir, const SValue *sv, IROperand op, co mismatch = 1; } - /* Compare CValue (c union) - compare multiple members for better diagnosis */ + /* Compare CValue (c union). Only compare c.i: union padding bytes in the + * unused portions of CValue can differ between two semantically-equal + * values, so a full memcmp would report false mismatches. */ if (reconstructed.c.i != sv->c.i) { fprintf(stderr, "%s: c.i mismatch: reconstructed=0x%016llx, expected=0x%016llx\n", context, (unsigned long long)reconstructed.c.i, (unsigned long long)sv->c.i); mismatch = 1; } - else if (memcmp(&reconstructed.c, &sv->c, sizeof(CValue)) != 0) - { - /* Check string members if i matches but bytes differ (likely padding or str variant) */ - if (reconstructed.c.str.data != sv->c.str.data || reconstructed.c.str.size != sv->c.str.size) - { - fprintf(stderr, "%s: c.str mismatch: data=%p/%p, size=%d/%d\n", context, (void *)reconstructed.c.str.data, - (void *)sv->c.str.data, reconstructed.c.str.size, sv->c.str.size); - } - else - { - fprintf(stderr, "%s: c mismatch: bytes differ (likely padding)\n", context); - fprintf(stderr, " reconstructed.c.i = 0x%016llx\n", (unsigned long long)reconstructed.c.i); - fprintf(stderr, " expected.c.i = 0x%016llx\n", (unsigned long long)sv->c.i); - } - mismatch = 1; - } /* Compare sym pointer */ if (reconstructed.sym != sv->sym) diff --git a/tccir_operand.h b/tccir_operand.h index 6980d56b..5971c500 100644 --- a/tccir_operand.h +++ b/tccir_operand.h @@ -52,7 +52,18 @@ typedef enum TCCIR_VREG_TYPE #define IROP_TAG_NONE 0 /* sentinel for unused operand */ #define IROP_TAG_VREG 1 /* pure vreg with no additional data */ #define IROP_TAG_IMM32 2 /* payload.imm32: signed 32-bit immediate */ -#define IROP_TAG_STACKOFF 3 /* payload.imm32: signed 32-bit FP-relative offset */ +#define IROP_TAG_STACKOFF 3 /* payload.imm32: signed 32-bit FP-relative offset + * + * IMPORTANT: not every STACKOFF operand is a real + * stack slot reference. A *direct* stack location + * has tag == STACKOFF, is_local == 1, is_lval == 1 + * AND vreg_type == 0. When a VAR or PARAM is + * referenced via its potential spill encoding, + * vreg_type is non-zero and the offset field is + * only metadata about where it *would* spill; the + * program reads from the vreg, not from that slot. + * New passes that inspect stack operands MUST + * check vreg_type == 0 to avoid miscompiles. */ #define IROP_TAG_F32 4 /* payload.f32_bits: 32-bit float bits (inline) */ #define IROP_TAG_I64 5 /* payload.pool_idx: index into pool_i64[] */ #define IROP_TAG_F64 6 /* payload.pool_idx: index into pool_f64[] */ @@ -97,7 +108,10 @@ typedef struct __attribute__((packed)) IROperand uint32_t is_local : 1; /* VT_LOCAL: stack-relative (23) */ uint32_t is_const : 1; /* VT_CONST: constant value (24) */ uint32_t btype : 3; /* IROP_BTYPE_* (25-27) */ - uint32_t vreg_type : 4; /* TCCIR_VREG_TYPE_* (28-31) */ + uint32_t vreg_type : 4; /* TCCIR_VREG_TYPE_* (28-31). + For IROP_TAG_STACKOFF: zero means a real + direct StackLoc reference; non-zero means a + vreg-backed spill encoding (see above). */ }; }; union @@ -179,6 +193,9 @@ int irop_compare_svalue(const struct TCCIRState *ir, const struct SValue *sv, IR /* Position sentinel value: max 17-bit value means "no position" */ #define IROP_POSITION_NONE 0x1FFFF +/* Forward declaration: defined below after all helpers it needs. */ +static inline int32_t irop_get_vreg(const IROperand op); + /* Check if operand encodes a negative vreg (sentinel pattern). * Excludes IROP_NONE (vr == -1) which also matches the sentinel bit pattern. */ static inline int irop_is_neg_vreg(const IROperand op) @@ -191,8 +208,7 @@ static inline int irop_is_neg_vreg(const IROperand op) /* Check if operand has no associated vreg */ static inline int irop_has_no_vreg(const IROperand op) { - /* Either negative vreg sentinel OR the old vr < 0 check for IROP_NONE */ - return irop_is_neg_vreg(op) || (op.position == IROP_POSITION_NONE && op.vreg_type == 0); + return irop_get_vreg(op) == -1; } /* Extract tag from operand (using bitfield) */ @@ -543,7 +559,7 @@ static inline uint32_t irop_get_pool_idx(const IROperand op) /* Check if operand is an lvalue (needs dereference) - uses bitfield */ static inline int irop_op_is_lval(const IROperand op) { - if (op.vr < 0) + if (irop_get_tag(op) == IROP_TAG_NONE) return 0; return op.is_lval; } @@ -551,7 +567,7 @@ static inline int irop_op_is_lval(const IROperand op) /* Check if operand has VT_LOCAL semantics - uses bitfield */ static inline int irop_op_is_local(const IROperand op) { - if (op.vr < 0) + if (irop_get_tag(op) == IROP_TAG_NONE) return 0; return op.is_local; } @@ -559,7 +575,7 @@ static inline int irop_op_is_local(const IROperand op) /* Check if operand has VT_LLOCAL semantics (double indirection) - uses bitfield */ static inline int irop_op_is_llocal(const IROperand op) { - if (op.vr < 0) + if (irop_get_tag(op) == IROP_TAG_NONE) return 0; return op.is_llocal; } @@ -567,7 +583,7 @@ static inline int irop_op_is_llocal(const IROperand op) /* Check if operand is constant - uses bitfield */ static inline int irop_op_is_const(const IROperand op) { - if (op.vr < 0) + if (irop_get_tag(op) == IROP_TAG_NONE) return 0; return op.is_const; } diff --git a/tccls.c b/tccls.c index 3a4e14d6..b8307df7 100644 --- a/tccls.c +++ b/tccls.c @@ -266,6 +266,31 @@ uint32_t tcc_ls_compute_live_regs(LSLiveIntervalState *ls, int instruction_idx) return live_regs; } +/* True when physical register `reg` is claimed at instruction `pos` by any + * live interval other than `skip`. Post-RA register rewriters (move + * coalescing, the phase-3 scratch-conflict fixup) deliberately make two + * overlapping intervals share one register (in-place two-address ops), so a + * single live_regs_by_instruction bit can carry two claims. When a rewrite + * moves one claimant away it must leave the bit set wherever another claimant + * is still live, or the bitmap under-reports and a later rewrite allocates + * the register on top of a live value. */ +int tcc_ls_reg_held_by_other(const LSLiveIntervalState *ls, int reg, int pos, const LSLiveInterval *skip) +{ + for (int i = 0; i < ls->next_interval_index; ++i) + { + const LSLiveInterval *iv = &ls->intervals[i]; + if (iv == skip) + continue; + if (iv->stack_location != 0) + continue; + if (iv->r0 != reg && iv->r1 != reg) + continue; + if (iv->start <= (uint32_t)pos && iv->end >= (uint32_t)pos) + return 1; + } + return 0; +} + int tcc_ls_find_free_scratch_reg(LSLiveIntervalState *ls, int instruction_idx, uint32_t exclude_regs, int is_leaf) { uint32_t live_regs = exclude_regs; @@ -282,39 +307,30 @@ int tcc_ls_find_free_scratch_reg(LSLiveIntervalState *ls, int instruction_idx, u live_regs |= (1 << 15); + /* Union the precomputed per-instruction bitmap with a fresh interval scan. + * ra_build_live_regs_bitmap deliberately OMITS any interval that carries a + * stack_location (it assumes a spilled value does not hold a register across + * its whole range). That assumption is FALSE for a loop-carried value kept + * live in a register across the loop body while also owning a spill slot + * (r0 >= 0 AND stack_location != 0): the bitmap then under-reports that + * register as free, and the scratch picker can hand it out, clobbering the + * still-live value (random-C O2 wrong-code, Finding #15). tcc_ls_compute_live_regs + * scans the intervals directly (ignoring stack_location) and DOES report it, + * so unioning the two is correct and strictly conservative: it can only mark + * MORE registers live, never fewer, so it can never introduce a new clobber. */ if (ls->live_regs_by_instruction && instruction_idx >= 0 && instruction_idx < ls->live_regs_by_instruction_size) - { live_regs |= ls->live_regs_by_instruction[instruction_idx]; - LS_DBG(" Using precomputed liveness: 0x%x", live_regs); - } + + if (ls->cached_instruction_idx == instruction_idx) + live_regs |= ls->cached_live_regs; else { - if (ls->cached_instruction_idx == instruction_idx) - { - live_regs |= ls->cached_live_regs; - LS_DBG(" Using cached liveness: 0x%x", live_regs); - } - else - { - uint32_t computed = tcc_ls_compute_live_regs(ls, instruction_idx); - ls->cached_instruction_idx = instruction_idx; - ls->cached_live_regs = computed; - live_regs |= computed; - LS_DBG(" Computed live registers: 0x%x", live_regs); - } + uint32_t computed = tcc_ls_compute_live_regs(ls, instruction_idx); + ls->cached_instruction_idx = instruction_idx; + ls->cached_live_regs = computed; + live_regs |= computed; } - - /* DEBUG: 90_struct scratch-divergence. At idx 70/75/80 (printf-arg LEAs) the - * device returns PREG_NONE (R0-R3 all live) but QEMU returns R0 — diff the - * raw liveness to see if live_regs_by_instruction[idx] differs. */ - if (funcname && !strcmp((const char *)funcname, "test_init_struct_from_struct") && - (instruction_idx == 70 || instruction_idx == 72 || instruction_idx == 75 || instruction_idx == 80)) - fprintf(stderr, "FSR idx=%d excl=0x%x live=0x%x arr=%p sz=%d raw[idx]=0x%x avail_low=0x%x\n", instruction_idx, - exclude_regs, live_regs, (void *)ls->live_regs_by_instruction, ls->live_regs_by_instruction_size, - (ls->live_regs_by_instruction && instruction_idx < ls->live_regs_by_instruction_size) - ? ls->live_regs_by_instruction[instruction_idx] - : 0xDEADu, - (~live_regs) & 0xFu); + LS_DBG(" Liveness (bitmap ∪ interval-scan): 0x%x", live_regs); { const uint32_t avail_low = (~live_regs) & 0xFu; diff --git a/tccls.h b/tccls.h index 1a26fe6b..2d343c93 100644 --- a/tccls.h +++ b/tccls.h @@ -87,6 +87,8 @@ void tcc_ls_reset_scratch_cache(LSLiveIntervalState *ls); uint32_t tcc_ls_compute_live_regs(LSLiveIntervalState *ls, int instruction_idx); +int tcc_ls_reg_held_by_other(const LSLiveIntervalState *ls, int reg, int pos, const LSLiveInterval *skip); + int tcc_ls_find_free_scratch_reg(LSLiveIntervalState *ls, int instruction_idx, uint32_t exclude_regs, int is_leaf); void tcc_ls_recompute_dirty_registers(LSLiveIntervalState *ls); diff --git a/tests/Makefile b/tests/Makefile index bd45befd..53ba78d3 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -21,6 +21,7 @@ TESTS = \ llong_test-run \ tests2-dir \ pp-dir \ + frontend-dir \ memtest \ dlltest \ cross-test @@ -373,4 +374,6 @@ clean: rm -f ex? tcc_g weaktest.*.txt *.def *.pdb *.obj libtcc_test_mt @$(MAKE) -C tests2 $@ @$(MAKE) -C pp $@ + @$(MAKE) -C frontend $@ + @rm -rf linker/build debug/build runtime/build diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..0dbc70ea --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +"""Shared pytest options for the tinycc tests tree.""" + + +def pytest_addoption(parser): + parser.addoption( + "--compiler", + action="store", + default=None, + help="Path to the armv8m-tcc cross compiler", + ) diff --git a/tests/debug/conftest.py b/tests/debug/conftest.py new file mode 100644 index 00000000..a0e6f0f9 --- /dev/null +++ b/tests/debug/conftest.py @@ -0,0 +1,41 @@ +"""Shared pytest configuration for the debug-info coverage layer.""" + +from pathlib import Path + +import pytest + +DEBUG_DIR = Path(__file__).parent +TINYCC_DIR = DEBUG_DIR / "../.." + + +def _find_compiler(compiler_override=None): + """Resolve the cross compiler using the requested fallback chain.""" + if compiler_override is not None: + p = Path(compiler_override) + if not p.exists(): + raise FileNotFoundError(f"--compiler not found: {p}") + return p + + candidates = [ + TINYCC_DIR / "armv8m-tcc", + TINYCC_DIR / "bin" / "armv8m-tcc", + ] + for cand in candidates: + if cand.exists(): + return cand + raise FileNotFoundError( + "No armv8m-tcc cross compiler found. " + "Build one with `make cross` in libs/tinycc, or pass --compiler." + ) + + +def pytest_configure(config): + """Register custom markers used by the debug test layers.""" + config.addinivalue_line("markers", "debug: debug-info coverage test") + config.addinivalue_line("markers", "debug_dwarf: DWARF debug-info test") + config.addinivalue_line("markers", "debug_stab: STAB debug-info test") + + +@pytest.fixture(scope="session") +def debug_compiler(pytestconfig): + return _find_compiler(pytestconfig.getoption("compiler")) diff --git a/tests/debug/dwarf/01_compile_unit.c b/tests/debug/dwarf/01_compile_unit.c new file mode 100644 index 00000000..99d7c0af --- /dev/null +++ b/tests/debug/dwarf/01_compile_unit.c @@ -0,0 +1,7 @@ +/* Minimal TU to inspect DWARF compile unit DIE. */ +static int static_var = 42; +int global_var; + +int compute(int x) { + return x + static_var + global_var; +} diff --git a/tests/debug/dwarf/02_function_var.c b/tests/debug/dwarf/02_function_var.c new file mode 100644 index 00000000..ee55adf6 --- /dev/null +++ b/tests/debug/dwarf/02_function_var.c @@ -0,0 +1,5 @@ +/* DWARF DIEs for functions, parameters, and local variables. */ +int add(int a, int b) { + int local = a + b; + return local; +} diff --git a/tests/debug/dwarf/03_line_info.c b/tests/debug/dwarf/03_line_info.c new file mode 100644 index 00000000..4d09b0a7 --- /dev/null +++ b/tests/debug/dwarf/03_line_info.c @@ -0,0 +1,5 @@ +/* DWARF line-number program for a tiny function. */ +int line_func(int x) { + int y = x + 1; + return y; +} diff --git a/tests/debug/stab/01_placeholder.c b/tests/debug/stab/01_placeholder.c new file mode 100644 index 00000000..e599b0a5 --- /dev/null +++ b/tests/debug/stab/01_placeholder.c @@ -0,0 +1,3 @@ +/* STAB output is currently disabled in this fork (put_stabs* are no-ops). + This placeholder documents the gap; the debug harness skips STAB cases. */ +int stab_func(int x) { return x; } diff --git a/tests/debug/test_debug.py b/tests/debug/test_debug.py new file mode 100644 index 00000000..200d277f --- /dev/null +++ b/tests/debug/test_debug.py @@ -0,0 +1,168 @@ +"""Phase 5: debug-info coverage tests. + +Each test cross-compiles a tiny C case with ``-g`` and inspects the resulting +object with arm-none-eabi-readelf and arm-none-eabi-objdump. The assertions +are characterizations of the current DWARF output; STAB output is currently +disabled in this fork (put_stabs* are no-ops) so those cases are skipped. +""" + +import re +import subprocess +from pathlib import Path + +import pytest + +ROOT = Path(__file__).parent.parent.parent # libs/tinycc +TCC = ROOT / "armv8m-tcc" +DEBUG_DIR = Path(__file__).parent +BUILD_DIR = DEBUG_DIR / "build" + +READELF = "arm-none-eabi-readelf" +OBJDUMP = "arm-none-eabi-objdump" + + +def _compile(name, subdir): + """Cross-compile a case in /.c to a relocatable object with -g.""" + src = DEBUG_DIR / subdir / f"{name}.c" + obj = BUILD_DIR / subdir / f"{name}.o" + obj.parent.mkdir(parents=True, exist_ok=True) + + cflags = [ + "-O1", + "-g", + "-nostdlib", + "-fvisibility=hidden", + "-mcpu=cortex-m33", + "-mthumb", + "-mfloat-abi=soft", + "-ffunction-sections", + "-c", + ] + cmd = [str(TCC)] + cflags + [str(src), "-o", str(obj)] + result = subprocess.run( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + errors="replace", + ) + if result.returncode != 0: + raise RuntimeError( + f"Compile failed for {subdir}/{name}: {cmd}\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" + ) + return obj + + +def _readelf_debug_sections(obj): + """Return set of debug section names.""" + result = subprocess.run( + [READELF, "-S", str(obj)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + errors="replace", + ) + assert result.returncode == 0, f"readelf -S failed for {obj}: {result.stderr}" + + debug_sections = set() + for line in result.stdout.splitlines(): + if ".debug_" in line or ".debug_line" in line: + m = re.search(r"\.debug_\w+", line) + if m: + debug_sections.add(m.group(0)) + return debug_sections + + +def _readelf_debug_info(obj): + """Return the raw --debug-dump=info output.""" + result = subprocess.run( + [READELF, "--debug-dump=info", str(obj)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + errors="replace", + ) + assert result.returncode == 0, f"readelf --debug-dump=info failed for {obj}: {result.stderr}" + return result.stdout + + +def _readelf_debug_line(obj): + """Return the raw --debug-dump=line output.""" + result = subprocess.run( + [READELF, "--debug-dump=line", str(obj)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + errors="replace", + ) + assert result.returncode == 0, f"readelf --debug-dump=line failed for {obj}: {result.stderr}" + return result.stdout + + +# ----------------------------------------------------------------------------- +# dwarf/ +# ----------------------------------------------------------------------------- +@pytest.mark.debug +@pytest.mark.debug_dwarf +def test_dwarf_compile_unit(): + obj = _compile("01_compile_unit", "dwarf") + sections = _readelf_debug_sections(obj) + + # DWARF5 CU info needs at least these sections. + required = {".debug_info", ".debug_abbrev", ".debug_line", ".debug_str"} + missing = required - sections + assert not missing, f"missing DWARF sections: {missing}" + + info = _readelf_debug_info(obj) + assert "DW_TAG_compile_unit" in info + assert "DW_AT_producer" in info + assert "DW_AT_name" in info + + +@pytest.mark.debug +@pytest.mark.debug_dwarf +def test_dwarf_function_and_variables(): + obj = _compile("02_function_var", "dwarf") + info = _readelf_debug_info(obj) + + # Function and parameter/variable DIEs. + assert "DW_TAG_subprogram" in info + assert "add" in info + assert "DW_TAG_formal_parameter" in info + assert "DW_TAG_variable" in info + + +@pytest.mark.debug +@pytest.mark.debug_dwarf +def test_dwarf_line_info(): + obj = _compile("03_line_info", "dwarf") + line = _readelf_debug_line(obj) + + # Line number program should reference the source file and function lines. + assert "DWARF Version" in line + assert "line_func" in line or "03_line_info.c" in line + assert "Line Number Statements" in line + + +# ----------------------------------------------------------------------------- +# stab/ +# ----------------------------------------------------------------------------- +@pytest.mark.debug +@pytest.mark.debug_stab +def test_stab_disabled(): + """STAB emission is currently disabled in this fork. + + The source still contains the STAB records (tccdbg.c put_stabs*), but the + output functions are no-ops and no .stab / .stabstr sections are emitted. + This test documents that state; if STAB support is restored it should be + replaced with real golden assertions. + """ + obj = _compile("01_placeholder", "stab") + sections = _readelf_debug_sections(obj) + + # With -g the compiler emits DWARF, not STAB. + assert ".stab" not in sections + assert ".stabstr" not in sections + assert ".debug_info" in sections + + pytest.skip("STAB output is disabled in this fork; only DWARF is emitted") diff --git a/tests/frontend/Makefile b/tests/frontend/Makefile new file mode 100644 index 00000000..f323c8bb --- /dev/null +++ b/tests/frontend/Makefile @@ -0,0 +1,28 @@ +# Frontend coverage tests for ARMv8-M TinyCC. +# +# Mirrors tests/pp/Makefile but drives the pytest harness instead of raw +# preprocessor invocations. + +TOP = ../.. +TCC = $(TOP)/bin/armv8m-tcc +PYTHON = python3 + +.PHONY: all test update pp types diagnostics clean + +all test: + $(PYTHON) -m pytest $(CURDIR) -q + +pp: + $(PYTHON) -m pytest $(CURDIR)/test_frontend.py -q -m frontend_pp + +types: + $(PYTHON) -m pytest $(CURDIR)/test_frontend.py -q -m frontend_types + +diagnostics: + $(PYTHON) -m pytest $(CURDIR)/test_frontend.py -q -m frontend_diagnostics + +update: + $(PYTHON) -m pytest $(CURDIR) --update -q + +clean: + find $(CURDIR) -name '*.o' -delete diff --git a/tests/frontend/conftest.py b/tests/frontend/conftest.py new file mode 100644 index 00000000..20e5cc0f --- /dev/null +++ b/tests/frontend/conftest.py @@ -0,0 +1,64 @@ +"""Shared pytest configuration for the frontend coverage layer.""" + +from pathlib import Path + +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--update", + action="store_true", + default=False, + help="Regenerate golden files from current compiler output", + ) + # --compiler is normally provided by the parent tests/conftest.py, but that + # conftest is not loaded when pytest is invoked from inside tests/frontend/ + # (as `make test-frontend` does). Register it here too, tolerating the + # duplicate when both conftests are active (running from tests/). + try: + parser.addoption( + "--compiler", + action="store", + default=None, + help="Path to the armv8m-tcc cross compiler", + ) + except ValueError: + pass + + +def _find_compiler(compiler_override=None): + """Resolve the cross compiler using the requested fallback chain.""" + if compiler_override is not None: + p = Path(compiler_override) + if not p.exists(): + raise FileNotFoundError(f"--compiler not found: {p}") + return p + + tinycc = Path(__file__).parent.parent.parent + candidates = [ + tinycc / "armv8m-tcc", + tinycc / "bin" / "armv8m-tcc", + ] + for cand in candidates: + if cand.exists(): + return cand + raise FileNotFoundError( + "No armv8m-tcc cross compiler found. " + "Build one with `make cross` in libs/tinycc, or pass --compiler." + ) + + +def pytest_configure(config): + """Register custom markers used by the frontend test layers.""" + config.addinivalue_line("markers", "frontend: frontend coverage test") + config.addinivalue_line("markers", "frontend_pp: preprocessor/lexer test") + config.addinivalue_line("markers", "frontend_types: type-system / semantic test") + config.addinivalue_line( + "markers", "frontend_diagnostics: expected-error diagnostic test" + ) + + +@pytest.fixture(scope="session") +def frontend_compiler(pytestconfig): + return _find_compiler(pytestconfig.getoption("compiler")) diff --git a/tests/frontend/diagnostics/01_undeclared.c b/tests/frontend/diagnostics/01_undeclared.c new file mode 100644 index 00000000..aaee6fea --- /dev/null +++ b/tests/frontend/diagnostics/01_undeclared.c @@ -0,0 +1 @@ +int f(void) { return undeclared_var; } diff --git a/tests/frontend/diagnostics/01_undeclared.stderr b/tests/frontend/diagnostics/01_undeclared.stderr new file mode 100644 index 00000000..966b6de7 --- /dev/null +++ b/tests/frontend/diagnostics/01_undeclared.stderr @@ -0,0 +1,2 @@ +error: +'undeclared_var' undeclared diff --git a/tests/frontend/diagnostics/02_redefinition.c b/tests/frontend/diagnostics/02_redefinition.c new file mode 100644 index 00000000..0a2205b0 --- /dev/null +++ b/tests/frontend/diagnostics/02_redefinition.c @@ -0,0 +1,2 @@ +int x = 1; +int x = 2; diff --git a/tests/frontend/diagnostics/02_redefinition.stderr b/tests/frontend/diagnostics/02_redefinition.stderr new file mode 100644 index 00000000..a9d7d640 --- /dev/null +++ b/tests/frontend/diagnostics/02_redefinition.stderr @@ -0,0 +1,2 @@ +error: +redefinition of 'x' diff --git a/tests/frontend/diagnostics/02_type_mismatch.c b/tests/frontend/diagnostics/02_type_mismatch.c new file mode 100644 index 00000000..589e5ea9 --- /dev/null +++ b/tests/frontend/diagnostics/02_type_mismatch.c @@ -0,0 +1 @@ +int f(void) { int x; x = "hello"; return 0; } diff --git a/tests/frontend/diagnostics/02_type_mismatch.stderr b/tests/frontend/diagnostics/02_type_mismatch.stderr new file mode 100644 index 00000000..83ffd6a8 --- /dev/null +++ b/tests/frontend/diagnostics/02_type_mismatch.stderr @@ -0,0 +1,2 @@ +error: +assignment makes integer from pointer diff --git a/tests/frontend/diagnostics/03_incompatible_types.c b/tests/frontend/diagnostics/03_incompatible_types.c new file mode 100644 index 00000000..0d85efde --- /dev/null +++ b/tests/frontend/diagnostics/03_incompatible_types.c @@ -0,0 +1,3 @@ +int f(int x) { + return x + "hello"; +} diff --git a/tests/frontend/diagnostics/03_incompatible_types.stderr b/tests/frontend/diagnostics/03_incompatible_types.stderr new file mode 100644 index 00000000..83ffd6a8 --- /dev/null +++ b/tests/frontend/diagnostics/03_incompatible_types.stderr @@ -0,0 +1,2 @@ +error: +assignment makes integer from pointer diff --git a/tests/frontend/diagnostics/03_redefinition.c b/tests/frontend/diagnostics/03_redefinition.c new file mode 100644 index 00000000..c1146dba --- /dev/null +++ b/tests/frontend/diagnostics/03_redefinition.c @@ -0,0 +1,5 @@ +int f(void) { + int x; + int x; + return 0; +} diff --git a/tests/frontend/diagnostics/03_redefinition.stderr b/tests/frontend/diagnostics/03_redefinition.stderr new file mode 100644 index 00000000..85426ce3 --- /dev/null +++ b/tests/frontend/diagnostics/03_redefinition.stderr @@ -0,0 +1,2 @@ +error: +redeclaration of 'x' diff --git a/tests/frontend/diagnostics/04_invalid_lvalue.c b/tests/frontend/diagnostics/04_invalid_lvalue.c new file mode 100644 index 00000000..de772e38 --- /dev/null +++ b/tests/frontend/diagnostics/04_invalid_lvalue.c @@ -0,0 +1 @@ +int f(void) { int a[2]; a = 0; return 0; } diff --git a/tests/frontend/diagnostics/04_invalid_lvalue.stderr b/tests/frontend/diagnostics/04_invalid_lvalue.stderr new file mode 100644 index 00000000..ad755c64 --- /dev/null +++ b/tests/frontend/diagnostics/04_invalid_lvalue.stderr @@ -0,0 +1,2 @@ +error: +lvalue expected diff --git a/tests/frontend/diagnostics/05_incompatible_call.c b/tests/frontend/diagnostics/05_incompatible_call.c new file mode 100644 index 00000000..8024552f --- /dev/null +++ b/tests/frontend/diagnostics/05_incompatible_call.c @@ -0,0 +1,2 @@ +void g(int x); +void h(void) { g("hello"); } diff --git a/tests/frontend/diagnostics/05_incompatible_call.stderr b/tests/frontend/diagnostics/05_incompatible_call.stderr new file mode 100644 index 00000000..83ffd6a8 --- /dev/null +++ b/tests/frontend/diagnostics/05_incompatible_call.stderr @@ -0,0 +1,2 @@ +error: +assignment makes integer from pointer diff --git a/tests/frontend/diagnostics/break_outside_loop.c b/tests/frontend/diagnostics/break_outside_loop.c new file mode 100644 index 00000000..3fd466ab --- /dev/null +++ b/tests/frontend/diagnostics/break_outside_loop.c @@ -0,0 +1,4 @@ +int f(void) { + break; + return 0; +} diff --git a/tests/frontend/diagnostics/break_outside_loop.stderr b/tests/frontend/diagnostics/break_outside_loop.stderr new file mode 100644 index 00000000..cf48f319 --- /dev/null +++ b/tests/frontend/diagnostics/break_outside_loop.stderr @@ -0,0 +1,2 @@ +error: +cannot break diff --git a/tests/frontend/diagnostics/continue_outside_loop.c b/tests/frontend/diagnostics/continue_outside_loop.c new file mode 100644 index 00000000..3679f912 --- /dev/null +++ b/tests/frontend/diagnostics/continue_outside_loop.c @@ -0,0 +1,4 @@ +int f(void) { + continue; + return 0; +} diff --git a/tests/frontend/diagnostics/continue_outside_loop.stderr b/tests/frontend/diagnostics/continue_outside_loop.stderr new file mode 100644 index 00000000..24664c55 --- /dev/null +++ b/tests/frontend/diagnostics/continue_outside_loop.stderr @@ -0,0 +1,2 @@ +error: +cannot continue diff --git a/tests/frontend/diagnostics/duplicate_label.c b/tests/frontend/diagnostics/duplicate_label.c new file mode 100644 index 00000000..9b9ac577 --- /dev/null +++ b/tests/frontend/diagnostics/duplicate_label.c @@ -0,0 +1,6 @@ +int f(void) { +label: + ; +label: + return 0; +} diff --git a/tests/frontend/diagnostics/duplicate_label.stderr b/tests/frontend/diagnostics/duplicate_label.stderr new file mode 100644 index 00000000..203b3203 --- /dev/null +++ b/tests/frontend/diagnostics/duplicate_label.stderr @@ -0,0 +1,2 @@ +error: +duplicate label diff --git a/tests/frontend/diagnostics/invalid_lvalue.c b/tests/frontend/diagnostics/invalid_lvalue.c new file mode 100644 index 00000000..6ce4bf0c --- /dev/null +++ b/tests/frontend/diagnostics/invalid_lvalue.c @@ -0,0 +1,5 @@ +int f(void) { + int x; + x + 1 = 2; + return 0; +} diff --git a/tests/frontend/diagnostics/invalid_lvalue.stderr b/tests/frontend/diagnostics/invalid_lvalue.stderr new file mode 100644 index 00000000..ad755c64 --- /dev/null +++ b/tests/frontend/diagnostics/invalid_lvalue.stderr @@ -0,0 +1,2 @@ +error: +lvalue expected diff --git a/tests/frontend/diagnostics/missing_closing_brace.c b/tests/frontend/diagnostics/missing_closing_brace.c new file mode 100644 index 00000000..743b8ae7 --- /dev/null +++ b/tests/frontend/diagnostics/missing_closing_brace.c @@ -0,0 +1 @@ +int f(void) { diff --git a/tests/frontend/diagnostics/missing_closing_brace.stderr b/tests/frontend/diagnostics/missing_closing_brace.stderr new file mode 100644 index 00000000..c8a30790 --- /dev/null +++ b/tests/frontend/diagnostics/missing_closing_brace.stderr @@ -0,0 +1,2 @@ +error: +expression expected before diff --git a/tests/frontend/diagnostics/missing_semicolon.c b/tests/frontend/diagnostics/missing_semicolon.c new file mode 100644 index 00000000..4a3e45e7 --- /dev/null +++ b/tests/frontend/diagnostics/missing_semicolon.c @@ -0,0 +1 @@ +int x diff --git a/tests/frontend/diagnostics/missing_semicolon.stderr b/tests/frontend/diagnostics/missing_semicolon.stderr new file mode 100644 index 00000000..e4a402d6 --- /dev/null +++ b/tests/frontend/diagnostics/missing_semicolon.stderr @@ -0,0 +1,2 @@ +error: +';' expected diff --git a/tests/frontend/diagnostics/type_mismatch.c b/tests/frontend/diagnostics/type_mismatch.c new file mode 100644 index 00000000..44c85d64 --- /dev/null +++ b/tests/frontend/diagnostics/type_mismatch.c @@ -0,0 +1,5 @@ +int f(void) { + int x; + x = &x; + return 0; +} diff --git a/tests/frontend/diagnostics/type_mismatch.stderr b/tests/frontend/diagnostics/type_mismatch.stderr new file mode 100644 index 00000000..83ffd6a8 --- /dev/null +++ b/tests/frontend/diagnostics/type_mismatch.stderr @@ -0,0 +1,2 @@ +error: +assignment makes integer from pointer diff --git a/tests/frontend/diagnostics/undeclared_identifier.c b/tests/frontend/diagnostics/undeclared_identifier.c new file mode 100644 index 00000000..0fad89e7 --- /dev/null +++ b/tests/frontend/diagnostics/undeclared_identifier.c @@ -0,0 +1,3 @@ +int f(void) { + return x; +} diff --git a/tests/frontend/diagnostics/undeclared_identifier.stderr b/tests/frontend/diagnostics/undeclared_identifier.stderr new file mode 100644 index 00000000..252ae7ed --- /dev/null +++ b/tests/frontend/diagnostics/undeclared_identifier.stderr @@ -0,0 +1,2 @@ +error: +'x' undeclared diff --git a/tests/frontend/diagnostics/void_variable.c b/tests/frontend/diagnostics/void_variable.c new file mode 100644 index 00000000..35a53e9c --- /dev/null +++ b/tests/frontend/diagnostics/void_variable.c @@ -0,0 +1 @@ +void v; diff --git a/tests/frontend/diagnostics/void_variable.stderr b/tests/frontend/diagnostics/void_variable.stderr new file mode 100644 index 00000000..14b45024 --- /dev/null +++ b/tests/frontend/diagnostics/void_variable.stderr @@ -0,0 +1,2 @@ +error: +declaration of void object diff --git a/tests/frontend/pp/01_macro_expand.c b/tests/frontend/pp/01_macro_expand.c new file mode 100644 index 00000000..cc77180d --- /dev/null +++ b/tests/frontend/pp/01_macro_expand.c @@ -0,0 +1,2 @@ +#define VALUE 42 +int x = VALUE; diff --git a/tests/frontend/pp/01_macro_expand.expect b/tests/frontend/pp/01_macro_expand.expect new file mode 100644 index 00000000..642ca52e --- /dev/null +++ b/tests/frontend/pp/01_macro_expand.expect @@ -0,0 +1 @@ +int x = 42; diff --git a/tests/frontend/pp/01_simple_macro.c b/tests/frontend/pp/01_simple_macro.c new file mode 100644 index 00000000..08753e6b --- /dev/null +++ b/tests/frontend/pp/01_simple_macro.c @@ -0,0 +1,2 @@ +#define ADD(a, b) (a + b) +int x = ADD(1, 2); diff --git a/tests/frontend/pp/01_simple_macro.expect b/tests/frontend/pp/01_simple_macro.expect new file mode 100644 index 00000000..97ccef91 --- /dev/null +++ b/tests/frontend/pp/01_simple_macro.expect @@ -0,0 +1 @@ +int x = (1 + 2); diff --git a/tests/frontend/pp/02_stringify.c b/tests/frontend/pp/02_stringify.c new file mode 100644 index 00000000..9839ca85 --- /dev/null +++ b/tests/frontend/pp/02_stringify.c @@ -0,0 +1,2 @@ +#define STR(x) #x +char *s = STR(hello); diff --git a/tests/frontend/pp/02_stringify.expect b/tests/frontend/pp/02_stringify.expect new file mode 100644 index 00000000..c23d8a24 --- /dev/null +++ b/tests/frontend/pp/02_stringify.expect @@ -0,0 +1 @@ +char *s = "hello"; diff --git a/tests/frontend/pp/03_token_paste.c b/tests/frontend/pp/03_token_paste.c new file mode 100644 index 00000000..2ae43a5b --- /dev/null +++ b/tests/frontend/pp/03_token_paste.c @@ -0,0 +1,2 @@ +#define CAT(a, b) a ## b +int xy = CAT(x, y); diff --git a/tests/frontend/pp/03_token_paste.expect b/tests/frontend/pp/03_token_paste.expect new file mode 100644 index 00000000..6b09dfd4 --- /dev/null +++ b/tests/frontend/pp/03_token_paste.expect @@ -0,0 +1 @@ +int xy = xy; diff --git a/tests/frontend/pp/04_if_expr.c b/tests/frontend/pp/04_if_expr.c new file mode 100644 index 00000000..ea4090e7 --- /dev/null +++ b/tests/frontend/pp/04_if_expr.c @@ -0,0 +1,5 @@ +#if 1 + 1 == 2 +int yes; +#else +int no; +#endif diff --git a/tests/frontend/pp/04_if_expr.expect b/tests/frontend/pp/04_if_expr.expect new file mode 100644 index 00000000..2ab8b3ff --- /dev/null +++ b/tests/frontend/pp/04_if_expr.expect @@ -0,0 +1 @@ +int yes; diff --git a/tests/frontend/pp/04_variadic.c b/tests/frontend/pp/04_variadic.c new file mode 100644 index 00000000..a79094d3 --- /dev/null +++ b/tests/frontend/pp/04_variadic.c @@ -0,0 +1,2 @@ +#define LOG(fmt, ...) printf(fmt, __VA_ARGS__) +LOG("value: %d", 42); diff --git a/tests/frontend/pp/04_variadic.expect b/tests/frontend/pp/04_variadic.expect new file mode 100644 index 00000000..bdf21fbc --- /dev/null +++ b/tests/frontend/pp/04_variadic.expect @@ -0,0 +1 @@ +printf("value: %d", 42); diff --git a/tests/frontend/pp/05_ifdef.c b/tests/frontend/pp/05_ifdef.c new file mode 100644 index 00000000..90757674 --- /dev/null +++ b/tests/frontend/pp/05_ifdef.c @@ -0,0 +1,6 @@ +#define FLAG +#ifdef FLAG +int enabled = 1; +#else +int enabled = 0; +#endif diff --git a/tests/frontend/pp/05_ifdef.expect b/tests/frontend/pp/05_ifdef.expect new file mode 100644 index 00000000..c4869c45 --- /dev/null +++ b/tests/frontend/pp/05_ifdef.expect @@ -0,0 +1 @@ +int enabled = 1; diff --git a/tests/frontend/pp/06_recursive_macro.c b/tests/frontend/pp/06_recursive_macro.c new file mode 100644 index 00000000..aec3929f --- /dev/null +++ b/tests/frontend/pp/06_recursive_macro.c @@ -0,0 +1,3 @@ +#define A A B +#define B B A +int x = A; diff --git a/tests/frontend/pp/06_recursive_macro.expect b/tests/frontend/pp/06_recursive_macro.expect new file mode 100644 index 00000000..a6f2d85f --- /dev/null +++ b/tests/frontend/pp/06_recursive_macro.expect @@ -0,0 +1 @@ +int x = A B A; diff --git a/tests/frontend/pp/07_stringify_escapes.c b/tests/frontend/pp/07_stringify_escapes.c new file mode 100644 index 00000000..7af11c12 --- /dev/null +++ b/tests/frontend/pp/07_stringify_escapes.c @@ -0,0 +1,4 @@ +#define STR(x) #x +char *a = STR("he said \"hi\""); +char *b = STR(a \ b); +char *c = STR( spaced out ); diff --git a/tests/frontend/pp/07_stringify_escapes.expect b/tests/frontend/pp/07_stringify_escapes.expect new file mode 100644 index 00000000..718eb11e --- /dev/null +++ b/tests/frontend/pp/07_stringify_escapes.expect @@ -0,0 +1,3 @@ +char *a = "\"he said \\\"hi\\\"\""; +char *b = "a \ b"; +char *c = "spaced out"; diff --git a/tests/frontend/pp/08_paste_multiple.c b/tests/frontend/pp/08_paste_multiple.c new file mode 100644 index 00000000..e5955860 --- /dev/null +++ b/tests/frontend/pp/08_paste_multiple.c @@ -0,0 +1,4 @@ +#define TRIPLE(a, b, c) a ## b ## c +int xyz = TRIPLE(x, y, z); +#define MKID(a, b) a##_##b +int foo_bar = MKID(foo, bar); diff --git a/tests/frontend/pp/08_paste_multiple.expect b/tests/frontend/pp/08_paste_multiple.expect new file mode 100644 index 00000000..3ebe294c --- /dev/null +++ b/tests/frontend/pp/08_paste_multiple.expect @@ -0,0 +1,2 @@ +int xyz = xyz; +int foo_bar = foo_bar; diff --git a/tests/frontend/pp/08_paste_operator_adjacent.c b/tests/frontend/pp/08_paste_operator_adjacent.c new file mode 100644 index 00000000..fea44052 --- /dev/null +++ b/tests/frontend/pp/08_paste_operator_adjacent.c @@ -0,0 +1,15 @@ +/* Pasting an identifier/number with an adjacent punctuator that does not + recombine into a single valid preprocessing token (C11 6.10.3.3p3: "If the + result is not a valid preprocessing token, the behavior is undefined"). + tcc recovers by re-lexing the pasted text as however many tokens it + naturally splits into, emits a "does not give a valid preprocessing + token" warning, and keeps going -- this is a permitted (if idiosyncratic) + recovery strategy for UB, not a standard violation. This test pins tcc's + current recovery output (including its formatting quirks) so a future + change to the recovery path is a deliberate, visible decision. */ +#define PLUSPLUS(a) a ## ++ +int i = 1; +int j = PLUSPLUS(i); +#define NEG(a) - ## a +int k = 5; +int m = NEG(3); diff --git a/tests/frontend/pp/08_paste_operator_adjacent.expect b/tests/frontend/pp/08_paste_operator_adjacent.expect new file mode 100644 index 00000000..4489c31a --- /dev/null +++ b/tests/frontend/pp/08_paste_operator_adjacent.expect @@ -0,0 +1,6 @@ +int i = 1; +int j = + i ++; +int k = 5; +int m = + - 3; diff --git a/tests/frontend/pp/09_elif_chain.c b/tests/frontend/pp/09_elif_chain.c new file mode 100644 index 00000000..cf5d2f8f --- /dev/null +++ b/tests/frontend/pp/09_elif_chain.c @@ -0,0 +1,25 @@ +#define X 3 + +#if X == 1 +int v = 1; +#elif X == 2 +int v = 2; +#elif X == 3 +int v = 3; +#elif X == 4 +int v = 4; +#else +int v = -1; +#endif + +#if -1 > 0U +int neg_vs_unsigned = 1; +#else +int neg_vs_unsigned = 0; +#endif + +#if 0xFFFFFFFFU == -1 +int allones = 1; +#else +int allones = 0; +#endif diff --git a/tests/frontend/pp/09_elif_chain.expect b/tests/frontend/pp/09_elif_chain.expect new file mode 100644 index 00000000..c15823d6 --- /dev/null +++ b/tests/frontend/pp/09_elif_chain.expect @@ -0,0 +1,3 @@ +int v = 3; +int neg_vs_unsigned = 1; +int allones = 0; diff --git a/tests/frontend/pp/09_if_defined_complex.c b/tests/frontend/pp/09_if_defined_complex.c new file mode 100644 index 00000000..04fda830 --- /dev/null +++ b/tests/frontend/pp/09_if_defined_complex.c @@ -0,0 +1,13 @@ +#define FOO 1 +#define BAR 0 +#if defined(FOO) && (defined BAR || !defined(BAZ)) +int a = 1; +#else +int a = 0; +#endif + +#if (defined(FOO) ? BAR : FOO) == 0 +int b = 1; +#else +int b = 0; +#endif diff --git a/tests/frontend/pp/09_if_defined_complex.expect b/tests/frontend/pp/09_if_defined_complex.expect new file mode 100644 index 00000000..5e511a32 --- /dev/null +++ b/tests/frontend/pp/09_if_defined_complex.expect @@ -0,0 +1,2 @@ +int a = 1; +int b = 1; diff --git a/tests/frontend/pp/10_nested_macro_call_args.c b/tests/frontend/pp/10_nested_macro_call_args.c new file mode 100644 index 00000000..8bc1d326 --- /dev/null +++ b/tests/frontend/pp/10_nested_macro_call_args.c @@ -0,0 +1,7 @@ +#define ADD(a, b) ((a) + (b)) +#define CALL_ADD(x, y) ADD(x, y) +#define TWICE(f, x) f(x, x) +int p = CALL_ADD(2, 3); +int q = TWICE(ADD, 5); +#define APPLY(f, ...) f(__VA_ARGS__) +int r = APPLY(ADD, 4, 6); diff --git a/tests/frontend/pp/10_nested_macro_call_args.expect b/tests/frontend/pp/10_nested_macro_call_args.expect new file mode 100644 index 00000000..89efaa3b --- /dev/null +++ b/tests/frontend/pp/10_nested_macro_call_args.expect @@ -0,0 +1,3 @@ +int p = ((2) + (3)); +int q = ((5) + (5)); +int r = ((4) + (6)); diff --git a/tests/frontend/pp/11_string_concat_macro.c b/tests/frontend/pp/11_string_concat_macro.c new file mode 100644 index 00000000..126755b3 --- /dev/null +++ b/tests/frontend/pp/11_string_concat_macro.c @@ -0,0 +1,7 @@ +#define GREETING "Hello" +char *s1 = GREETING ", " "World"; +#define STR(x) #x +#define XSTR(x) STR(x) +#define VERSION_MAJOR 1 +#define VERSION_MINOR 2 +char *ver = XSTR(VERSION_MAJOR) "." XSTR(VERSION_MINOR); diff --git a/tests/frontend/pp/11_string_concat_macro.expect b/tests/frontend/pp/11_string_concat_macro.expect new file mode 100644 index 00000000..e9485a5b --- /dev/null +++ b/tests/frontend/pp/11_string_concat_macro.expect @@ -0,0 +1,2 @@ +char *s1 = "Hello" ", " "World"; +char *ver = "1" "." "2"; diff --git a/tests/frontend/pp/12_line_file_include.c b/tests/frontend/pp/12_line_file_include.c new file mode 100644 index 00000000..a8ac120f --- /dev/null +++ b/tests/frontend/pp/12_line_file_include.c @@ -0,0 +1,18 @@ +/* __LINE__ must reset relative to each file (1 at the top of the included + header, then resume counting in the includer after the #include returns), + and must reflect the *use* site when expanded from inside a function-like + macro body, not the macro's definition site. + Note: __FILE__'s value is intentionally not printed here -- the test + harness invokes the compiler with an absolute path to this very file, so + asserting on __FILE__'s exact text would bake the repo checkout's + absolute filesystem path into the golden and break on any other clone + location. The #ifdef below only checks that __FILE__ is a recognized, + always-defined macro inside an included file (not just the main file). */ +int main_line1 = __LINE__; +#include "line_hdr.h" +int main_line3 = __LINE__; +#define WRAP_LINE() __LINE__ +int wrapped = WRAP_LINE(); +#ifdef __FILE__ +int file_macro_defined = 1; +#endif diff --git a/tests/frontend/pp/12_line_file_include.expect b/tests/frontend/pp/12_line_file_include.expect new file mode 100644 index 00000000..2d2b21ef --- /dev/null +++ b/tests/frontend/pp/12_line_file_include.expect @@ -0,0 +1,5 @@ +int main_line1 = 11; +int hdr_line = 1; +int main_line3 = 13; +int wrapped = 15; +int file_macro_defined = 1; diff --git a/tests/frontend/pp/13_pragma_unknown.c b/tests/frontend/pp/13_pragma_unknown.c new file mode 100644 index 00000000..30c72abd --- /dev/null +++ b/tests/frontend/pp/13_pragma_unknown.c @@ -0,0 +1,4 @@ +#pragma some_unknown_pragma foo bar +int before = 1; +#pragma STDC FP_CONTRACT ON +int after = 2; diff --git a/tests/frontend/pp/13_pragma_unknown.expect b/tests/frontend/pp/13_pragma_unknown.expect new file mode 100644 index 00000000..30c72abd --- /dev/null +++ b/tests/frontend/pp/13_pragma_unknown.expect @@ -0,0 +1,4 @@ +#pragma some_unknown_pragma foo bar +int before = 1; +#pragma STDC FP_CONTRACT ON +int after = 2; diff --git a/tests/frontend/pp/14_pragma_operator_currently_unsupported.c b/tests/frontend/pp/14_pragma_operator_currently_unsupported.c new file mode 100644 index 00000000..64a1ce82 --- /dev/null +++ b/tests/frontend/pp/14_pragma_operator_currently_unsupported.c @@ -0,0 +1,22 @@ +/* BUG (regression pin): the `_Pragma(string-literal)` unary operator + (C11 6.10.9) is not implemented at all in tccpp.c -- there is no + TOK__Pragma / keyword recognition anywhere in the lexer, only the + `#pragma` *directive* form is handled (pragma_parse() in tccpp.c). + Per the standard, `_Pragma("X")` must be destringized and processed as + if by `#pragma X` right there in the token stream (this is what lets + `#define DO_PRAGMA(x) _Pragma(#x)` conditionally emit pragmas from + macros -- a very common portable-header idiom). + Current (wrong) behavior: under `-E`, the `_Pragma(...)` call is passed + through completely untouched instead of being rewritten to + `#pragma message "hi"` (compare: gcc -E performs the rewrite). This + golden pins that passthrough. The effect is worse than a cosmetic -E + difference: a real (non -E) compile of `_Pragma("message \"hi\"")` + fails outright, e.g. at file scope with + `error: identifier expected`, or inside a function body with + `warning: implicit declaration of function '_Pragma'` followed by + `error: ';' expected`, because `_Pragma` is parsed as an ordinary + (unrecognized) identifier/call rather than a preprocessor operator. + Once `_Pragma` support is added, this golden must be updated to the + destringized-and-rewritten form. */ +_Pragma("message \"hi\"") +int x = 1; diff --git a/tests/frontend/pp/14_pragma_operator_currently_unsupported.expect b/tests/frontend/pp/14_pragma_operator_currently_unsupported.expect new file mode 100644 index 00000000..90783692 --- /dev/null +++ b/tests/frontend/pp/14_pragma_operator_currently_unsupported.expect @@ -0,0 +1,2 @@ +_Pragma("message \"hi\"") +int x = 1; diff --git a/tests/frontend/pp/15_variadic_edge.c b/tests/frontend/pp/15_variadic_edge.c new file mode 100644 index 00000000..98ed5d38 --- /dev/null +++ b/tests/frontend/pp/15_variadic_edge.c @@ -0,0 +1,11 @@ +#define LOG0(fmt, ...) printf(fmt, ##__VA_ARGS__) +LOG0("no args"); +LOG0("one arg: %d", 7); + +#define COUNT(...) VA_COUNT(__VA_ARGS__, 5, 4, 3, 2, 1) +#define VA_COUNT(a1, a2, a3, a4, a5, N, ...) N +int n1 = COUNT(a); +int n2 = COUNT(a, b, c); + +#define TRAIL(a, ...) a, __VA_ARGS__ +int arr[] = { TRAIL(1, 2, 3,) }; diff --git a/tests/frontend/pp/15_variadic_edge.expect b/tests/frontend/pp/15_variadic_edge.expect new file mode 100644 index 00000000..07a8b5da --- /dev/null +++ b/tests/frontend/pp/15_variadic_edge.expect @@ -0,0 +1,5 @@ +printf("no args"); +printf("one arg: %d",7); +int n1 = 1; +int n2 = 3; +int arr[] = { 1, 2, 3, }; diff --git a/tests/frontend/pp/16_multiline_backslash_comment.c b/tests/frontend/pp/16_multiline_backslash_comment.c new file mode 100644 index 00000000..8eff034e --- /dev/null +++ b/tests/frontend/pp/16_multiline_backslash_comment.c @@ -0,0 +1,8 @@ +#define ADD3(a, b, c) \ + ((a) + /* first add */ \ + (b) + \ + (c)) /* final */ +int total = ADD3(1, 2, 3); + +#define COMMENTED_VALUE /* leading comment */ 99 /* trailing comment */ +int val = COMMENTED_VALUE; diff --git a/tests/frontend/pp/16_multiline_backslash_comment.expect b/tests/frontend/pp/16_multiline_backslash_comment.expect new file mode 100644 index 00000000..4b9b8667 --- /dev/null +++ b/tests/frontend/pp/16_multiline_backslash_comment.expect @@ -0,0 +1,2 @@ +int total = ((1) + (2) + (3)); +int val = 99; diff --git a/tests/frontend/pp/17_redefinition_identical.c b/tests/frontend/pp/17_redefinition_identical.c new file mode 100644 index 00000000..7ff4ab10 --- /dev/null +++ b/tests/frontend/pp/17_redefinition_identical.c @@ -0,0 +1,11 @@ +#define SIZE 10 +#define SIZE 10 +int arr[SIZE]; + +#define FN(x) ((x) * 2) +#define FN(x) ((x) * 2) +int y = FN(5); + +#define SP_TEST 1 + 2 +#define SP_TEST 1 + 2 +int z = SP_TEST; diff --git a/tests/frontend/pp/17_redefinition_identical.expect b/tests/frontend/pp/17_redefinition_identical.expect new file mode 100644 index 00000000..423f2b64 --- /dev/null +++ b/tests/frontend/pp/17_redefinition_identical.expect @@ -0,0 +1,3 @@ +int arr[10]; +int y = ((5) * 2); +int z = 1 + 2; diff --git a/tests/frontend/pp/18_empty_arg_and_placemarker_paste.c b/tests/frontend/pp/18_empty_arg_and_placemarker_paste.c new file mode 100644 index 00000000..407faf90 --- /dev/null +++ b/tests/frontend/pp/18_empty_arg_and_placemarker_paste.c @@ -0,0 +1,11 @@ +#define PAIR(a, b) a##b +int e1 = PAIR(, 5); +int e2 = PAIR(5, ); +int e3 = PAIR(, ); + +#define TWO(a, b) [a][b] +int e4 TWO(, x); + +#define OPEQ(a, b) a ## b +int v = 1; +v OPEQ(+, =) 3; diff --git a/tests/frontend/pp/18_empty_arg_and_placemarker_paste.expect b/tests/frontend/pp/18_empty_arg_and_placemarker_paste.expect new file mode 100644 index 00000000..24069505 --- /dev/null +++ b/tests/frontend/pp/18_empty_arg_and_placemarker_paste.expect @@ -0,0 +1,6 @@ +int e1 = 5; +int e2 = 5; +int e3 = ; +int e4 [][x]; +int v = 1; +v += 3; diff --git a/tests/frontend/pp/empty_macro.c b/tests/frontend/pp/empty_macro.c new file mode 100644 index 00000000..13318671 --- /dev/null +++ b/tests/frontend/pp/empty_macro.c @@ -0,0 +1,2 @@ +#define EMPTY +int x EMPTY = 1; diff --git a/tests/frontend/pp/empty_macro.expect b/tests/frontend/pp/empty_macro.expect new file mode 100644 index 00000000..46481df0 --- /dev/null +++ b/tests/frontend/pp/empty_macro.expect @@ -0,0 +1 @@ +int x = 1; diff --git a/tests/frontend/pp/include_guard.c b/tests/frontend/pp/include_guard.c new file mode 100644 index 00000000..d13753b5 --- /dev/null +++ b/tests/frontend/pp/include_guard.c @@ -0,0 +1,4 @@ +#ifndef GUARD_H +#define GUARD_H +int guarded; +#endif diff --git a/tests/frontend/pp/include_guard.expect b/tests/frontend/pp/include_guard.expect new file mode 100644 index 00000000..f60a294a --- /dev/null +++ b/tests/frontend/pp/include_guard.expect @@ -0,0 +1 @@ +int guarded; diff --git a/tests/frontend/pp/line_continuation.c b/tests/frontend/pp/line_continuation.c new file mode 100644 index 00000000..d715914b --- /dev/null +++ b/tests/frontend/pp/line_continuation.c @@ -0,0 +1,3 @@ +#define LONG \ + 123 +int x = LONG; diff --git a/tests/frontend/pp/line_continuation.expect b/tests/frontend/pp/line_continuation.expect new file mode 100644 index 00000000..0d0efa38 --- /dev/null +++ b/tests/frontend/pp/line_continuation.expect @@ -0,0 +1 @@ +int x = 123; diff --git a/tests/frontend/pp/line_hdr.h b/tests/frontend/pp/line_hdr.h new file mode 100644 index 00000000..816cf419 --- /dev/null +++ b/tests/frontend/pp/line_hdr.h @@ -0,0 +1 @@ +int hdr_line = __LINE__; diff --git a/tests/frontend/pp/macro_indirection.c b/tests/frontend/pp/macro_indirection.c new file mode 100644 index 00000000..1b53a6f1 --- /dev/null +++ b/tests/frontend/pp/macro_indirection.c @@ -0,0 +1,3 @@ +#define A B +#define B 3 +int x = A; diff --git a/tests/frontend/pp/macro_indirection.expect b/tests/frontend/pp/macro_indirection.expect new file mode 100644 index 00000000..3694828b --- /dev/null +++ b/tests/frontend/pp/macro_indirection.expect @@ -0,0 +1 @@ +int x = 3; diff --git a/tests/frontend/pp/macro_undef.c b/tests/frontend/pp/macro_undef.c new file mode 100644 index 00000000..8865e629 --- /dev/null +++ b/tests/frontend/pp/macro_undef.c @@ -0,0 +1,4 @@ +#define FOO 1 +int a = FOO; +#undef FOO +int b = FOO; diff --git a/tests/frontend/pp/macro_undef.expect b/tests/frontend/pp/macro_undef.expect new file mode 100644 index 00000000..3aa14e59 --- /dev/null +++ b/tests/frontend/pp/macro_undef.expect @@ -0,0 +1,2 @@ +int a = 1; +int b = FOO; diff --git a/tests/frontend/pp/pragma_once.c b/tests/frontend/pp/pragma_once.c new file mode 100644 index 00000000..8a08415d --- /dev/null +++ b/tests/frontend/pp/pragma_once.c @@ -0,0 +1,2 @@ +#pragma once +int once; diff --git a/tests/frontend/pp/pragma_once.expect b/tests/frontend/pp/pragma_once.expect new file mode 100644 index 00000000..4c7ad461 --- /dev/null +++ b/tests/frontend/pp/pragma_once.expect @@ -0,0 +1 @@ +int once; diff --git a/tests/frontend/pp/predefined_macros.c b/tests/frontend/pp/predefined_macros.c new file mode 100644 index 00000000..e0e9f8f6 --- /dev/null +++ b/tests/frontend/pp/predefined_macros.c @@ -0,0 +1,3 @@ +int line = __LINE__; +const char *date = __DATE__; +const char *time = __TIME__; diff --git a/tests/frontend/pp/predefined_macros.expect b/tests/frontend/pp/predefined_macros.expect new file mode 100644 index 00000000..94b9d4f0 --- /dev/null +++ b/tests/frontend/pp/predefined_macros.expect @@ -0,0 +1,3 @@ +int line = 1; +const char *date = ""; +const char *time = "