diff --git a/.gitignore b/.gitignore index bbfc3cbd..6f465adc 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .#* *.o *.so +*.elf *.a *.exe *.dll @@ -85,3 +86,19 @@ tmp/ .venv bin/ +.yasos-build +tests/ir_tests/pch_header_check +tests/ir_tests/pch_usage_check +pch/ +config.h.cross +.opencode +test.txt +tests/ir_tests/dump_ir.txt +tests/ir_tests/dump.txt +tests/ir_tests/dump_fine.txt +tests/ir_tests/dump_ir_fine.txt +.aider* +.claude +.cache +scripts/.disasm_cache.json +scripts/.disasm_cache.pending.json diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index 5652322c..00000000 --- a/AGENTS.md +++ /dev/null @@ -1,476 +0,0 @@ -# TinyCC for ARMv8-M - Agent Guide - -## Project Overview - -This is a specialized fork of **TinyCC (Tiny C Compiler)** focused on **ARMv8-M architecture** support (Cortex-M33, Cortex-M23, and similar ARMv8-M microcontrollers). It features a custom Intermediate Representation (IR) and code generation pipeline optimized for embedded ARM targets. - -### Key Characteristics - -- **Primary Target**: ARMv8-M (Cortex-M33) with Thumb-2 instruction set -- **Architecture**: IR-based compilation with separate front-end and back-end -- **Floating Point**: Multiple FP options (software, VFPv4-sp, VFPv5-dp, RP2350 DCP) -- **Library**: Can be used as `libtcc.a` library for JIT compilation -- **License**: GNU Lesser General Public License (LGPL) - -## Project Structure - -``` -. -├── Core Compiler Sources -│ ├── tcc.c # Main driver/CLI entry point -│ ├── tccpp.c # C preprocessor -│ ├── tccgen.c # C parser and type system -│ ├── tccir.c # Intermediate Representation (IR) generator -│ ├── tccir.h # IR definitions and opcodes -│ ├── tccir_operand.c # IR operand handling -│ ├── tccir_operand.h # IR operand definitions -│ ├── tccls.c # Liveness analysis and register allocation -│ ├── tccld.c # Linker -│ ├── tccelf.c # ELF file format support -│ ├── tccasm.c # Inline assembler -│ ├── tccdbg.c # Debug info generation -│ ├── tccdebug.c # Debug utilities -│ ├── libtcc.c # Library API implementation -│ └── tccyaff.c # YAFF (Yet Another File Format) support -│ -├── ARM-Specific Sources -│ ├── arm-thumb-gen.c # ARM Thumb-2 code generator (from IR) -│ ├── arm-thumb-opcodes.c# Thumb-2 opcode builders -│ ├── arm-thumb-opcodes.h# Thumb-2 instruction definitions -│ ├── arm-thumb-asm.c # ARM assembler parser -│ ├── arm-thumb-callsite.c# Call site handling for ARM -│ ├── arm-thumb-defs.h # ARM-specific definitions -│ ├── arm-link.c # ARM linker support -│ ├── arch/armv8m.c # ARMv8-M architecture configuration -│ └── arch/arm_aapcs.c # ARM Procedure Call Standard support -│ -├── Headers -│ ├── tcc.h # Main compiler header -│ ├── libtcc.h # Public library API -│ ├── tcctok.h # Token definitions -│ ├── tccld.h # Linker interface -│ ├── tccls.h # Liveness analysis interface -│ ├── tccabi.h # ABI definitions -│ ├── thumb-tok.h # ARM Thumb token definitions -│ └── svalue.h # Stack value definitions -│ -├── Libraries -│ ├── lib/ # Runtime library sources (libtcc1.a) -│ │ ├── libtcc1.c # Core runtime functions -│ │ ├── armeabi.c # ARM EABI helper functions -│ │ ├── armv8m_eabi.c # ARMv8-M EABI specific -│ │ └── fp/ # Floating point libraries -│ │ ├── soft/ # Software FP implementation -│ │ ├── arm/vfpv4-sp/ # VFPv4 single-precision -│ │ ├── arm/vfpv5-dp/ # VFPv5 double-precision -│ │ └── arm/rp2350/ # RP2350 DCP support -│ └── include/ # System headers (tcclib.h, stddef.h, etc.) -│ -├── Tests -│ ├── tests/ir_tests/ # IR-level tests (pytest-based) -│ ├── tests/thumb/armv8m/# Assembly instruction tests -│ ├── tests/tests2/ # C language compliance tests -│ ├── tests/pp/ # Preprocessor tests -│ └── tests/benchmarks/ # Performance benchmarks -│ -├── Build System -│ ├── configure # Configuration script (POSIX shell) -│ ├── Makefile # Main build rules -│ ├── config.mak # Generated configuration -│ └── config.h # Generated C headers -│ -└── Documentation - ├── tcc-doc.texi # Texinfo documentation source - ├── LAZY_SECTION_LOADING.md # Lazy loading design doc - └── asm_port.md # Assembler porting notes -``` - -## Build System - -### Prerequisites - -- GCC or Clang compiler -- GNU Make -- Python 3 with virtualenv (for tests) -- `arm-none-eabi-gcc` (for ARMv8-M cross-compilation) - -### Configure Options - -```bash -./configure [options] - --prefix=PREFIX # Installation prefix [/usr/local] - --enable-cross # Build cross compilers - --debug # Include debug info - --enable-asan # Enable AddressSanitizer - --disable-static # Build shared library (libtcc.so) -``` - -### Build Commands - -```bash -# Configure for native build (x86_64) -./configure - -# Build ARMv8-M cross compiler -make cross - -# Build everything including fp-libs -make cross fp-libs - -# Run tests (use -j16 for parallel execution) -make test -j16 - -# Clean build artifacts -make clean - -# Install (default: /usr/local) -make install -``` - -### Output Files - -- `armv8m-tcc` - ARMv8-M cross compiler executable -- `armv8m-libtcc1.a` - Runtime library for ARMv8-M -- `libtcc1-fp-*.a` - Floating point libraries for different FPU configs -- `libtcc.a` or `libtcc.so` - Library version of compiler - -### Docker Environment - -A Dockerfile is provided for a reproducible build environment with all dependencies pre-installed. The CI workflow also uses this Dockerfile for consistent testing. - -**Build the container image using Make:** -```bash -# Build with default settings (localhost/tinycc-armv8m:latest) -make container-build - -# Build for GitHub Container Registry (GHCR) -make container-build DOCKER_REGISTRY=ghcr.io DOCKER_IMAGE_NAME=yourusername/tinycc-armv8m - -# Build for Docker Hub -make container-build DOCKER_REGISTRY=docker.io DOCKER_IMAGE_NAME=yourusername/tinycc-armv8m -``` - -**Push the container image to registry:** -```bash -# Push to GitHub Container Registry (must be logged in: docker/podman login ghcr.io) -make container-push DOCKER_REGISTRY=ghcr.io DOCKER_IMAGE_NAME=yourusername/tinycc-armv8m - -# Push to Docker Hub (must be logged in: docker/podman login docker.io) -make container-push DOCKER_REGISTRY=docker.io DOCKER_IMAGE_NAME=yourusername/tinycc-armv8m -``` - -**Examples:** -```bash -# Build and push to GHCR for this repo (moby/tinycc) -make container-push DOCKER_REGISTRY=ghcr.io DOCKER_IMAGE_NAME=moby/tinycc-armv8m DOCKER_IMAGE_TAG=v1.0 - -# Build and push to Docker Hub -make container-push DOCKER_REGISTRY=docker.io DOCKER_IMAGE_NAME=myuser/tinycc-armv8m DOCKER_IMAGE_TAG=latest -``` - -**CI/CD:** -The CI workflow (`.github/workflows/ci.yml`) pulls the pre-built image from `ghcr.io/USERNAME/tinycc-armv8m:latest` and runs tests inside it. The container image is built and pushed by `.github/workflows/docker-build.yml` when the Dockerfile changes or manually via workflow dispatch. - -**Legacy aliases:** `make docker-build` and `make docker-push` also work. - -**Manual Docker usage:** -```bash -# Build manually -docker build -t tinycc-armv8m . - -# Interactive shell -docker run -it --rm -v $(pwd):/workspace tinycc-armv8m - -# Run tests directly -docker run --rm -v $(pwd):/workspace tinycc-armv8m bash -c "\ - virtualenv .venv && \ - source .venv/bin/activate && \ - make test -j$(nproc)" -``` - -**Docker image includes:** -- Ubuntu 24.04 base -- GCC, G++, Make, Git -- Python 3 with virtualenv support -- ARM cross-compilation toolchain (`gcc-arm-none-eabi`) -- QEMU user-mode for ARM emulation -- GDB multi-arch for debugging - -## Testing - -### Test Structure - -The project uses multiple testing frameworks: - -1. **IR Tests** (`tests/ir_tests/`): pytest-based functional tests - - Test C code compilation to IR and execution via QEMU - - Requirements: `pytest`, `pytest-xdist`, `pexpect` - - Tests are numbered: `01_hello_world.c`, `20_op_add.c`, etc. - - Each `.c` file has a corresponding `.expect` file with expected output - -2. **GCC Torture Tests** (`tests/gcctestsuite/`): GCC c-torture test suite - - ~2000 compile tests and ~1700 execute tests from GCC - - Git submodule at `tests/gcctestsuite/gcc-testsuite` - - Run via `make test-all` or `pytest tests/gcctestsuite/` - -3. **Assembly Tests** (`tests/thumb/armv8m/`): pytest-based assembler tests - - Test individual Thumb-2 instructions - - Compares TCC output against `arm-none-eabi-gcc` - -4. **Legacy Tests** (`tests/tests2/`, `tests/pp/`): Makefile-based tests - - C language compliance tests (curated subset run via IR tests) - - Preprocessor tests - -### Running Tests - -```bash -# Initialize GCC testsuite submodule (one-time) -git submodule update --init --depth 1 tests/gcctestsuite/gcc-testsuite - -# Run IR tests (includes curated tests2) -make test -j16 - -# Run GCC torture tests -make test-all - -# Run only IR tests -make test-venv test-prepare -cd tests/ir_tests && pytest -s -n auto - -# Run only assembly tests -make test-asm -j16 - -# Run legacy tests -make test-legacy -j16 - -# Run AEABI host tests -make test-aeabi-host -j16 -``` - -### Quick Test Runner (run.py) - -For quick manual testing, use `tests/ir_tests/run.py`: - -```bash -cd tests/ir_tests - -# Compile and run a single file with default flags -python run.py -c mytest.c - -# Compile with optimization flags -python run.py -c mytest.c --cflags="-O1" - -# Dump IR while running -python run.py -c mytest.c --cflags="-O1" --dump-ir - -# Use GCC instead of TCC for comparison -python run.py -c mytest.c --gcc=/usr/bin/arm-none-eabi-gcc - -# Run a pre-compiled ELF file -python run.py -f build/mytest.elf - -# Enable GDB debugging (QEMU waits for debugger) -python run.py -c mytest.c --gdb - -# Pass command-line arguments to the test program -python run.py -c mytest.c --args arg1 arg2 arg3 -``` - -### Test Requirements for IR Tests - -The first run will build newlib for the ARM target: -```bash -cd tests/ir_tests/qemu/mps2-an505 && sh ./build_newlib.sh -``` - -This creates `newlib_build/arm-none-eabi/newlib/libc.a` needed for linking. - -## Code Architecture - -### Compilation Pipeline - -``` -C Source (.c) - ↓ -Preprocessor (tccpp.c) - macro expansion, includes - ↓ -Parser (tccgen.c) - semantic analysis, type checking - ↓ -IR Generation (tccir.c) - platform-independent IR - ↓ -IR Optimization - constant folding, dead code elimination - ↓ -Register Allocation (tccls.c) - liveness analysis, register assignment - ↓ -Code Generation (arm-thumb-gen.c) - Thumb-2 machine code - ↓ -ELF Output (tccelf.c) - relocations, sections, symbols -``` - -### IR (Intermediate Representation) - -The IR is a three-address code representation with: - -- **Operations**: `TCCIR_OP_ADD`, `TCCIR_OP_LOAD`, `TCCIR_OP_FUNCCALLVAL`, etc. -- **Operands**: Registers, immediates, memory references, symbols -- **Types**: `IR_TYPE_S32`, `IR_TYPE_F32`, `IR_TYPE_F64`, etc. - -Key files: -- `tccir.h` - IR opcodes and structures -- `tccir_operand.h` - Operand types and accessors -- `tccir.c` - IR generation from AST -- `arm-thumb-gen.c` - IR to Thumb-2 code generation - -### Register Allocation - -Two-phase register allocation in `tccls.c`: - -1. **Liveness Analysis**: Compute live ranges for virtual registers -2. **Register Allocation**: Assign physical registers using linear scan - -Architecture configuration in `arch/armv8m.c`: -```c -ArchitectureConfig architecture_config = { - .pointer_size = 4, - .stack_align = 8, - .reg_size = 4, - .parameter_registers = 4, // r0-r3 for arguments - .has_fpu = 0, -}; -``` - -## Coding Conventions - -### Style Guidelines - -check .clang-format - -Example: -```c -void function_name(int arg) -{ - if (condition) { - do_something(); - } else { - do_other(); - } -} -``` - -### Compiler Warnings - -The build uses strict warnings: -```makefile -CFLAGS += -std=c11 -Wunused-function -Wno-declaration-after-statement -Werror -``` - -### Debug Macros - -Enable debug output with build flags: -```bash -make CFLAGS+='-DPARSE_DEBUG' # Parser debug -make CFLAGS+='-DPP_DEBUG' # Preprocessor debug -make CFLAGS+='-DASM_DEBUG' # Assembler debug -make CFLAGS+='-DCONFIG_TCC_DEBUG' # IR dump (-dump-ir) -make CFLAGS+='-DTCC_LS_DEBUG' # Register allocator debug (linear scan) -``` - -The `TCC_LS_DEBUG` flag enables detailed logging of the linear scan register allocator: -- Live interval creation and range information -- Register assignment decisions (including callee-saved vs caller-saved) -- Spilling decisions and stack slot allocation -- Active interval expiration -- Scratch register allocation -- Final register allocation summary - -## Floating Point Support - -The compiler supports multiple FP configurations via `lib/fp/`: - -| FPU Type | Library | Description | -|----------|---------|-------------| -| Software | `libsoftfp.{a,so}` | Pure C soft-float (no FPU) | -| VFPv4-sp | `libvfpv4sp.{a,so}` | Cortex-M4F (single-precision) | -| VFPv5-dp | `libvfpv5dp.{a,so}` | Cortex-M7 (double-precision) | -| RP2350 | `librp2350fp.{a,so}` | RP2350 double coprocessor | - -Build specific FP library: -```bash -cd lib/fp && make FPU=vfpv4-sp # static .a -cd lib/fp && make FPU=vfpv4-sp build-shared # shared .so -``` - -## Key Development Notes - -### Adding a New IR Instruction - -1. Add opcode to `TccIrOp` enum in `tccir.h` -2. Add lowering logic in `arm-thumb-gen.c` -3. Add test case in `tests/ir_tests/` - -### Adding Assembly Instructions - -1. Add opcode builder in `arm-thumb-opcodes.c` -2. Add token definition in `thumb-tok.h` -3. Add parser support in `arm-thumb-asm.c` -4. Add test case in `tests/thumb/armv8m/` - -### Important Limitations - -- This fork is specifically tailored for ARMv8-M (Cortex-M33) -- Native compilation on x86_64 is not the primary use case -- Some standard C features may be incomplete (check test suite) - -## Library API (libtcc) - -The compiler can be used as a library for JIT compilation: - -```c -#include - -TCCState *s = tcc_new(); -tcc_set_output_type(s, TCC_OUTPUT_MEMORY); -tcc_compile_string(s, "int square(int x) { return x*x; }"); -tcc_relocate(s); -int (*square)(int) = tcc_get_symbol(s, "square"); -int result = square(5); -tcc_delete(s); -``` - -See `libtcc.h` for full API and `tests/libtcc_test.c` for examples. - -## Security Considerations - -- The compiler processes untrusted C code; input validation is essential -- Buffer bounds are checked in most places but fuzzing is recommended -- The `-b` option enables runtime bounds checking (when available) -- Stack protector support varies by target - -## Troubleshooting - -### Common Build Issues - -1. **Missing `config.mak`**: Run `./configure` first -2. **Missing `arm-none-eabi-gcc`**: Install ARM GNU toolchain -3. **Tests fail with QEMU errors**: Ensure qemu-arm is installed - -### Debug Techniques - -```bash -# Dump IR for a file -./armv8m-tcc -dump-ir -c test.c - -# Show verbose output -./armv8m-tcc -vv -c test.c - -# Enable bounds checking -./armv8m-tcc -b -run test.c -``` - -## Related Documentation - -- `README` - Original TinyCC README -- `LAZY_SECTION_LOADING.md` - Design for lazy section loading -- `asm_port.md` - Assembler porting notes -- `lib/fp/README.md` - Floating point library documentation -- `tcc-doc.html` - Full documentation (requires `makeinfo`) diff --git a/CLAUDE.md b/CLAUDE.md index b8251553..6adb9f25 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -138,16 +138,36 @@ void function_name(int arg) Build uses `-std=c11 -Wunused-function -Werror`. -## Debug Flags +## Debug Logging -Pass via `CFLAGS+=` to `make`: +Unified logging system defined in `log.h`. Each scope is a compile-time switch: ```bash -make CFLAGS+='-DPARSE_DEBUG' # parser debug -make CFLAGS+='-DPP_DEBUG' # preprocessor debug -make CFLAGS+='-DASM_DEBUG' # assembler debug +make CFLAGS+='-DTCC_LOG_ALL=1' # enable ALL logging scopes +make CFLAGS+='-DTCC_LOG_IR_GEN=1' # IR generation & optimization passes +make CFLAGS+='-DTCC_LOG_LOOP_OPT=1' # loop optimization (induction vars) +make CFLAGS+='-DTCC_LOG_IV_SR=1' # induction variable / strength reduction +make CFLAGS+='-DTCC_LOG_LICM=1' # loop-invariant code motion +make CFLAGS+='-DTCC_LOG_LS=1' # linear scan register allocator +make CFLAGS+='-DTCC_LOG_STACK_ALLOC=1' # stack frame allocation +make CFLAGS+='-DTCC_LOG_CODEGEN=1' # frontend code generation (tccgen.c) +make CFLAGS+='-DTCC_LOG_INLINE_STRUCT=1' # inline struct return expansion +make CFLAGS+='-DTCC_LOG_CALLSITE=1' # call site processing +make CFLAGS+='-DTCC_LOG_YAFF=1' # YAFF object format +make CFLAGS+='-DTCC_LOG_THOP=1' # thumb opcode encoding trace +make CFLAGS+='-DTCC_LOG_THUMB=1' # thumb code generation (general) +make CFLAGS+='-DTCC_LOG_MACH=1' # machine-level store/assign +make CFLAGS+='-DTCC_LOG_BRANCH_OPT=1' # branch size optimization +make CFLAGS+='-DTCC_LOG_SCRATCH=1' # scratch register management +make CFLAGS+='-DTCC_LOG_RELOC=1' # ELF relocation processing +make CFLAGS+='-DTCC_LOG_POOL=1' # IR memory pool +``` + +Use `LOG_(fmt, ...)` macros in code. Output goes to stderr with `[SCOPE]` prefix. + +Other debug flags (not part of log.h): +```bash make CFLAGS+='-DCONFIG_TCC_DEBUG' # enables -dump-ir flag -make CFLAGS+='-DTCC_LS_DEBUG' # register allocator detail ``` At runtime: diff --git a/Makefile b/Makefile index 95765abb..937648f4 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,28 @@ ifeq (-$(GCC_MAJOR)-$(findstring $(GCC_MINOR),56789)-,-4--) CFLAGS += -D_FORTIFY_SOURCE=0 endif +ENABLE_GC_SECTIONS ?= no +ENABLE_LTO ?= no +RELEASE ?= no + +ifneq ($(filter 1 yes true,$(RELEASE)),) + ENABLE_GC_SECTIONS := yes + ENABLE_LTO := yes + override CFLAGS := $(filter-out -g,$(CFLAGS)) + CFLAGS += -DNDEBUG + LDFLAGS += -s +endif + +ifneq ($(filter 1 yes true,$(ENABLE_GC_SECTIONS)),) + CFLAGS += -ffunction-sections -fdata-sections + LDFLAGS += -Wl,--gc-sections +endif + +ifneq ($(filter 1 yes true,$(ENABLE_LTO)),) + CFLAGS += -flto + LDFLAGS += -flto +endif + LIBTCC = libtcc.a LIBTCC1 = libtcc1.a LINK_LIBTCC = @@ -147,9 +169,14 @@ endif PROGS_CROSS = $(foreach X,$(TCC_X),$X-tcc$(EXESUF)) LIBTCC1_CROSS = $(foreach X,$(LIBTCC1_X),$X-libtcc1.a) +AUTO_PCH_COMMON_HEADERS = stdio.h stdlib.h string.h +AUTO_PCH_STAMPS = $(foreach X,$(TCC_X),$(TOP)/pch/.$X-auto-pch.stamp) $(info $(LIBTCC1_CROSS)) # build cross compilers & libs +# PCH disabled on YasOS (unused; costs runtime heap + startup probe time) — +# auto-PCH generation dropped here. Re-add $(AUTO_PCH_STAMPS) (and re-enable +# the loader in tccpp.c / pch_auto_enabled) to restore precompiled headers. cross: $(LIBTCC1_CROSS) $(PROGS_CROSS) $(FP_LIBS_CROSS) # build specific cross compiler & lib @@ -190,6 +217,42 @@ $(FP_LIBS_STAMP_DIR)/.%-fp-libs.stamp: $(FP_LIBS_STAMP_DIR)/.%-tcc.checksum $(FP @# Save the checksum that was used for this build @cp $(abspath $(FP_LIBS_STAMP_DIR)/.$*-tcc.checksum) $(abspath $(FP_LIBS_STAMP_DIR)/.$*-fp-libs.checksum.saved) +$(TOP)/pch/.%-auto-pch.stamp: %-tcc$(EXESUF) + @mkdir -p "$(TOP)/pch/$*" + @dir="$(abspath $(TOP)/pch/$*)"; \ + index="$$dir/auto.index"; \ + tool="./$*-tcc$(EXESUF) -B$(TOP)"; \ + rm -f "$$index"; \ + for hdr in $(AUTO_PCH_COMMON_HEADERS); do rm -f "$$dir/$$hdr.pch" "$$dir/$$hdr.opt.pch"; done; \ + includes="$$($$tool -print-search-dirs 2>/dev/null | awk 'BEGIN { in_include = 0 } /^include:$$/ { in_include = 1; next } /^[^ ]/ { if (in_include) exit } in_include { sub(/^ /, ""); if ($$0 != "-") print }' || true)"; \ + for hdr in $(AUTO_PCH_COMMON_HEADERS); do \ + src=""; \ + for inc in $$includes; do \ + if [ -f "$$inc/$$hdr" ]; then \ + src="$$inc/$$hdr"; \ + break; \ + fi; \ + done; \ + [ -n "$$src" ] || continue; \ + for opt in 0 1; do \ + if [ "$$opt" = 0 ]; then oflags=""; pch="$$hdr.pch"; else oflags="-O$$opt"; pch="$$hdr.opt.pch"; fi; \ + if $$tool $$oflags -generate-pch "$$src" -o "$$dir/$$pch" >/dev/null 2>&1; then \ + probe="$$dir/.$$hdr.probe.c"; \ + printf '#include <%s>\nint main(void){return 0;}\n' "$$hdr" > "$$probe"; \ + out="$$($$tool $$oflags -use-pch "$$dir/$$pch" -E "$$probe" 2>&1 >/dev/null || true)"; \ + rm -f "$$probe"; \ + if ! printf '%s' "$$out" | grep -q 'ignoring PCH'; then \ + printf '%s\t%s\n' "$$src" "$$pch" >> "$$index"; \ + else \ + rm -f "$$dir/$$pch"; \ + fi; \ + else \ + rm -f "$$dir/$$pch"; \ + fi; \ + done; \ + done; \ + touch "$@" + install: ; @$(MAKE) --no-print-directory install$(CFG) install-strip: ; @$(MAKE) --no-print-directory install$(CFG) CONFIG_strip=yes uninstall: ; @$(MAKE) --no-print-directory uninstall$(CFG) @@ -235,11 +298,13 @@ LIB-$(TR) ?= {B}:/usr/$(TRIPLET-$T)/lib:/usr/lib/$(MARCH-$T) INC-$(TR) ?= {B}/include:/usr/$(TRIPLET-$T)/include:/usr/include endif -IR_FILES = ir/type.c ir/pool.c ir/vreg.c ir/stack.c ir/live.c ir/dump.c ir/codegen.c ir/opt.c ir/opt_jump_thread.c ir/licm.c ir/core.c ir/machine_op.c +IR_FILES = ir/type.c ir/pool.c ir/vreg.c ir/stack.c ir/dump.c ir/codegen.c ir/opt.c ir/opt_du.c ir/opt_xform.c ir/opt_utils.c ir/opt_alias.c ir/opt_loop_utils.c ir/opt_engine.c ir/opt_pipeline.c ir/opt_hash.c ir/opt_gens_fusion.c ir/opt_gens_bool.c ir/opt_gens_call_result.c ir/opt_gens_branch.c ir/opt_loop.c ir/opt_loop_dead.c ir/opt_memory.c ir/opt_jump_thread.c ir/opt_pack64.c ir/opt_dce.c ir/opt_constfold.c ir/opt_branch.c ir/opt_copyprop.c ir/opt_fusion.c ir/opt_promote.c ir/opt_constprop.c ir/opt_knownbits.c ir/opt_dead_lea_store.c ir/opt_const_aggregate.c ir/opt_dead_vla.c ir/opt_loop_const_sim.c ir/opt_switch_data.c ir/opt_reroll.c ir/opt_neg_chain.c ir/opt_bitfield.c ir/opt_cmp_fuse.c ir/opt_setif_or_taut.c ir/licm.c ir/cfg.c ir/ssa.c ir/opt/ssa_opt.c ir/opt/ssa_opt_dce.c ir/opt/ssa_opt_cprop.c ir/opt/ssa_opt_fold.c ir/opt/ssa_opt_phi.c ir/opt/ssa_opt_strength.c ir/opt/ssa_opt_gvn.c ir/opt/ssa_opt_reassoc.c ir/opt/ssa_opt_narrow.c ir/opt/ssa_opt_branch.c ir/opt/ssa_opt_sccp.c ir/opt/ssa_opt_load_cse.c ir/opt/ssa_opt_dead_loop.c ir/opt/ssa_opt_cmp_eq.c ir/regalloc.c ir/core.c ir/machine_op.c CORE_FILES = tccir_operand.c tccls.c tcc.c tcctools.c libtcc.c tccpp.c tccgen.c tccdbg.c tccelf.c tccasm.c tccyaff.c tccld.c tccdebug.c svalue.c tccmachine.c tccopt.c $(IR_FILES) -CORE_FILES += tcc.h config.h libtcc.h tcctok.h tccir.h tccir_operand.h tccld.h tccmachine.h tccopt.h +CORE_FILES += tcc.h config.h libtcc.h tcctok.h tccir.h tccir_operand.h tccld.h tccmachine.h tccopt.h log.h CORE_FILES += $(wildcard ir/*.h) -armv8m_FILES = $(CORE_FILES) arch/arm_aapcs.c arch/armv8m.c arm-thumb-opcodes.c arm-thumb-gen.c arm-thumb-callsite.c arm-link.c arm-thumb-asm.c arm-thumb-defs.h thumb-tok.h +armv8m_FILES = $(CORE_FILES) arm-thumb-gen.c arm-thumb-callsite.c arm-link.c arm-thumb-asm.c arm-thumb-defs.h thumb-tok.h arch/arm/thumb/thumb.h arch/arm/arm.h +armv8m_ARCH = arm +armv8m_ARCH_LIB = $(X)arch/arm/libarm.a TCCDEFS_H$(subst yes,,$(CONFIG_predefs)) = tccdefs_.h @@ -249,10 +314,11 @@ LIBTCC_SRC = $(filter-out tcc.c tcctools.c,$(filter %.c,$($T_FILES))) # Compile from separate objects LIBTCC_OBJ = $(patsubst %.c,$(X)%.o,$(LIBTCC_SRC)) LIBTCC_INC = $(filter %.h %-gen.c %-link.c,$($T_FILES)) -TCC_FILES = $(X)tcc.o $(LIBTCC_OBJ) +ARCH_LIB = $($T_ARCH_LIB) +TCC_FILES = $(X)tcc.o $(LIBTCC_OBJ) $(ARCH_LIB) $(X)tccpp.o : $(TCCDEFS_H) -DEFINES += -I$(TOP) -I$(TOP)/ir +DEFINES += -I$(TOP) -I$(TOP)/ir -I$(TOP)/ir/opt GITHASH:=$(shell git rev-parse --abbrev-ref HEAD 2>/dev/null || echo no) ifneq ($(GITHASH),no) @@ -275,9 +341,13 @@ endif $(X)%.o : %.c $(LIBTCC_INC) $S$(CC) -o $@ -c $< $(addsuffix ,$(DEFINES) $(CFLAGS)) -$(X)arch/%.o : arch/%.c $(LIBTCC_INC) - @mkdir -p $(dir $@) - $S$(CC) -o $@ -c $< $(addsuffix ,$(DEFINES) $(CFLAGS)) +# Architecture library — built by nested Makefile +TARGET_ARCH_NAME = $($T_ARCH) +$(ARCH_LIB): FORCE + @mkdir -p $(dir $(ARCH_LIB)) + $S$(MAKE) --no-print-directory -C arch ARCH=$(TARGET_ARCH_NAME) \ + TOP=$(CURDIR) BUILD_DIR=$(CURDIR)/$(dir $(ARCH_LIB)) \ + CC="$(CC)" AR="$(AR)" CFLAGS="$(CFLAGS)" DEFINES="$(DEFINES)" $(X)ir/%.o : ir/%.c $(LIBTCC_INC) @mkdir -p $(dir $@) @@ -364,6 +434,7 @@ install-unx: $(call IFw,$(TOPSRC)/lib/fp/libsoftfp.a $(TOPSRC)/lib/fp/libvfpv4sp.a $(TOPSRC)/lib/fp/libvfpv5dp.a $(TOPSRC)/lib/fp/librp2350fp.a,"$(libdir)") $(call IFw,$(TOPSRC)/lib/fp/libsoftfp.so $(TOPSRC)/lib/fp/libvfpv4sp.so $(TOPSRC)/lib/fp/libvfpv5dp.so $(TOPSRC)/lib/fp/librp2350fp.so,"$(libdir)") $(call IF,$(TOPSRC)/include/*.h $(TOPSRC)/tcclib.h,"$(tccdir)/include") + @if [ -d "$(TOPSRC)/pch" ]; then echo "-> $(tccdir)/pch : $(TOPSRC)/pch" ; mkdir -p "$(tccdir)/pch" && cp -r "$(TOPSRC)/pch"/. "$(tccdir)/pch" ; fi $(call $(if $(findstring .so,$(LIBTCC)),IBw,IFw),$(LIBTCC),"$(libdir)") $(call IF,$(TOPSRC)/libtcc.h,"$(includedir)") $(call IFw,tcc.1,"$(mandir)/man1") @@ -436,10 +507,33 @@ VENV_PIP := $(VENV_BINDIR)/pip IRTESTS_DIR := tests/ir_tests IRTESTS_REQUIREMENTS := $(IRTESTS_DIR)/requirements.txt IRTESTS_VENV_STAMP := $(VENV_DIR)/.irtests-requirements.stamp +PCH_BENCHMARK_SCRIPT := $(IRTESTS_DIR)/benchmark_pch.py +PCH_PREPARE_SCRIPT := $(IRTESTS_DIR)/prepare_pch.py NEWLIB_DIR := $(IRTESTS_DIR)/qemu/mps2-an505/newlib_build/arm-none-eabi/newlib NEWLIB_LIBC_A := $(NEWLIB_DIR)/libc.a +# newlib is a vendored submodule (its include dir is symlinked into +# libc_includes/newlib). We must not commit edits into it; instead keep local +# fixups as patches under tests/ir_tests/patches and apply them idempotently +# before any target that consumes the headers (warn-check, test-prepare). +NEWLIB_SRC := $(IRTESTS_DIR)/qemu/mps2-an505/libs/newlib +NEWLIB_PATCH_DIR := $(IRTESTS_DIR)/patches + +.PHONY: patch-newlib +patch-newlib: + @for p in $$(ls $(NEWLIB_PATCH_DIR)/*.patch 2>/dev/null | sort); do \ + ap=$$(cd $$(dirname "$$p") && pwd)/$$(basename "$$p"); \ + if git -C $(NEWLIB_SRC) apply --reverse --check "$$ap" >/dev/null 2>&1; then \ + : ; \ + elif git -C $(NEWLIB_SRC) apply --check "$$ap" >/dev/null 2>&1; then \ + echo "------------ newlib: applying patch $$(basename $$p) ------------"; \ + git -C $(NEWLIB_SRC) apply "$$ap"; \ + else \ + echo "WARNING: newlib patch $$(basename $$p) does not apply cleanly (skipping)"; \ + fi; \ + done + # Host tests for soft-float aeabi functions AEABI_HOST_TESTS = test_aeabi_all test_host test_dmul_host AEABI_HOST_TEST_DIR = lib/fp/soft @@ -473,12 +567,31 @@ $(IRTESTS_VENV_STAMP): $(IRTESTS_REQUIREMENTS) touch "$@" .PHONY: test-prepare -test-prepare: +test-prepare: patch-newlib @set -e; \ if [ -f "$(NEWLIB_LIBC_A)" ]; then exit 0; fi; \ echo "------------ ir_tests: building newlib (first run) ------------"; \ cd $(IRTESTS_DIR)/qemu/mps2-an505 && sh ./build_newlib.sh +.PHONY: rebuild-newlib +rebuild-newlib: + @echo "------------ ir_tests: rebuilding newlib ------------" + @rm -rf $(IRTESTS_DIR)/qemu/mps2-an505/newlib_build + @cd $(IRTESTS_DIR)/qemu/mps2-an505 && sh ./build_newlib.sh + +.PHONY: prepare-pch benchmark-pch benchmark-pch-libc benchmark-pch-libtcc +prepare-pch: cross + @$(PYTHON) "$(PCH_PREPARE_SCRIPT)" $(PCH_PREPARE_ARGS) + +benchmark-pch: cross + @$(PYTHON) "$(PCH_BENCHMARK_SCRIPT)" $(PCH_BENCHMARK_ARGS) + +benchmark-pch-libc: cross + @$(PYTHON) "$(PCH_BENCHMARK_SCRIPT)" --scenario libc-common $(PCH_BENCHMARK_ARGS) + +benchmark-pch-libtcc: cross + @$(PYTHON) "$(PCH_BENCHMARK_SCRIPT)" --scenario libtcc $(PCH_BENCHMARK_ARGS) + ASMTESTS_DIR := tests/thumb/armv8m @@ -498,13 +611,50 @@ test-asm: cross test-venv $(PYTEST) --tb=short -q -n $(J) .; \ fi +# Check that cross-compilation produces no unexpected warnings or errors. +# Rebuilds libtcc1.a and compiles test files with -c, failing on any +# "warning:" or "error:" in stderr. +WARN_CHECK_SRCS = \ + tests/tests2/15_recursion.c \ + tests/tests2/14_if.c \ + tests/tests2/04_for.c \ + tests/tests2/08_while.c \ + tests/tests2/09_do_while.c \ + tests/tests2/06_case.c \ + tests/tests2/07_function.c + +.PHONY: warn-check +warn-check: armv8m-tcc$(EXESUF) patch-newlib + @echo "------------ warn-check: libtcc1.a build ------------" + @rm -f armv8m-libtcc1.a + @log=$$($(MAKE) --no-print-directory armv8m-libtcc1.a 2>&1) ; \ + warns=$$(echo "$$log" | grep -c -E 'warning:|error:') ; \ + if [ "$$warns" -ne 0 ]; then \ + echo "FAIL: unexpected warnings/errors building libtcc1.a:" ; \ + echo "$$log" | grep -E 'warning:|error:' ; \ + exit 1 ; \ + fi + @echo "------------ warn-check: test file compilation ------------" + @fail=0 ; \ + wc_inc="-nostdinc -I$(IRTESTS_DIR)/libc_includes -I$(IRTESTS_DIR)/libc_imports -I$(IRTESTS_DIR)/libc_includes/newlib -Iinclude" ; \ + for f in $(WARN_CHECK_SRCS); do \ + out=$$(./armv8m-tcc$(EXESUF) $$wc_inc -c "$$f" -o /dev/null 2>&1) ; \ + if echo "$$out" | grep -qE 'warning:|error:'; then \ + echo "FAIL: $$f:" ; \ + echo "$$out" | grep -E 'warning:|error:' ; \ + fail=1 ; \ + fi ; \ + done ; \ + if [ "$$fail" -ne 0 ]; then exit 1; fi + @echo "------------ warn-check: passed ------------" + # run IR tests via pytest (preferred) -test: cross test-aeabi-host test-asm test-venv test-prepare download-gcc-tests +test: cross test-aeabi-host test-asm warn-check test-venv test-prepare download-gcc-tests ut @echo "------------ ir_tests (pytest) ------------" @if [ "$(USE_VENV)" = "1" ]; then \ - cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -s -n $(J); \ + cd $(IRTESTS_DIR) && "$(VENV_PY)" -m pytest -s -n $(J) --durations=10; \ else \ - cd $(IRTESTS_DIR) && $(PYTEST) -s -n $(J); \ + cd $(IRTESTS_DIR) && $(PYTEST) -s -n $(J) --durations=10; \ fi # legacy tests (kept for reference) @@ -529,6 +679,7 @@ test-install: $(TCCDEFS_H) clean: @rm -f tcc *-tcc tcc_p tcc_c @rm -f tags ETAGS *.o *.a *.so* *.out *.log lib*.def *.exe *.dll + @rm -rf *-ir/ *-arch/ @rm -f a.out *.dylib *_.h *.pod *.tcov @$(MAKE) -s -C lib $@ @$(MAKE) -s -C tests $@ @@ -606,7 +757,14 @@ test-all: cross test-aeabi-host test-asm test-venv test-prepare test-gcc-torture test-valgrind: $(MAKE) test VALGRIND=1 -.PHONY: all cross fp-libs clean test test-valgrind test-aeabi-host test-legacy test-tests2 test-gcc-torture test-gcc-torture-compile test-gcc-torture-execute test-full test-all download-gcc-tests tar tags ETAGS doc distclean install uninstall FORCE +# host-native internal unit tests (see tests/unit/README for the design) +ut: + $(MAKE) -C tests/unit run + +ut-clean: + $(MAKE) -C tests/unit clean + +.PHONY: all cross fp-libs clean test test-valgrind test-aeabi-host test-legacy test-tests2 test-gcc-torture test-gcc-torture-compile test-gcc-torture-execute test-full test-all rebuild-newlib download-gcc-tests tar tags ETAGS doc distclean install uninstall ut ut-clean FORCE # Container image settings (auto-detect docker or podman) DOCKER_REGISTRY ?= ghcr.io @@ -668,6 +826,8 @@ help: @echo " $(wordlist 9,99,$(TCC_X))" @echo "make test" @echo " rebuild + initialize GCC testsuite + run pytest in tests/ir_tests" + @echo "make rebuild-newlib" + @echo " wipe and rebuild newlib used by ir_tests/qemu (mps2-an505)" @echo "make test-legacy" @echo " run legacy make-based tests (tests/Makefile)" @echo "make tests2.all / make tests2.37 / make tests2.37+" diff --git a/PLAN.md b/PLAN.md new file mode 100644 index 00000000..974bc6f7 --- /dev/null +++ b/PLAN.md @@ -0,0 +1,222 @@ +# SSA Optimization Plan: Fold Inlined Check Functions to 7 Instructions + +## Goal + +Reduce `main` in `test_llong_load_signed.c` from 100 instructions to 7 (matching GCC -O2). +GCC's output is: `push; ldr; bl puts; ldr; bl puts; movs r0,#0; pop`. + +All 3 inlined `check_s64` comparisons must be proven always-equal and eliminated. + +## Current State + +After pre-SSA optimizations and loop rotation, the SSA optimizer receives this IR for `main`: + +``` +0000: PARAM0[call_1] ... +0001: CALL puts ;; puts("Testing...") +0002: V0 <-- #0 +0003: T0 <-- GlobalSym(g1)***DEREF*** ;; T0 = g1 +0004: T1 <-- GlobalSym(g2)***DEREF*** ;; T1 = g2 +0005: StackLoc[-16] <-- T1 ;; arr[1] = g2 +0006: StackLoc[-8] <-- #-1099511627776 ;; arr[2] = -(1LL<<40) + +;; --- inlined check_s64("arr0", arr[0], g1) --- +0012: V2 <-- "arr0" +0013: T5 <-- T0 ;; got = T0 (= g1, from arr[0] forwarded by pre-SSA) +0014: V3 <-- T5 +0015: T6 <-- GlobalSym(g1)***DEREF*** ;; exp = reload g1 +0016: V4 <-- T6 +0017: CMP T5, T6 ;; ← SHOULD FOLD: both are g1 +0018: JMP == skip1 + ... printf FAIL path + RETURNVALUE #1 ... +skip1: + +;; --- inlined check_s64("arr1", arr[1], g2) --- +0030: T12 <-- T11***DEREF*** ;; got = *Addr[StackLoc[-16]] = arr[1] +0034: T14 <-- GlobalSym(g2)***DEREF*** ;; exp = g2 +0036: CMP T13, T14 ;; ← SHOULD FOLD: T12 loaded from StackLoc[-16] which holds T1 = g2 + ... printf FAIL path + RETURNVALUE #1 ... + +;; --- inlined check_s64("local", local, -(1LL<<40)) --- +0046: T18 <-- &V0 +0047: V9 <-- T18 +0050: T20 <-- V9 ;; T20 = &V0 +0051: T21 <-- V10 ;; T21 = -(1LL<<40) (from StackLoc[-8]) +0052: T20***DEREF*** <-- T21 ;; *&V0 = -(1LL<<40), i.e. V0 = -(1LL<<40) +0054: T22 <-- V0 [LOAD] ;; T22 = V0 = -(1LL<<40) +0056: CMP T22, #-1099511627776 ;; ← SHOULD FOLD: both are -(1LL<<40) + ... printf FAIL path + RETURNVALUE #1 ... + +0066: PARAM0 ... +0067: CALL puts ;; puts("PASS") +0068: RETURNVALUE #0 +``` + +After SSA optimization, **nothing folds** — all 3 CMPs and their dead error paths survive. + +--- + +## Three Comparisons, Three Root Causes + +### CMP 1: `CMP T5, T6` — Global Load CSE not firing + +**What happens:** T0 and T6 both load `GlobalSym(g1)***DEREF***`. No store to g1 between them. + +**Root cause:** `ssa_opt_load_cse` correctly tracks global loads, but there is an intervening `CALL puts` at instruction 1 which **invalidates all tracked loads** (line 86-88 of `ssa_opt_load_cse.c`). The T0 load at instruction 3 is registered, then `CALL puts` at instruction 1... wait, the CALL is before T0. + +Actually, re-reading the IR: the CALL at line 1 is before T0 at line 3. So T0 is registered after the call. T6 is at line 15. Between lines 3 and 15, there are no CALLs or aliasing stores. **Load CSE should fire.** + +**Actual root cause:** The load CSE pass handles this correctly in theory. But between T0 (line 3) and T6 (line 15), there is a **basic block boundary** (the JUMPIF at line 18 creates a branch). T0 is in the entry block; T6 is in a dominated block after the first check_s64 branch. Since `gload_process_block` passes `state` by value to domtree children, the T0 entry should be visible in the block containing T6. + +**But wait:** between T0 and T6, there's a `CALL GlobalSym(printf)` at line 23 (the error path) which invalidates the load state. However, that CALL is in a **different basic block** (the error arm). Since load_cse walks the dominator tree, the error block is a child that gets its own copy of state. The continuation block (post-CMP) should still see T0. + +**Investigation needed:** Check if the CFG/dominator tree structure puts T6 in a block dominated by T0's block, and that the error-arm invalidation doesn't leak into the continuation. **Likely a dominator-tree issue or block-boundary issue.** + +### CMP 2: `CMP T13, T14` — Stack store-load forwarding through pointer + +**What happens:** +- `StackLoc[-16] <-- T1` stores g2 to arr[1] +- Later: `T10 <-- Addr[StackLoc[-16]]; V5 <-- T10; T11 <-- V5; T12 <-- T11***DEREF***` loads arr[1] through a pointer chain +- T14 loads `GlobalSym(g2)` again + +**Root cause:** The load of arr[1] goes through a VAR-indirected pointer (`T11 = V5 = Addr[StackLoc[-16]]`), not a direct `StackLoc[-16]` load. The pre-SSA SL-forward and the SSA optimizer don't resolve the pointer chain to recognize this is a stack load. + +**Fix:** After SSA cprop resolves `T11 → V5 → T10 → Addr[StackLoc[-16]]`, the load `T12 <-- T11***DEREF***` becomes `T12 <-- *Addr[StackLoc[-16]]` = load from StackLoc[-16]. Then: +1. Stack store-load forwarding: StackLoc[-16] holds T1 → T12 = T1 +2. Load CSE: T14 = GlobalSym(g2) = T1 (since T1 was loaded from g2) +3. CMP T12, T14 → CMP T1, T1 → fold + +**This requires:** SSA cprop to propagate through VARs into pointer dereferences, and then a store-load forwarding pass for stack slots. + +### CMP 3: `CMP T22, #-1099511627776` — SCCP through store-via-pointer + +**What happens:** +- `V0 = 0` initially +- `T20 = &V0; *T20 = -(1LL<<40)` stores through pointer +- `T22 = V0` loads the value + +**Root cause:** SCCP's `sccp_resolve_var` scans backward for store-through-pointer patterns. It finds `T20***DEREF*** <-- T21 [STORE]` and tries to trace T20 back to `&V0`. But T20 is defined as `T20 = V9`, and V9 is a VAR (not `&V0` directly). The backward scan doesn't follow through VAR indirection. + +**Fix:** Either: +- (a) Run cprop before SCCP so T20 is simplified to `T20 = &V0` directly, or +- (b) Teach `sccp_resolve_var`'s backward pointer scan to follow through ASSIGN chains and VAR stores + +--- + +## Implementation Plan + +### Step 1: Fix pass ordering — run cprop before SCCP + +In `tcc_ir_ssa_opt_run` ([ssa_opt.c:405](ir/opt/ssa_opt.c#L405)): + +```c +// Current: +changes += ssa_opt_sccp(ctx); +changes += ssa_opt_cprop(ctx); + +// Change to: +changes += ssa_opt_cprop(ctx); +changes += ssa_opt_sccp(ctx); +``` + +**Why:** cprop resolves copy chains like `T20 = V9 = T18 = &V0` into direct `T20 = &V0`. SCCP's backward pointer scan then finds the `&V0` pattern directly. + +This alone should fix **CMP 3** (the `-(1LL<<40)` constant case). + +### Step 2: Extend SCCP to resolve store-through-pointer with LOAD sources + +In `sccp_resolve_var` ([ssa_opt_sccp.c:138](ir/opt/ssa_opt_sccp.c#L138)), when handling `STORE *T = src`: + +Currently, only constant-immediate sources are handled (`if (irop_is_immediate(src))`). Extend to resolve `src` through the SCCP lattice: + +```c +/* After finding *T = src where T = &V: */ +IROperand src = tcc_ir_op_get_src1(ir, q); +if (irop_is_immediate(src)) { + *out = irop_get_imm64_ex(ir, src); + return SCCP_CONST; +} +/* NEW: check if src TEMP has a known constant in the lattice */ +int32_t src_vr = irop_get_vreg(src); +SCCPCell *src_cell = sccp_cell(s, src_vr); +if (src_cell && src_cell->state == SCCP_CONST) { + *out = src_cell->value; + return SCCP_CONST; +} +return SCCP_BOTTOM; +``` + +**Why:** The stored value may come from a LOAD of a constant stack slot (e.g., `T21 = V10 [LOAD]` where V10 was assigned from `StackLoc[-8]` which holds `-(1LL<<40)`). After cprop + earlier SCCP iterations, T21 may be known-constant in the lattice. + +### Step 3: Add CMP-of-same-vreg folding to ssa_opt_branch + +In `ssa_fold_cmp_jumpif` ([ssa_opt_branch.c:44](ir/opt/ssa_opt_branch.c#L44)): + +Already implemented at lines 66-77 — checks `vr1 == vr2`. This handles the case where load_cse converts the second load to an ASSIGN from the first, and cprop propagates it. **No change needed here.** + +### Step 4: Debug/fix load_cse dominator-tree traversal + +The `ssa_opt_load_cse` pass should already deduplicate the two `GlobalSym(g1)` loads (T0 at line 3, T6 at line 15). Verify that: + +1. The CFG correctly places T0 and T6 in blocks where T0's block dominates T6's block +2. No CALL or aliasing STORE between T0 and T6 invalidates the entry +3. The `GLoadState` passed by value to child blocks preserves T0's entry + +If load_cse fires correctly: +- T6 becomes `ASSIGN T0` +- cprop propagates: CMP T5, T6 → CMP T0, T0 +- branch fold: CMP identical vregs → always equal → JMP/NOP +- DCE removes the dead error path + +**Test:** Add `fprintf(stderr, ...)` in `gload_process_block` to trace tracked entries and invalidations per block. + +### Step 5: Add SSA stack store-load forwarding (for CMP 2) + +Create a new pass or extend `ssa_opt_load_cse` to handle stack-slot forwarding: + +**Pattern:** +``` +StackLoc[N] <-- Tx [STORE] +... +Ty <-- *Addr[StackLoc[N]] ;; after cprop resolved pointer chain +``` + +**Transform:** Replace `Ty` with `Tx` (the stored value). + +**Implementation in `ssa_opt_load_cse`:** Track `(StackLoc offset → result_vreg)` alongside global loads. On a LOAD where src1 is a TEMP that was assigned `Addr[StackLoc[N]]`, look up whether StackLoc[N] was previously stored. If so, replace the LOAD with ASSIGN from the stored vreg. + +**Invalidation:** Any STORE to the same StackLoc or any CALL or any aliasing store (to a non-local address) invalidates the entry. Stack-local stores to *different* offsets are safe. + +### Step 6: Cascading cleanup (already implemented) + +After Steps 1-5 make the CMPs foldable, the existing passes cascade: + +1. **load_cse** → T6 = T0 (dedup global loads) +2. **cprop** → propagate copies +3. **branch** → CMP x,x → fold to always-equal → JUMP/NOP +4. **dce** → remove dead printf paths + RETURNVALUE #1 +5. **dce** → remove dead VAR stores (V2, V3, V4, etc.) + +Result: `main` = `puts + puts + RETURNVALUE #0` = 7 instructions. + +--- + +## Execution Order + +| # | Task | File(s) | Risk | Impact | +|---|------|---------|------|--------| +| 1 | Reorder cprop before sccp | `ssa_opt.c` | Low | Fixes CMP 3 | +| 2 | Extend SCCP lattice lookup for store sources | `ssa_opt_sccp.c` | Low | Strengthens CMP 3 | +| 3 | Debug/fix load_cse for CMP 1 | `ssa_opt_load_cse.c` | Medium | Fixes CMP 1 | +| 4 | Add stack store-load forwarding | `ssa_opt_load_cse.c` | Medium | Fixes CMP 2 | +| 5 | Run full test suite | — | — | Verify no regressions | + +## Verification + +```bash +./scripts/compare_disasm.py tests/ir_tests/test_llong_load_signed.c +# Expected: main = 7 instructions, Ratio = 1.00x + +make test -j16 # IR test suite +make test-asm -j16 # Assembly tests +``` diff --git a/PLAN_nested_functions.md b/PLAN_nested_functions.md deleted file mode 100644 index 7034d557..00000000 --- a/PLAN_nested_functions.md +++ /dev/null @@ -1,1141 +0,0 @@ -# Plan: Supporting GCC Nested Functions (20000822-1.c) - -## Problem Statement - -``` -❯ python run.py -c ../gcctestsuite/gcc-testsuite/gcc/testsuite/gcc.c-torture/execute/20000822-1.c --cflags="-O0" -Using CFLAGS: -O0 -Compilation failed: - 20000822-1.c:15: error: cannot use local functions -``` - -The test `20000822-1.c` uses **GCC nested functions** — a GNU C extension that allows defining functions inside other functions, with access to the enclosing scope's variables. TinyCC currently rejects this with a hard error at `tccgen.c:11393`. - ---- - -## Test Analysis - -```c -/* { dg-require-effective-target trampolines } */ -void abort(void); - -int f0(int (*fn)(int *), int *p) { - return (*fn)(p); // indirect call via function pointer -} - -int f1(void) { - int i = 0; - - int f2(int *p) { // (1) nested function definition - i = 1; // (2) writes to parent's local variable - return *p + 1; // (3) reads *p (which points to i) - } - - return f0(f2, &i); // (4) takes address of nested function → trampoline -} - -int main() { - if (f1() != 2) // expected: f2 sets i=1, returns *(&i)+1 = 2 - abort(); - return 0; -} -``` - -### GNU C Features Required - -| # | Feature | Complexity | Description | -|---|---------|------------|-------------| -| 1 | Nested function definition | Medium | `f2` defined inside `f1`'s body | -| 2 | Parent scope variable capture | High | `f2` reads/writes `i` from `f1`'s stack frame | -| 3 | Address-of nested function | High | `f2` passed as `int (*)(int*)` to `f0` | -| 4 | Trampoline / indirect call | High | `f0` calls `f2` through a function pointer — requires trampoline to set up static chain | - ---- - -## Affected GCC Torture Tests (14 total) - -All require `dg-require-effective-target trampolines`: - -| Test | Features Used | -|------|---------------| -| `20000822-1.c` | Nested func, capture, address-of, indirect call | -| `920428-2.c` | Nested function with capture | -| `920501-7.c` | Nested function with capture | -| `920612-2.c` | Nested function with capture | -| `921017-1.c` | Nested function with capture | -| `921215-1.c` | Nested function with capture | -| `931002-1.c` | Nested function with capture | -| `comp-goto-2.c` | Nested function + computed goto | -| `nestfunc-1.c` | Nested function basics | -| `nestfunc-2.c` | Nested function arguments | -| `nestfunc-3.c` | Nested function with struct returns | -| `nestfunc-5.c` | Nested function + `__label__` | -| `nestfunc-6.c` | Nested function + nonlocal goto | -| `pr24135.c` | Nested function + `__label__` + nonlocal goto | - ---- - -## Current Codebase State - -### Where the error originates - -```c -// tccgen.c:11391-11393 -if (tok == '{') { - if (l != VT_CONST) - tcc_error("cannot use local functions"); -``` - -`decl()` is called with `l = VT_LOCAL` when parsing block-scope declarations. -Only `l = VT_CONST` (file scope) is permitted to have function bodies. - -### Compilation pipeline (current) - -``` -decl(VT_CONST) → parse type + declarator → gen_function(sym) - ↓ - tcc_ir_alloc() ← one IR state per function - block(0) ← parse body, emit IR - optimization passes - register allocation - tcc_ir_codegen_generate() ← emit Thumb-2 - tcc_ir_free() -``` - -### Global state consumed by gen_function - -These globals must be saved/restored when suspending parent compilation: - -| Global | Type | Purpose | -|--------|------|---------| -| `tcc_state->ir` | `TCCIRState*` | Current IR state (per-function, alloc'd by `tcc_ir_alloc`) | -| `loc` | `int` | Current local stack offset (grows negative) | -| `ind` | `int` | Current code output index in `cur_text_section` | -| `rsym` | `int` | Return symbol jump chain (-1 sentinel) | -| `func_ind` | `int` | Function start index | -| `funcname` | `const char*` | Current function name | -| `func_vt` | `CType` | Function return type | -| `func_var` | `int` | Variadic flag | -| `cur_scope` | `struct scope*` | Current scope (linked list) | -| `root_scope` | `struct scope*` | Root scope of current function | -| `loop_scope` | `struct scope*` | Current loop scope | -| `local_stack` | `Sym*` | Local symbol stack | -| `local_label_stack` | `Sym*` | Local labels | -| `global_label_stack` | `Sym*` | Global label stack (saved per-function) | -| `nocode_wanted` | `int` | Code generation suppression flag | -| `local_scope` | `int` | Local scope depth counter | -| `nb_temp_local_vars` | `int` | Temp local variable count | -| `arr_temp_local_vars` | `struct[8]` | Temp local variable info | -| `cur_text_section` | `Section*` | Current output section | -| `cur_switch` | `struct switch_t*` | Current switch (should be NULL at nested func) | - -### Key constraints - -- **One `TCCIRState` per function** — nested function compilation would need to suspend the parent's state -- **No static chain concept** — IR locals are simple FP offsets with no cross-frame access -- **No trampoline infrastructure** — no code exists for generating executable trampolines -- **ARM FP register is R7** (Thumb convention), not R11 — affects static chain register choice -- **Inline functions** already use `skip_or_save_block` + reparse model — we should reuse this pattern - -### ARM calling convention (AAPCS) - -- R0-R3: argument registers -- R7: frame pointer (Thumb) -- R12 (IP): scratch / intra-procedure call -- R10: platform register (available as static chain in GCC) -- LR (R14): link register -- No existing use of R10 as static chain - ---- - -## Architecture Decision: Save-Tokens + Reparse (like inline functions) - -### Why not suspend/resume? - -Suspending the parent's `gen_function()` mid-compilation (saving all globals, allocating a new `TCCIRState`, compiling the nested function, restoring) is fragile: - -- `gen_function()` has deep call stacks: `gen_function → block → block → decl → ???` -- The C stack state (return addresses, local variables in `block()`, `decl()`, etc.) cannot be saved -- Many optimization passes assume they run on a complete function — partial IR state is invalid - -### Why save-tokens + reparse? - -TCC already has a proven model: **inline functions**. When a `static inline` function is encountered, TCC: - -1. Calls `skip_or_save_block(&fn->func_str)` to tokenize the entire body -2. Stores the `TokenString` for later -3. When the function is actually used, replays via `begin_macro(fn->func_str, 1)` + `gen_function()` - -We use the **same pattern** for nested functions: - -1. When we see a nested function definition inside `decl(VT_LOCAL)`, save its body as a `TokenString` -2. Record metadata (captured variables, parent scope info) -3. Jump past the body (the parent continues parsing normally) -4. **Before** the parent's `gen_function()` returns (after `block(0)` but before optimizations), compile all nested functions - -### What about VLA-style token caching? - -VLAs also use `skip_or_save_block` for array dimension expressions (`vla_array_tok`). The nested function approach is the same concept at a larger scale — we're caching a complete function body instead of a single expression. - -### Storage: NestedFunc array on TCCIRState - -We store nested function descriptors in an array on the parent's `TCCIRState`, similar to how `inline_fns` are stored on `TCCState`: - -```c -typedef struct NestedFunc { - TokenString *func_str; // saved token stream of body - Sym *sym; // symbol (with mangled name like f1.f2) - CType func_type; // function type - int *captured_offsets; // parent FP offsets of captured vars - int nb_captured; // number of captured vars - int trampoline_needed; // 1 if address-of is taken - char parent_filename[1]; // filename for error reporting -} NestedFunc; -``` - ---- - -## Implementation Plan - -### Phase 1: Parser — Save Nested Function Bodies as Tokens - -**Effort**: 2-3 days -**Files**: `tccgen.c`, `tcc.h`, `tccir.h` - -#### 1.1 Data structures - -```c -// tcc.h additions: - -// Nested function descriptor — stored before compilation -typedef struct NestedFunc { - TokenString *func_str; // saved token stream of function body - Sym *sym; // function symbol in parent's local scope - CType type; // full function type - AttributeDef ad; // function attributes - int v; // token id (function name) - char filename[256]; // source filename for error messages -} NestedFunc; - -// tccir.h additions to TCCIRState: -// NestedFunc *nested_funcs; -// int nb_nested_funcs; -// int has_static_chain; // 1 if this function is itself nested -// int static_chain_vreg; // vreg holding the chain (R10 on entry) -``` - -#### 1.2 Pseudocode: Modify `decl(VT_LOCAL)` to save nested function body - -``` -function decl(l): - ...existing type parsing... - - if tok == '{': - if l == VT_LOCAL: - // ── NEW: nested function definition ── - assert (type.t & VT_BTYPE) == VT_FUNC - - // Validate parameters (same as file-scope path) - foreach param in type.ref->next: - if param has no identifier: error("expected identifier") - if param is void: param.type = int_type - - merge_funcattr(&type.ref->f, &ad.f) - - // Create a mangled symbol: "parent.child" - mangled_name = concat(funcname, ".", get_tok_str(v)) - - // Push symbol into LOCAL scope so the parent body can reference it - type.t &= ~VT_EXTERN - sym = sym_push(v, &type, VT_CONST, 0) // VT_CONST: it's a function - put_extern_sym(sym, cur_text_section, 0, 0) // placeholder - - // Save the token stream (reuse inline function pattern) - ir = tcc_state->ir - nf = &ir->nested_funcs[ir->nb_nested_funcs++] - nf->sym = sym - nf->type = type - nf->ad = ad - nf->v = v - strcpy(nf->filename, file->filename) - skip_or_save_block(&nf->func_str) // saves '{' ... '}' - - break // continue parsing parent body - else: - // existing file-scope path - ... -``` - -#### 1.3 Pseudocode: Compile nested functions after parent body - -Insert nested function compilation in `gen_function()`, **after** `block(0)` returns but **before** IR optimization. At this point: -- The parent's `loc` is finalized (all locals allocated) -- Captured variable FP-offsets are known -- The parent's token stream is exhausted (nested body was already skipped) - -``` -function gen_function(sym): - ...existing setup... - - ir = tcc_ir_alloc() - tcc_state->ir = ir - ...existing param processing... - block(0) - tcc_ir_backpatch_to_here(ir, rsym) - - // ── NEW: compile nested functions ── - if ir->nb_nested_funcs > 0: - compile_nested_functions(ir, sym) - - ...existing optimization passes... - ...existing register allocation... - ...existing codegen... - tcc_ir_free(ir) - -function compile_nested_functions(parent_ir, parent_sym): - // Save ALL parent global state - saved = { - .ir = tcc_state->ir, - .loc = loc, - .ind = ind, - .rsym = rsym, - .func_ind = func_ind, - .funcname = funcname, - .func_vt = func_vt, - .func_var = func_var, - .cur_scope = cur_scope, - .root_scope = root_scope, - .loop_scope = loop_scope, - .local_stack = local_stack, - .local_label_stack = local_label_stack, - .global_label_stack = global_label_stack, - .nocode_wanted = nocode_wanted, - .local_scope = local_scope, - .nb_temp_local_vars = nb_temp_local_vars, - .cur_text_section = cur_text_section, - .cur_switch = cur_switch, - } - memcpy(saved.arr_temp_local_vars, arr_temp_local_vars, sizeof arr_temp_local_vars) - - // Record parent's finalized stack layout for capture resolution - parent_loc = loc // deepest local offset — all offsets are known - - for each nf in parent_ir->nested_funcs: - // Replay the saved token stream (same as inline function expansion) - tccpp_putfile(nf->filename) - begin_macro(nf->func_str, 1) - next() // prime the first token - - // The nested function compiles into the SAME text section - cur_text_section = saved.cur_text_section - - // gen_function() handles everything: IR alloc, block(), optimize, codegen - gen_function(nf->sym) - - end_macro() - - // Restore ALL parent state - tcc_state->ir = saved.ir - loc = saved.loc - ind = saved.ind - rsym = saved.rsym - func_ind = saved.func_ind - funcname = saved.funcname - func_vt = saved.func_vt - func_var = saved.func_var - cur_scope = saved.cur_scope - root_scope = saved.root_scope - loop_scope = saved.loop_scope - local_stack = saved.local_stack - local_label_stack = saved.local_label_stack - global_label_stack = saved.global_label_stack - nocode_wanted = saved.nocode_wanted - local_scope = saved.local_scope - nb_temp_local_vars = saved.nb_temp_local_vars - cur_text_section = saved.cur_text_section - cur_switch = saved.cur_switch - memcpy(arr_temp_local_vars, saved.arr_temp_local_vars, sizeof arr_temp_local_vars) -``` - -#### 1.4 Why after `block(0)` but before optimizations? - -- **After `block(0)`**: All parent locals have been allocated, so we know exact FP offsets for captured variables. The token stream has been fully consumed. -- **Before optimizations**: The parent's IR is complete but not yet optimized. Nested function code goes into the `.text` section at `ind` (which gen_function modifies). After we restore `ind`, the parent's codegen continues where it left off. -- **Note**: `gen_function()` calls `next()` at the end which consumes the closing `}`. Since we use `begin_macro/end_macro` to replay, this is handled correctly — the nested function body is self-contained in the `TokenString`. - -#### 1.5 Symbol visibility during parent body parsing - -After `skip_or_save_block`, the nested function's symbol (`f2`) is on `local_stack`. When the parent body references `f2` (e.g., `f0(f2, &i)`), it resolves via `sym_find()` to a function symbol — just like any other function. No special handling needed for **direct calls**. - -For **address-of** (`&f2` or passing `f2` as function pointer), the symbol resolution produces a function reference. The trampoline logic (Phase 3) intercepts this. - ---- - -### Phase 2: Static Chain — Captured Variable Access - -**Effort**: 3-5 days -**Files**: `tccgen.c`, `tcc.h`, `tccir.h`, `ir/core.c`, `ir/core.h`, `tccls.c`, `arch/armv8m.c` - -#### 2.1 Static chain register: R10 - -Following GCC's ARM convention, use **R10** as the static chain register. When a nested function is called, R10 points to the parent's stack frame (= parent's FP value at the time of the call). - -```c -// arm-thumb-defs.h -#define REG_STATIC_CHAIN 10 // R10: static chain for nested functions -``` - -#### 2.2 Architecture config addition - -```c -// arch/armv8m.c — extend ArchitectureConfig -ArchitectureConfig architecture_config = { - .pointer_size = 4, - .stack_align = 8, - .reg_size = 4, - .parameter_registers = 4, - .has_fpu = 0, - .static_chain_reg = 10, // NEW: R10 for nested function static chain -}; -``` - -#### 2.3 Identifying captured variables - -During the reparse of the nested function body (inside `gen_function` called for the nested func), variable lookups that resolve to parent-scope locals need special treatment. - -**Problem**: After `skip_or_save_block` saved the nested function's tokens and we later replay them, `sym_find()` for captured variables must still resolve. But `pop_local_syms(NULL, 0)` in the parent's `gen_function()` hasn't run yet (we compile nested functions before that). So the parent's local symbols are still on `local_stack`. - -**Approach**: We need a way to detect "this symbol is from the parent scope, not our own scope" during nested function compilation. - -``` -// Pseudocode for captured variable detection: - -// Before compiling nested function, save the boundary of the parent's local_stack -parent_locals_boundary = local_stack // top of parent's locals - -// During nested function compilation, in sym_find/variable resolution: -function resolve_var_in_nested_func(tok): - sym = sym_find(tok) - if sym == NULL: return NULL - - if sym belongs to parent scope (sym->prev chain crosses parent_locals_boundary): - // This is a captured variable - mark_as_captured(sym) - return create_chain_access(sym) // returns an SValue with chain-relative addressing - else: - return sym // local to nested function, normal access -``` - -**Alternative simpler approach**: Since we know the nested function's own locals are pushed after we enter `gen_function(nf->sym)`, any `VT_LOCAL` symbol that was already on the stack at entry is a parent local: - -``` -// Pseudocode: -// In compile_nested_functions(), before calling gen_function(nf->sym): -parent_local_stack_top = local_stack // save parent's local stack position - -// Inside the nested gen_function, if we resolve a VT_LOCAL sym: -if sym->r & VT_LOCAL && sym is on local_stack && sym was pushed before parent_local_stack_top: - // This is a captured variable access - // sym->c is its FP-relative offset in the parent's frame - // Emit: LOAD/STORE via R10 (static chain) + sym->c -``` - -#### 2.4 Captured variable IR generation - -When we detect a captured variable access inside a nested function, instead of the normal `VT_LOCAL | VT_LVAL` SValue (which means "FP + offset"), we produce an SValue that means "chain_reg + offset": - -``` -// Pseudocode for generating IR for captured variable access: - -function svalue_for_captured_var(sym): - // Option A: New SValue kind — VT_CHAIN_LOCAL - sv.r = VT_CHAIN_LOCAL | VT_LVAL // new flag meaning "relative to static chain reg" - sv.c.i = sym->c // parent FP offset (already known) - sv.type = sym->type - return sv - - // Option B: Reuse VT_LOCAL but with a different base register hint - // The IR emitter checks ir->has_static_chain when it sees a VT_LOCAL - // and the sym_scope indicates parent scope → redirect to chain reg -``` - -**Option B is simpler** — it avoids a new SValue kind. We distinguish captured variables by checking if the symbol's scope is outside the current function. - -#### 2.5 IR-level handling of captured variables - -No new IR opcodes needed. Captured variable access becomes: - -``` -// Normal local: LOAD dest, [FP + offset] → FP is implicit base for VT_LOCAL -// Captured local: LOAD dest, [V_chain + offset] → V_chain is a vreg holding R10 - -// In IR generation (tccir.c or tccgen.c), when loading a captured var: -// 1. The static chain vreg is allocated once at function entry -// 2. Captured access: emit TCCIR_OP_LOAD with src1 = chain_vreg, offset = parent_offset -``` - -Pseudocode for chain vreg setup: - -``` -function gen_function_for_nested(sym): - ...standard gen_function() setup... - - if sym is a nested function (ir->has_static_chain): - // Allocate a vreg that holds R10 (static chain) - // This vreg is live for the entire function - ir->static_chain_vreg = tcc_ir_alloc_vreg(ir, IR_TYPE_PTR) - - // Emit IR instruction that says "chain_vreg = R10 on entry" - // This is like a parameter but in R10 instead of R0-R3 - emit TCCIR_OP_ASSIGN chain_vreg <- STATIC_CHAIN_REG -``` - -#### 2.6 Register allocation changes - -``` -// Pseudocode for register allocator changes: - -function tcc_ls_allocate_registers(ls, params, float_params, spill_base): - ...existing setup... - - if current function has_static_chain: - // Remove R10 from the allocatable register set - ls->registers_map &= ~(1ULL << 10) - - // The chain vreg must be assigned to R10 - // Mark it with incoming_reg = R10 (similar to how params get R0-R3) - chain_interval = find_interval_for_vreg(ls, ir->static_chain_vreg) - chain_interval->r0 = 10 // pre-assigned to R10 -``` - -#### 2.7 Captured variable marking in parent - -Variables captured by nested functions must be forced to stack (cannot be register-only): - -``` -// Pseudocode: In compile_nested_functions(), after parsing all nested func bodies -// but we actually need this DURING block(0) of the parent... - -// Better approach: During the first parse of the parent body, whenever we -// define a nested function via skip_or_save_block(), we can't yet know which -// parent vars are captured (we haven't parsed the nested body yet!) - -// Solution: Two-pass or lazy capture marking: -// -// OPTION A — Lazy: During nested function gen_function(), when we encounter -// a captured var access, set sym->addrtaken = 1 on the parent's symbol. -// Since the parent's IR is already generated, we need to retroactively fix -// the parent's liveness info to mark these as spilled. -// -// OPTION B — Pre-scan: After skip_or_save_block() saves the nested body tokens, -// do a quick token scan looking for identifier references that match parent locals. -// Mark those as captured immediately. -// -// OPTION C — Reparse approach (simplest, matches our architecture): -// Since nested functions are compiled AFTER the parent's block(0) but BEFORE -// optimization, the parent's IR is complete. At this point: -// - Parent locals have known FP offsets (loc is finalized) -// - We compile the nested function which uses these offsets via chain reg -// - The parent never needs to "know" about captures — the nested function -// accesses parent memory through R10, which is transparent to the parent -// -// Wait — there IS a problem: if the parent's register allocator puts a -// "captured" variable in a register only and never spills it, the nested -// function's R10-relative access would read stale stack memory. -// -// SOLUTION: Mark variables as addrtaken in the parent's IR generation. -// During block(0), when we encounter a nested function that MIGHT capture -// parent vars, conservatively mark ALL parent locals as addrtaken. -// Or better: do a token pre-scan of the saved body to find which vars are used. - -function prescan_captured_vars(nf, parent_local_stack): - // Walk the saved TokenString looking for identifiers - // that match parent local variable names. - // Mark matching parent syms as addrtaken (forces stack spill). - - tokens = tok_str_buf(nf->func_str) - pos = 0 - while tokens[pos] != TOK_EOF: - t = tokens[pos] - if t >= TOK_IDENT: - sym = lookup in parent_local_stack for token t - if sym != NULL && sym->r & VT_LOCAL: - sym->type.t |= VT_ADDRTAKEN // force to stack - // Record in nf->captured_offsets for later - nf->captured_offsets[nf->nb_captured++] = sym->c // FP offset - pos = advance past token + associated data - - // This runs during decl(VT_LOCAL) right after skip_or_save_block, - // BEFORE the parent's block(0) continues parsing. So the addrtaken - // flag is set BEFORE the parent's IR generation decisions. -``` - -**Critical insight**: The pre-scan must happen at parse time (during `decl(VT_LOCAL)`) before the parent's `block(0)` generates IR for variables that might be captured. Otherwise the parent's IR could put them in registers. - -#### 2.8 Direct call convention for nested functions - -When the parent calls a nested function directly (not via function pointer): - -``` -// Parent's IR for: f2(arg) -// 1. Load R10 = current FP (R7) -// MOV R10, R7 — or emit IR: ASSIGN R10 <- FP -// 2. Normal call: BL f1.f2 - -// Pseudocode in tccgen.c gfunc_call path: -function gen_call(func_sym, args): - if func_sym is a nested function: - // Set up static chain before call - emit IR: STORE R10, current_FP (or MOV R10, R7) - // Then proceed with normal call - emit IR: FUNCCALLVAL func_sym, args... -``` - -The IR can represent this as a regular `FUNCCALLVAL` where the call site metadata records "needs chain setup". Or emit a new `TCCIR_OP_SET_CHAIN` instruction before the call. - ---- - -### Phase 3: Trampoline Generation (Address-of Nested Function) - -**Effort**: 5-7 days -**Files**: `tccgen.c`, `arm-thumb-gen.c`, `arm-thumb-opcodes.c`, `tccelf.c` - -This is the most complex phase. Required when a nested function's address is taken (e.g., `f0(f2, &i)` where `f2` is passed as a function pointer). - -#### 3.1 Why not executable stack trampolines? - -GCC's approach generates small code snippets on the stack. Ruled out for ARMv8-M: the stack is non-executable when MPU is enabled. - -#### 3.2 Chosen approach: Static trampoline in `.text` + writable chain slot in `.data` - -Each nested function whose address is taken gets a trampoline: - -```asm -; In .text — trampoline for f1.f2: -; Thumb-2 encoding, 4 instructions + 2 data words = 16+8 = 24 bytes -__tramp_f1__f2: - LDR r10, [pc, #8] ; r10 = *(PC+8) = chain_slot address - LDR r10, [r10] ; r10 = *chain_slot = parent FP value - LDR pc, [pc, #4] ; pc = *(PC+4) = f1__f2 address (tail call) - NOP ; alignment padding (Thumb-2) -.Ltramp_f1__f2_func: - .word f1__f2 ; R_ARM_ABS32 relocation to lifted function -.Ltramp_f1__f2_chain_ptr: - .word __chain_slot_f1__f2 ; R_ARM_ABS32 reloc to .data slot - -; In .data — writable slot: -__chain_slot_f1__f2: - .word 0 ; parent writes FP here at runtime -``` - -When the parent takes the address of the nested function: - -``` -// Pseudocode for generating IR when &f2 is referenced as a value: - -function gen_addr_of_nested_func(nested_sym): - // 1. Write current FP to the chain slot - // STR R7, [chain_slot_addr] - emit IR: chain_slot_addr <- SYMBOL(__chain_slot_f1__f2) - emit IR: STORE [chain_slot_addr], FP - - // 2. Return the trampoline address as the "function pointer" - // The caller will call __tramp_f1__f2 thinking it's a normal function - emit IR: result <- SYMBOL(__tramp_f1__f2) - return result -``` - -**Pseudocode for trampoline emission** (during the nested function's `gen_function` or a post-pass): - -``` -function emit_trampoline(nested_sym, parent_ir): - // Save current output position - saved_ind = ind - - // Emit Thumb-2 trampoline code: - // All offsets relative to PC which is 4 bytes ahead in Thumb mode - - // LDR r10, [pc, #8] — Thumb-2 T3 encoding - emit_thumb32(0xF8DF, 0xA008) // LDR.W r10, [pc, #8] - - // LDR r10, [r10, #0] — dereference the chain slot pointer - emit_thumb32(0xF8DA, 0xA000) // LDR.W r10, [r10, #0] - - // LDR pc, [pc, #4] — jump to the actual function - emit_thumb32(0xF8DF, 0xF004) // LDR.W pc, [pc, #4] - - // NOP for alignment - emit_thumb16(0xBF00) // NOP - - // Data words (with relocations): - emit_word_with_reloc(nested_sym) // R_ARM_ABS32 → f1__f2 - emit_word_with_reloc(chain_slot_sym) // R_ARM_ABS32 → chain slot in .data - - // Create the chain slot in .data section - chain_slot_sym = create_data_slot(".data", 4) // 4-byte writable slot - - // Register trampoline symbol - trampoline_sym = put_extern_sym_2(...) - - // Store trampoline info so parent can reference it - nested_sym->trampoline_sym = trampoline_sym - nested_sym->chain_slot_sym = chain_slot_sym -``` - -#### 3.3 Re-entrancy limitation - -This approach is **NOT re-entrant**: if the parent function recurses, each recursive invocation writes the same `.data` chain slot. The last writer wins, corrupting earlier invocations' nested function pointers. - -**Acceptable for now**: Most GCC torture tests don't combine recursion + nested function pointers. Document the limitation. - -**Future fix**: Stack-allocated trampoline descriptors (Phase 3b, deferred): -- Allocate a `{func_addr, chain_value}` pair on the parent's stack -- Trampoline code in `.text` reads from a descriptor whose address is passed via R12 (IP) -- Requires an `alloca`-like mechanism or reserving stack space statically - -#### 3.4 Detecting when address-of is needed - -In `tccgen.c`, when a nested function symbol is used in a non-call context (i.e., its address is taken): - -``` -// Pseudocode in expression evaluation: - -function handle_symbol_reference(sym): - if sym is a nested function: - if context is a direct function call (immediately followed by '('): - // Direct call — no trampoline needed, just set up R10 - gen_call_nested_direct(sym, args) - else: - // Address taken — need trampoline - sym->nested_addr_taken = 1 - gen_addr_of_nested_func(sym) -``` - -The `trampoline_needed` flag on the `NestedFunc` descriptor must be checked after the parent's `block(0)` to decide whether to emit a trampoline. - ---- - -### Phase 4: IR Integration & Optimization Safety - -**Effort**: 3-4 days -**Files**: `ir/core.c`, `ir/core.h`, `ir/codegen.c`, `ir/live.c`, `tccir.h` - -#### 4.1 New fields on TCCIRState - -```c -// tccir.h additions to TCCIRState: -typedef struct NestedFunc NestedFunc; // forward decl - -struct TCCIRState { - ...existing fields... - - // Nested function support - NestedFunc *nested_funcs; // array of nested function descriptors - int nb_nested_funcs; // count - int nested_funcs_capacity; // allocated capacity - - uint8_t has_static_chain; // 1 if this function is itself nested - int static_chain_vreg; // vreg holding R10 (chain pointer) - int parent_loc; // parent's `loc` value (for offset validation) -}; -``` - -#### 4.2 Chain vreg as a parameter-like entity - -The static chain register (R10) is modeled as a special parameter: - -``` -// Pseudocode for chain vreg initialization during nested gen_function: - -function gen_function_nested_setup(ir): - if not ir->has_static_chain: return - - // Allocate a vreg for the chain. It behaves like parameter but in R10. - chain_vreg = tcc_ir_alloc_local_vreg(ir) - ir->static_chain_vreg = chain_vreg - - // Mark in liveness: chain_vreg is live-in at instruction 0 - // Its live range spans the entire function (conservative) - interval = find_or_create_interval(chain_vreg) - interval->start = 0 - interval->end = ir->next_instruction_index // updated at end - interval->incoming_reg = REG_STATIC_CHAIN // R10 - interval->addrtaken = 0 // it's a pointer, not an addressed var -``` - -#### 4.3 Optimization safety for captured variable accesses - -Captured variable loads/stores go through the chain pointer (an indirection through R10). These must not be eliminated by: - -- **Store-load forwarding**: Chain loads are through a different base register — the optimizer already treats different bases as distinct memory locations (no issue if using indexed LOAD/STORE with chain_vreg as base) -- **Dead store elimination**: A store through the chain modifies the parent's frame — it's externally visible. Mark chain stores as having side effects. -- **Constant propagation**: Cannot propagate through chain loads (the parent's memory could change between calls if the parent resumes) -- **CSE**: Chain loads from the same offset CAN be CSE'd within a basic block (the parent frame doesn't change while the nested function runs) - -``` -// Pseudocode: Mark chain-relative operations appropriately - -function emit_chain_load(ir, dest_vreg, parent_offset): - // Use regular LOAD but with chain_vreg as base - src_op = make_operand_vreg_plus_offset(ir->static_chain_vreg, parent_offset) - dest_op = make_operand_vreg(dest_vreg) - tcc_ir_put_op(ir, TCCIR_OP_LOAD, src_op, NONE, dest_op) - // No special flags needed — the load uses a non-FP base register, - // so the optimizer already treats it as a memory access, not a stack local - -function emit_chain_store(ir, parent_offset, src_vreg): - dest_op = make_operand_vreg_plus_offset(ir->static_chain_vreg, parent_offset) - src_op = make_operand_vreg(src_vreg) - tcc_ir_put_op(ir, TCCIR_OP_STORE, src_op, NONE, dest_op) - // Store through chain — the optimizer must not eliminate this - // Since the base is a vreg (not FP), existing conservative rules apply -``` - -#### 4.4 Parent IR: chain setup before direct calls - -When the parent calls a nested function directly, it must pass its FP in R10: - -``` -// Pseudocode for parent's call to nested function: - -function gen_call_to_nested_func(ir, nested_sym, args): - // Before the call, set R10 = current FP - // This is modeled as: MOV R10, R7 - // In IR terms: allocate temp vreg, emit FP read, then a "call annotation" - - // Option A: Emit explicit ASSIGN from FP to a vreg assigned to R10 - tmp = alloc_temp_vreg() - emit TCCIR_OP_ASSIGN tmp <- FP_OPERAND - // The call instruction metadata records: R10 must hold `tmp` at call time - emit TCCIR_OP_FUNCCALLVAL nested_sym, args, chain_vreg=tmp - - // Option B: Add a pre-call setup instruction - emit TCCIR_OP_SET_CHAIN (implicit: R10 <- FP) - emit TCCIR_OP_FUNCCALLVAL nested_sym, args - - // Option B is simpler and avoids complex register constraints at call sites -``` - ---- - -### Phase 5: ARM Code Generation - -**Effort**: 3-5 days -**Files**: `arm-thumb-gen.c`, `arm-thumb-opcodes.c`, `arm-thumb-opcodes.h`, `ir/codegen.c` - -#### 5.1 Nested function prologue/epilogue - -``` -// Pseudocode for modified prologue generation: - -function gen_func_prologue(ir): - push_mask = compute_callee_saved_registers(ir) - - if ir->has_static_chain: - // R10 must be saved (it's callee-saved anyway on ARM) - push_mask |= (1 << 10) - // R10 arrives pre-loaded with chain value - // No additional setup needed — the chain vreg IS R10 - - emit PUSH {push_mask} - if need_frame_pointer: - emit MOV R7, SP - emit SUB SP, SP, #frame_size - -function gen_func_epilogue(ir): - // Standard epilogue — R10 restored from push - emit ADD SP, SP, #frame_size - emit POP {push_mask | (1 << PC)} // or MOV PC, LR for leaf -``` - -#### 5.2 Chain-relative load/store codegen - -``` -// Pseudocode for lowering chain LOAD/STORE to Thumb-2: - -function codegen_load_via_chain(ir, instruction): - // Instruction: LOAD dest <- [chain_vreg + offset] - // chain_vreg has been assigned to R10 by register allocator - - base_reg = get_physical_reg(instruction.src1) // should be R10 - offset = instruction.offset - dest_reg = get_physical_reg(instruction.dest) - - if offset fits in Thumb-2 LDR immediate (0..4095): - emit LDR.W dest_reg, [base_reg, #offset] - else: - // Large offset — materialize in scratch - scratch = get_scratch_register() - emit_movw_movt(scratch, offset) - emit LDR dest_reg, [base_reg, scratch] - -function codegen_store_via_chain(ir, instruction): - base_reg = get_physical_reg(instruction.dest_addr) // R10 - offset = instruction.offset - src_reg = get_physical_reg(instruction.src1) - - if offset fits in Thumb-2 STR immediate: - emit STR.W src_reg, [base_reg, #offset] - else: - scratch = get_scratch_register() - emit_movw_movt(scratch, offset) - emit STR src_reg, [base_reg, scratch] -``` - -#### 5.3 `SET_CHAIN` instruction codegen (for parent calling nested func) - -``` -// Pseudocode for SET_CHAIN instruction lowering: - -function codegen_set_chain(ir, instruction): - // Emit: MOV R10, R7 (copy frame pointer to static chain register) - // This is a Thumb-2 MOV register instruction - emit_thumb16_mov(10, 7) // MOV R10, R7 -``` - -#### 5.4 Trampoline code emission - -``` -// Pseudocode for emitting trampoline after nested function is compiled: - -function emit_trampoline_code(nested_sym, chain_slot_sym): - // Emit into .text section, after the nested function's code - - // First, create the trampoline function symbol - tramp_name = concat("__tramp_", nested_sym->name) - tramp_start = ind - - // Thumb-2: LDR R10, [PC, #8] — load address of chain slot - // PC at this point = tramp_start + 4 (Thumb pipeline) - // We want data at tramp_start + 16 (after 4 instructions × 4 bytes) - // Offset = 16 - 4 = 12... but actual Thumb-2 LDR literal encoding - // matters. Use proper opcode builder: - arm_thumb_ldr_literal_w(R10, chain_ptr_offset) - - // Thumb-2: LDR R10, [R10, #0] — dereference: r10 = *chain_slot - arm_thumb_ldr_imm_w(R10, R10, 0) - - // Thumb-2: LDR PC, [PC, #offset] — jump to nested function - // This loads the function address from the literal pool entry below - arm_thumb_ldr_literal_w(PC, func_addr_offset) - - // Padding NOP if needed for alignment - arm_thumb_nop() - - // Data: function address (with R_ARM_ABS32 relocation) - emit_word(0) - add_relocation(R_ARM_ABS32, nested_sym, ind - 4) - - // Data: chain slot address (with R_ARM_ABS32 relocation) - emit_word(0) - add_relocation(R_ARM_ABS32, chain_slot_sym, ind - 4) - - // Create & register trampoline symbol - put_extern_sym_2(tramp_sym, cur_text_section, tramp_start + 1, ind - tramp_start, 0) - // +1 for Thumb bit - - // Store on nested func descriptor for the parent to reference - nested_sym->trampoline_sym_index = tramp_sym->c -``` - -#### 5.5 Chain slot creation in `.data` - -``` -// Pseudocode: - -function create_chain_slot(nested_sym): - // Allocate 4 bytes in .data section - data_sec = tcc_state->data_section // or bss_section - offset = section_add(data_sec, 4, 4) // 4 bytes, 4-byte aligned - - // Create a symbol for it - chain_slot_name = concat("__chain_", nested_sym->name) - chain_slot_sym = put_elf_sym(...) - - // Initialize to 0 - write_word_at(data_sec, offset, 0) - - return chain_slot_sym -``` - ---- - -### Phase 6: Linker Support - -**Effort**: 1-2 days -**Files**: `arm-link.c`, `tccelf.c` - -#### 6.1 Relocations - -The trampoline uses standard `R_ARM_ABS32` relocations for both the function address and chain slot address data words. No new relocation types needed. - -``` -// Pseudocode: Relocation handling (should work with existing code) - -// In arm-link.c, relocate_section(): -// R_ARM_ABS32 cases already handle: -// *(uint32_t*)ptr += sym_addr -// This covers both: -// .word f1__f2 → resolved to f1__f2's .text address (with +1 Thumb bit) -// .word __chain_f1__f2 → resolved to chain slot's .data address -``` - -#### 6.2 Symbol visibility - -Nested function symbols (`f1.f2` or `f1__f2`) should be `STB_LOCAL` in ELF — they are not externally visible: - -``` -// Pseudocode: - -function create_nested_func_symbol(mangled_name, type): - sym = external_sym(mangled_name_token, type, 0, &ad) - // Force local binding — nested functions are not exported - ELF32_ST_INFO(elfsym(sym)) = ELF32_ST_INFO(STB_LOCAL, STT_FUNC) - return sym -``` - -Trampoline symbols (`__tramp_f1__f2`) and chain slot symbols (`__chain_f1__f2`) are also `STB_LOCAL`. - ---- - -### Phase 7: Testing & Validation - -**Effort**: 3-5 days -**Files**: `tests/ir_tests/`, `tests/gcctestsuite/conftest.py` - -#### 7.1 Incremental test plan - -| Test | Phase Required | What it validates | -|------|----------------|-------------------| -| `nested_basic.c` | 1 | Nested function def + direct call, no capture | -| `nested_capture_read.c` | 1+2 | Nested function reads parent variable via chain | -| `nested_capture_write.c` | 1+2 | Nested function writes parent variable via chain | -| `nested_direct_call_args.c` | 1+2 | Passing arguments + capturing parent vars | -| `nested_funcptr.c` | 1+2+3 | Address of nested function → trampoline | -| `nested_funcptr_indirect.c` | 1+2+3 | Nested func passed through another function (20000822-1 pattern) | -| `nested_multi_level.c` | 1+2 | Double-nested: f → g → h with capture | -| `nested_recursive_parent.c` | 1+2+3 | Recursive parent + nested function call | -| `20000822-1.c` | 1+2+3 | The original GCC torture test | - -#### 7.2 Test: `nested_basic.c` (Phase 1 validation) - -```c -// No capture, just direct call -int main() { - int add1(int x) { return x + 1; } - if (add1(41) != 42) abort(); - return 0; -} -``` - -Expected IR for `main`: -- Defines symbol `main.add1` -- `BL main.add1` with R10 = R7 (chain, unused by add1) - -Expected IR for `main.add1`: -- Normal function, just happens to be nested -- No chain access, `has_static_chain = 0` (or 1 but unused) - -#### 7.3 Test: `nested_capture_write.c` (Phase 2 validation) - -```c -int main() { - int x = 10; - void set_x(int val) { x = val; } - set_x(42); - if (x != 42) abort(); - return 0; -} -``` - -Expected IR for `main.set_x`: -- `has_static_chain = 1` -- Loads chain pointer from R10 -- Stores `val` to `[R10 + offset_of_x]` - -#### 7.4 GCC torture test integration - -``` -// Pseudocode for conftest.py update: - -// Remove skip entries for these 14 tests: -// 20000822-1.c, 920428-2.c, 920501-7.c, 920612-2.c, 921017-1.c, -// 921215-1.c, 931002-1.c, comp-goto-2.c, nestfunc-1.c, nestfunc-2.c, -// nestfunc-3.c, nestfunc-5.c, nestfunc-6.c, pr24135.c -// -// Keep comp-goto-2.c, nestfunc-5.c, nestfunc-6.c, pr24135.c skipped -// initially — they require computed goto / nonlocal goto extensions -``` - ---- - -## Dependency Graph - -``` -Phase 1 ──→ Parser: save nested func body as TokenString - │ + compile after parent's block(0) - │ -Phase 2 ──→ Static chain: R10 convention, captured var access - │ via pre-scan + chain vreg - │ -Phase 3 ──→ Trampolines: .text code + .data chain slot - │ for address-of nested function - │ -Phase 4 ──→ IR: chain vreg management, optimization safety - │ -Phase 5 ──→ ARM codegen: prologue R10 save, chain load/store, - │ trampoline emission, SET_CHAIN lowering - │ -Phase 6 ──→ Linker: R_ARM_ABS32 relocs (mostly existing) - │ -Phase 7 ──→ Testing: incremental + 14 GCC torture tests -``` - -In practice, Phases 1-5 are interleaved: you can't test Phase 1 without at least stub codegen (Phase 5), and Phase 2 needs IR support (Phase 4). The recommended implementation order: - -1. **Phase 1 + Phase 4 (core) + Phase 5 (stub)**: Get `nested_basic.c` working (no capture) -2. **Phase 2 + Phase 4 (capture) + Phase 5 (chain codegen)**: Get `nested_capture_*.c` working -3. **Phase 3 + Phase 5 (trampoline) + Phase 6**: Get `20000822-1.c` working -4. **Phase 7**: Run full GCC torture suite - ---- - -## Estimated Total Effort - -| Phase | Effort | Cumulative | -|-------|--------|------------| -| 1: Parser (save + reparse) | 2-3 days | 3 days | -| 2: Static chain + capture | 3-5 days | 8 days | -| 3: Trampolines | 5-7 days | 15 days | -| 4: IR integration | 3-4 days | 19 days | -| 5: ARM codegen | 3-5 days | 24 days | -| 6: Linker | 1-2 days | 26 days | -| 7: Testing | 3-5 days | 31 days | - -**Total: ~4-5 weeks** for full nested function support with trampolines. -**Milestone 1 (~1 week)**: Direct nested function calls, no capture (`nested_basic.c`). -**Milestone 2 (~2 weeks)**: Capture support (`nested_capture_*.c`). -**Milestone 3 (~3.5 weeks)**: Full trampoline support, `20000822-1.c` passes. -**Milestone 4 (~4.5 weeks)**: All applicable GCC torture tests passing. - ---- - -## Risks & Open Questions - -1. **Re-entrancy**: Static `.text` trampolines with `.data` chain slots are not re-entrant for recursive parent functions. Is this acceptable, or do we need `alloca`-based descriptors? (Acceptable for now — document limitation.) - -2. **`gen_function()` calls `next()` at the end**: The reparse model via `begin_macro`/`end_macro` must correctly handle this. Verify that the token stream terminates cleanly after the `}` of the nested function body. - -3. **Symbol mangling**: Names like `f1.f2` may conflict with C identifiers. Use `f1__nested__f2` or an internal-only token ID to avoid collisions. - -4. **Nested-inside-nested**: Multi-level nesting (f → g → h) requires chasing chain pointers: `h` accesses `g`'s frame via its chain, and `g`'s chain to reach `f`. Each level adds one indirection. The chain vreg in `h` points to `g`'s frame, which contains `g`'s chain vreg pointing to `f`'s frame. Needs chain-of-chains support. - -5. **Inline functions**: If a nested function is defined inside an inline function, the token-save method works naturally (inline expansion replays the outer tokens, which include the nested function save logic). But trampoline symbols need unique names per instantiation. - -6. **`__label__` / nonlocal goto**: Tests `nestfunc-5.c`, `nestfunc-6.c`, and `pr24135.c` use nonlocal goto from nested functions. This requires stack unwinding support. Defer to a future phase. - -7. **Optimization interaction**: Chain loads/stores must not be eliminated by store-load forwarding or dead store elimination. Since they use a non-FP base register (chain vreg → R10), existing conservative rules should suffice. Verify with test cases. - -8. **Thread safety**: Static `.data` chain slots are not thread-safe. Acceptable for single-threaded embedded targets (Cortex-M33). - -9. **Token pre-scan accuracy**: The `prescan_captured_vars` function does a shallow token scan — it cannot resolve scoping correctly (e.g., if the nested function declares a local with the same name as a parent variable, the pre-scan would over-mark). Conservative over-marking is safe (forces unnecessary stack spills) but suboptimal. Could refine later with a proper scope-aware scan. diff --git a/README b/README index 809dd8d4..4972e52c 100644 --- a/README +++ b/README @@ -1,96 +1,217 @@ -Tiny C Compiler - C Scripting Everywhere - The Smallest ANSI C compiler ------------------------------------------------------------------------ - -Features: --------- - -- SMALL! You can compile and execute C code everywhere, for example on - rescue disks. - -- FAST! tcc generates optimized x86 code. No byte code - overhead. Compile, assemble and link about 7 times faster than 'gcc - -O0'. - -- UNLIMITED! Any C dynamic library can be used directly. TCC is - heading toward full ISOC99 compliance. TCC can of course compile - itself. - -- SAFE! tcc includes an optional memory and bound checker. Bound - checked code can be mixed freely with standard code. - -- Compile and execute C source directly. No linking or assembly - necessary. Full C preprocessor included. - -- C script supported : just add '#!/usr/local/bin/tcc -run' at the first - line of your C source, and execute it directly from the command - line. - -Documentation: -------------- - -1) Installation on a i386/x86_64/arm/aarch64/riscv64 - Linux/macOS/FreeBSD/NetBSD/OpenBSD hosts. - - ./configure - make - make test - make install - - Notes: For FreeBSD, NetBSD and OpenBSD, gmake should be used instead of make. - For Windows read tcc-win32.txt. - -makeinfo must be installed to compile the doc. By default, tcc is -installed in /usr/local/bin. ./configure --help shows configuration -options. - - -2) Introduction - -We assume here that you know ANSI C. Look at the example ex1.c to know -what the programs look like. - -The include file can be used if you want a small basic libc -include support (especially useful for floppy disks). Of course, you -can also use standard headers, although they are slower to compile. +TinyCC for ARMv8-M — Tiny C Compiler fork for ARMv8-M (Cortex-M33, Cortex-M23) +================================================================================= + +This is a fork of the Tiny C Compiler (TCC) by Fabrice Bellard, modified for +**ARMv8-M architecture** with a custom IR, register allocator, and Thumb-2 +code generator. + +Differences from Original TinyCC +-------------------------------- + +**1. Target Architecture** +Original TCC targets x86/x86_64/aarch64/riscv64 on desktop/server OSes. +This fork targets **ARMv8-M** microcontrollers (Cortex-M33, Cortex-M23, etc.) +with the Thumb-2 instruction set. + +**2. Custom IR (Intermediate Representation)** +The original TCC uses a simple, direct translation to machine code. This fork +introduces a **three-address code IR** with explicit register operands, enabling +separate front-end and back-end. Key files: `tccir.c`, `tccir.h`, +`tccir_operand.h`. + +**3. Register Allocation** +This fork includes a **two-phase linear scan register allocator** (`tccls.c`) +that performs liveness analysis and assigns physical registers. The original TCC +uses a simpler approach without liveness analysis. + +**4. Code Generation** +Instead of x86 code generation, this fork generates **Thumb-2 machine code** +via `arm-thumb-gen.c` and `arm-thumb-opcodes.c`. It supports the ARMv8-M +instruction set including DSP extensions. + +**5. Floating Point Options** +Multiple FP back-ends are supported: +- Software FP (pure C) — **currently the only working option** +- VFPv4-sp (single-precision, Cortex-M4F) — infrastructure in place, not yet functional +- VFPv5-dp (double-precision, Cortex-M7) — infrastructure in place, not yet functional +- RP2350 DCP (double coprocessor) — infrastructure in place, not yet functional + +Hardware FP support is not yet implemented; only soft-float can be used. + +**6. Library Mode** +Can be used as `libtcc.a` for **JIT compilation** in host applications. + +**7. ARM-Specific Features** +- ARM Procedure Call Standard (AAPCS) support +- ARMv8-M EABI helper functions +- ARM assembler parser for inline assembly +- ARM-specific ELF linking (`arm-link.c`) + +**8. Runtime Library** +Includes a custom runtime library (`libtcc1`) with ARM EABI helpers in +`lib/armeabi.c` and `lib/armv8m_eabi.c`. + +Project Structure +----------------- + +``` +. +├── Core Compiler Sources +│ ├── tcc.c # Main driver/CLI entry point +│ ├── tccpp.c # C preprocessor +│ ├── tccgen.c # C parser and type system +│ ├── tccir.c # IR generator +│ ├── tccir.h # IR definitions and opcodes +│ ├── tccir_operand.c # IR operand handling +│ ├── tccir_operand.h # IR operand definitions +│ ├── tccls.c # Liveness analysis and register allocation +│ ├── tccld.c # Linker +│ ├── tccelf.c # ELF file format support +│ ├── tccasm.c # Inline assembler +│ ├── tccdbg.c # Debug info generation +│ ├── tccdebug.c # Debug utilities +│ ├── libtcc.c # Library API implementation +│ └── tccyaff.c # YAFF support +│ +├── ARM-Specific Sources +│ ├── arm-thumb-gen.c # Thumb-2 code generator +│ ├── arm-thumb-opcodes.c# Thumb-2 opcode builders +│ ├── arm-thumb-opcodes.h# Thumb-2 instruction definitions +│ ├── arm-thumb-asm.c # ARM assembler parser +│ ├── arm-thumb-callsite.c# Call site handling +│ ├── arm-thumb-defs.h # ARM-specific definitions +│ ├── arm-link.c # ARM linker support +│ ├── arch/armv8m.c # ARMv8-M architecture config +│ └── arch/arm_aapcs.c # ARM Procedure Call Standard +│ +├── Libraries +│ ├── lib/ # Runtime library sources +│ │ ├── libtcc1.c # Core runtime functions +│ │ ├── armeabi.c # ARM EABI helpers +│ │ ├── armv8m_eabi.c # ARMv8-M EABI specific +│ │ └── fp/ # Floating point libraries +│ ├── include/ # System headers +│ +├── Tests +│ ├── tests/ir_tests/ # IR-level tests (pytest) +│ ├── tests/thumb/armv8m/# Thumb-2 instruction tests +│ ├── tests/tests2/ # C language compliance tests +│ ├── tests/pp/ # Preprocessor tests +│ └── tests/benchmarks/ # Performance benchmarks +│ +├── Build System +│ ├── configure # Configuration script +│ ├── Makefile # Main build rules +│ └── config.h # Generated configuration +│ +└── Documentation + ├── tcc-doc.texi # Texinfo documentation source + ├── LAZY_SECTION_LOADING.md + └── asm_port.md +``` + +Build +----- + +```bash +# Configure +./configure [options] +``` + +`./configure` accepts the following options: + +| Flag | Description | +|------|-------------| +| `--enable-O2` | Build an optimized TCC compiler | +| `--enable-cross` | Build the ARMv8-M cross compiler (`armv8m-tcc`) | +| `--debug` | Enable IR debug output (`-dump-ir`) and compile TCC with debug symbols | + +Example: +```bash +./configure --enable-cross --enable-O2 --debug +make cross +``` + +Run `./configure --help` for more options. + +```bash +# Build ARMv8-M cross compiler +make cross + +# Build everything including FP libraries +make cross fp-libs + +# Run tests +make test -j16 +``` + +Docker +------ + +A Dockerfile provides a reproducible build environment: + +```bash +# Build container +make container-build + +# Run tests inside container +docker run --rm -v $(pwd):/workspace tinycc-armv8m bash -c "\ + virtualenv .venv && \ + source .venv/bin/activate && \ + make test -j$(nproc)" +``` + +Testing +------- -You can begin your C script with '#!/usr/local/bin/tcc -run' on the first -line and set its execute bits (chmod a+x your_script). Then, you can -launch the C code as a shell or perl script :-) The command line -arguments are put in 'argc' and 'argv' of the main functions, as in -ANSI C. +```bash +# Initialize GCC testsuite submodule (one-time) +git submodule update --init --depth 1 tests/gcctestsuite/gcc-testsuite -3) Examples +# Run IR tests +make test -j16 -ex1.c: simplest example (hello world). Can also be launched directly -as a script: './ex1.c'. +# GCC torture tests +make test-all +``` -ex2.c: more complicated example: find a number with the four -operations given a list of numbers (benchmark). +Quick Test Runner +----------------- -ex3.c: compute fibonacci numbers (benchmark). +```bash +cd tests/ir_tests -ex4.c: more complicated: X11 program. Very complicated test in fact -because standard headers are being used ! As for ex1.c, can also be launched -directly as a script: './ex4.c'. +# Compile and run a single file +python run.py -c mytest.c -ex5.c: 'hello world' with standard glibc headers. +# With optimization +python run.py -c mytest.c --cflags="-O1" -tcc.c: TCC can of course compile itself. Used to check the code -generator. +# Dump IR +python run.py -c mytest.c --cflags="-O1" --dump-ir +``` -tcctest.c: auto test for TCC which tests many subtle possible bugs. Used -when doing 'make test'. +Debugging +--------- -4) Full Documentation +```bash +# Show IR output +./armv8m-tcc -dump-ir -c test.c -Please read tcc-doc.html to have all the features of TCC. +# Verbose output +./armv8m-tcc -vv -c test.c +``` -Additional information is available for the Windows port in tcc-win32.txt. +Enable debug logging at build time: +```bash +make CFLAGS+='-DTCC_LS_DEBUG' # Register allocator debug +make CFLAGS+='-DCONFIG_TCC_DEBUG' # IR dump support +``` -License: +License ------- -TCC is distributed under the GNU Lesser General Public License (see -COPYING file). +TCC is distributed under the GNU Lesser General Public License (LGPL). +See the COPYING file for details. -Fabrice Bellard. +This fork is maintained for ARMv8-M embedded development. diff --git a/arch/Makefile b/arch/Makefile new file mode 100644 index 00000000..515afa01 --- /dev/null +++ b/arch/Makefile @@ -0,0 +1,21 @@ +# Architecture build dispatcher +# +# Called from top-level Makefile: +# make -C arch ARCH=arm BUILD_DIR=... CC=... CFLAGS=... DEFINES=... +# +# To add a new architecture: +# 1. Create arch// with sources and a Makefile +# 2. Add _FILES and ARCH_OBJS_ to the top-level Makefile + +ARCH ?= arm +TOP ?= .. +BUILD_DIR ?= . + +all: + $(MAKE) -C $(ARCH) TOP=$(TOP) BUILD_DIR=$(BUILD_DIR) \ + CC="$(CC)" AR="$(AR)" CFLAGS="$(CFLAGS)" DEFINES="$(DEFINES)" + +clean: + $(MAKE) -C $(ARCH) clean BUILD_DIR=$(BUILD_DIR) + +.PHONY: all clean diff --git a/arch/arm/Makefile b/arch/arm/Makefile new file mode 100644 index 00000000..b9ea7338 --- /dev/null +++ b/arch/arm/Makefile @@ -0,0 +1,51 @@ +# ARM Architecture build +# +# Called from arch/Makefile: +# make -C arm TOP=... BUILD_DIR=... CC=... CFLAGS=... DEFINES=... +# +# Builds architecture-level objects (arm.c, arm_aapcs.c), +# dispatches to ISA subdirectories (thumb/), and bundles +# everything into $(BUILD_DIR)/libarm.a. + +TOP ?= ../.. +BUILD_DIR ?= . +CC ?= gcc +AR ?= ar +CFLAGS ?= +DEFINES ?= + +SRCS = arm.c arm_aapcs.c arm_regalloc.c ssa_opt_arm.c +OBJS = $(addprefix $(BUILD_DIR)/, $(SRCS:.c=.o)) + +ISA_DIRS = thumb +ISA_LIBS = $(foreach d,$(ISA_DIRS),$(BUILD_DIR)/lib$(d).a) + +LIB = $(BUILD_DIR)/libarm.a + +all: $(LIB) + +$(LIB): $(OBJS) $(ISA_LIBS) + printf 'create $@\n' > $(BUILD_DIR)/_libarm.mri + @for o in $(OBJS); do printf 'addmod %s\n' $$o >> $(BUILD_DIR)/_libarm.mri; done + @for l in $(ISA_LIBS); do printf 'addlib %s\n' $$l >> $(BUILD_DIR)/_libarm.mri; done + @printf 'save\nend\n' >> $(BUILD_DIR)/_libarm.mri + $(AR) -M < $(BUILD_DIR)/_libarm.mri + @rm -f $(BUILD_DIR)/_libarm.mri + +$(BUILD_DIR)/lib%.a: FORCE + $(MAKE) --no-print-directory -C $* \ + TOP=$(TOP) BUILD_DIR=$(BUILD_DIR) \ + CC="$(CC)" AR="$(AR)" CFLAGS="$(CFLAGS)" DEFINES="$(DEFINES)" + +$(BUILD_DIR)/%.o: %.c + @mkdir -p $(dir $@) + $(CC) -o $@ -c $< $(CFLAGS) $(DEFINES) -I$(TOP) -I$(TOP)/ir -I$(TOP)/ir/opt + +clean: + rm -f $(OBJS) $(LIB) $(ISA_LIBS) + @for dir in $(ISA_DIRS); do \ + $(MAKE) --no-print-directory -C $$dir clean BUILD_DIR=$(BUILD_DIR); \ + done + +FORCE: +.PHONY: all clean FORCE diff --git a/arch/arm/arm.c b/arch/arm/arm.c new file mode 100644 index 00000000..6ba6962e --- /dev/null +++ b/arch/arm/arm.c @@ -0,0 +1,111 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2025 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define USING_GLOBALS +#include "arch/arm/arm.h" +#include "arch/arm/thumb/thumb.h" +#include "tcc.h" + +/* ───── Internal profile / FPU resolution ───── */ + +static const FloatingPointConfig *arm_resolve_fpu(const char *mfpu) +{ + /* TODO: link to actual FPU configs once they live under arch/arm/fpu/ */ + (void)mfpu; + return NULL; +} + +struct target_dependent_config arm_target_dependent; + +ArchitectureConfig architecture_config; + +void arm_target_init(const char *march, const char *mfpu, const char *mcpu, uint64_t extra_feat_bits) +{ + thop_feat feat = thumb_resolve_features(march, mfpu, extra_feat_bits); + + arm_target_dependent = (struct target_dependent_config){ + .mcpu_name = mcpu, + .feat = feat, + .is_secure_tz = feat.sec != 0, + }; + + architecture_config = (ArchitectureConfig){ + .pointer_size = 4, + .stack_align = 8, + .reg_size = 4, + .parameter_registers = 4, + .has_fpu = 0, + .static_chain_reg = 10, + .fpu = NULL, + + .march_name = march ? march : "armv8-m.main", + .int_reg_count = 13, + .fp_reg_count = feat.fp_dp_d32 ? 64 + : feat.vfp_dp ? 32 + : feat.vfp_sp ? 32 + : 0, + .default_align = 4, + .big_endian = 0, + + .target_dependent = &arm_target_dependent, + }; + + if (mfpu) + { + const FloatingPointConfig *fpu = arm_resolve_fpu(mfpu); + architecture_config.fpu = fpu; + architecture_config.has_fpu = fpu != NULL; + } +} + +bool tcc_target_has(tcc_target_cap cap) +{ + const thop_feat f = arm_target_dependent.feat; + switch (cap) + { + case TCC_CAP_HW_DIVIDE: + return f.div; + case TCC_CAP_HW_FP_SP: + return f.vfp_sp; + case TCC_CAP_HW_FP_DP: + return f.vfp_dp; + case TCC_CAP_HW_FP_HP: + return f.fp16; + case TCC_CAP_DSP_SIMD: + return f.dsp; + case TCC_CAP_SATURATING_ARITH: + return f.sat; + case TCC_CAP_BITFIELD_INSTRS: + return f.bfx; + case TCC_CAP_COND_EXEC: + return f.it; + case TCC_CAP_MOVE_IMM_WIDE: + return f.movw_movt; + case TCC_CAP_VECTOR: + return f.mve_int; + case TCC_CAP_SECURITY: + return f.sec; + case TCC_CAP_POINTER_AUTH: + return f.pacbti; + case TCC_CAP_LOW_OVERHEAD_LOOP: + return f.lob; + } + return false; +} diff --git a/arch/arm/arm.h b/arch/arm/arm.h new file mode 100644 index 00000000..b2697aee --- /dev/null +++ b/arch/arm/arm.h @@ -0,0 +1,25 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2025 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +void arm_target_init(const char *march, const char *mfpu, const char *mcpu, uint64_t extra_feat_bits); \ No newline at end of file diff --git a/arch/arm_aapcs.c b/arch/arm/arm_aapcs.c similarity index 99% rename from arch/arm_aapcs.c rename to arch/arm/arm_aapcs.c index c3a19b87..b8e9b9eb 100644 --- a/arch/arm_aapcs.c +++ b/arch/arm/arm_aapcs.c @@ -18,7 +18,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include "../tcc.h" +#include "tcc.h" #include "tccabi.h" #include #include diff --git a/arch/arm/arm_regalloc.c b/arch/arm/arm_regalloc.c new file mode 100644 index 00000000..69ef08a9 --- /dev/null +++ b/arch/arm/arm_regalloc.c @@ -0,0 +1,59 @@ +/* + * TCC - Tiny C Compiler + * + * ARM register set definitions for SSA register allocator. + * + * Copyright (c) 2025 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "arm_regalloc.h" + +/* AAPCS: R0-R3 caller-saved, R4-R11 callee-saved, R12(IP) caller-saved */ +static const int arm_caller_saved[] = {0, 1, 2, 3, 12}; +static const int arm_callee_saved[] = {4, 5, 6, 7, 8, 9, 10, 11}; + +/* VFP: S0-S15 caller-saved, S16-S31 callee-saved */ +static const int arm_fp_caller_saved[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +static const int arm_fp_callee_saved[] = {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + +static const RegAllocTarget arm_target = { + .int_class = + { + .num_regs = 13, /* R0-R12 */ + .caller_saved = arm_caller_saved, + .num_caller_saved = 5, + .callee_saved = arm_callee_saved, + .num_callee_saved = 8, + .pair_align = 1, /* even-aligned pairs for 64-bit */ + }, + .fp_class = + { + .num_regs = 32, /* S0-S31 */ + .caller_saved = arm_fp_caller_saved, + .num_caller_saved = 16, + .callee_saved = arm_fp_callee_saved, + .num_callee_saved = 16, + .pair_align = 1, /* even-aligned for double */ + }, + .param_regs = 4, /* R0-R3 */ + .static_chain_reg = 10, /* R10 */ +}; + +const RegAllocTarget *arm_get_regalloc_target(void) +{ + return &arm_target; +} diff --git a/arch/arm/arm_regalloc.h b/arch/arm/arm_regalloc.h new file mode 100644 index 00000000..a4dd1771 --- /dev/null +++ b/arch/arm/arm_regalloc.h @@ -0,0 +1,28 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2025 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef TCC_ARM_REGALLOC_H +#define TCC_ARM_REGALLOC_H + +#include "ir/regalloc.h" + +const RegAllocTarget *arm_get_regalloc_target(void); + +#endif /* TCC_ARM_REGALLOC_H */ diff --git a/arch/arm/ssa_opt_arm.c b/arch/arm/ssa_opt_arm.c new file mode 100644 index 00000000..53249077 --- /dev/null +++ b/arch/arm/ssa_opt_arm.c @@ -0,0 +1,936 @@ +/* + * TCC IR - SSA Target-Specific Optimization Generators (ARM Thumb-2) + * + * Copyright (c) 2025 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation. + */ + +#define USING_GLOBALS +#include "ir.h" +#include "ssa_opt.h" +#include "ssa_opt_arm.h" + +/* ============================================================================ + * ssa_gen_arm_fuse_mul_add_to_mla + * + * Pattern: t1 = MUL(a, b); t2 = ADD(t1, c) where t1 has single use + * Result: t2 = MLA(a, b, c); NOP the MUL + * + * ARM Thumb-2 MLA executes in 1 cycle vs MUL(1) + ADD(1) = 2 cycles. + * ============================================================================ */ + +int ssa_gen_arm_fuse_mul_add_to_mla(IRSSAOptCtx *ctx, int instr_idx) +{ + TCCIRState *ir = ctx->ir; + IRQuadCompact *mul_q = &ir->compact_instructions[instr_idx]; + + IROperand mul_dest = tcc_ir_op_get_dest(ir, mul_q); + int32_t mul_vr = irop_get_vreg(mul_dest); + if (mul_vr < 0) + return 0; + + IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, mul_vr); + if (!vi || vi->use_count != 1) + return 0; + + IRSSAUse *use = &vi->uses[0]; + if (use->kind != SSA_USE_INSTR) + return 0; + + int add_idx = use->idx; + IRQuadCompact *add_q = &ir->compact_instructions[add_idx]; + if (add_q->op != TCCIR_OP_ADD) + return 0; + + /* 64-bit MLA not supported on Cortex-M */ + if (mul_dest.btype == IROP_BTYPE_INT64) + return 0; + + /* Identify which ADD operand is the MUL result and which is the accumulator */ + IROperand add_src1 = tcc_ir_op_get_src1(ir, add_q); + IROperand add_src2 = tcc_ir_op_get_src2(ir, add_q); + IROperand accum; + + if (irop_get_vreg(add_src1) == mul_vr) + accum = add_src2; + else if (irop_get_vreg(add_src2) == mul_vr) + accum = add_src1; + else + return 0; + + /* If the accumulator is defined by a SHL/SHR, the ARM backend can fold the + * shift into the ADD's barrel-shifter operand (e.g. `add Rd, Rn, Rm, lsl + * #N`). MLA has no barrel-shifter on its accumulator, so fusing would + * defeat that lowering and produce wrong results for bitfield arithmetic + * (test gcc.c-torture/execute/20000113-1). Keep the MUL/ADD form so the + * backend can pick the better encoding. */ + int32_t accum_vr_chk = irop_get_vreg(accum); + if (accum_vr_chk >= 0) { + IRSSAVregInfo *avi = ssa_opt_vinfo(ctx, accum_vr_chk); + if (avi && avi->def_instr >= 0) { + int def_op = ir->compact_instructions[avi->def_instr].op; + if (def_op == TCCIR_OP_SHL || def_op == TCCIR_OP_SAR || + def_op == TCCIR_OP_SHR) + return 0; + } + } + + /* Skip if barrel-shift fusion already absorbed a shift into the ADD's + * src2 operand: that op was rewritten to consume the SHR's input vreg + * with the shift kind/amount recorded in ir->barrel_shifts[]. The + * original SHR def is now a NOP, so the def_op check above doesn't + * fire — without this guard the MLA fusion would drop the shift. */ + if (ir->barrel_shifts && add_q->orig_index >= 0 && + add_q->orig_index <= ir->max_orig_index && + ir->barrel_shifts[add_q->orig_index] != 0) + return 0; + + /* Place the MLA at the ADD's position. By SSA dominance, MUL's inputs and + * the accumulator are all defined before the ADD, so this is always valid. + * Placing the MLA at the MUL's position would require the accumulator to + * dominate the MUL — that's the rarer case. */ + IROperand add_dest = tcc_ir_op_get_dest(ir, add_q); + IROperand mul_src1 = tcc_ir_op_get_src1(ir, mul_q); + IROperand mul_src2 = tcc_ir_op_get_src2(ir, mul_q); + + /* Allocate fresh pool space for the MLA's 4 operands (dest, src1, src2, + * accum). Reusing the ADD's operand_base would clobber the next + * instruction's operands at base+2 and base+3. */ + int nb = ir->iroperand_pool_count; + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + if (nb + 3 >= ir->iroperand_pool_capacity) + return 0; + + add_q->op = TCCIR_OP_MLA; + add_q->operand_base = nb; + ir->iroperand_pool[nb + 0] = add_dest; + ir->iroperand_pool[nb + 1] = mul_src1; + ir->iroperand_pool[nb + 2] = mul_src2; + ir->iroperand_pool[nb + 3] = accum; + + /* MUL's result is no longer used; NOP it. */ + ssa_opt_nop_instr(ctx, instr_idx); + + /* The ADD already had uses recorded for mul_vr and accum_vr at add_idx. + * After the rewrite, the MLA at add_idx uses mul_src1, mul_src2, accum. + * Add uses for mul_src1/mul_src2 (previously they were used by the now- + * NOP'd MUL only), and remove the dead use of mul_vr. */ + IRSSAVregInfo *s1vi = ssa_opt_vinfo(ctx, irop_get_vreg(mul_src1)); + if (s1vi) + ssa_opt_add_use_instr(s1vi, add_idx); + IRSSAVregInfo *s2vi = ssa_opt_vinfo(ctx, irop_get_vreg(mul_src2)); + if (s2vi) + ssa_opt_add_use_instr(s2vi, add_idx); + + IRSSAVregInfo *mvi = ssa_opt_vinfo(ctx, mul_vr); + if (mvi) { + ssa_opt_remove_use_instr(mvi, add_idx); + mvi->def_instr = -1; + } + + return 1; +} + +/* ============================================================================ + * ssa_gen_arm_fuse_shl_add_to_load_indexed + * + * Pattern: t1 = SHL(idx, #scale); t2 = ADD(base, t1); t3 = LOAD(t2) + * where t1 and t2 are single-use + * Result: t3 = LOAD_INDEXED(base, idx, #scale); NOP SHL, ADD + * + * Maps directly to ARM Thumb-2: LDR Rd, [Rn, Rm, LSL #scale] + * ============================================================================ */ + +int ssa_gen_arm_fuse_shl_add_to_load_indexed(IRSSAOptCtx *ctx, int instr_idx) +{ + TCCIRState *ir = ctx->ir; + IRQuadCompact *shl_q = &ir->compact_instructions[instr_idx]; + + /* SHL must have immediate scale */ + IROperand shl_src2 = tcc_ir_op_get_src2(ir, shl_q); + if (shl_src2.tag != IROP_TAG_IMM32) + return 0; + int32_t scale = (int32_t)irop_get_imm64_ex(ir, shl_src2); + if (scale < 0 || scale > 3) + return 0; + + IROperand shl_dest = tcc_ir_op_get_dest(ir, shl_q); + int32_t shl_vr = irop_get_vreg(shl_dest); + if (shl_vr < 0) + return 0; + + IRSSAVregInfo *shl_vi = ssa_opt_vinfo(ctx, shl_vr); + if (!shl_vi || shl_vi->use_count != 1) + return 0; + if (shl_vi->uses[0].kind != SSA_USE_INSTR) + return 0; + + /* Find the ADD that uses the SHL result */ + int add_idx = shl_vi->uses[0].idx; + IRQuadCompact *add_q = &ir->compact_instructions[add_idx]; + if (add_q->op != TCCIR_OP_ADD) + return 0; + + IROperand add_src1 = tcc_ir_op_get_src1(ir, add_q); + IROperand add_src2 = tcc_ir_op_get_src2(ir, add_q); + IROperand base; + + if (irop_get_vreg(add_src1) == shl_vr) + base = add_src2; + else if (irop_get_vreg(add_src2) == shl_vr) + base = add_src1; + else + return 0; + + /* Bail if base would require its own deref (e.g. stack-spilled VLA + * pointer represented as StackLoc[N] with is_lval=1). LOAD_INDEXED + * treats its base as a single address, not as an lvalue to be loaded. */ + if (base.is_lval) + return 0; + + IROperand add_dest = tcc_ir_op_get_dest(ir, add_q); + int32_t add_vr = irop_get_vreg(add_dest); + if (add_vr < 0) + return 0; + + IRSSAVregInfo *add_vi = ssa_opt_vinfo(ctx, add_vr); + if (!add_vi || add_vi->use_count != 1) + return 0; + if (add_vi->uses[0].kind != SSA_USE_INSTR) + return 0; + + /* Find the LOAD that uses the ADD result */ + int load_idx = add_vi->uses[0].idx; + IRQuadCompact *load_q = &ir->compact_instructions[load_idx]; + if (load_q->op != TCCIR_OP_LOAD) + return 0; + + IROperand load_src = tcc_ir_op_get_src1(ir, load_q); + if (irop_get_vreg(load_src) != add_vr) + return 0; + if (!load_src.is_lval) + return 0; + + /* Rewrite LOAD → LOAD_INDEXED(base, index, scale) */ + IROperand shl_src1 = tcc_ir_op_get_src1(ir, shl_q); + IROperand load_dest = tcc_ir_op_get_dest(ir, load_q); + + load_q->op = TCCIR_OP_LOAD_INDEXED; + + /* Allocate NEW pool space for 4 operands (dest, base, index, scale). + * The original LOAD only had 2 slots; reusing operand_base would overwrite + * the next instruction's operands at pool[lb+2] and pool[lb+3]. */ + int lb = ir->iroperand_pool_count; + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + if (lb + 3 >= ir->iroperand_pool_capacity) { + load_q->op = TCCIR_OP_LOAD; + return 0; + } + load_q->operand_base = lb; + + /* base: clear lval since LOAD_INDEXED handles the deref */ + base.is_lval = 0; + + ir->iroperand_pool[lb + 0] = load_dest; + ir->iroperand_pool[lb + 1] = base; + ir->iroperand_pool[lb + 2] = shl_src1; + ir->iroperand_pool[lb + 3] = shl_src2; + + /* Update use-def chains */ + int32_t base_vr = irop_get_vreg(base); + IRSSAVregInfo *bvi = ssa_opt_vinfo(ctx, base_vr); + if (bvi) + ssa_opt_add_use_instr(bvi, load_idx); + + int32_t idx_vr = irop_get_vreg(shl_src1); + IRSSAVregInfo *ivi = ssa_opt_vinfo(ctx, idx_vr); + if (ivi) + ssa_opt_add_use_instr(ivi, load_idx); + + /* Clear intermediate vreg info */ + shl_vi->use_count = 0; + shl_vi->def_instr = -1; + add_vi->use_count = 0; + add_vi->def_instr = -1; + + /* NOP SHL and ADD */ + ssa_opt_nop_instr(ctx, instr_idx); + ssa_opt_nop_instr(ctx, add_idx); + + return 1; +} + +/* ============================================================================ + * ssa_gen_arm_fuse_shl_add_to_store_indexed + * + * Pattern: t1 = SHL(idx, #scale); t2 = ADD(base, t1); STORE(t2, val) + * where t1 and t2 are single-use + * Result: STORE_INDEXED(base, val, idx, #scale); NOP SHL, ADD + * + * Maps to ARM Thumb-2: STR Rd, [Rn, Rm, LSL #scale] + * ============================================================================ */ + +int ssa_gen_arm_fuse_shl_add_to_store_indexed(IRSSAOptCtx *ctx, int instr_idx) +{ + TCCIRState *ir = ctx->ir; + IRQuadCompact *shl_q = &ir->compact_instructions[instr_idx]; + + IROperand shl_src2 = tcc_ir_op_get_src2(ir, shl_q); + if (shl_src2.tag != IROP_TAG_IMM32) + return 0; + int32_t scale = (int32_t)irop_get_imm64_ex(ir, shl_src2); + if (scale < 0 || scale > 3) + return 0; + + IROperand shl_dest = tcc_ir_op_get_dest(ir, shl_q); + int32_t shl_vr = irop_get_vreg(shl_dest); + if (shl_vr < 0) + return 0; + + IRSSAVregInfo *shl_vi = ssa_opt_vinfo(ctx, shl_vr); + if (!shl_vi || shl_vi->use_count != 1) + return 0; + if (shl_vi->uses[0].kind != SSA_USE_INSTR) + return 0; + + int add_idx = shl_vi->uses[0].idx; + IRQuadCompact *add_q = &ir->compact_instructions[add_idx]; + if (add_q->op != TCCIR_OP_ADD) + return 0; + + IROperand add_src1 = tcc_ir_op_get_src1(ir, add_q); + IROperand add_src2 = tcc_ir_op_get_src2(ir, add_q); + IROperand base; + + if (irop_get_vreg(add_src1) == shl_vr) + base = add_src2; + else if (irop_get_vreg(add_src2) == shl_vr) + base = add_src1; + else + return 0; + + /* Bail if base would require its own deref (e.g. stack-spilled VLA + * pointer represented as StackLoc[N] with is_lval=1). STORE_INDEXED + * treats its base as a single address, not as an lvalue to be loaded. */ + if (base.is_lval) + return 0; + + IROperand add_dest = tcc_ir_op_get_dest(ir, add_q); + int32_t add_vr = irop_get_vreg(add_dest); + if (add_vr < 0) + return 0; + + IRSSAVregInfo *add_vi = ssa_opt_vinfo(ctx, add_vr); + if (!add_vi || add_vi->use_count != 1) + return 0; + if (add_vi->uses[0].kind != SSA_USE_INSTR) + return 0; + + int store_idx = add_vi->uses[0].idx; + IRQuadCompact *store_q = &ir->compact_instructions[store_idx]; + if (store_q->op != TCCIR_OP_STORE) + return 0; + + IROperand store_dest = tcc_ir_op_get_dest(ir, store_q); + if (irop_get_vreg(store_dest) != add_vr) + return 0; + + /* Rewrite STORE → STORE_INDEXED(base, src, index, scale) */ + IROperand shl_src1 = tcc_ir_op_get_src1(ir, shl_q); + IROperand store_src = tcc_ir_op_get_src1(ir, store_q); + + store_q->op = TCCIR_OP_STORE_INDEXED; + + /* Allocate NEW pool space for 4 operands (base, value, index, scale). + * The original STORE only had 2 slots; reusing operand_base would overwrite + * the next instruction's operands at pool[sb+2] and pool[sb+3]. */ + int sb = ir->iroperand_pool_count; + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + if (sb + 3 >= ir->iroperand_pool_capacity) { + store_q->op = TCCIR_OP_STORE; + return 0; + } + store_q->operand_base = sb; + + base.is_lval = 0; + + ir->iroperand_pool[sb + 0] = base; + ir->iroperand_pool[sb + 1] = store_src; + ir->iroperand_pool[sb + 2] = shl_src1; + ir->iroperand_pool[sb + 3] = shl_src2; + + /* Update use-def chains */ + int32_t base_vr = irop_get_vreg(base); + IRSSAVregInfo *bvi = ssa_opt_vinfo(ctx, base_vr); + if (bvi) + ssa_opt_add_use_instr(bvi, store_idx); + + int32_t idx_vr = irop_get_vreg(shl_src1); + IRSSAVregInfo *ivi = ssa_opt_vinfo(ctx, idx_vr); + if (ivi) + ssa_opt_add_use_instr(ivi, store_idx); + + shl_vi->use_count = 0; + shl_vi->def_instr = -1; + add_vi->use_count = 0; + add_vi->def_instr = -1; + + ssa_opt_nop_instr(ctx, instr_idx); + ssa_opt_nop_instr(ctx, add_idx); + + return 1; +} + +/* ============================================================================ + * ssa_gen_arm_reduce_mul_to_shift + * + * Pattern: dest = MUL(src, #pow2) or MUL(#pow2, src) + * Result: dest = SHL(src, #log2(pow2)) + * + * SHL is 1-cycle single-issue vs MUL which uses the multiplier pipeline. + * ============================================================================ */ + +int ssa_gen_arm_reduce_mul_to_shift(IRSSAOptCtx *ctx, int instr_idx) +{ + TCCIRState *ir = ctx->ir; + IRQuadCompact *q = &ir->compact_instructions[instr_idx]; + + IROperand src1 = tcc_ir_op_get_src1(ir, q); + IROperand src2 = tcc_ir_op_get_src2(ir, q); + IROperand imm_op, var_op; + + if (src2.tag == IROP_TAG_IMM32) { + imm_op = src2; + var_op = src1; + } else if (src1.tag == IROP_TAG_IMM32) { + imm_op = src1; + var_op = src2; + } else { + return 0; + } + + int64_t val = irop_get_imm64_ex(ir, imm_op); + if (val <= 0 || (val & (val - 1)) != 0) + return 0; + + int shift = 0; + int64_t v = val; + while (v > 1) { shift++; v >>= 1; } + + q->op = TCCIR_OP_SHL; + imm_op.u.imm32 = shift; + tcc_ir_op_set_src1(ir, q, var_op); + tcc_ir_op_set_src2(ir, q, imm_op); + + return 1; +} + +/* ============================================================================ + * ssa_gen_arm_fuse_load_through_add_imm + * + * Pattern: t_lea = ADD(base, #imm); t_val = LOAD(t_lea_deref) + * Result: t_val = LOAD_INDEXED(base, #imm, scale=0) + * + * Unlike the SHL-based fusion this does NOT require single-use of t_lea — + * multiple LOADs through the same address each get rewritten, and DCE + * cleans up the dead ADD if it ends up with no users. Mapping to + * LOAD_INDEXED with scale=0 + immediate index also enables the LDRD + * pairing peephole in ir/codegen.c which only fires on adjacent + * LOAD_INDEXED instructions with matching base + offset+4. + * + * Range guard: only fire when the immediate fits the [Rn, #imm] + * encoding (`abs(imm) <= 4095` for the word forms). Beyond that, the + * backend would materialize the immediate into a register and lose the + * benefit of the fusion. + * ============================================================================ */ + +static int arm_extract_add_imm_base(TCCIRState *ir, IRSSAOptCtx *ctx, + int32_t lea_vr, IROperand *out_base, + int32_t *out_imm, int *out_lea_idx) +{ + if (lea_vr < 0) + return 0; + if (TCCIR_DECODE_VREG_TYPE(lea_vr) != TCCIR_VREG_TYPE_TEMP) + return 0; + + IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, lea_vr); + if (!vi || vi->def_count != 1 || vi->def_instr < 0) + return 0; + + /* Every use of the LEA temp must be an ADDRESS use (deref) — i.e. it + * appears as the pointer operand of a LOAD or STORE. If it has a + * "value" use (used as data for ADD/SUB/MUL/ASSIGN/etc., or as the + * source value of a STORE), the LEA is a real pointer that participates + * in further computation — typically a loop-carried induction variable. + * Rewriting one address use to LOAD_INDEXED(base, #imm) extends base's + * liveness past the LEA, and the regalloc may then coalesce base with + * the post-update phi-copy temp, producing wrong addresses. + * + * When every use is an address use, after we rewrite them all the LEA + * becomes dead and DCE cleans up the ADD — no lifetime extension. */ + for (int u = 0; u < vi->use_count; u++) { + IRSSAUse use = vi->uses[u]; + if (use.kind != SSA_USE_INSTR) + return 0; + IRQuadCompact *uq = &ir->compact_instructions[use.idx]; + if (uq->op == TCCIR_OP_LOAD) { + IROperand s = tcc_ir_op_get_src1(ir, uq); + if (!s.is_lval || irop_get_vreg(s) != lea_vr) + return 0; + } else if (uq->op == TCCIR_OP_STORE) { + IROperand d = tcc_ir_op_get_dest(ir, uq); + if (irop_get_vreg(d) != lea_vr) + return 0; + /* If the LEA is being used as the STORE's value (not the address), + * reject. */ + IROperand sv = tcc_ir_op_get_src1(ir, uq); + if (irop_get_vreg(sv) == lea_vr) + return 0; + } else { + return 0; + } + } + + IRQuadCompact *dq = &ir->compact_instructions[vi->def_instr]; + if (dq->op != TCCIR_OP_ADD) + return 0; + + IROperand a = tcc_ir_op_get_src1(ir, dq); + IROperand b = tcc_ir_op_get_src2(ir, dq); + + IROperand base_op; + IROperand imm_op; + if (a.tag == IROP_TAG_IMM32 && b.tag != IROP_TAG_IMM32) { + imm_op = a; base_op = b; + } else if (b.tag == IROP_TAG_IMM32 && a.tag != IROP_TAG_IMM32) { + imm_op = b; base_op = a; + } else { + return 0; + } + + if (base_op.is_lval) + return 0; + /* Refuse SYMREF bases here — LOAD_INDEXED with a SYMREF base + imm + * isn't materially better than the existing LEA, and the backend's + * fast path for symbol+offset uses different code. */ + if (base_op.tag != IROP_TAG_VREG) + return 0; + + int32_t imm = irop_get_imm32(imm_op); + int abs_imm = imm < 0 ? -imm : imm; + if (abs_imm > 4095) + return 0; + + *out_base = base_op; + *out_imm = imm; + *out_lea_idx = vi->def_instr; + return 1; +} + +int ssa_gen_arm_fuse_load_through_add_imm(IRSSAOptCtx *ctx, int instr_idx) +{ + TCCIRState *ir = ctx->ir; + IRQuadCompact *load_q = &ir->compact_instructions[instr_idx]; + if (load_q->op != TCCIR_OP_LOAD) + return 0; + + IROperand load_dest_chk = tcc_ir_op_get_dest(ir, load_q); + /* 64-bit loads: skip. The STORE/LOAD handlers for 64-bit pointer-deref + * deliberately use two 32-bit ops (not LDRD/STRD) to tolerate unaligned + * packed-struct addresses; the LOAD_INDEXED/STORE_INDEXED 64-bit paths + * use LDRD/STRD which faults on misalignment. Don't fuse here. */ + if (irop_get_btype(load_dest_chk) == IROP_BTYPE_INT64 || + irop_get_btype(load_dest_chk) == IROP_BTYPE_FLOAT64) + return 0; + + IROperand load_src = tcc_ir_op_get_src1(ir, load_q); + if (!load_src.is_lval) + return 0; + if (load_src.is_local || load_src.is_llocal) + return 0; + if (load_src.tag != IROP_TAG_VREG) + return 0; + + int32_t lea_vr = irop_get_vreg(load_src); + IROperand base; + int32_t imm; + int lea_idx; + if (!arm_extract_add_imm_base(ir, ctx, lea_vr, &base, &imm, &lea_idx)) + return 0; + + IROperand load_dest = tcc_ir_op_get_dest(ir, load_q); + + /* Build the new operand pool entry for LOAD_INDEXED(base, imm, scale=0). */ + int lb = ir->iroperand_pool_count; + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + if (lb + 3 >= ir->iroperand_pool_capacity) + return 0; + + IROperand idx_op = irop_make_imm32(0, imm, load_dest.btype); + IROperand scale_op = irop_make_imm32(0, 0, IROP_BTYPE_INT32); + IROperand base_clean = base; + base_clean.is_lval = 0; + + load_q->op = TCCIR_OP_LOAD_INDEXED; + load_q->operand_base = lb; + ir->iroperand_pool[lb + 0] = load_dest; + ir->iroperand_pool[lb + 1] = base_clean; + ir->iroperand_pool[lb + 2] = idx_op; + ir->iroperand_pool[lb + 3] = scale_op; + + /* Use-def chain maintenance: + * - Drop the use of lea_vr from this LOAD (no longer references it). + * - Add a use of base_vr at this LOAD. + * The defining ADD becomes dead when lea_vr.use_count hits 0; DCE will + * remove it. */ + IRSSAVregInfo *lea_vi = ssa_opt_vinfo(ctx, lea_vr); + if (lea_vi) + ssa_opt_remove_use_instr(lea_vi, instr_idx); + + int32_t base_vr = irop_get_vreg(base_clean); + IRSSAVregInfo *base_vi = ssa_opt_vinfo(ctx, base_vr); + if (base_vi) + ssa_opt_add_use_instr(base_vi, instr_idx); + + return 1; +} + +/* ============================================================================ + * ssa_gen_arm_fuse_store_through_add_imm + * + * Mirror of the load variant for STORE. + * Pattern: t_lea = ADD(base, #imm); STORE(t_lea_deref, val) + * Result: STORE_INDEXED(base, val, #imm, scale=0) + * ============================================================================ */ + +int ssa_gen_arm_fuse_store_through_add_imm(IRSSAOptCtx *ctx, int instr_idx) +{ + TCCIRState *ir = ctx->ir; + IRQuadCompact *store_q = &ir->compact_instructions[instr_idx]; + if (store_q->op != TCCIR_OP_STORE) + return 0; + + IROperand store_src = tcc_ir_op_get_src1(ir, store_q); + /* See LOAD variant: skip 64-bit to avoid STRD on packed/misaligned addresses. */ + if (irop_get_btype(store_src) == IROP_BTYPE_INT64 || + irop_get_btype(store_src) == IROP_BTYPE_FLOAT64) + return 0; + + IROperand store_dest = tcc_ir_op_get_dest(ir, store_q); + if (store_dest.is_local || store_dest.is_llocal) + return 0; + if (store_dest.tag != IROP_TAG_VREG) + return 0; + + int32_t lea_vr = irop_get_vreg(store_dest); + IROperand base; + int32_t imm; + int lea_idx; + if (!arm_extract_add_imm_base(ir, ctx, lea_vr, &base, &imm, &lea_idx)) + return 0; + + int sb = ir->iroperand_pool_count; + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + if (sb + 3 >= ir->iroperand_pool_capacity) + return 0; + + IROperand idx_op = irop_make_imm32(0, imm, store_src.btype); + IROperand scale_op = irop_make_imm32(0, 0, IROP_BTYPE_INT32); + IROperand base_clean = base; + base_clean.is_lval = 0; + + store_q->op = TCCIR_OP_STORE_INDEXED; + store_q->operand_base = sb; + ir->iroperand_pool[sb + 0] = base_clean; + ir->iroperand_pool[sb + 1] = store_src; + ir->iroperand_pool[sb + 2] = idx_op; + ir->iroperand_pool[sb + 3] = scale_op; + + IRSSAVregInfo *lea_vi = ssa_opt_vinfo(ctx, lea_vr); + if (lea_vi) + ssa_opt_remove_use_instr(lea_vi, instr_idx); + + int32_t base_vr = irop_get_vreg(base_clean); + IRSSAVregInfo *base_vi = ssa_opt_vinfo(ctx, base_vr); + if (base_vi) + ssa_opt_add_use_instr(base_vi, instr_idx); + + return 1; +} + +/* ============================================================================ + * ssa_gen_arm_fuse_mla_accum_through_add_imm + * + * Pattern: t_lea = ADD(base, #imm); MLA dest, src1, src2 + t_lea_deref + * where t_lea is single-use (only as MLA's accum deref). + * Result: t_lea = LOAD_INDEXED(base, #imm, scale=0) + * MLA dest, src1, src2 + t_lea (accum non-deref) + * + * The MLA accumulator carries a memory-deref operand directly in the IR — + * codegen materialises it as `LEA + LDR` (2 insns). Rewriting the LEA's + * defining ADD into LOAD_INDEXED collapses both into a single + * `LDR rD, [base, #imm]`, saving one instruction. This mirrors the + * LOAD-side fusion but reuses the LEA's instruction slot for the LOAD + * (no IR insertion needed). + * + * The transform is destructive on t_lea's value (it no longer holds an + * address, but the loaded value), so it only fires when t_lea is used + * exactly once and that use is the MLA accum deref. + * ============================================================================ */ + +int ssa_gen_arm_fuse_mla_accum_through_add_imm(IRSSAOptCtx *ctx, int instr_idx) +{ + TCCIRState *ir = ctx->ir; + IRQuadCompact *mla_q = &ir->compact_instructions[instr_idx]; + if (mla_q->op != TCCIR_OP_MLA) + return 0; + + /* Accum is at operand_base + 3. */ + IROperand accum = ir->iroperand_pool[mla_q->operand_base + 3]; + if (!accum.is_lval || accum.is_llocal || accum.is_local || accum.is_sym) + return 0; + if (accum.tag != IROP_TAG_VREG) + return 0; + int32_t lea_vr = irop_get_vreg(accum); + if (lea_vr < 0 || TCCIR_DECODE_VREG_TYPE(lea_vr) != TCCIR_VREG_TYPE_TEMP) + return 0; + + /* 64-bit accumulators aren't supported by MLA on Cortex-M; this also + * sidesteps the LDRD-alignment trap that the LOAD-side fusion guards + * against. */ + IROperand mla_dest = ir->iroperand_pool[mla_q->operand_base + 0]; + if (irop_get_btype(mla_dest) == IROP_BTYPE_INT64) + return 0; + if (irop_get_btype(accum) == IROP_BTYPE_INT64 || + irop_get_btype(accum) == IROP_BTYPE_FLOAT64) + return 0; + + /* t_lea must be single-use (only this MLA's accum) and defined by ADD + * with a register base + immediate offset that fits the LDR encoding. */ + IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, lea_vr); + if (!vi || vi->def_count != 1 || vi->def_instr < 0 || vi->use_count != 1) + return 0; + if (vi->uses[0].kind != SSA_USE_INSTR || vi->uses[0].idx != instr_idx) + return 0; + + IRQuadCompact *dq = &ir->compact_instructions[vi->def_instr]; + if (dq->op != TCCIR_OP_ADD) + return 0; + + IROperand a = tcc_ir_op_get_src1(ir, dq); + IROperand b = tcc_ir_op_get_src2(ir, dq); + IROperand base_op, imm_op; + if (a.tag == IROP_TAG_IMM32 && b.tag != IROP_TAG_IMM32) { + imm_op = a; base_op = b; + } else if (b.tag == IROP_TAG_IMM32 && a.tag != IROP_TAG_IMM32) { + imm_op = b; base_op = a; + } else { + return 0; + } + if (base_op.is_lval || base_op.tag != IROP_TAG_VREG) + return 0; + + int32_t imm = irop_get_imm32(imm_op); + int abs_imm = imm < 0 ? -imm : imm; + if (abs_imm > 4095) + return 0; + + IROperand lea_dest = tcc_ir_op_get_dest(ir, dq); + IROperand base_clean = base_op; + base_clean.is_lval = 0; + + /* Rewrite the defining ADD into LOAD_INDEXED(base, #imm, scale=0). */ + int lb = ir->iroperand_pool_count; + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + if (lb + 3 >= ir->iroperand_pool_capacity) + return 0; + + IROperand idx_op = irop_make_imm32(0, imm, irop_get_btype(lea_dest)); + IROperand scale_op = irop_make_imm32(0, 0, IROP_BTYPE_INT32); + dq->op = TCCIR_OP_LOAD_INDEXED; + dq->operand_base = lb; + ir->iroperand_pool[lb + 0] = lea_dest; + ir->iroperand_pool[lb + 1] = base_clean; + ir->iroperand_pool[lb + 2] = idx_op; + ir->iroperand_pool[lb + 3] = scale_op; + + /* Update use-def chain: the ADD used to read base+imm; now it reads + * base directly with an embedded immediate index. Drop the base's + * existing use at vi->def_instr (already there from the ADD form) — + * actually LOAD_INDEXED still uses base at this same instruction, so + * leave the existing use record in place. */ + + /* Rewrite MLA's accum: clear is_lval so the MLA reads t_lea as a value. */ + accum.is_lval = 0; + ir->iroperand_pool[mla_q->operand_base + 3] = accum; + + return 1; +} + +/* ============================================================================ + * ssa_gen_arm_fuse_store_src_through_add_imm + * + * Pattern: t_lea = ADD(base, #imm); STORE(V, *t_lea_DEREF) + * where t_lea is single-use (only as the STORE's src deref). + * Result: t_lea = LOAD_INDEXED(base, #imm, scale=0); STORE(V, t_lea) + * + * This is the SRC-side mirror of fuse_store_through_add_imm (which handles + * *t_lea = val — t_lea as the STORE *destination* address). Inlined + * helpers like check1 produce `V <- c->field [STORE]` patterns where the + * field-address LEA's only use is the STORE's deref source — a pure + * address use that should fuse to a single `ldr [base, #imm]`. Mirrors + * fuse_mla_accum_through_add_imm: rewrites the LEA's slot to + * LOAD_INDEXED, then clears is_lval on the STORE's src. + * + * Skips 64-bit (LDRD alignment, see [feedback_lea_fusion_addr_only]) and + * requires the LEA's only use to be this STORE's src1 deref — same + * invariant as the MLA-accum variant. + * ============================================================================ */ + +int ssa_gen_arm_fuse_store_src_through_add_imm(IRSSAOptCtx *ctx, int instr_idx) +{ + TCCIRState *ir = ctx->ir; + IRQuadCompact *store_q = &ir->compact_instructions[instr_idx]; + if (store_q->op != TCCIR_OP_STORE) + return 0; + + IROperand store_src = tcc_ir_op_get_src1(ir, store_q); + if (!store_src.is_lval || store_src.is_llocal || store_src.is_local || store_src.is_sym) + return 0; + if (store_src.tag != IROP_TAG_VREG) + return 0; + + int store_btype = irop_get_btype(store_src); + if (store_btype == IROP_BTYPE_INT64 || store_btype == IROP_BTYPE_FLOAT64) + return 0; + + int32_t lea_vr = irop_get_vreg(store_src); + if (lea_vr < 0 || TCCIR_DECODE_VREG_TYPE(lea_vr) != TCCIR_VREG_TYPE_TEMP) + return 0; + + IRSSAVregInfo *vi = ssa_opt_vinfo(ctx, lea_vr); + if (!vi || vi->def_count != 1 || vi->def_instr < 0 || vi->use_count != 1) + return 0; + if (vi->uses[0].kind != SSA_USE_INSTR || vi->uses[0].idx != instr_idx) + return 0; + + IRQuadCompact *dq = &ir->compact_instructions[vi->def_instr]; + if (dq->op != TCCIR_OP_ADD) + return 0; + + IROperand a = tcc_ir_op_get_src1(ir, dq); + IROperand b = tcc_ir_op_get_src2(ir, dq); + IROperand base_op, imm_op; + if (a.tag == IROP_TAG_IMM32 && b.tag != IROP_TAG_IMM32) { + imm_op = a; base_op = b; + } else if (b.tag == IROP_TAG_IMM32 && a.tag != IROP_TAG_IMM32) { + imm_op = b; base_op = a; + } else { + return 0; + } + if (base_op.is_lval || base_op.tag != IROP_TAG_VREG) + return 0; + + int32_t imm = irop_get_imm32(imm_op); + int abs_imm = imm < 0 ? -imm : imm; + if (abs_imm > 4095) + return 0; + + IROperand lea_dest = tcc_ir_op_get_dest(ir, dq); + /* Update btype to match the loaded value (the LEA dest was a pointer-typed + * INT32; after fusion it holds the loaded value). */ + IROperand lea_dest_new = lea_dest; + lea_dest_new.btype = store_btype; + + IROperand base_clean = base_op; + base_clean.is_lval = 0; + + int lb = ir->iroperand_pool_count; + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + tcc_ir_pool_add(ir, IROP_NONE); + if (lb + 3 >= ir->iroperand_pool_capacity) + return 0; + + IROperand idx_op = irop_make_imm32(0, imm, store_btype); + IROperand scale_op = irop_make_imm32(0, 0, IROP_BTYPE_INT32); + dq->op = TCCIR_OP_LOAD_INDEXED; + dq->operand_base = lb; + ir->iroperand_pool[lb + 0] = lea_dest_new; + ir->iroperand_pool[lb + 1] = base_clean; + ir->iroperand_pool[lb + 2] = idx_op; + ir->iroperand_pool[lb + 3] = scale_op; + + /* Clear is_lval on the STORE's src so the codegen reads t_lea as a value + * (the loaded data) instead of dereferencing it again. */ + IROperand new_src = store_src; + new_src.is_lval = 0; + new_src.btype = store_btype; + tcc_ir_set_src1(ir, instr_idx, new_src); + + return 1; +} + +/* ============================================================================ + * Generator Table + * ============================================================================ */ + +/* Combined dispatcher: the gen-table runner breaks after the first matching + * entry regardless of return value, so two gens for TCCIR_OP_SHL would never + * both get a chance. Try LOAD_INDEXED first; if it doesn't fire, fall through + * to STORE_INDEXED. */ +static int ssa_gen_arm_fuse_shl_indexed(IRSSAOptCtx *ctx, int instr_idx) +{ + int r = ssa_gen_arm_fuse_shl_add_to_load_indexed(ctx, instr_idx); + if (r > 0) + return r; + return ssa_gen_arm_fuse_shl_add_to_store_indexed(ctx, instr_idx); +} + +/* STORE dispatcher: try the dest-side address fusion first (the original + * "store through LEA"), then the src-side fusion that handles the inlined + * `V <- *(base + #imm) [STORE]` pattern. */ +static int ssa_gen_arm_fuse_store_add_imm_combined(IRSSAOptCtx *ctx, int instr_idx) +{ + int r = ssa_gen_arm_fuse_store_through_add_imm(ctx, instr_idx); + if (r > 0) + return r; + return ssa_gen_arm_fuse_store_src_through_add_imm(ctx, instr_idx); +} + +static const IRSSAOptGen ssa_gen_arm[] = { + { TCCIR_OP_MUL, ssa_gen_arm_fuse_mul_add_to_mla, "arm_mla_fusion" }, + { TCCIR_OP_MUL, ssa_gen_arm_reduce_mul_to_shift, "arm_mul_to_shl" }, + { TCCIR_OP_SHL, ssa_gen_arm_fuse_shl_indexed, "arm_shl_indexed" }, + { TCCIR_OP_LOAD, ssa_gen_arm_fuse_load_through_add_imm, "arm_load_add_imm" }, + { TCCIR_OP_STORE, ssa_gen_arm_fuse_store_add_imm_combined, "arm_store_add_imm" }, + { TCCIR_OP_MLA, ssa_gen_arm_fuse_mla_accum_through_add_imm, "arm_mla_accum_add_imm" }, +}; + +void tcc_ir_ssa_opt_arm_register(void) +{ + tcc_ir_ssa_opt_register_target(ssa_gen_arm, + sizeof(ssa_gen_arm) / sizeof(ssa_gen_arm[0])); +} diff --git a/arch/arm/ssa_opt_arm.h b/arch/arm/ssa_opt_arm.h new file mode 100644 index 00000000..e5b1f02c --- /dev/null +++ b/arch/arm/ssa_opt_arm.h @@ -0,0 +1,45 @@ +/* + * TCC IR - SSA Target-Specific Optimization Generators (ARM Thumb-2) + * + * Copyright (c) 2025 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation. + */ + +#ifndef TCC_IR_SSA_OPT_ARM_H +#define TCC_IR_SSA_OPT_ARM_H + +#include "ssa_opt.h" + +/* ============================================================================ + * ARM Thumb-2 Generators + * + * Each generator rewrites one target-specific instruction pattern. + * Named explicitly for the pattern they match, like thop_* builders. + * ============================================================================ */ + +/* MUL + ADD → MLA: fuse single-use multiply into multiply-accumulate */ +int ssa_gen_arm_fuse_mul_add_to_mla(IRSSAOptCtx *ctx, int instr_idx); + +/* SHL + ADD + LOAD → LOAD_INDEXED: fuse array index computation */ +int ssa_gen_arm_fuse_shl_add_to_load_indexed(IRSSAOptCtx *ctx, int instr_idx); + +/* SHL + ADD + STORE → STORE_INDEXED: fuse array store computation */ +int ssa_gen_arm_fuse_shl_add_to_store_indexed(IRSSAOptCtx *ctx, int instr_idx); + +/* MUL → SHL: strength-reduce power-of-2 multiply to shift */ +int ssa_gen_arm_reduce_mul_to_shift(IRSSAOptCtx *ctx, int instr_idx); + +/* ADD(base, #imm) + LOAD → LOAD_INDEXED(base, #imm, scale=0): + * fuse base + small constant offset addressing into a single load. */ +int ssa_gen_arm_fuse_load_through_add_imm(IRSSAOptCtx *ctx, int instr_idx); + +/* ADD(base, #imm) + STORE → STORE_INDEXED(base, val, #imm, scale=0). */ +int ssa_gen_arm_fuse_store_through_add_imm(IRSSAOptCtx *ctx, int instr_idx); + +/* Register ARM generators with the SSA optimization engine */ +void tcc_ir_ssa_opt_arm_register(void); + +#endif /* TCC_IR_SSA_OPT_ARM_H */ diff --git a/arch/arm/thumb/Makefile b/arch/arm/thumb/Makefile new file mode 100644 index 00000000..dc80f1cf --- /dev/null +++ b/arch/arm/thumb/Makefile @@ -0,0 +1,31 @@ +# Thumb instruction set build +# +# Called from arch/arm/Makefile: +# make -C thumb TOP=... BUILD_DIR=... CC=... CFLAGS=... DEFINES=... +# +# Produces $(BUILD_DIR)/libthumb.a + +TOP ?= ../../.. +BUILD_DIR ?= . +CC ?= gcc +AR ?= ar +CFLAGS ?= +DEFINES ?= + +SRCS = thumb.c thop_alu_imm.c thop_alu_reg.c thop_cmp.c thop_shift_imm.c thop_shift_reg.c thop_mem_reg.c thop_mem_unpriv.c thop_mem_exclusive.c thop_mem_imm.c thop_extend.c thop_rev.c thop_bitfield.c thop_vfp.c thop_system.c thop_branch.c thop_block.c thop_mul.c thop_dsp.c thop_tbb.c thop_ldrd.c thop_ldrex.c thop_ldaex.c thop_mrs.c thop_pld.c thop_mov.c thop_mvn.c thop_adr.c thop_ldr_literal.c +OBJS = $(addprefix $(BUILD_DIR)/, $(SRCS:.c=.o)) +LIB = $(BUILD_DIR)/libthumb.a + +all: $(LIB) + +$(LIB): $(OBJS) + $(AR) rcs $@ $^ + +$(BUILD_DIR)/%.o: %.c + @mkdir -p $(dir $@) + $(CC) -o $@ -c $< $(CFLAGS) $(DEFINES) -I$(TOP) -I$(TOP)/ir + +clean: + rm -f $(OBJS) $(LIB) + +.PHONY: all clean diff --git a/arch/arm/thumb/thop_adr.c b/arch/arm/thumb/thop_adr.c new file mode 100644 index 00000000..86bdc6db --- /dev/null +++ b/arch/arm/thumb/thop_adr.c @@ -0,0 +1,67 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_adr.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * ADR — address to register + * ═══════════════════════════════════════════════════════════════════ */ + +/* T1: ADR , # — rd low reg, imm scaled by 4, positive */ +static const thop_variant_shape SHAPE_ADR_T1 = { + .size = THOP_VARIANT_T16, + .rd_place = {8, 3}, + .rd_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 8, .scale_log2 = 2}, + .imm_place = {0, 8}, + .feat = {.t16 = 1}, +}; + +/* T3: ADR , # — positive, plain 12-bit */ +static const thop_variant_shape SHAPE_ADR_T3 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rd_con = REG_NOT_PC, + .imm = {.kind = IMM_PACK_3_8_1, .width = 12}, + .feat = {.t32 = 1}, +}; + +/* T4: ADR , #- — negative offset */ +static const thop_variant_shape SHAPE_ADR_T4 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rd_con = REG_NOT_PC, + .imm = {.kind = IMM_PACK_3_8_1, .width = 12, .is_signed = true}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_ADR_IMM, "adr", {&SHAPE_ADR_T1, 0xa000, NULL}, {&SHAPE_ADR_T3, 0xf20f0000, NULL}, + {&SHAPE_ADR_T4, 0xf2af0000, NULL}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_adr_imm(uint32_t rd, int imm, thumb_enforce_encoding encoding) +{ + return thop_emit(TH_ADR_IMM.name, TH_ADR_IMM.variants, TH_ADR_IMM.variant_count, + (thop_args){.rd = rd, .imm = (uint32_t)imm, .enc = encoding}); +} diff --git a/arch/arm/thumb/thop_adr.h b/arch/arm/thumb/thop_adr.h new file mode 100644 index 00000000..b2dd31f0 --- /dev/null +++ b/arch/arm/thumb/thop_adr.h @@ -0,0 +1,7 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_adr_imm(uint32_t rd, int imm, thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_alu_imm.c b/arch/arm/thumb/thop_alu_imm.c new file mode 100644 index 00000000..ef8fea84 --- /dev/null +++ b/arch/arm/thumb/thop_alu_imm.c @@ -0,0 +1,177 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_alu_imm.h" + +/* ═══════════════════════════════════════════════════════════════════ + * ALU immediate — shared shapes (defined once in .rodata) + * + * Each shape describes field layout, constraints, immediate encoding, + * and feature requirements. Per-instruction variants only add the + * base opcode. + * ═══════════════════════════════════════════════════════════════════ */ + +/* 16-bit: OP , # — rd==rn, low regs only */ +static const thop_variant_shape SHAPE_T16_IMM8 = { + .size = THOP_VARIANT_T16, + .rd_place = {8, 3}, + .rn_place = {8, 3}, + .rd_con = REG_LOW_ONLY | REG_EQ_RN, + .rn_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 8}, + .imm_place = {0, 8}, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +/* 16-bit: OP , , # — both low regs */ +static const thop_variant_shape SHAPE_T16_IMM3 = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rn_place = {3, 3}, + .rd_con = REG_LOW_ONLY, + .rn_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 3}, + .imm_place = {6, 3}, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +/* 32-bit: OP{S}.W , , # — modified immediate */ +static const thop_variant_shape SHAPE_T32_MOD_IMM = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rd_con = REG_NOT_PC, + .rn_con = REG_NOT_PC, + .imm = {.kind = IMM_PACK_CONST, .width = 12}, + .has_s_bit = 1, + .feat = {.t32 = 1, .mod_imm = 1}, +}; + +/* 32-bit: OPW , , # — plain 12-bit */ +static const thop_variant_shape SHAPE_T32_IMM12 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rd_con = REG_NOT_PC, + .imm = {.kind = IMM_PACK_3_8_1, .width = 12}, + .feat = {.t32 = 1}, +}; + +/* 16-bit: ADD SP, SP, # — rd/rn implicit SP, imm scaled by 4 */ +static const thop_variant_shape SHAPE_T16_ADD_SP_IMM = { + .size = THOP_VARIANT_T16, + .rd_con = REG_SP_ONLY, + .rn_con = REG_SP_ONLY, + .imm = {.kind = IMM_RAW, .width = 7, .scale_log2 = 2}, + .imm_place = {0, 7}, + .feat = {.t16 = 1}, +}; + +/* 16-bit: ADD , SP, # — rd low reg, rn implicit SP, imm scaled by 4 */ +static const thop_variant_shape SHAPE_T16_ADD_SP_IMM8 = { + .size = THOP_VARIANT_T16, + .rd_place = {8, 3}, + .rd_con = REG_LOW_ONLY, + .rn_con = REG_SP_ONLY, + .imm = {.kind = IMM_RAW, .width = 8, .scale_log2 = 2}, + .imm_place = {0, 8}, + .feat = {.t16 = 1}, +}; + +/* 16-bit: SUB SP, SP, # — rd/rn implicit SP, imm scaled by 4 */ +static const thop_variant_shape SHAPE_T16_SUB_SP_IMM = { + .size = THOP_VARIANT_T16, + .rd_con = REG_SP_ONLY, + .rn_con = REG_SP_ONLY, + .imm = {.kind = IMM_RAW, .width = 7, .scale_log2 = 2}, + .imm_place = {0, 7}, + .feat = {.t16 = 1}, +}; + +/* Shorthand for variant initializers */ +#define V_IMM8(b) {&SHAPE_T16_IMM8, (b)} +#define V_IMM3(b) {&SHAPE_T16_IMM3, (b)} +#define V_MOD_IMM(b) {&SHAPE_T32_MOD_IMM, (b)} +#define V_IMM12(b) {&SHAPE_T32_IMM12, (b)} + +/* ═══════════════════════════════════════════════════════════════════ + * Function-generating macros + * ═══════════════════════════════════════════════════════════════════ */ + +#define THOP_ALU_IMM_FN(fn_name, table_id) \ + thumb_opcode fn_name(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, \ + thumb_enforce_encoding enc) \ + { \ + return thop_emit(table_id.name, table_id.variants, table_id.variant_count, \ + (thop_args){.rd = rd, .rn = rn, .imm = imm, .flags = flags, .enc = enc}); \ + } + +#define THOP_ALU_WIDE_FN(fn_name, base32) \ + thumb_opcode fn_name(uint32_t rd, uint32_t rn, uint32_t imm) \ + { \ + static const thop_variant _v[] = {V_IMM12(base32)}; \ + static const thop_table _t = {.name = #fn_name, .variants = _v, .variant_count = 1}; \ + return thop_emit(_t.name, _t.variants, _t.variant_count, \ + (thop_args){.rd = rd, .rn = rn, .imm = imm, .enc = ENFORCE_ENCODING_32BIT}); \ + } + +/* ═══════════════════════════════════════════════════════════════════ + * ADD/SUB — all four forms (T16 narrow + T32 wide) + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_ADD_IMM, "add", V_IMM8(0x3000), V_IMM3(0x1C00), {&SHAPE_T16_ADD_SP_IMM, 0xb000}, + {&SHAPE_T16_ADD_SP_IMM8, 0xa800}, V_MOD_IMM(0xF1000000), V_IMM12(0xF2000000)); +THOP_ALU_IMM_FN(th_add_imm, TH_ADD_IMM) +THOP_ALU_WIDE_FN(th_addw, 0xF2000000) + +TH_TABLE(TH_SUB_IMM, "sub", V_IMM8(0x3800), V_IMM3(0x1E00), {&SHAPE_T16_SUB_SP_IMM, 0xb080}, V_MOD_IMM(0xF1A00000), + V_IMM12(0xF2A00000)); +THOP_ALU_IMM_FN(th_sub_imm, TH_SUB_IMM) +THOP_ALU_WIDE_FN(th_subw, 0xF2A00000) + +/* ═══════════════════════════════════════════════════════════════════ + * T32-only ALU immediate (modified immediate only) + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_RSB_IMM, "rsb", V_MOD_IMM(0xF1C00000)); +THOP_ALU_IMM_FN(th_rsb_imm, TH_RSB_IMM) + +TH_TABLE(TH_ADC_IMM, "adc", V_MOD_IMM(0xF1400000)); +THOP_ALU_IMM_FN(th_adc_imm, TH_ADC_IMM) + +TH_TABLE(TH_SBC_IMM, "sbc", V_MOD_IMM(0xF1600000)); +THOP_ALU_IMM_FN(th_sbc_imm, TH_SBC_IMM) + +TH_TABLE(TH_AND_IMM, "and", V_MOD_IMM(0xF0000000)); +THOP_ALU_IMM_FN(th_and_imm, TH_AND_IMM) + +TH_TABLE(TH_BIC_IMM, "bic", V_MOD_IMM(0xF0200000)); +THOP_ALU_IMM_FN(th_bic_imm, TH_BIC_IMM) + +TH_TABLE(TH_ORR_IMM, "orr", V_MOD_IMM(0xF0400000)); +THOP_ALU_IMM_FN(th_orr_imm, TH_ORR_IMM) + +TH_TABLE(TH_ORN_IMM, "orn", V_MOD_IMM(0xF0600000)); +THOP_ALU_IMM_FN(th_orn_imm, TH_ORN_IMM) + +TH_TABLE(TH_EOR_IMM, "eor", V_MOD_IMM(0xF0800000)); +THOP_ALU_IMM_FN(th_eor_imm, TH_EOR_IMM) diff --git a/arch/arm/thumb/thop_alu_imm.h b/arch/arm/thumb/thop_alu_imm.h new file mode 100644 index 00000000..be7da259 --- /dev/null +++ b/arch/arm/thumb/thop_alu_imm.h @@ -0,0 +1,59 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +extern thop_table TH_ALU_IMM; + +thumb_opcode th_add_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding); +thumb_opcode th_addw(uint32_t rd, uint32_t rn, uint32_t imm); + +thumb_opcode th_sub_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding); +thumb_opcode th_subw(uint32_t rd, uint32_t rn, uint32_t imm); + +thumb_opcode th_rsb_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, + thumb_enforce_encoding encoding); + +thumb_opcode th_adc_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, + thumb_enforce_encoding encoding); + +thumb_opcode th_sbc_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding); + +thumb_opcode th_and_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, + thumb_enforce_encoding encoding); + +thumb_opcode th_bic_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding); + +thumb_opcode th_orr_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding); + +thumb_opcode th_orn_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding); + +thumb_opcode th_eor_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_alu_reg.c b/arch/arm/thumb/thop_alu_reg.c new file mode 100644 index 00000000..ae416b3d --- /dev/null +++ b/arch/arm/thumb/thop_alu_reg.c @@ -0,0 +1,162 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_alu_reg.h" + +/* ═══════════════════════════════════════════════════════════════════ + * ALU register — shared shapes + * + * T1: OP , , — 16-bit, all low, no shift + * T3: OP{S}.W , , {,shift} — 32-bit, with shift+S + * + * T2 (ADD , ) uses a split DN:Rd encoding that doesn't + * fit the generic bitfield model — handled via custom emit. + * ═══════════════════════════════════════════════════════════════════ */ + +/* 16-bit: OP , , — all low regs, no shift */ +static const thop_variant_shape SHAPE_T16_REG3 = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rn_place = {3, 3}, + .rm_place = {6, 3}, + .rd_con = REG_LOW_ONLY, + .rn_con = REG_LOW_ONLY, + .rm_con = REG_LOW_ONLY, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +/* 32-bit: OP{S}.W , , {,shift} — with S bit and shift */ +static const thop_variant_shape SHAPE_T32_REG_SHIFT = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .rd_con = REG_NOT_PC, + .rn_con = REG_NOT_PC, + .rm_con = REG_NOT_SP | REG_NOT_PC, + .has_s_bit = 1, + .shift_type_bits = {4, 2}, + .shift_imm2_bits = {6, 2}, + .shift_imm3_bits = {12, 3}, + .feat = {.t32 = 1}, +}; + +#define V_REG3(b) {&SHAPE_T16_REG3, (b)} +#define V_REGS(b) {&SHAPE_T32_REG_SHIFT, (b)} + +/* ═══════════════════════════════════════════════════════════════════ + * Generic wrapper + * ═══════════════════════════════════════════════════════════════════ */ + +static thumb_opcode thop_alu_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding enc, const thop_table *table) +{ + return thop_emit(table->name, table->variants, table->variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm, .flags = flags, .shift = shift, .enc = enc}); +} + +/* ═══════════════════════════════════════════════════════════════════ + * Function-generating macros + * ═══════════════════════════════════════════════════════════════════ */ + +#define THOP_ALU_REG_FN(fn_name, table_id) \ + thumb_opcode fn_name(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, \ + thumb_enforce_encoding enc) \ + { \ + return thop_alu_reg(rd, rn, rm, flags, shift, enc, &table_id); \ + } + +/* ═══════════════════════════════════════════════════════════════════ + * ADD register — T1 + ADD-SP-T1 + T2 + T3 + * + * ADD-SP-T1 (ADD , SP, ) and T2 (ADD , ) share + * the same 0x4400 base — SP goes in the Rm encoding field via + * rn_place, Rdm via the DN:Rd split. + * ═══════════════════════════════════════════════════════════════════ */ + +/* ADD , SP, — rd==rm, rn==SP, DN:Rd split */ +static const thop_variant_shape SHAPE_T16_ADD_SP_REG = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .has_rd_hi = 1, + .rn_place = {3, 4}, + .rd_con = REG_EQ_RM, + .rn_con = REG_SP_ONLY, + .feat = {.t16 = 1}, +}; + +/* ADD , — rd==rn, any reg, no shift, no S, DN:Rd split */ +static const thop_variant_shape SHAPE_T16_ADD_T2 = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .has_rd_hi = 1, + .rn_place = {0, 3}, + .rm_place = {3, 4}, + .rd_con = REG_EQ_RN, + .feat = {.t16 = 1}, +}; + +#define V_ADD_SP_REG(b) {&SHAPE_T16_ADD_SP_REG, (b)} +#define V_ADD_T2(b) {&SHAPE_T16_ADD_T2, (b)} + +TH_TABLE(TH_ADD_REG, "add", V_REG3(0x1800), V_ADD_SP_REG(0x4400), V_ADD_T2(0x4400), V_REGS(0xEB000000)); +THOP_ALU_REG_FN(th_add_reg, TH_ADD_REG) + +TH_TABLE(TH_SUB_REG, "sub", V_REG3(0x1a00), V_REGS(0xEBA00000)); +THOP_ALU_REG_FN(th_sub_reg, TH_SUB_REG) + +TH_TABLE(TH_RSB_REG, "rsb", V_REGS(0xEBC00000)); +THOP_ALU_REG_FN(th_rsb_reg, TH_RSB_REG) + +/* 16-bit: OP , — rd==rn, all low, no shift (ADC, SBC, AND, ORR, EOR, BIC, etc.) */ +static const thop_variant_shape SHAPE_T16_REG_RDN_RM = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rm_place = {3, 3}, + .rd_con = REG_LOW_ONLY | REG_EQ_RN, + .rn_con = REG_LOW_ONLY, + .rm_con = REG_LOW_ONLY, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +#define V_REG_RDN_RM(b) {&SHAPE_T16_REG_RDN_RM, (b)} + +TH_TABLE(TH_ADC_REG, "adc", V_REG_RDN_RM(0x4140), V_REGS(0xEB400000)); +THOP_ALU_REG_FN(th_adc_reg, TH_ADC_REG) + +TH_TABLE(TH_SBC_REG, "sbc", V_REG_RDN_RM(0x4180), V_REGS(0xEB600000)); +THOP_ALU_REG_FN(th_sbc_reg, TH_SBC_REG) + +TH_TABLE(TH_AND_REG, "and", V_REG_RDN_RM(0x4000), V_REGS(0xEA000000)); +THOP_ALU_REG_FN(th_and_reg, TH_AND_REG) + +TH_TABLE(TH_BIC_REG, "bic", V_REG_RDN_RM(0x4380), V_REGS(0xEA200000)); +THOP_ALU_REG_FN(th_bic_reg, TH_BIC_REG) + +TH_TABLE(TH_ORR_REG, "orr", V_REG_RDN_RM(0x4300), V_REGS(0xEA400000)); +THOP_ALU_REG_FN(th_orr_reg, TH_ORR_REG) + +TH_TABLE(TH_ORN_REG, "orn", V_REGS(0xEA600000)); +THOP_ALU_REG_FN(th_orn_reg, TH_ORN_REG) + +TH_TABLE(TH_EOR_REG, "eor", V_REG_RDN_RM(0x4040), V_REGS(0xEA800000)); +THOP_ALU_REG_FN(th_eor_reg, TH_EOR_REG) diff --git a/arch/arm/thumb/thop_alu_reg.h b/arch/arm/thumb/thop_alu_reg.h new file mode 100644 index 00000000..e9b25b17 --- /dev/null +++ b/arch/arm/thumb/thop_alu_reg.h @@ -0,0 +1,55 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_add_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); + +thumb_opcode th_adc_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); + +thumb_opcode th_sbc_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); + +thumb_opcode th_and_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); + +thumb_opcode th_bic_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); + +thumb_opcode th_orr_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); + +thumb_opcode th_orn_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); + +thumb_opcode th_eor_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); + +thumb_opcode th_rsb_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); + +thumb_opcode th_sub_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_bitfield.c b/arch/arm/thumb/thop_bitfield.c new file mode 100644 index 00000000..ef3a5418 --- /dev/null +++ b/arch/arm/thumb/thop_bitfield.c @@ -0,0 +1,152 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_bitfield.h" + +#define USING_GLOBALS +#include "tcc.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Bitfield / saturation — shared 32-bit shapes + * + * BFX instructions (bfc, bfi, sbfx) share a common skeleton: + * - rd in bits [11:8], rn in bits [19:16] + * - lsb is split into imm3[14:12] and imm2[7:6] by the engine + * - the 5-bit payload (msb / width-1 / sat_imm) is passed as imm2 + * + * SAT instructions (ssat, usat) reuse the shift_imm2/imm3 fields for + * the shift amount and have two variants (LSL / ASR) differing only + * in the base opcode (sh bit at position 21). + * ═══════════════════════════════════════════════════════════════════ */ + +static const thop_variant_shape SHAPE_T32_BFX = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rd_con = REG_NOT_PC, + .rn_con = REG_ANY, + .imm = {.kind = IMM_RAW, .width = 5}, + .split_imm2_place = {6, 2}, + .split_imm3_place = {12, 3}, + .imm2_place = {0, 5}, + .feat = {.t32 = 1, .bfx = 1}, +}; + +/* SSAT with LSL (or no shift) — base has sh=0 */ +static const thop_variant_shape SHAPE_T32_SSAT_LSL = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rd_con = REG_NOT_PC, + .shift_imm3_bits = {12, 3}, + .shift_imm2_bits = {6, 2}, + .shift_allowed = (1u << THUMB_SHIFT_NONE) | (1u << THUMB_SHIFT_LSL), + .imm2_place = {0, 5}, + .feat = {.t32 = 1, .sat = 1}, +}; + +/* SSAT with ASR — base has sh=1 */ +static const thop_variant_shape SHAPE_T32_SSAT_ASR = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rd_con = REG_NOT_PC, + .shift_imm3_bits = {12, 3}, + .shift_imm2_bits = {6, 2}, + .shift_allowed = (1u << THUMB_SHIFT_ASR), + .imm2_place = {0, 5}, + .feat = {.t32 = 1, .sat = 1}, +}; + +/* USAT with LSL (or no shift) — base has sh=0 */ +static const thop_variant_shape SHAPE_T32_USAT_LSL = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rd_con = REG_NOT_PC, + .shift_imm3_bits = {12, 3}, + .shift_imm2_bits = {6, 2}, + .shift_allowed = (1u << THUMB_SHIFT_NONE) | (1u << THUMB_SHIFT_LSL), + .imm2_place = {0, 5}, + .feat = {.t32 = 1, .sat = 1}, +}; + +/* USAT with ASR — base has sh=1 */ +static const thop_variant_shape SHAPE_T32_USAT_ASR = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rd_con = REG_NOT_PC, + .shift_imm3_bits = {12, 3}, + .shift_imm2_bits = {6, 2}, + .shift_allowed = (1u << THUMB_SHIFT_ASR), + .imm2_place = {0, 5}, + .feat = {.t32 = 1, .sat = 1}, +}; + +#define V_BFX(b) {&SHAPE_T32_BFX, (b)} +#define V_SSAT_LSL(b) {&SHAPE_T32_SSAT_LSL, (b)} +#define V_SSAT_ASR(b) {&SHAPE_T32_SSAT_ASR, (b)} +#define V_USAT_LSL(b) {&SHAPE_T32_USAT_LSL, (b)} +#define V_USAT_ASR(b) {&SHAPE_T32_USAT_ASR, (b)} + +/* ═══════════════════════════════════════════════════════════════════ + * Instruction tables + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_BFC, "bfc", V_BFX(0xf36f0000)); +TH_TABLE(TH_BFI, "bfi", V_BFX(0xf3600000)); +TH_TABLE(TH_SBFX, "sbfx", V_BFX(0xf3400000)); +TH_TABLE(TH_SSAT, "ssat", V_SSAT_LSL(0xf3000000), V_SSAT_ASR(0xf3200000)); +TH_TABLE(TH_USAT, "usat", V_USAT_LSL(0xf3800000), V_USAT_ASR(0xf3a00000)); + +/* ═══════════════════════════════════════════════════════════════════ + * Emit wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_bfc(uint32_t rd, uint32_t lsb, uint32_t width) +{ + return thop_emit(TH_BFC.name, TH_BFC.variants, TH_BFC.variant_count, + (thop_args){.rd = rd, .rn = 0, .imm = lsb, .imm2 = lsb + width - 1}); +} + +thumb_opcode th_bfi(uint32_t rd, uint32_t rn, uint32_t lsb, uint32_t width) +{ + return thop_emit(TH_BFI.name, TH_BFI.variants, TH_BFI.variant_count, + (thop_args){.rd = rd, .rn = rn, .imm = lsb, .imm2 = lsb + width - 1}); +} + +thumb_opcode th_sbfx(uint32_t rd, uint32_t rn, uint32_t lsb, uint32_t width) +{ + return thop_emit(TH_SBFX.name, TH_SBFX.variants, TH_SBFX.variant_count, + (thop_args){.rd = rd, .rn = rn, .imm = lsb, .imm2 = width - 1}); +} + +thumb_opcode th_ssat(uint32_t rd, uint32_t imm, uint32_t rn, thumb_shift shift) +{ + return thop_emit(TH_SSAT.name, TH_SSAT.variants, TH_SSAT.variant_count, + (thop_args){.rd = rd, .rn = rn, .imm2 = imm - 1, .shift = shift}); +} + +thumb_opcode th_usat(uint32_t rd, uint32_t imm, uint32_t rn, thumb_shift shift) +{ + return thop_emit(TH_USAT.name, TH_USAT.variants, TH_USAT.variant_count, + (thop_args){.rd = rd, .rn = rn, .imm2 = imm, .shift = shift}); +} diff --git a/arch/arm/thumb/thop_bitfield.h b/arch/arm/thumb/thop_bitfield.h new file mode 100644 index 00000000..ddf7906c --- /dev/null +++ b/arch/arm/thumb/thop_bitfield.h @@ -0,0 +1,31 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_bfc(uint32_t rd, uint32_t lsb, uint32_t width); +thumb_opcode th_bfi(uint32_t rd, uint32_t rn, uint32_t lsb, uint32_t width); +thumb_opcode th_sbfx(uint32_t rd, uint32_t rn, uint32_t lsb, uint32_t width); +thumb_opcode th_ssat(uint32_t rd, uint32_t imm, uint32_t rn, thumb_shift shift); +thumb_opcode th_usat(uint32_t rd, uint32_t imm, uint32_t rn, thumb_shift shift); diff --git a/arch/arm/thumb/thop_block.c b/arch/arm/thumb/thop_block.c new file mode 100644 index 00000000..213824a4 --- /dev/null +++ b/arch/arm/thumb/thop_block.c @@ -0,0 +1,205 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_block.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Block data transfer: PUSH, POP, LDM, STM, LDMDB, STMDB + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── PUSH ───── */ + +/* T1 narrow: push {reglist}, [lr] — raw reglist in bits [7:0], lr flag at bit 8 */ +static const thop_variant_shape SHAPE_PUSH_T1 = { + .size = THOP_VARIANT_T16, + .rm_raw_place = {0, 8}, /* raw register list in bits [7:0] */ + .imm = {.kind = IMM_RAW, .width = 1}, + .imm_place = {8, 1}, /* LR flag at bit 8 */ + .rm_con = REG_LOW_REGSET | REG_RM_BITS_NOT_LR_PC, + .feat = {.t16 = 1}, +}; + +/* T2 wide: push {reglist} — register list in bits [15:3], SP/PC not allowed */ +static const thop_variant_shape SHAPE_PUSH_T2 = { + .size = THOP_VARIANT_T32, + .rm_place = {0, 13}, /* register list in bits [12:0] (r0-r12) */ + .imm = {.kind = IMM_RAW, .width = 1}, + .imm_place = {14, 1}, /* LR/M flag at bit 14 */ + .rm_con = REG_RM_BITS_NOT_LR_PC, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_PUSH, "push", {&SHAPE_PUSH_T1, 0xb400, NULL}, {&SHAPE_PUSH_T2, 0xe92d0000, NULL}); + +/* ───── POP ───── */ + +/* T1 narrow: pop {reglist}, [pc] — raw reglist in bits [7:0], pc flag at bit 8 */ +static const thop_variant_shape SHAPE_POP_T1 = { + .size = THOP_VARIANT_T16, + .rm_raw_place = {0, 8}, /* raw register list in bits [7:0] */ + .imm = {.kind = IMM_RAW, .width = 1}, + .imm_place = {8, 1}, /* PC flag at bit 8 */ + .rm_con = REG_LOW_REGSET | REG_RM_BITS_NOT_LR_PC, + .feat = {.t16 = 1}, +}; + +/* T2 wide: pop {reglist} — register list in bits [15:3], SP not allowed */ +static const thop_variant_shape SHAPE_POP_T2 = { + .size = THOP_VARIANT_T32, + .rm_place = {0, 15}, /* register list in bits [14:0] (r0-r12 + LR) */ + .imm = {.kind = IMM_RAW, .width = 1}, + .imm_place = {15, 1}, /* PC/P flag at bit 15 */ + .rm_con = REG_RM_BIT_NOT_SP, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_POP, "pop", {&SHAPE_POP_T1, 0xbc00, NULL}, {&SHAPE_POP_T2, 0xe8bd0000, NULL}); + +/* ───── LDM ───── */ + +/* T1 narrow: ldm {rn}, {reglist}! — rn in bits [8:5], raw reglist in bits [7:0] */ +static const thop_variant_shape SHAPE_LDM_T1 = { + .size = THOP_VARIANT_T16, + .rd_place = {8, 3}, /* rn in bits [10:8] */ + .rd_con = REG_LOW_ONLY, + .rm_raw_place = {0, 8}, /* raw register list in bits [7:0] */ + .rm_con = REG_LOW_REGSET, + .feat = {.t16 = 1}, +}; + +/* T3 wide: ldmia {rn}!, {reglist} — rn at [19:16], reglist at [12:0], writeback at [21] */ +static const thop_variant_shape SHAPE_LDM_T3 = { + .size = THOP_VARIANT_T32, + .rd_place = {16, 4}, /* rn in bits [19:16] */ + .rm_place = {0, 16}, /* register list in bits [15:0] (r0-r12, LR, PC) */ + .rm_con = REG_RM_BIT_NOT_SP, + .imm = {.kind = IMM_RAW, .width = 1}, + .imm_place = {21, 1}, /* writeback bit at position 21 */ + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_LDM, "ldm", {&SHAPE_LDM_T1, 0xc800, NULL}, {&SHAPE_LDM_T3, 0xe8900000, NULL}); + +/* ───── STM ───── */ + +/* T1 narrow: stm {rn}!, {reglist} — rn in bits [8:5], raw reglist in bits [7:0] */ +static const thop_variant_shape SHAPE_STM_T1 = { + .size = THOP_VARIANT_T16, + .rd_place = {8, 3}, /* rn in bits [10:8] */ + .rd_con = REG_LOW_ONLY, + .rm_raw_place = {0, 8}, /* raw register list in bits [7:0] */ + .rm_con = REG_LOW_REGSET, + .feat = {.t16 = 1}, +}; + +/* T3 wide: stmia {rn}!, {reglist} — rn at [19:16], reglist at [12:0], writeback at [21] */ +static const thop_variant_shape SHAPE_STM_T3 = { + .size = THOP_VARIANT_T32, + .rd_place = {16, 4}, /* rn in bits [19:16] */ + .rm_place = {0, 15}, /* register list in bits [14:0] (r0-r12, LR) */ + .rm_con = REG_RM_BIT_NOT_SP, + .imm = {.kind = IMM_RAW, .width = 1}, + .imm_place = {21, 1}, /* writeback bit at position 21 */ + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_STM, "stm", {&SHAPE_STM_T1, 0xc000, NULL}, {&SHAPE_STM_T3, 0xe8800000, NULL}); + +/* ───── LDMDB (T32) ───── */ + +static const thop_variant_shape SHAPE_LDMDB = { + .size = THOP_VARIANT_T32, + .rd_place = {16, 4}, + .rm_place = {0, 16}, /* register list in bits [15:0] (r0-r12, LR, PC) */ + .rm_con = REG_RM_BIT_NOT_SP, + .imm = {.kind = IMM_RAW, .width = 1}, + .imm_place = {21, 1}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_LDMDB, "ldmdb", {&SHAPE_LDMDB, 0xe9100000, NULL}); + +/* ───── STMDB (T32) ───── */ + +static const thop_variant_shape SHAPE_STMDB = { + .size = THOP_VARIANT_T32, + .rd_place = {16, 4}, + .rm_place = {0, 15}, /* register list in bits [14:0] (r0-r12, LR) */ + .rm_con = REG_RM_BIT_NOT_SP, + .imm = {.kind = IMM_RAW, .width = 1}, + .imm_place = {21, 1}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_STMDB, "stmdb", {&SHAPE_STMDB, 0xe9000000, NULL}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_push(uint32_t regs) +{ + uint8_t lr = (regs >> R_LR) & 1; + regs &= ~((1u << R_LR) | (1u << R_PC)); + return thop_emit(TH_PUSH.name, TH_PUSH.variants, TH_PUSH.variant_count, + (thop_args){.rm = regs, .imm = lr}); +} + +thumb_opcode th_pop(uint16_t regs) +{ + uint8_t pc = (regs >> R_PC) & 1; + regs &= ~(1u << R_PC); + return thop_emit(TH_POP.name, TH_POP.variants, TH_POP.variant_count, + (thop_args){.rm = regs, .imm = pc}); +} + +thumb_opcode th_ldm(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding) +{ + if (rn == R_SP && writeback && encoding != ENFORCE_ENCODING_32BIT) + return th_pop(regset); + if (!writeback) + encoding = ENFORCE_ENCODING_32BIT; + regset &= ~(1u << rn); + return thop_emit(TH_LDM.name, TH_LDM.variants, TH_LDM.variant_count, + (thop_args){.rd = rn, .rm = regset, .imm = writeback, .enc = encoding}); +} + +thumb_opcode th_stm(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding) +{ + if (!writeback) + encoding = ENFORCE_ENCODING_32BIT; + regset &= ~(1u << rn); + return thop_emit(TH_STM.name, TH_STM.variants, TH_STM.variant_count, + (thop_args){.rd = rn, .rm = regset, .imm = writeback, .enc = encoding}); +} + +thumb_opcode th_ldmdb(uint32_t rn, uint32_t reglist, uint32_t w) +{ + return thop_emit(TH_LDMDB.name, TH_LDMDB.variants, TH_LDMDB.variant_count, + (thop_args){.rd = rn, .rm = reglist, .imm = w}); +} + +thumb_opcode th_stmdb(uint32_t rn, uint32_t reglist, uint32_t w, thumb_enforce_encoding encoding) +{ + (void)encoding; + return thop_emit(TH_STMDB.name, TH_STMDB.variants, TH_STMDB.variant_count, + (thop_args){.rd = rn, .rm = reglist, .imm = w}); +} diff --git a/arch/arm/thumb/thop_block.h b/arch/arm/thumb/thop_block.h new file mode 100644 index 00000000..8917cc55 --- /dev/null +++ b/arch/arm/thumb/thop_block.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_push(uint32_t regs); +thumb_opcode th_pop(uint16_t regs); +thumb_opcode th_ldm(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding); +thumb_opcode th_stm(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding); +thumb_opcode th_ldmdb(uint32_t rn, uint32_t reglist, uint32_t w); +thumb_opcode th_stmdb(uint32_t rn, uint32_t reglist, uint32_t w, thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_branch.c b/arch/arm/thumb/thop_branch.c new file mode 100644 index 00000000..ce29a85d --- /dev/null +++ b/arch/arm/thumb/thop_branch.c @@ -0,0 +1,217 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_branch.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Branch instructions + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── BX (T16 only) ───── */ + +static const thop_variant_shape SHAPE_BX = { + .size = THOP_VARIANT_T16, + .rm_place = {3, 4}, + .feat = {.t16 = 1}, +}; + +TH_TABLE(TH_BX, "bx", {&SHAPE_BX, 0x4700, NULL}); + +/* ───── BL (T32 only) ───── */ + +static thumb_opcode bl_t1_emit(uint32_t base, const thop_args *a) +{ + uint32_t val = a->imm; + uint32_t s = (val >> 24) & 1; + uint32_t imm10 = (val >> 14) & 0x3ff; + uint32_t j1 = (~((val >> 23) & 1) ^ s) & 1; + uint32_t j2 = (~((val >> 22) & 1) ^ s) & 1; + uint32_t imm11 = (val >> 1) & 0x7ff; + uint32_t hi = 0xf000 | (s << 10) | imm10; + uint32_t lo = 0xd000 | (j1 << 13) | (j2 << 11) | imm11; + uint32_t op = (hi << 16) | lo; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static const thop_variant_shape SHAPE_BL_T1 = { + .size = THOP_VARIANT_T32, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_BL_T1, "bl", {&SHAPE_BL_T1, 0, bl_t1_emit}); + +/* ───── BLX (T16 reg) ───── */ + +static const thop_variant_shape SHAPE_BLX_REG = { + .size = THOP_VARIANT_T16, + .rm_place = {3, 4}, + .feat = {.t16 = 1}, +}; + +TH_TABLE(TH_BLX_REG, "blx", {&SHAPE_BLX_REG, 0x4780, NULL}); + +/* ───── B (conditional T16, T32, unconditional T32) ───── */ + +static const thop_variant_shape SHAPE_B_COND_T16 = { + .size = THOP_VARIANT_T16, + .rd_place = {8, 4}, + .imm = {.kind = IMM_RAW, .width = 8}, + .imm_place = {0, 8}, + .feat = {.t16 = 1}, +}; + +TH_TABLE(TH_B_COND_T16, "b", {&SHAPE_B_COND_T16, 0xd000, NULL}); + +static thumb_opcode b_t3_emit(uint32_t base, const thop_args *a) +{ + uint32_t imm = a->imm; + uint32_t s = (imm >> 19) & 1; + uint32_t imm6 = (imm >> 11) & 0x3f; + uint32_t imm11 = imm & 0x7ff; + uint32_t j2 = (imm >> 18) & 1; + uint32_t j1 = (imm >> 17) & 1; + uint32_t a_field = (s << 10) | imm6; + uint32_t b_field = (j1 << 13) | (j2 << 11) | imm11; + uint32_t enc = (a_field << 16) | b_field; + uint32_t op = 0xf0008000 | (a->rd << 22) | enc; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static thumb_opcode b_t4_emit(uint32_t base, const thop_args *a) +{ + uint32_t val = a->imm; + uint32_t s = (val >> 24) & 1; + uint32_t imm10 = (val >> 14) & 0x3ff; + uint32_t j1 = (~((val >> 23) & 1) ^ s) & 1; + uint32_t j2 = (~((val >> 22) & 1) ^ s) & 1; + uint32_t imm11 = (val >> 1) & 0x7ff; + uint32_t hi = 0xf000 | (s << 10) | imm10; + uint32_t lo = 0x9000 | (j1 << 13) | (j2 << 11) | imm11; + uint32_t op = (hi << 16) | lo; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static const thop_variant_shape SHAPE_B_T3 = { + .size = THOP_VARIANT_T32, + .rd_place = {22, 4}, + .feat = {.t32 = 1}, +}; + +static const thop_variant_shape SHAPE_B_T4 = { + .size = THOP_VARIANT_T32, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_B_T3, "b.w", {&SHAPE_B_T3, 0, b_t3_emit}); +TH_TABLE(TH_B_T4, "b.w", {&SHAPE_B_T4, 0, b_t4_emit}); + +/* ───── B (T2 unconditional, 16-bit) ───── */ + +static thumb_opcode b_t2_emit(uint32_t base, const thop_args *a) +{ + int32_t imm = (int32_t)a->imm; + int32_t i = imm >> 1; + if (i < 1023 && i > -1024 && !(imm & 1)) + { + return (thumb_opcode){.size = 2, .opcode = base | (i & 0x7ff)}; + } + return (thumb_opcode){.size = 0, .opcode = 0}; +} + +static const thop_variant_shape SHAPE_B_T2 = { + .size = THOP_VARIANT_T16, + .feat = {.t16 = 1}, +}; + +TH_TABLE(TH_B_T2, "b", {&SHAPE_B_T2, 0xe000, b_t2_emit}); + +/* ───── CBZ / CBNZ (T16 only) ───── */ + +static thumb_opcode cbz_emit(uint32_t base, const thop_args *a) +{ + uint32_t val = a->imm; + uint32_t i = (val >> 5) & 1; + uint32_t imm5 = (val >> 1) & 0x1f; + uint32_t op = base | (i << 9) | (imm5 << 3) | a->rd; + return (thumb_opcode){.size = 2, .opcode = op}; +} + +static const thop_variant_shape SHAPE_CBZ = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .feat = {.t16 = 1, .cbz = 1}, +}; + +TH_TABLE(TH_CBZ, "cbz", {&SHAPE_CBZ, 0xb100, cbz_emit}); +TH_TABLE(TH_CBNZ, "cbnz", {&SHAPE_CBZ, 0xb900, cbz_emit}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_bx_reg(uint16_t rm) +{ + return thop_emit(TH_BX.name, TH_BX.variants, TH_BX.variant_count, (thop_args){.rm = rm}); +} + +thumb_opcode th_bl_t1(uint32_t imm) +{ + return thop_emit(TH_BL_T1.name, TH_BL_T1.variants, TH_BL_T1.variant_count, + (thop_args){.imm = imm}); +} + +thumb_opcode th_b_t1(uint32_t cond, uint32_t imm) +{ + return thop_emit(TH_B_COND_T16.name, TH_B_COND_T16.variants, TH_B_COND_T16.variant_count, + (thop_args){.rd = cond, .imm = imm & 0xff}); +} + +thumb_opcode th_b_t3(uint32_t cond, uint32_t imm) +{ + return thop_emit(TH_B_T3.name, TH_B_T3.variants, TH_B_T3.variant_count, + (thop_args){.rd = cond, .imm = imm}); +} + +thumb_opcode th_b_t4(int32_t imm) +{ + return thop_emit(TH_B_T4.name, TH_B_T4.variants, TH_B_T4.variant_count, (thop_args){.imm = (uint32_t)imm}); +} + +thumb_opcode th_b_t2(int32_t imm11) +{ + return thop_emit(TH_B_T2.name, TH_B_T2.variants, TH_B_T2.variant_count, + (thop_args){.imm = (uint32_t)imm11}); +} + +thumb_opcode th_blx_reg(uint16_t rm) +{ + return thop_emit(TH_BLX_REG.name, TH_BLX_REG.variants, TH_BLX_REG.variant_count, + (thop_args){.rm = rm}); +} + +thumb_opcode th_cbz(uint16_t rn, uint32_t imm, uint32_t nonzero) +{ + if (nonzero) + return thop_emit(TH_CBNZ.name, TH_CBNZ.variants, TH_CBNZ.variant_count, + (thop_args){.rd = rn, .imm = imm}); + return thop_emit(TH_CBZ.name, TH_CBZ.variants, TH_CBZ.variant_count, + (thop_args){.rd = rn, .imm = imm}); +} diff --git a/arch/arm/thumb/thop_branch.h b/arch/arm/thumb/thop_branch.h new file mode 100644 index 00000000..3ab26071 --- /dev/null +++ b/arch/arm/thumb/thop_branch.h @@ -0,0 +1,14 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_bx_reg(uint16_t rm); +thumb_opcode th_bl_t1(uint32_t imm); +thumb_opcode th_blx_reg(uint16_t rm); +thumb_opcode th_b_t1(uint32_t cond, uint32_t imm); +thumb_opcode th_b_t3(uint32_t cond, uint32_t imm); +thumb_opcode th_b_t4(int32_t imm); +thumb_opcode th_b_t2(int32_t imm11); +thumb_opcode th_cbz(uint16_t rn, uint32_t imm, uint32_t nonzero); diff --git a/arch/arm/thumb/thop_cmp.c b/arch/arm/thumb/thop_cmp.c new file mode 100644 index 00000000..d2b02cfa --- /dev/null +++ b/arch/arm/thumb/thop_cmp.c @@ -0,0 +1,170 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_cmp.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Compare/Test immediate — shared shapes + * ═══════════════════════════════════════════════════════════════════ */ + +/* T1: CMP , # — 16-bit, rn low, imm8 raw */ +static const thop_variant_shape SHAPE_T16_CMP_IMM = { + .size = THOP_VARIANT_T16, + .rn_place = {8, 3}, + .rn_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 8}, + .imm_place = {0, 8}, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +/* T2/T1 (32-bit): CMP/CMN/TST/TEQ.W , # — modified imm, rd=0xF hardcoded */ +static const thop_variant_shape SHAPE_T32_CMP_IMM = { + .size = THOP_VARIANT_T32, + .rn_place = {16, 4}, + .rn_con = REG_NOT_PC, + .imm = {.kind = IMM_PACK_CONST, .width = 12}, + .implicit_s = true, + .feat = {.t32 = 1, .mod_imm = 1}, +}; + +/* ═══════════════════════════════════════════════════════════════════ + * Compare/Test register — shared shapes + * ═══════════════════════════════════════════════════════════════════ */ + +/* T1: CMP/CMN/TST , — 16-bit, both low, no shift */ +static const thop_variant_shape SHAPE_T16_CMP_REG = { + .size = THOP_VARIANT_T16, + .rn_place = {0, 3}, + .rm_place = {3, 3}, + .rn_con = REG_LOW_ONLY, + .rm_con = REG_LOW_ONLY, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +/* T2: CMP , — rn any (not PC), rm any, no shift */ +static const thop_variant_shape SHAPE_T16_CMP_REG_T2 = { + .size = THOP_VARIANT_T16, + .rm_place = {3, 3}, + .rn_con = REG_NOT_PC, + .rm_con = REG_ANY, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +static thumb_opcode cmp_reg_t2_custom_emit(uint32_t base, const thop_args *a) +{ + const uint16_t N = (a->rn >> 3) & 0x1; + return (thumb_opcode){ + .size = 2, + .opcode = base | (N << 7) | (a->rm << 3) | (a->rn & 0x7), + }; +} + +/* T3/T2 (32-bit): CMP/CMN/TST/TEQ.W , {,shift} — rd=0xF hardcoded */ +static const thop_variant_shape SHAPE_T32_CMP_REG = { + .size = THOP_VARIANT_T32, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .rn_con = REG_NOT_PC, + .rm_con = REG_NOT_SP | REG_NOT_PC, + .shift_type_bits = {4, 2}, + .shift_imm2_bits = {6, 2}, + .shift_imm3_bits = {12, 3}, + .implicit_s = true, + .feat = {.t32 = 1}, +}; + +/* ═══════════════════════════════════════════════════════════════════ + * Generic wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +static thumb_opcode thop_cmp_imm(uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding enc, + const thop_table *table) +{ + return thop_emit(table->name, table->variants, table->variant_count, + (thop_args){.rd = 0xf, .rn = rn, .imm = imm, .flags = flags, .enc = enc}); +} + +static thumb_opcode thop_cmp_reg(uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding enc, const thop_table *table) +{ + return thop_emit(table->name, table->variants, table->variant_count, + (thop_args){.rd = 0xf, .rn = rn, .rm = rm, .flags = flags, .shift = shift, .enc = enc}); +} + +/* ═══════════════════════════════════════════════════════════════════ + * Function-generating macros + * ═══════════════════════════════════════════════════════════════════ */ + +#define THOP_CMP_IMM_FN(fn_name, table_id) \ + thumb_opcode fn_name(uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding enc) \ + { \ + return thop_cmp_imm(rn, imm, flags, enc, &table_id); \ + } + +#define THOP_CMP_REG_FN(fn_name, table_id) \ + thumb_opcode fn_name(uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, \ + thumb_enforce_encoding enc) \ + { \ + return thop_cmp_reg(rn, rm, flags, shift, enc, &table_id); \ + } + +/* Shorthand variant initializers */ +#define V_CMP_IMM8(b) {&SHAPE_T16_CMP_IMM, (b)} +#define V_CMP_IMM(b) {&SHAPE_T32_CMP_IMM, (b)} +#define V_CMP_REG(b) {&SHAPE_T16_CMP_REG, (b)} +#define V_CMP_REG_T2(b) {&SHAPE_T16_CMP_REG_T2, (b), cmp_reg_t2_custom_emit} +#define V_CMP_REGS(b) {&SHAPE_T32_CMP_REG, (b)} + +/* ═══════════════════════════════════════════════════════════════════ + * Instruction tables + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_CMP_IMM, "cmp", V_CMP_IMM8(0x2800), V_CMP_IMM(0xF1B00F00)); +THOP_CMP_IMM_FN(th_cmp_imm, TH_CMP_IMM) + +TH_TABLE(TH_CMN_IMM, "cmn", V_CMP_IMM(0xF1100F00)); +THOP_CMP_IMM_FN(th_cmn_imm, TH_CMN_IMM) + +TH_TABLE(TH_TST_IMM, "tst", V_CMP_IMM(0xF0100F00)); +THOP_CMP_IMM_FN(th_tst_imm, TH_TST_IMM) + +TH_TABLE(TH_TEQ_IMM, "teq", V_CMP_IMM(0xF0900F00)); +THOP_CMP_IMM_FN(th_teq_imm, TH_TEQ_IMM) + +TH_TABLE(TH_CMP_REG, "cmp", V_CMP_REG(0x4280), V_CMP_REG_T2(0x4500), V_CMP_REGS(0xEBB00F00)); + +thumb_opcode th_cmp_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding) +{ + (void)rd; + return thop_cmp_reg(rn, rm, flags, shift, encoding, &TH_CMP_REG); +} + +TH_TABLE(TH_CMN_REG, "cmn", V_CMP_REG(0x42C0), V_CMP_REGS(0xEB100F00)); +THOP_CMP_REG_FN(th_cmn_reg, TH_CMN_REG) + +TH_TABLE(TH_TST_REG, "tst", V_CMP_REG(0x4200), V_CMP_REGS(0xEA100F00)); +THOP_CMP_REG_FN(th_tst_reg, TH_TST_REG) + +TH_TABLE(TH_TEQ_REG, "teq", V_CMP_REGS(0xEA900F00)); +THOP_CMP_REG_FN(th_teq_reg, TH_TEQ_REG) diff --git a/arch/arm/thumb/thop_cmp.h b/arch/arm/thumb/thop_cmp.h new file mode 100644 index 00000000..574dc507 --- /dev/null +++ b/arch/arm/thumb/thop_cmp.h @@ -0,0 +1,79 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +/* ───── Compare / Test immediate ───── + * + * CMP/CMN/TST/TEQ only update flags; 32-bit forms hard-code Rd=0xF. + */ +thumb_opcode th_cmp_imm(uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); + +/* Wrapper matching thumb_imm_handler_t for generic handler tables */ +static inline thumb_opcode th_cmp_imm_handler(uint32_t rd, uint32_t rn, uint32_t imm, + thumb_flags_behaviour flags, thumb_enforce_encoding enc) +{ + (void)rd; + return th_cmp_imm(rn, imm, flags, enc); +} + +thumb_opcode th_cmn_imm(uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); + +/* Wrapper matching thumb_imm_handler_t for generic handler tables */ +static inline thumb_opcode th_cmn_imm_handler(uint32_t rd, uint32_t rn, uint32_t imm, + thumb_flags_behaviour flags, thumb_enforce_encoding enc) +{ + (void)rd; + return th_cmn_imm(rn, imm, flags, enc); +} + +thumb_opcode th_tst_imm(uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); + +/* Wrapper matching thumb_imm_handler_t for generic handler tables */ +static inline thumb_opcode th_tst_imm_handler(uint32_t rd, uint32_t rn, uint32_t imm, + thumb_flags_behaviour flags, thumb_enforce_encoding enc) +{ + (void)rd; + return th_tst_imm(rn, imm, flags, enc); +} + +thumb_opcode th_teq_imm(uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); + +/* Wrapper matching thumb_imm_handler_t for generic handler tables */ +static inline thumb_opcode th_teq_imm_handler(uint32_t rd, uint32_t rn, uint32_t imm, + thumb_flags_behaviour flags, thumb_enforce_encoding enc) +{ + (void)rd; + return th_teq_imm(rn, imm, flags, enc); +} + +/* ───── Compare / Test register ───── */ +thumb_opcode th_cmp_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); +thumb_opcode th_cmn_reg(uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); +thumb_opcode th_tst_reg(uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); +thumb_opcode th_teq_reg(uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_dsp.c b/arch/arm/thumb/thop_dsp.c new file mode 100644 index 00000000..3f5f90b7 --- /dev/null +++ b/arch/arm/thumb/thop_dsp.c @@ -0,0 +1,90 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_dsp.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * DSP and SIMD instructions + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── UADD8 / USUB8 / SEL (T32 only, ARMv7E-M / v8-M) ───── */ + +static const thop_variant_shape SHAPE_DSP_REG3 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .feat = {.t32 = 1, .dsp = 1}, +}; + +TH_TABLE(TH_UADD8, "uadd8", {&SHAPE_DSP_REG3, 0xfa80f040, NULL}); +TH_TABLE(TH_USUB8, "usub8", {&SHAPE_DSP_REG3, 0xfac0f040, NULL}); +TH_TABLE(TH_SEL, "sel", {&SHAPE_DSP_REG3, 0xfaa0f080, NULL}); + +/* ───── PKHBT (T32 only) ───── */ + +static thumb_opcode pkhbt_emit(uint32_t base, const thop_args *a) +{ + uint32_t shift_n = a->shift.value; + uint32_t tb = (a->shift.type == THUMB_SHIFT_ASR) ? 1 : 0; + uint32_t op = base | (a->rd << 8) | (a->rn << 16) | (a->rm << 0) | + ((shift_n & 3) << 6) | (((shift_n >> 2) & 7) << 12) | (tb << 5); + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static const thop_variant_shape SHAPE_PKH = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .shift_allowed = (1u << THUMB_SHIFT_LSL) | (1u << THUMB_SHIFT_ASR), + .feat = {.t32 = 1, .dsp = 1}, +}; + +TH_TABLE(TH_PKHBT, "pkhbt", {&SHAPE_PKH, 0xeac00000, pkhbt_emit}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_uadd8(uint16_t rd, uint16_t rn, uint16_t rm) +{ + return thop_emit(TH_UADD8.name, TH_UADD8.variants, TH_UADD8.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm}); +} + +thumb_opcode th_usub8(uint16_t rd, uint16_t rn, uint16_t rm) +{ + return thop_emit(TH_USUB8.name, TH_USUB8.variants, TH_USUB8.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm}); +} + +thumb_opcode th_sel(uint16_t rd, uint16_t rn, uint16_t rm) +{ + return thop_emit(TH_SEL.name, TH_SEL.variants, TH_SEL.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm}); +} + +thumb_opcode th_pkhbt(uint32_t rd, uint32_t rn, uint32_t rm, thumb_shift shift) +{ + return thop_emit(TH_PKHBT.name, TH_PKHBT.variants, TH_PKHBT.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm, .shift = shift}); +} diff --git a/arch/arm/thumb/thop_dsp.h b/arch/arm/thumb/thop_dsp.h new file mode 100644 index 00000000..eb8d4a94 --- /dev/null +++ b/arch/arm/thumb/thop_dsp.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_uadd8(uint16_t rd, uint16_t rn, uint16_t rm); +thumb_opcode th_usub8(uint16_t rd, uint16_t rn, uint16_t rm); +thumb_opcode th_sel(uint16_t rd, uint16_t rn, uint16_t rm); +thumb_opcode th_pkhbt(uint32_t rd, uint32_t rn, uint32_t rm, thumb_shift shift); diff --git a/arch/arm/thumb/thop_extend.c b/arch/arm/thumb/thop_extend.c new file mode 100644 index 00000000..6090d12b --- /dev/null +++ b/arch/arm/thumb/thop_extend.c @@ -0,0 +1,91 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_extend.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Extend instructions — shared shapes + * ═══════════════════════════════════════════════════════════════════ */ + +/* T1: , — 16-bit, rd/rm low, no rotation */ +static const thop_variant_shape SHAPE_T16_EXTEND = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rm_place = {3, 3}, + .rd_con = REG_LOW_ONLY, + .rm_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 0}, /* rotate must be 0 for T1 */ + .shift_allowed = (1u << THUMB_SHIFT_ROR), + .feat = {.t16 = 1}, +}; + +/* T2: , {, ROR #} — 32-bit, rotate in bits [5:4] */ +static const thop_variant_shape SHAPE_T32_EXTEND = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rm_place = {0, 4}, + .ra_place = {16, 4}, /* rm duplicated at bits [19:16] */ + .rd_con = REG_NOT_PC, + .imm = {.kind = IMM_RAW, .width = 2, .scale_log2 = 3}, + .imm_place = {4, 2}, + .shift_allowed = (1u << THUMB_SHIFT_ROR), + .feat = {.t32 = 1}, +}; + +#define V_EXTEND_T16(b) {&SHAPE_T16_EXTEND, (b)} +#define V_EXTEND_T32(b) {&SHAPE_T32_EXTEND, (b)} + +/* ═══════════════════════════════════════════════════════════════════ + * Generic wrapper + * ═══════════════════════════════════════════════════════════════════ */ + +static thumb_opcode thop_extend(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding enc, + const thop_table *table) +{ + return thop_emit(table->name, table->variants, table->variant_count, + (thop_args){.rd = rd, + .rm = rm, + .ra = rm, + .imm = shift.value, + .shift = shift, + .enc = enc}); +} + +#define THOP_EXTEND_FN(fn_name, table_id) \ + thumb_opcode fn_name(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding enc) \ + { \ + return thop_extend(rd, rm, shift, enc, &table_id); \ + } + +/* ═══════════════════════════════════════════════════════════════════ + * Instruction tables + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_SXTB, "sxtb", V_EXTEND_T16(0xb240), V_EXTEND_T32(0xfa4ff080)); +THOP_EXTEND_FN(th_sxtb, TH_SXTB) + +TH_TABLE(TH_SXTH, "sxth", V_EXTEND_T16(0xb200), V_EXTEND_T32(0xfa0ff080)); +THOP_EXTEND_FN(th_sxth, TH_SXTH) + +TH_TABLE(TH_UXTB, "uxtb", V_EXTEND_T16(0xb2c0), V_EXTEND_T32(0xfa5ff080)); +THOP_EXTEND_FN(th_uxtb, TH_UXTB) + +TH_TABLE(TH_UXTH, "uxth", V_EXTEND_T16(0xb280), V_EXTEND_T32(0xfa1ff080)); +THOP_EXTEND_FN(th_uxth, TH_UXTH) diff --git a/arch/arm/thumb/thop_extend.h b/arch/arm/thumb/thop_extend.h new file mode 100644 index 00000000..d047ea0c --- /dev/null +++ b/arch/arm/thumb/thop_extend.h @@ -0,0 +1,30 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_sxtb(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_sxth(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_uxtb(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_uxth(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_ldaex.c b/arch/arm/thumb/thop_ldaex.c new file mode 100644 index 00000000..b6a43784 --- /dev/null +++ b/arch/arm/thumb/thop_ldaex.c @@ -0,0 +1,93 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_ldaex.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Load-Acquire / Store-Release exclusive (ARMv8-M) + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── LDAEX / LDAEXB / LDAEXH (T32, ARMv8-M) ───── */ + +static const thop_variant_shape SHAPE_LDAEX = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .rn_place = {16, 4}, + .feat = {.t32 = 1, .ldaex = 1}, +}; + +TH_TABLE(TH_LDAEX, "ldaex", {&SHAPE_LDAEX, 0xe8d00fef, NULL}); +TH_TABLE(TH_LDAEXB, "ldaexb", {&SHAPE_LDAEX, 0xe8d00fcf, NULL}); +TH_TABLE(TH_LDAEXH, "ldaexh", {&SHAPE_LDAEX, 0xe8d00fdf, NULL}); + +/* ───── STLEX / STLEXB / STLEXH (T32, ARMv8-M) ───── */ + +static const thop_variant_shape SHAPE_STLEX = { + .size = THOP_VARIANT_T32, + .rd_place = {0, 4}, + .rn_place = {16, 4}, + .rm_place = {12, 4}, + .feat = {.t32 = 1, .ldaex = 1}, +}; + +TH_TABLE(TH_STLEX, "stlex", {&SHAPE_STLEX, 0xe8c00fe0, NULL}); +TH_TABLE(TH_STLEXB, "stlexb", {&SHAPE_STLEX, 0xe8c00fc0, NULL}); +TH_TABLE(TH_STLEXH, "stlexh", {&SHAPE_STLEX, 0xe8c00fd0, NULL}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_ldaex(uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_LDAEX.name, TH_LDAEX.variants, TH_LDAEX.variant_count, + (thop_args){.rd = rt, .rn = rn}); +} + +thumb_opcode th_stlex(uint32_t rd, uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_STLEX.name, TH_STLEX.variants, TH_STLEX.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rt}); +} + +thumb_opcode th_ldaexb(uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_LDAEXB.name, TH_LDAEXB.variants, TH_LDAEXB.variant_count, + (thop_args){.rd = rt, .rn = rn}); +} + +thumb_opcode th_ldaexh(uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_LDAEXH.name, TH_LDAEXH.variants, TH_LDAEXH.variant_count, + (thop_args){.rd = rt, .rn = rn}); +} + +thumb_opcode th_stlexb(uint32_t rd, uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_STLEXB.name, TH_STLEXB.variants, TH_STLEXB.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rt}); +} + +thumb_opcode th_stlexh(uint32_t rd, uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_STLEXH.name, TH_STLEXH.variants, TH_STLEXH.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rt}); +} diff --git a/arch/arm/thumb/thop_ldaex.h b/arch/arm/thumb/thop_ldaex.h new file mode 100644 index 00000000..476d77f6 --- /dev/null +++ b/arch/arm/thumb/thop_ldaex.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_ldaex(uint32_t rt, uint32_t rn); +thumb_opcode th_stlex(uint32_t rd, uint32_t rt, uint32_t rn); +thumb_opcode th_ldaexb(uint32_t rt, uint32_t rn); +thumb_opcode th_ldaexh(uint32_t rt, uint32_t rn); +thumb_opcode th_stlexb(uint32_t rd, uint32_t rt, uint32_t rn); +thumb_opcode th_stlexh(uint32_t rd, uint32_t rt, uint32_t rn); diff --git a/arch/arm/thumb/thop_ldr_literal.c b/arch/arm/thumb/thop_ldr_literal.c new file mode 100644 index 00000000..55125c7e --- /dev/null +++ b/arch/arm/thumb/thop_ldr_literal.c @@ -0,0 +1,76 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_ldr_literal.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * LDR (literal) — load from PC-relative address + * ═══════════════════════════════════════════════════════════════════ */ + +/* T1: LDR , [PC, #] — rt low reg, imm scaled by 4 */ +static const thop_variant_shape SHAPE_LDR_LIT_T1 = { + .size = THOP_VARIANT_T16, + .rd_place = {8, 3}, + .rd_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 8, .scale_log2 = 2}, + .imm_place = {0, 8}, + .feat = {.t16 = 1}, +}; + +/* T3/T4: LDR , [PC, #+/-] — 32-bit, rt != PC */ +static thumb_opcode ldr_literal_emit(uint32_t base, const thop_args *a) +{ + uint32_t rt = a->rd; + uint32_t imm = a->imm; + uint32_t add = a->rn; /* re-use rn to pass add/sub flag */ + + if (rt == R_PC) + return (thumb_opcode){.size = 0, .opcode = 0}; + + if (imm <= 0xfff) { + uint32_t ins = (0xf85f | ((add & 1) << 7)) << 16; + ins |= (rt & 0xf) << 12 | imm; + return (thumb_opcode){.size = 4, .opcode = ins}; + } + return (thumb_opcode){.size = 0, .opcode = 0}; +} + +static const thop_variant_shape SHAPE_LDR_LIT_T32 = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .rd_con = REG_NOT_PC, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_LDR_LITERAL, "ldr", + {&SHAPE_LDR_LIT_T1, 0x4800, NULL}, + {&SHAPE_LDR_LIT_T32, 0, ldr_literal_emit}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_ldr_literal(uint16_t rt, uint32_t imm, uint32_t add) +{ + return thop_emit(TH_LDR_LITERAL.name, TH_LDR_LITERAL.variants, TH_LDR_LITERAL.variant_count, + (thop_args){.rd = rt, .imm = imm, .rn = add}); +} + diff --git a/arch/arm/thumb/thop_ldr_literal.h b/arch/arm/thumb/thop_ldr_literal.h new file mode 100644 index 00000000..de150d6a --- /dev/null +++ b/arch/arm/thumb/thop_ldr_literal.h @@ -0,0 +1,7 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_ldr_literal(uint16_t rt, uint32_t imm, uint32_t add); diff --git a/arch/arm/thumb/thop_ldrd.c b/arch/arm/thumb/thop_ldrd.c new file mode 100644 index 00000000..e84d40f8 --- /dev/null +++ b/arch/arm/thumb/thop_ldrd.c @@ -0,0 +1,72 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_ldrd.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * LDRD / STRD (dual-word load/store) + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── LDRD_imm (T32) ───── */ + +static thumb_opcode ldrd_imm_emit(uint32_t base, const thop_args *a) +{ + uint32_t imm = a->imm; + uint32_t index = (a->puw & 0x4) ? 1 : 0; + uint32_t add = (a->puw & 0x2) ? 1 : 0; + uint32_t wback = (a->puw & 0x1) ? 1 : 0; + uint32_t rn = a->rn; + uint32_t rt = a->rd; + uint32_t rt2 = a->rm; + uint32_t op = base | (add << 23) | (index << 24) | (wback << 21) | + (rn << 16) | (rt << 12) | (rt2 << 8) | (imm & 0xff); + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static const thop_variant_shape SHAPE_LDRD = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .rn_place = {16, 4}, + .rm_place = {8, 4}, + .imm = {.kind = IMM_RAW, .width = 8}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_LDRD_IMM, "ldrd", {&SHAPE_LDRD, 0xe8500000, ldrd_imm_emit}); +TH_TABLE(TH_STRD_IMM, "strd", {&SHAPE_LDRD, 0xe8400000, ldrd_imm_emit}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_ldrd_imm(uint32_t rt, uint32_t rt2, uint32_t rn, int imm, uint32_t puw) +{ + return thop_emit(TH_LDRD_IMM.name, TH_LDRD_IMM.variants, TH_LDRD_IMM.variant_count, + (thop_args){.rd = rt, .rm = rt2, .rn = rn, + .imm = (uint32_t)(imm >> 2), .puw = (uint8_t)puw}); +} + +thumb_opcode th_strd_imm(uint32_t rt, uint32_t rt2, uint32_t rn, int imm, uint32_t puw) +{ + return thop_emit(TH_STRD_IMM.name, TH_STRD_IMM.variants, TH_STRD_IMM.variant_count, + (thop_args){.rd = rt, .rm = rt2, .rn = rn, + .imm = (uint32_t)(imm >> 2), .puw = (uint8_t)puw}); +} diff --git a/arch/arm/thumb/thop_ldrd.h b/arch/arm/thumb/thop_ldrd.h new file mode 100644 index 00000000..ead8c9d7 --- /dev/null +++ b/arch/arm/thumb/thop_ldrd.h @@ -0,0 +1,8 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_ldrd_imm(uint32_t rt, uint32_t rt2, uint32_t rn, int imm, uint32_t puw); +thumb_opcode th_strd_imm(uint32_t rt, uint32_t rt2, uint32_t rn, int imm, uint32_t puw); diff --git a/arch/arm/thumb/thop_ldrex.c b/arch/arm/thumb/thop_ldrex.c new file mode 100644 index 00000000..74efed72 --- /dev/null +++ b/arch/arm/thumb/thop_ldrex.c @@ -0,0 +1,118 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_ldrex.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Load/store exclusive + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── LDREX (T32) ───── */ + +static const thop_variant_shape SHAPE_LDREX = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .rn_place = {16, 4}, + .imm = {.kind = IMM_RAW, .width = 8}, + .imm_place = {0, 8}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_LDREX, "ldrex", {&SHAPE_LDREX, 0xe8500f00, NULL}); + +/* ───── STREX (T32) ───── */ + +static const thop_variant_shape SHAPE_STREX = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {12, 4}, + .rm_place = {16, 4}, + .imm = {.kind = IMM_RAW, .width = 8}, + .imm_place = {0, 8}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_STREX, "strex", {&SHAPE_STREX, 0xe8400000, NULL}); + +/* ───── LDREXB / LDREXH (T32) ───── */ + +static const thop_variant_shape SHAPE_LDREXB = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .rn_place = {16, 4}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_LDREXB, "ldrexb", {&SHAPE_LDREXB, 0xe8d00f4f, NULL}); +TH_TABLE(TH_LDREXH, "ldrexh", {&SHAPE_LDREXB, 0xe8d00f5f, NULL}); + +/* ───── STREXB / STREXH (T32) ───── */ + +static const thop_variant_shape SHAPE_STREXB = { + .size = THOP_VARIANT_T32, + .rd_place = {0, 4}, + .rn_place = {12, 4}, + .rm_place = {16, 4}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_STREXB, "strexb", {&SHAPE_STREXB, 0xe8c00f40, NULL}); +TH_TABLE(TH_STREXH, "strexh", {&SHAPE_STREXB, 0xe8c00f50, NULL}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_ldrex(uint32_t rt, uint32_t rn, int imm) +{ + return thop_emit(TH_LDREX.name, TH_LDREX.variants, TH_LDREX.variant_count, + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)(imm >> 2)}); +} + +thumb_opcode th_strex(uint32_t rd, uint32_t rt, uint32_t rn, int imm) +{ + return thop_emit(TH_STREX.name, TH_STREX.variants, TH_STREX.variant_count, + (thop_args){.rd = rd, .rn = rt, .rm = rn, .imm = (uint32_t)(imm >> 2)}); +} + +thumb_opcode th_ldrexb(uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_LDREXB.name, TH_LDREXB.variants, TH_LDREXB.variant_count, + (thop_args){.rd = rt, .rn = rn}); +} + +thumb_opcode th_ldrexh(uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_LDREXH.name, TH_LDREXH.variants, TH_LDREXH.variant_count, + (thop_args){.rd = rt, .rn = rn}); +} + +thumb_opcode th_strexb(uint32_t rd, uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_STREXB.name, TH_STREXB.variants, TH_STREXB.variant_count, + (thop_args){.rd = rd, .rn = rt, .rm = rn}); +} + +thumb_opcode th_strexh(uint32_t rd, uint32_t rt, uint32_t rn) +{ + return thop_emit(TH_STREXH.name, TH_STREXH.variants, TH_STREXH.variant_count, + (thop_args){.rd = rd, .rn = rt, .rm = rn}); +} diff --git a/arch/arm/thumb/thop_ldrex.h b/arch/arm/thumb/thop_ldrex.h new file mode 100644 index 00000000..d5bc5d2d --- /dev/null +++ b/arch/arm/thumb/thop_ldrex.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_ldrex(uint32_t rt, uint32_t rn, int imm); +thumb_opcode th_strex(uint32_t rd, uint32_t rt, uint32_t rn, int imm); +thumb_opcode th_ldrexb(uint32_t rt, uint32_t rn); +thumb_opcode th_ldrexh(uint32_t rt, uint32_t rn); +thumb_opcode th_strexb(uint32_t rd, uint32_t rt, uint32_t rn); +thumb_opcode th_strexh(uint32_t rd, uint32_t rt, uint32_t rn); diff --git a/arch/arm/thumb/thop_mem_exclusive.c b/arch/arm/thumb/thop_mem_exclusive.c new file mode 100644 index 00000000..c316ebee --- /dev/null +++ b/arch/arm/thumb/thop_mem_exclusive.c @@ -0,0 +1,67 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_mem_exclusive.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Exclusive/acquire-release — shared shape (T32 only) + * ═══════════════════════════════════════════════════════════════════ */ + +static const thop_variant_shape SHAPE_T32_EXCLUSIVE = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .rn_place = {16, 4}, + .feat = {.t32 = 1}, +}; + +#define V_EXCLUSIVE(b) {&SHAPE_T32_EXCLUSIVE, (b)} + +static thumb_opcode thop_exclusive(uint32_t rt, uint32_t rn, const thop_table *table) +{ + return thop_emit(table->name, table->variants, table->variant_count, (thop_args){.rd = rt, .rn = rn}); +} + +#define THOP_EXCLUSIVE_FN(fn_name, table_id) \ + thumb_opcode fn_name(uint32_t rt, uint32_t rn) \ + { \ + return thop_exclusive(rt, rn, &table_id); \ + } + +/* ═══════════════════════════════════════════════════════════════════ + * Instruction tables + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_LDA, "lda", V_EXCLUSIVE(0xE8D00FAF)); +THOP_EXCLUSIVE_FN(th_lda, TH_LDA) + +TH_TABLE(TH_LDAB, "ldab", V_EXCLUSIVE(0xE8D00F8F)); +THOP_EXCLUSIVE_FN(th_ldab, TH_LDAB) + +TH_TABLE(TH_LDAH, "ldah", V_EXCLUSIVE(0xE8D00F9F)); +THOP_EXCLUSIVE_FN(th_ldah, TH_LDAH) + +TH_TABLE(TH_STL, "stl", V_EXCLUSIVE(0xE8C00FAF)); +THOP_EXCLUSIVE_FN(th_stl, TH_STL) + +TH_TABLE(TH_STLB, "stlb", V_EXCLUSIVE(0xE8C00F8F)); +THOP_EXCLUSIVE_FN(th_stlb, TH_STLB) + +TH_TABLE(TH_STLH, "stlh", V_EXCLUSIVE(0xE8C00F9F)); +THOP_EXCLUSIVE_FN(th_stlh, TH_STLH) diff --git a/arch/arm/thumb/thop_mem_exclusive.h b/arch/arm/thumb/thop_mem_exclusive.h new file mode 100644 index 00000000..35cf9315 --- /dev/null +++ b/arch/arm/thumb/thop_mem_exclusive.h @@ -0,0 +1,32 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_lda(uint32_t rt, uint32_t rn); +thumb_opcode th_ldab(uint32_t rt, uint32_t rn); +thumb_opcode th_ldah(uint32_t rt, uint32_t rn); +thumb_opcode th_stl(uint32_t rt, uint32_t rn); +thumb_opcode th_stlb(uint32_t rt, uint32_t rn); +thumb_opcode th_stlh(uint32_t rt, uint32_t rn); diff --git a/arch/arm/thumb/thop_mem_imm.c b/arch/arm/thumb/thop_mem_imm.c new file mode 100644 index 00000000..72a9aba8 --- /dev/null +++ b/arch/arm/thumb/thop_mem_imm.c @@ -0,0 +1,250 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define USING_GLOBALS +#include "thop_mem_imm.h" +#include "tcc.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Thumb load/store immediate-offset instructions + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── T16 shapes ───── */ + +static const thop_variant_shape SHAPE_T16_MEM_IMM4 = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, .rn_place = {3, 3}, + .rd_con = REG_LOW_ONLY, .rn_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 5, .scale_log2 = 2}, + .imm_place = {6, 5}, + .puw_fixed = 6, + .feat = {.t16 = 1}, +}; + +static const thop_variant_shape SHAPE_T16_MEM_IMM0 = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, .rn_place = {3, 3}, + .rd_con = REG_LOW_ONLY, .rn_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 5, .scale_log2 = 0}, + .imm_place = {6, 5}, + .puw_fixed = 6, + .feat = {.t16 = 1}, +}; + +static const thop_variant_shape SHAPE_T16_MEM_IMM1 = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, .rn_place = {3, 3}, + .rd_con = REG_LOW_ONLY, .rn_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 5, .scale_log2 = 1}, + .imm_place = {6, 5}, + .puw_fixed = 6, + .feat = {.t16 = 1}, +}; + +static const thop_variant_shape SHAPE_T16_MEM_SP_IMM4 = { + .size = THOP_VARIANT_T16, + .rd_place = {8, 3}, + .rd_con = REG_LOW_ONLY, + .rn_con = REG_SP_ONLY, /* SP is implicit in encoding */ + .imm = {.kind = IMM_RAW, .width = 8, .scale_log2 = 2}, + .imm_place = {0, 8}, + .puw_fixed = 6, + .feat = {.t16 = 1}, +}; + +/* ───── T32 positive-offset shapes ───── */ + +static const thop_variant_shape SHAPE_T32_MEM_POS_ANY_NOTPC = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, .rn_place = {16, 4}, + .rd_con = REG_ANY, .rn_con = REG_NOT_PC, + .imm = {.kind = IMM_RAW, .width = 12, .scale_log2 = 0}, + .imm_place = {0, 12}, + .puw_fixed = 6, + .feat = {.t32 = 1}, +}; + +static const thop_variant_shape SHAPE_T32_MEM_POS_NOSP_NOTPC = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, .rn_place = {16, 4}, + .rd_con = REG_NOT_SP, .rn_con = REG_NOT_PC, + .imm = {.kind = IMM_RAW, .width = 12, .scale_log2 = 0}, + .imm_place = {0, 12}, + .puw_fixed = 6, + .feat = {.t32 = 1}, +}; + +static const thop_variant_shape SHAPE_T32_MEM_POS_NOSP_ANY = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, .rn_place = {16, 4}, + .rd_con = REG_NOT_SP, .rn_con = REG_ANY, + .imm = {.kind = IMM_RAW, .width = 12, .scale_log2 = 0}, + .imm_place = {0, 12}, + .puw_fixed = 6, + .feat = {.t32 = 1}, +}; + +/* ───── T32 PC-relative shapes ───── */ + +static const thop_variant_shape SHAPE_T32_MEM_PC_POS = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, .rn_place = {16, 4}, + .rd_con = REG_ANY, .rn_con = REG_PC_ONLY, + .imm = {.kind = IMM_RAW, .width = 12, .scale_log2 = 0}, + .imm_place = {0, 12}, + .puw_fixed = 6, + .feat = {.t32 = 1}, +}; + +static const thop_variant_shape SHAPE_T32_MEM_PC_NEG = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, .rn_place = {16, 4}, + .rd_con = REG_ANY, .rn_con = REG_PC_ONLY, + .imm = {.kind = IMM_RAW, .width = 12, .scale_log2 = 0}, + .imm_place = {0, 12}, + .puw_fixed = 4, + .feat = {.t32 = 1}, +}; + +/* ───── T32 indexed shapes (PUW in bits [10:8]) ───── */ + +static const thop_variant_shape SHAPE_T32_MEM_IDX_ANY = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, .rn_place = {16, 4}, + .rd_con = REG_ANY, .rn_con = REG_ANY, + .imm = {.kind = IMM_RAW, .width = 8, .scale_log2 = 0}, + .imm_place = {0, 8}, + .puw_bits = {8, 3}, + .feat = {.t32 = 1}, +}; + +static const thop_variant_shape SHAPE_T32_MEM_IDX_NOSP = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, .rn_place = {16, 4}, + .rd_con = REG_NOT_SP, .rn_con = REG_ANY, + .imm = {.kind = IMM_RAW, .width = 8, .scale_log2 = 0}, + .imm_place = {0, 8}, + .puw_bits = {8, 3}, + .feat = {.t32 = 1}, +}; + +/* ───── Tables ───── */ + +TH_TABLE(TH_LDR_IMM, "ldr", + {&SHAPE_T16_MEM_IMM4, 0x6800, NULL}, + {&SHAPE_T16_MEM_SP_IMM4, 0x9800, NULL}, + {&SHAPE_T32_MEM_POS_ANY_NOTPC, 0xf8d00000, NULL}, + {&SHAPE_T32_MEM_PC_POS, 0xf8df0000, NULL}, + {&SHAPE_T32_MEM_PC_NEG, 0xf85f0000, NULL}, + {&SHAPE_T32_MEM_IDX_ANY, 0xf8500800, NULL}); + +TH_TABLE(TH_LDRB_IMM, "ldrb", + {&SHAPE_T16_MEM_IMM0, 0x7800, NULL}, + {&SHAPE_T32_MEM_POS_NOSP_NOTPC, 0xf8900000, NULL}, + {&SHAPE_T32_MEM_PC_POS, 0xf89f0000, NULL}, + {&SHAPE_T32_MEM_PC_NEG, 0xf81f0000, NULL}, + {&SHAPE_T32_MEM_IDX_NOSP, 0xf8100800, NULL}); + +TH_TABLE(TH_LDRH_IMM, "ldrh", + {&SHAPE_T16_MEM_IMM1, 0x8800, NULL}, + {&SHAPE_T32_MEM_POS_NOSP_NOTPC, 0xf8b00000, NULL}, + {&SHAPE_T32_MEM_PC_POS, 0xf8bf0000, NULL}, + {&SHAPE_T32_MEM_PC_NEG, 0xf83f0000, NULL}, + {&SHAPE_T32_MEM_IDX_NOSP, 0xf8300800, NULL}); + +TH_TABLE(TH_LDRSB_IMM, "ldrsb", + {&SHAPE_T32_MEM_POS_NOSP_NOTPC, 0xf9900000, NULL}, + {&SHAPE_T32_MEM_PC_POS, 0xf99f0000, NULL}, + {&SHAPE_T32_MEM_PC_NEG, 0xf91f0000, NULL}, + {&SHAPE_T32_MEM_IDX_NOSP, 0xf9100800, NULL}); + +TH_TABLE(TH_LDRSH_IMM, "ldrsh", + {&SHAPE_T32_MEM_POS_NOSP_NOTPC, 0xf9b00000, NULL}, + {&SHAPE_T32_MEM_PC_POS, 0xf9bf0000, NULL}, + {&SHAPE_T32_MEM_PC_NEG, 0xf93f0000, NULL}, + {&SHAPE_T32_MEM_IDX_NOSP, 0xf9300800, NULL}); + +TH_TABLE(TH_STR_IMM, "str", + {&SHAPE_T16_MEM_IMM4, 0x6000, NULL}, + {&SHAPE_T16_MEM_SP_IMM4, 0x9000, NULL}, + {&SHAPE_T32_MEM_POS_ANY_NOTPC, 0xf8c00000, NULL}, + {&SHAPE_T32_MEM_PC_POS, 0xf8df0000, NULL}, + {&SHAPE_T32_MEM_PC_NEG, 0xf85f0000, NULL}, + {&SHAPE_T32_MEM_IDX_ANY, 0xf8400800, NULL}); + +TH_TABLE(TH_STRB_IMM, "strb", + {&SHAPE_T16_MEM_IMM0, 0x7000, NULL}, + {&SHAPE_T32_MEM_POS_NOSP_ANY, 0xf8800000, NULL}, + {&SHAPE_T32_MEM_IDX_NOSP, 0xf8000800, NULL}); + +TH_TABLE(TH_STRH_IMM, "strh", + {&SHAPE_T16_MEM_IMM1, 0x8000, NULL}, + {&SHAPE_T32_MEM_POS_NOSP_ANY, 0xf8a00000, NULL}, + {&SHAPE_T32_MEM_IDX_NOSP, 0xf8200800, NULL}); + +/* ───── Emit wrappers ───── */ + +thumb_opcode th_ldr_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + return thop_emit(TH_LDR_IMM.name, TH_LDR_IMM.variants, TH_LDR_IMM.variant_count, + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)imm, .puw = (uint8_t)puw, .enc = enc}); +} + +thumb_opcode th_ldrb_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + return thop_emit(TH_LDRB_IMM.name, TH_LDRB_IMM.variants, TH_LDRB_IMM.variant_count, + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)imm, .puw = (uint8_t)puw, .enc = enc}); +} + +thumb_opcode th_ldrh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + return thop_emit(TH_LDRH_IMM.name, TH_LDRH_IMM.variants, TH_LDRH_IMM.variant_count, + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)imm, .puw = (uint8_t)puw, .enc = enc}); +} + +thumb_opcode th_ldrsb_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + return thop_emit(TH_LDRSB_IMM.name, TH_LDRSB_IMM.variants, TH_LDRSB_IMM.variant_count, + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)imm, .puw = (uint8_t)puw, .enc = enc}); +} + +thumb_opcode th_ldrsh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + return thop_emit(TH_LDRSH_IMM.name, TH_LDRSH_IMM.variants, TH_LDRSH_IMM.variant_count, + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)imm, .puw = (uint8_t)puw, .enc = enc}); +} + +thumb_opcode th_str_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + return thop_emit(TH_STR_IMM.name, TH_STR_IMM.variants, TH_STR_IMM.variant_count, + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)imm, .puw = (uint8_t)puw, .enc = enc}); +} + +thumb_opcode th_strb_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + return thop_emit(TH_STRB_IMM.name, TH_STRB_IMM.variants, TH_STRB_IMM.variant_count, + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)imm, .puw = (uint8_t)puw, .enc = enc}); +} + +thumb_opcode th_strh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + return thop_emit(TH_STRH_IMM.name, TH_STRH_IMM.variants, TH_STRH_IMM.variant_count, + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)imm, .puw = (uint8_t)puw, .enc = enc}); +} diff --git a/arch/arm/thumb/thop_mem_imm.h b/arch/arm/thumb/thop_mem_imm.h new file mode 100644 index 00000000..539ce674 --- /dev/null +++ b/arch/arm/thumb/thop_mem_imm.h @@ -0,0 +1,32 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include "thumb.h" + +thumb_opcode th_ldr_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); +thumb_opcode th_ldrb_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); +thumb_opcode th_ldrh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); +thumb_opcode th_ldrsb_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); +thumb_opcode th_ldrsh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); +thumb_opcode th_str_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); +thumb_opcode th_strb_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); +thumb_opcode th_strh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); diff --git a/arch/arm/thumb/thop_mem_reg.c b/arch/arm/thumb/thop_mem_reg.c new file mode 100644 index 00000000..5ccdcfc9 --- /dev/null +++ b/arch/arm/thumb/thop_mem_reg.c @@ -0,0 +1,97 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_mem_reg.h" + +#define USING_GLOBALS +#include "tcc.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Load/Store register — shared shapes + * ═══════════════════════════════════════════════════════════════════ */ + +/* T1: , [, ] — 16-bit, all low, no shift */ +static const thop_variant_shape SHAPE_T16_MEM_REG = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rn_place = {3, 3}, + .rm_place = {6, 3}, + .rd_con = REG_LOW_ONLY, + .rn_con = REG_LOW_ONLY, + .rm_con = REG_LOW_ONLY, + .feat = {.t16 = 1}, +}; + +/* T2/T3/T4 (32-bit): , [, {, LSL #}] — shift amount in imm2 bits [5:4] */ +static const thop_variant_shape SHAPE_T32_MEM_REG = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .rd_con = REG_NOT_SP, + .rn_con = REG_NOT_PC, + .rm_con = REG_NOT_SP | REG_NOT_PC, + .shift_imm2_bits = {4, 2}, + .shift_allowed = (1u << THUMB_SHIFT_LSL), + .feat = {.t32 = 1}, +}; + +#define V_MEM_REG_T1(b) {&SHAPE_T16_MEM_REG, (b)} +#define V_MEM_REG_T32(b) {&SHAPE_T32_MEM_REG, (b)} + +/* ═══════════════════════════════════════════════════════════════════ + * Generic wrapper + * ═══════════════════════════════════════════════════════════════════ */ + +#define THOP_MEM_REG_FN(fn_name, table_id) \ + thumb_opcode fn_name(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding enc) \ + { \ + return thop_emit(table_id.name, table_id.variants, table_id.variant_count, \ + (thop_args){.rd = rt, .rn = rn, .rm = rm, .flags = FLAGS_BEHAVIOUR_NOT_IMPORTANT, \ + .shift = shift, .enc = enc}); \ + } + +/* ═══════════════════════════════════════════════════════════════════ + * Instruction tables + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_LDR_REG, "ldr", V_MEM_REG_T1(0x5800), V_MEM_REG_T32(0xF8500000)); +THOP_MEM_REG_FN(th_ldr_reg, TH_LDR_REG) + +TH_TABLE(TH_LDRB_REG, "ldrb", V_MEM_REG_T1(0x5C00), V_MEM_REG_T32(0xF8100000)); +THOP_MEM_REG_FN(th_ldrb_reg, TH_LDRB_REG) + +TH_TABLE(TH_LDRH_REG, "ldrh", V_MEM_REG_T1(0x5A00), V_MEM_REG_T32(0xF8300000)); +THOP_MEM_REG_FN(th_ldrh_reg, TH_LDRH_REG) + +TH_TABLE(TH_LDRSB_REG, "ldrsb", V_MEM_REG_T1(0x5600), V_MEM_REG_T32(0xF9100000)); +THOP_MEM_REG_FN(th_ldrsb_reg, TH_LDRSB_REG) + +TH_TABLE(TH_LDRSH_REG, "ldrsh", V_MEM_REG_T1(0x5E00), V_MEM_REG_T32(0xF9300000)); +THOP_MEM_REG_FN(th_ldrsh_reg, TH_LDRSH_REG) + +TH_TABLE(TH_STR_REG, "str", V_MEM_REG_T1(0x5000), V_MEM_REG_T32(0xF8400000)); +THOP_MEM_REG_FN(th_str_reg, TH_STR_REG) + +TH_TABLE(TH_STRB_REG, "strb", V_MEM_REG_T1(0x5400), V_MEM_REG_T32(0xF8000000)); +THOP_MEM_REG_FN(th_strb_reg, TH_STRB_REG) + +TH_TABLE(TH_STRH_REG, "strh", V_MEM_REG_T1(0x5200), V_MEM_REG_T32(0xF8200000)); +THOP_MEM_REG_FN(th_strh_reg, TH_STRH_REG) diff --git a/arch/arm/thumb/thop_mem_reg.h b/arch/arm/thumb/thop_mem_reg.h new file mode 100644 index 00000000..3a7b3a8c --- /dev/null +++ b/arch/arm/thumb/thop_mem_reg.h @@ -0,0 +1,34 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_ldr_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_ldrb_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_ldrh_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_ldrsb_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_ldrsh_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_str_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_strb_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); +thumb_opcode th_strh_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_mem_unpriv.c b/arch/arm/thumb/thop_mem_unpriv.c new file mode 100644 index 00000000..d5133f14 --- /dev/null +++ b/arch/arm/thumb/thop_mem_unpriv.c @@ -0,0 +1,71 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_mem_unpriv.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Unprivileged load/store — shared shape (T32 only) + * ═══════════════════════════════════════════════════════════════════ */ + +static const thop_variant_shape SHAPE_T32_MEM_UNPRIV = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .rn_place = {16, 4}, + .imm = {.kind = IMM_RAW, .width = 8}, + .imm_place = {0, 8}, + .feat = {.t32 = 1}, +}; + +#define V_MEM_UNPRIV(b) {&SHAPE_T32_MEM_UNPRIV, (b)} + +#define THOP_MEM_UNPRIV_FN(fn_name, table_id) \ + thumb_opcode fn_name(uint32_t rt, uint32_t rn, int imm) \ + { \ + return thop_emit((table_id).name, (table_id).variants, (table_id).variant_count, \ + (thop_args){.rd = rt, .rn = rn, .imm = (uint32_t)imm}); \ + } + +/* ═══════════════════════════════════════════════════════════════════ + * Instruction tables + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_LDRT, "ldrt", V_MEM_UNPRIV(0xF8500E00)); +THOP_MEM_UNPRIV_FN(th_ldrt, TH_LDRT) + +TH_TABLE(TH_LDRBT, "ldrbt", V_MEM_UNPRIV(0xF8100E00)); +THOP_MEM_UNPRIV_FN(th_ldrbt, TH_LDRBT) + +TH_TABLE(TH_LDRHT, "ldrht", V_MEM_UNPRIV(0xF8300E00)); +THOP_MEM_UNPRIV_FN(th_ldrht, TH_LDRHT) + +TH_TABLE(TH_LDRSBT, "ldrsbt", V_MEM_UNPRIV(0xF9100E00)); +THOP_MEM_UNPRIV_FN(th_ldrsbt, TH_LDRSBT) + +TH_TABLE(TH_LDRSHT, "ldrsht", V_MEM_UNPRIV(0xF9300E00)); +THOP_MEM_UNPRIV_FN(th_ldrsht, TH_LDRSHT) + +TH_TABLE(TH_STRT, "strt", V_MEM_UNPRIV(0xF8400E00)); +THOP_MEM_UNPRIV_FN(th_strt, TH_STRT) + +TH_TABLE(TH_STRBT, "strbt", V_MEM_UNPRIV(0xF8000E00)); +THOP_MEM_UNPRIV_FN(th_strbt, TH_STRBT) + +TH_TABLE(TH_STRHT, "strht", V_MEM_UNPRIV(0xF8200E00)); +THOP_MEM_UNPRIV_FN(th_strht, TH_STRHT) diff --git a/arch/arm/thumb/thop_mem_unpriv.h b/arch/arm/thumb/thop_mem_unpriv.h new file mode 100644 index 00000000..f05d43f4 --- /dev/null +++ b/arch/arm/thumb/thop_mem_unpriv.h @@ -0,0 +1,34 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_ldrt(uint32_t rt, uint32_t rn, int imm); +thumb_opcode th_ldrbt(uint32_t rt, uint32_t rn, int imm); +thumb_opcode th_ldrht(uint32_t rt, uint32_t rn, int imm); +thumb_opcode th_ldrsbt(uint32_t rt, uint32_t rn, int imm); +thumb_opcode th_ldrsht(uint32_t rt, uint32_t rn, int imm); +thumb_opcode th_strt(uint32_t rt, uint32_t rn, int imm); +thumb_opcode th_strbt(uint32_t rt, uint32_t rn, int imm); +thumb_opcode th_strht(uint32_t rt, uint32_t rn, int imm); diff --git a/arch/arm/thumb/thop_mov.c b/arch/arm/thumb/thop_mov.c new file mode 100644 index 00000000..ffd2220a --- /dev/null +++ b/arch/arm/thumb/thop_mov.c @@ -0,0 +1,215 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define USING_GLOBALS +#include "thop_mov.h" +#include "thumb.h" +#include "tcc.h" + +/* ═══════════════════════════════════════════════════════════════════ + * MOV — move (register, immediate, top-half) + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── MOV register ───── */ + +/* T1 high-register MOV: MOV , — no shift, no S */ +static const thop_variant_shape SHAPE_MOV_REG_T1_HIGH = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .dn_rd_split = {0, 3}, + .rm_place = {3, 4}, + .rd_con = REG_NOT_PC, + .rm_con = REG_ANY, + .feat = {.t16 = 1}, +}; + +/* T1 shift alias: LSL/LSR/ASR , , #imm — low regs, implicit S */ +static thumb_opcode mov_reg_t1_shift_emit(uint32_t base, const thop_args *a) +{ + (void)base; + if (a->rd < 8 && a->rm < 8 && a->shift.type != THUMB_SHIFT_RRX && a->shift.type != THUMB_SHIFT_ROR && + ((a->flags == FLAGS_BEHAVIOUR_SET && !a->in_it_block) || (a->flags != FLAGS_BEHAVIOUR_SET && a->in_it_block))) + { + THOP_TRACE("%s %s, %s, #%u\n", th_shift_name(a->shift.type), th_reg_name(a->rd), th_reg_name(a->rm), + (unsigned)a->shift.value); + return (thumb_opcode){ + .size = 2, + .opcode = (0x0000 | (th_shift_value_to_sr_type(a->shift) << 11) | (a->shift.value << 6) | (a->rm << 3) | a->rd), + }; + } + return (thumb_opcode){.size = 0, .opcode = 0}; +} + +static const thop_variant_shape SHAPE_MOV_REG_T1_SHIFT = { + .size = THOP_VARIANT_T16, + .shift_allowed = (1u << THUMB_SHIFT_LSL) | (1u << THUMB_SHIFT_LSR) | (1u << THUMB_SHIFT_ASR), + .has_s_bit = 1, + .feat = {.t16 = 1}, +}; + +/* T3 wide MOV: MOV{S}.W , {,shift} */ +static const thop_variant_shape SHAPE_MOV_REG_T3 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .has_s_bit = 1, + .shift_type_bits = {4, 2}, + .shift_imm2_bits = {6, 2}, + .shift_imm3_bits = {12, 3}, + .shift_allowed = (1u << THUMB_SHIFT_LSL) | (1u << THUMB_SHIFT_LSR) | (1u << THUMB_SHIFT_ASR) | + (1u << THUMB_SHIFT_ROR) | (1u << THUMB_SHIFT_RRX), + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_MOV_REG, "mov", + {&SHAPE_MOV_REG_T1_HIGH, 0x4600}, + {&SHAPE_MOV_REG_T1_SHIFT, 0, mov_reg_t1_shift_emit}, + {&SHAPE_MOV_REG_T3, 0xea4f0000}); + +thumb_opcode th_mov_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding, bool in_it) +{ + if (shift.mode == THUMB_SHIFT_REGISTER && shift.type != THUMB_SHIFT_NONE) + return th_mov_reg_shift(rd, rm, shift.value, flags, shift, encoding); + + return thop_emit(TH_MOV_REG.name, TH_MOV_REG.variants, TH_MOV_REG.variant_count, + (thop_args){.rd = rd, .rm = rm, .flags = flags, .shift = shift, .enc = encoding, .in_it_block = in_it}); +} + +/* ───── MOV immediate ───── */ + +/* T1: MOVS , # — low regs, implicit S, no BLOCK flags */ +static thumb_opcode mov_imm_t1_emit(uint32_t base, const thop_args *a) +{ + if (a->rd <= 7 && a->imm <= 255 && a->flags != FLAGS_BEHAVIOUR_BLOCK) + { + THOP_TRACE("movs %s, #%u\n", th_reg_name(a->rd), (unsigned)a->imm); + return (thumb_opcode){.size = 2, .opcode = base | (a->rd << 8) | a->imm}; + } + return (thumb_opcode){.size = 0, .opcode = 0}; +} + +static const thop_variant_shape SHAPE_MOV_IMM_T1 = { + .size = THOP_VARIANT_T16, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +/* T3: MOV , # — modified immediate */ +static const thop_variant_shape SHAPE_MOV_IMM_T3 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rd_con = REG_NOT_SP | REG_NOT_PC, + .imm = {.kind = IMM_PACK_CONST}, + .has_s_bit = 1, + .feat = {.t32 = 1, .mod_imm = 1}, +}; + +/* T4: MOVW , # */ +static const thop_variant_shape SHAPE_MOV_IMM_T4 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rd_con = REG_NOT_SP | REG_NOT_PC, + .imm = {.kind = IMM_PACK_3_8_1}, + .feat = {.t32 = 1, .movw_movt = 1}, +}; + +TH_TABLE(TH_MOV_IMM, "mov", + {&SHAPE_MOV_IMM_T1, 0x2000, mov_imm_t1_emit}, + {&SHAPE_MOV_IMM_T3, 0xf04f0000}, + {&SHAPE_MOV_IMM_T4, 0xf2400000}); + +thumb_opcode th_mov_imm(uint16_t rd, uint32_t imm, thumb_flags_behaviour setflags, thumb_enforce_encoding encoding) +{ + return thop_emit(TH_MOV_IMM.name, TH_MOV_IMM.variants, TH_MOV_IMM.variant_count, + (thop_args){.rd = rd, .imm = imm, .flags = setflags, .enc = encoding}); +} + +/* ───── MOVT ───── */ + +static const thop_variant_shape SHAPE_MOVT = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rd_con = REG_NOT_SP | REG_NOT_PC, + .imm = {.kind = IMM_PACK_3_8_1}, + .feat = {.t32 = 1, .movw_movt = 1}, +}; + +TH_TABLE(TH_MOVT, "movt", {&SHAPE_MOVT, 0xf2c00000}); + +thumb_opcode th_movt(uint32_t rd, uint32_t imm16) +{ + return thop_emit(TH_MOVT.name, TH_MOVT.variants, TH_MOVT.variant_count, + (thop_args){.rd = rd, .imm = imm16}); +} + +/* ───── MOV register-controlled shift ───── */ + +/* T1: MOV , , — low regs, rd==rm */ +static thumb_opcode mov_reg_shift_t1_emit(uint32_t base, const thop_args *a) +{ + (void)base; + if (a->rd == a->rm && a->rd < 8 && a->ra < 8 && a->enc != ENFORCE_ENCODING_32BIT && a->shift.type != THUMB_SHIFT_RRX) + { + return (thumb_opcode){ + .size = 2, + .opcode = 0x4000 | (a->ra << 3) | (th_shift_type_to_op(a->shift) << 6) | a->rd, + }; + } + return (thumb_opcode){.size = 0, .opcode = 0}; +} + +static const thop_variant_shape SHAPE_MOV_REG_SHIFT_T1 = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rm_place = {0, 3}, + .ra_place = {3, 3}, + .rd_con = REG_LOW_ONLY | REG_EQ_RM, + .rm_con = REG_LOW_ONLY, + .ra_con = REG_LOW_ONLY, + .implicit_s = 1, + .shift_allowed = (1u << THUMB_SHIFT_LSL) | (1u << THUMB_SHIFT_LSR) | (1u << THUMB_SHIFT_ASR) | + (1u << THUMB_SHIFT_ROR), + .feat = {.t16 = 1}, +}; + +static const thop_variant_shape SHAPE_MOV_REG_SHIFT_T3 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rm_place = {16, 4}, + .ra_place = {0, 4}, + .has_s_bit = 1, + .shift_type_bits = {21, 2}, + .shift_allowed = (1u << THUMB_SHIFT_LSL) | (1u << THUMB_SHIFT_LSR) | (1u << THUMB_SHIFT_ASR) | + (1u << THUMB_SHIFT_ROR), + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_MOV_REG_SHIFT, "mov", + {&SHAPE_MOV_REG_SHIFT_T1, 0, mov_reg_shift_t1_emit}, + {&SHAPE_MOV_REG_SHIFT_T3, 0xfa00f000}); + +thumb_opcode th_mov_reg_shift(uint32_t rd, uint32_t rm, uint32_t rs, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding) +{ + return thop_emit(TH_MOV_REG_SHIFT.name, TH_MOV_REG_SHIFT.variants, TH_MOV_REG_SHIFT.variant_count, + (thop_args){.rd = rd, .rm = rm, .ra = rs, .flags = flags, .shift = shift, .enc = encoding}); +} diff --git a/arch/arm/thumb/thop_mov.h b/arch/arm/thumb/thop_mov.h new file mode 100644 index 00000000..525fb1cb --- /dev/null +++ b/arch/arm/thumb/thop_mov.h @@ -0,0 +1,15 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_mov_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding, bool in_it); + +thumb_opcode th_mov_imm(uint16_t rd, uint32_t imm, thumb_flags_behaviour setflags, thumb_enforce_encoding encoding); + +thumb_opcode th_movt(uint32_t rd, uint32_t imm16); + +thumb_opcode th_mov_reg_shift(uint32_t rd, uint32_t rm, uint32_t rs, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_mrs.c b/arch/arm/thumb/thop_mrs.c new file mode 100644 index 00000000..e8b56987 --- /dev/null +++ b/arch/arm/thumb/thop_mrs.c @@ -0,0 +1,83 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_mrs.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Move to/from special register (MRS, MSR) + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── MRS (T32 only) ───── */ + +static thumb_opcode mrs_emit(uint32_t base, const thop_args *a) +{ + uint32_t sysm = a->rm; + uint32_t rd = a->rd; + uint32_t op = base | (rd << 8) | sysm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static const thop_variant_shape SHAPE_MRS = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rm_place = {0, 8}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_MRS, "mrs", {&SHAPE_MRS, 0xf3ef8000, mrs_emit}); + +/* ───── MSR (T32 only) ───── */ + +static thumb_opcode msr_emit(uint32_t base, const thop_args *a) +{ + uint32_t sysm = a->rm; + uint32_t rn = a->rd; + uint32_t mask = a->imm; + uint32_t op = base | (rn << 16) | (mask << 10) | sysm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static const thop_variant_shape SHAPE_MSR = { + .size = THOP_VARIANT_T32, + .rd_place = {16, 4}, + .rm_place = {0, 8}, + .imm = {.kind = IMM_RAW, .width = 2}, + .imm2_place = {10, 2}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_MSR, "msr", {&SHAPE_MSR, 0xf3808000, msr_emit}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_mrs(uint32_t rd, uint32_t sysm) +{ + return thop_emit(TH_MRS.name, TH_MRS.variants, TH_MRS.variant_count, + (thop_args){.rd = rd, .rm = sysm}); +} + +thumb_opcode th_msr(uint32_t specreg, uint32_t rn, uint32_t mask) +{ + return thop_emit(TH_MSR.name, TH_MSR.variants, TH_MSR.variant_count, + (thop_args){.rd = rn, .rm = specreg, .imm = mask}); +} diff --git a/arch/arm/thumb/thop_mrs.h b/arch/arm/thumb/thop_mrs.h new file mode 100644 index 00000000..2792681a --- /dev/null +++ b/arch/arm/thumb/thop_mrs.h @@ -0,0 +1,8 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_mrs(uint32_t rd, uint32_t sysm); +thumb_opcode th_msr(uint32_t specreg, uint32_t rn, uint32_t mask); diff --git a/arch/arm/thumb/thop_mul.c b/arch/arm/thumb/thop_mul.c new file mode 100644 index 00000000..3dece491 --- /dev/null +++ b/arch/arm/thumb/thop_mul.c @@ -0,0 +1,164 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_mul.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Multiply, divide, and long multiply + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── MUL (T16: lo regs only, N == D; T32: any regs) ───── */ + +static thumb_opcode mul_t16_emit(uint32_t base, const thop_args *a) +{ + uint32_t op = base | ((a->rd & 7) << 0) | ((a->rm & 7) << 3); + return (thumb_opcode){.size = 2, .opcode = op}; +} + +static const thop_variant_shape SHAPE_MUL_T16 = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rm_place = {3, 3}, + .feat = {.t16 = 1}, +}; + +static const thop_variant_shape SHAPE_MUL_T32 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_MUL_T16, "muls", {&SHAPE_MUL_T16, 0x4340, mul_t16_emit}); +TH_TABLE(TH_MUL_T32, "mul", {&SHAPE_MUL_T32, 0xfb00f000, NULL}); + +/* ───── MLA / MLS (T32 only) ───── */ + +static const thop_variant_shape SHAPE_MLA = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .ra_place = {12, 4}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_MLA, "mla", {&SHAPE_MLA, 0xfb000000, NULL}); +TH_TABLE(TH_MLS, "mls", {&SHAPE_MLA, 0xfb000010, NULL}); + +/* ───── UMULL / UMLAL / SMULL / SMLAL (T32 only) ───── */ + +static thumb_opcode long_mul_emit(uint32_t base, const thop_args *a) +{ + uint32_t op = base | (a->rd << 8) | (a->rn << 16) | (a->rm << 0) | (a->ra << 12); + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static const thop_variant_shape SHAPE_LONG_MUL = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {0, 4}, + .rm_place = {16, 4}, + .ra_place = {12, 4}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_UMULL, "umull", {&SHAPE_LONG_MUL, 0xfba00000, long_mul_emit}); +TH_TABLE(TH_UMLAL, "umlal", {&SHAPE_LONG_MUL, 0xfbe00000, long_mul_emit}); +TH_TABLE(TH_SMULL, "smull", {&SHAPE_LONG_MUL, 0xfb800000, long_mul_emit}); +TH_TABLE(TH_SMLAL, "smlal", {&SHAPE_LONG_MUL, 0xfbc00000, long_mul_emit}); + +/* ───── SDIV / UDIV (T32 only) ───── */ + +static const thop_variant_shape SHAPE_DIV = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .feat = {.t32 = 1, .div = 1}, +}; + +TH_TABLE(TH_UDIV, "udiv", {&SHAPE_DIV, 0xfbb0f0f0, NULL}); +TH_TABLE(TH_SDIV, "sdiv", {&SHAPE_DIV, 0xfb90f0f0, NULL}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_mul(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding) +{ + (void)flags; + if (encoding == ENFORCE_ENCODING_32BIT || rd > 7 || rm > 7 || rn > 7 || rd != rm) + return thop_emit(TH_MUL_T32.name, TH_MUL_T32.variants, TH_MUL_T32.variant_count, + (thop_args){.rd = rd, .rm = rm, .rn = rn}); + return thop_emit(TH_MUL_T16.name, TH_MUL_T16.variants, TH_MUL_T16.variant_count, + (thop_args){.rd = rd, .rm = rn}); +} + +thumb_opcode th_mla(uint32_t rd, uint32_t rn, uint32_t rm, uint32_t ra) +{ + return thop_emit(TH_MLA.name, TH_MLA.variants, TH_MLA.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm, .ra = ra}); +} + +thumb_opcode th_mls(uint32_t rd, uint32_t rn, uint32_t rm, uint32_t ra) +{ + return thop_emit(TH_MLS.name, TH_MLS.variants, TH_MLS.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm, .ra = ra}); +} + +thumb_opcode th_umull(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm) +{ + return thop_emit(TH_UMULL.name, TH_UMULL.variants, TH_UMULL.variant_count, + (thop_args){.rd = rdhi, .rn = rn, .rm = rm, .ra = rdlo}); +} + +thumb_opcode th_umlal(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm) +{ + return thop_emit(TH_UMLAL.name, TH_UMLAL.variants, TH_UMLAL.variant_count, + (thop_args){.rd = rdhi, .rn = rn, .rm = rm, .ra = rdlo}); +} + +thumb_opcode th_smull(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm) +{ + return thop_emit(TH_SMULL.name, TH_SMULL.variants, TH_SMULL.variant_count, + (thop_args){.rd = rdhi, .rn = rn, .rm = rm, .ra = rdlo}); +} + +thumb_opcode th_smlal(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm) +{ + return thop_emit(TH_SMLAL.name, TH_SMLAL.variants, TH_SMLAL.variant_count, + (thop_args){.rd = rdhi, .rn = rn, .rm = rm, .ra = rdlo}); +} + +thumb_opcode th_udiv(uint16_t rd, uint16_t rn, uint16_t rm) +{ + return thop_emit(TH_UDIV.name, TH_UDIV.variants, TH_UDIV.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm}); +} + +thumb_opcode th_sdiv(uint16_t rd, uint16_t rn, uint16_t rm) +{ + return thop_emit(TH_SDIV.name, TH_SDIV.variants, TH_SDIV.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm}); +} diff --git a/arch/arm/thumb/thop_mul.h b/arch/arm/thumb/thop_mul.h new file mode 100644 index 00000000..65e81fca --- /dev/null +++ b/arch/arm/thumb/thop_mul.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_mul(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding); +thumb_opcode th_mla(uint32_t rd, uint32_t rn, uint32_t rm, uint32_t ra); +thumb_opcode th_mls(uint32_t rd, uint32_t rn, uint32_t rm, uint32_t ra); +thumb_opcode th_umull(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm); +thumb_opcode th_umlal(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm); +thumb_opcode th_smull(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm); +thumb_opcode th_smlal(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm); +thumb_opcode th_udiv(uint16_t rd, uint16_t rn, uint16_t rm); +thumb_opcode th_sdiv(uint16_t rd, uint16_t rn, uint16_t rm); diff --git a/arch/arm/thumb/thop_mvn.c b/arch/arm/thumb/thop_mvn.c new file mode 100644 index 00000000..1dac0050 --- /dev/null +++ b/arch/arm/thumb/thop_mvn.c @@ -0,0 +1,102 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_mvn.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * MVN — move NOT + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── MVN register ───── */ + +/* T1: MVN , — rd==rn, low regs, implicit S */ +static const thop_variant_shape SHAPE_MVN_REG_T1 = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rm_place = {3, 3}, + .rd_con = REG_LOW_ONLY | REG_EQ_RN, + .rn_con = REG_LOW_ONLY, + .rm_con = REG_LOW_ONLY, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +/* T3: MVN{S}.W , {,shift} */ +static const thop_variant_shape SHAPE_MVN_REG_T3 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .rd_con = REG_NOT_PC, + .rn_con = REG_NOT_PC, + .rm_con = REG_NOT_SP | REG_NOT_PC, + .has_s_bit = 1, + .shift_type_bits = {4, 2}, + .shift_imm2_bits = {6, 2}, + .shift_imm3_bits = {12, 3}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_MVN_REG, "mvn", + {&SHAPE_MVN_REG_T1, 0x43c0, NULL}, + {&SHAPE_MVN_REG_T3, 0xea6f0000, NULL}); + +static thumb_opcode thop_mvn_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding enc) +{ + return thop_emit(TH_MVN_REG.name, TH_MVN_REG.variants, TH_MVN_REG.variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm, .flags = flags, .shift = shift, .enc = enc}); +} + +thumb_opcode th_mvn_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding) +{ + return thop_mvn_reg(rd, rn, rm, flags, shift, encoding); +} + +/* ───── MVN immediate ───── */ + +/* T3: MVN , # — modified immediate only, always 32-bit */ +static thumb_opcode mvn_imm_emit(uint32_t base, const thop_args *a) +{ + uint32_t S = (a->flags == FLAGS_BEHAVIOUR_SET) ? 1 : 0; + uint32_t packed = th_pack_const(a->imm); + if (packed == 0 && a->imm != 0) + return (thumb_opcode){.size = 0, .opcode = 0}; + return (thumb_opcode){.size = 4, .opcode = base | (S << 20) | (a->rd << 8) | packed}; +} + +static const thop_variant_shape SHAPE_MVN_IMM_T3 = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .has_s_bit = 1, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_MVN_IMM, "mvn", {&SHAPE_MVN_IMM_T3, 0xf06f0000, mvn_imm_emit}); + +thumb_opcode th_mvn_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding) +{ + (void)rm; + return thop_emit(TH_MVN_IMM.name, TH_MVN_IMM.variants, TH_MVN_IMM.variant_count, + (thop_args){.rd = rd, .imm = imm, .flags = flags, .enc = encoding}); +} diff --git a/arch/arm/thumb/thop_mvn.h b/arch/arm/thumb/thop_mvn.h new file mode 100644 index 00000000..9f94f3a3 --- /dev/null +++ b/arch/arm/thumb/thop_mvn.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_mvn_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); +thumb_opcode th_mvn_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_pld.c b/arch/arm/thumb/thop_pld.c new file mode 100644 index 00000000..05029bbf --- /dev/null +++ b/arch/arm/thumb/thop_pld.c @@ -0,0 +1,107 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_pld.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Preload instructions (PLD, PLI) + * + * PLD/PLI have two distinct T32 encodings for positive vs negative + * immediates (T1 vs T2) with different opcode bases and immediate + * widths, so the wrappers encode directly rather than going through + * thop_emit. + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_pld_literal(int imm) +{ + int u = 1; + if (imm < 0) { + u = 0; + imm = -imm; + } + return (thumb_opcode){ + .size = 4, + .opcode = 0xf81ff000 | u << 23 | imm, + }; +} + +thumb_opcode th_pld_imm(uint32_t rn, uint32_t w, int imm) +{ + if (imm >= 0) { + return (thumb_opcode){ + .size = 4, + .opcode = 0xf890f000 | w << 22 | rn << 16 | imm, + }; + } + imm = -imm; + return (thumb_opcode){ + .size = 4, + .opcode = 0xf810fc00 | w << 22 | rn << 16 | imm, + }; +} + +thumb_opcode th_pld_reg(uint32_t rn, uint32_t rm, uint32_t w, thumb_shift shift) +{ + if (shift.type == THUMB_SHIFT_NONE) + shift.type = THUMB_SHIFT_LSL; + return (thumb_opcode){ + .size = 4, + .opcode = 0xf810f000 | w << 22 | rn << 16 | rm | shift.value << 4, + }; +} + +thumb_opcode th_pli_literal(int imm) +{ + int u = 1; + if (imm < 0) { + u = 0; + imm = -imm; + } + return (thumb_opcode){ + .size = 4, + .opcode = 0xf91ff000 | u << 23 | imm, + }; +} + +thumb_opcode th_pli_imm(uint32_t rn, uint32_t w, int imm) +{ + if (imm >= 0) { + return (thumb_opcode){ + .size = 4, + .opcode = 0xf990f000 | w << 22 | rn << 16 | imm, + }; + } + imm = -imm; + return (thumb_opcode){ + .size = 4, + .opcode = 0xf910fc00 | w << 22 | rn << 16 | imm, + }; +} + +thumb_opcode th_pli_reg(uint32_t rn, uint32_t rm, uint32_t w, thumb_shift shift) +{ + if (shift.type == THUMB_SHIFT_NONE) + shift.type = THUMB_SHIFT_LSL; + return (thumb_opcode){ + .size = 4, + .opcode = 0xf910f000 | w << 22 | rn << 16 | rm | shift.value << 4, + }; +} diff --git a/arch/arm/thumb/thop_pld.h b/arch/arm/thumb/thop_pld.h new file mode 100644 index 00000000..3f3b996a --- /dev/null +++ b/arch/arm/thumb/thop_pld.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_pld_literal(int imm); +thumb_opcode th_pld_imm(uint32_t rn, uint32_t w, int imm); +thumb_opcode th_pld_reg(uint32_t rn, uint32_t rm, uint32_t w, thumb_shift shift); +thumb_opcode th_pli_literal(int imm); +thumb_opcode th_pli_imm(uint32_t rn, uint32_t w, int imm); +thumb_opcode th_pli_reg(uint32_t rn, uint32_t rm, uint32_t w, thumb_shift shift); diff --git a/arch/arm/thumb/thop_rev.c b/arch/arm/thumb/thop_rev.c new file mode 100644 index 00000000..1c17fe3b --- /dev/null +++ b/arch/arm/thumb/thop_rev.c @@ -0,0 +1,101 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_rev.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Reverse / bit-reverse — shared shapes + * ═══════════════════════════════════════════════════════════════════ */ + +/* T1: , — 16-bit, rd/rm low */ +static const thop_variant_shape SHAPE_T16_REV = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rm_place = {3, 3}, + .rd_con = REG_LOW_ONLY, + .rm_con = REG_LOW_ONLY, + .feat = {.t16 = 1}, +}; + +/* T2: , — 32-bit, rm duplicated at bits [19:16] */ +static const thop_variant_shape SHAPE_T32_REV = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rm_place = {0, 4}, + .ra_place = {16, 4}, + .rd_con = REG_NOT_PC, + .feat = {.t32 = 1}, +}; + +/* T2 only (no 16-bit variant): rbit */ +static const thop_variant_shape SHAPE_T32_RBIT = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rm_place = {0, 4}, + .ra_place = {16, 4}, + .rd_con = REG_NOT_PC, + .feat = {.t32 = 1, .clz_rbit = 1}, +}; + +#define V_REV_T16(b) {&SHAPE_T16_REV, (b)} +#define V_REV_T32(b) {&SHAPE_T32_REV, (b)} +#define V_RBIT_T32(b) {&SHAPE_T32_RBIT, (b)} + +/* ═══════════════════════════════════════════════════════════════════ + * Generic wrapper + * ═══════════════════════════════════════════════════════════════════ */ + +static thumb_opcode thop_rev(uint32_t rd, uint32_t rm, thumb_enforce_encoding enc, const thop_table *table) +{ + return thop_emit(table->name, table->variants, table->variant_count, + (thop_args){.rd = rd, .rm = rm, .ra = rm, .enc = enc}); +} + +/* ═══════════════════════════════════════════════════════════════════ + * Instruction tables + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_REV, "rev", V_REV_T16(0xba00), V_REV_T32(0xfa90f080)); + +thumb_opcode th_rev(uint32_t rd, uint32_t rm, thumb_enforce_encoding enc) +{ + return thop_rev(rd, rm, enc, &TH_REV); +} + +TH_TABLE(TH_REV16, "rev16", V_REV_T16(0xba40), V_REV_T32(0xfa90f090)); + +thumb_opcode th_rev16(uint32_t rd, uint32_t rm, thumb_enforce_encoding enc) +{ + return thop_rev(rd, rm, enc, &TH_REV16); +} + +TH_TABLE(TH_REVSH, "revsh", V_REV_T16(0xbac0), V_REV_T32(0xfa90f0b0)); + +thumb_opcode th_revsh(uint32_t rd, uint32_t rm, thumb_enforce_encoding enc) +{ + return thop_rev(rd, rm, enc, &TH_REVSH); +} + +TH_TABLE(TH_RBIT, "rbit", V_RBIT_T32(0xfa90f0a0)); + +thumb_opcode th_rbit(uint32_t rd, uint32_t rm) +{ + return thop_rev(rd, rm, ENFORCE_ENCODING_NONE, &TH_RBIT); +} diff --git a/arch/arm/thumb/thop_rev.h b/arch/arm/thumb/thop_rev.h new file mode 100644 index 00000000..43686072 --- /dev/null +++ b/arch/arm/thumb/thop_rev.h @@ -0,0 +1,30 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_rev(uint32_t rd, uint32_t rm, thumb_enforce_encoding encoding); +thumb_opcode th_rev16(uint32_t rd, uint32_t rm, thumb_enforce_encoding encoding); +thumb_opcode th_revsh(uint32_t rd, uint32_t rm, thumb_enforce_encoding encoding); +thumb_opcode th_rbit(uint32_t rd, uint32_t rm); diff --git a/arch/arm/thumb/thop_shift_imm.c b/arch/arm/thumb/thop_shift_imm.c new file mode 100644 index 00000000..4950783c --- /dev/null +++ b/arch/arm/thumb/thop_shift_imm.c @@ -0,0 +1,93 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_shift_imm.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Shift immediate — shared shapes + * ═══════════════════════════════════════════════════════════════════ */ + +/* T1: , , # — 16-bit, rd/rm low, imm5 raw. + * LSL/LSR/ASR share this shape; shift type encoded in bits [12:11]. + */ +static const thop_variant_shape SHAPE_T16_SHIFT_IMM = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rm_place = {3, 3}, + .rd_con = REG_LOW_ONLY, + .rm_con = REG_LOW_ONLY, + .imm = {.kind = IMM_RAW, .width = 5}, + .imm_place = {6, 5}, + .shift_type_bits = {11, 2}, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +/* T3: MOV{S}.W , , — 32-bit, shift immediate */ +static const thop_variant_shape SHAPE_T32_SHIFT_IMM = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rm_place = {0, 4}, + .rd_con = REG_NOT_PC, + .rm_con = REG_NOT_PC, + .has_s_bit = 1, + .shift_imm3_bits = {12, 3}, + .shift_imm2_bits = {6, 2}, + .shift_type_bits = {4, 2}, + .feat = {.t32 = 1}, +}; + +#define V_SHIFT_IMM16(b) {&SHAPE_T16_SHIFT_IMM, (b)} +#define V_SHIFT_IMM32(b) {&SHAPE_T32_SHIFT_IMM, (b)} + +/* ═══════════════════════════════════════════════════════════════════ + * Generic wrapper + * ═══════════════════════════════════════════════════════════════════ */ + +static thumb_opcode thop_shift_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, + thumb_enforce_encoding enc, thumb_shift shift, const thop_table *table) +{ + return thop_emit(table->name, table->variants, table->variant_count, + (thop_args){.rd = rd, .rm = rm, .imm = imm, .flags = flags, .shift = shift, .enc = enc}); +} + +#define THOP_SHIFT_IMM_FN(fn_name, table_id, shift_type) \ + thumb_opcode fn_name(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, \ + thumb_enforce_encoding enc) \ + { \ + thumb_shift shift = {.type = shift_type, .value = imm, .mode = THUMB_SHIFT_IMMEDIATE}; \ + return thop_shift_imm(rd, rm, imm, flags, enc, shift, &table_id); \ + } + +/* ═══════════════════════════════════════════════════════════════════ + * Instruction tables + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_LSL_IMM, "lsl", V_SHIFT_IMM16(0x0000), V_SHIFT_IMM32(0xEA4F0000)); +THOP_SHIFT_IMM_FN(th_lsl_imm, TH_LSL_IMM, THUMB_SHIFT_LSL) + +TH_TABLE(TH_LSR_IMM, "lsr", V_SHIFT_IMM16(0x0000), V_SHIFT_IMM32(0xEA4F0000)); +THOP_SHIFT_IMM_FN(th_lsr_imm, TH_LSR_IMM, THUMB_SHIFT_LSR) + +TH_TABLE(TH_ASR_IMM, "asr", V_SHIFT_IMM16(0x0000), V_SHIFT_IMM32(0xEA4F0000)); +THOP_SHIFT_IMM_FN(th_asr_imm, TH_ASR_IMM, THUMB_SHIFT_ASR) + +TH_TABLE(TH_ROR_IMM, "ror", V_SHIFT_IMM32(0xEA4F0000)); +THOP_SHIFT_IMM_FN(th_ror_imm, TH_ROR_IMM, THUMB_SHIFT_ROR) diff --git a/arch/arm/thumb/thop_shift_imm.h b/arch/arm/thumb/thop_shift_imm.h new file mode 100644 index 00000000..6727efc2 --- /dev/null +++ b/arch/arm/thumb/thop_shift_imm.h @@ -0,0 +1,30 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_lsl_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); +thumb_opcode th_lsr_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); +thumb_opcode th_asr_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); +thumb_opcode th_ror_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_shift_reg.c b/arch/arm/thumb/thop_shift_reg.c new file mode 100644 index 00000000..2d4e3f8a --- /dev/null +++ b/arch/arm/thumb/thop_shift_reg.c @@ -0,0 +1,98 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_shift_reg.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Shift register — shared shapes + * ═══════════════════════════════════════════════════════════════════ */ + +/* T1: , — 16-bit, all low, rd==rn, no shift field */ +static const thop_variant_shape SHAPE_T16_SHIFT_REG = { + .size = THOP_VARIANT_T16, + .rd_place = {0, 3}, + .rm_place = {3, 3}, + .rd_con = REG_LOW_ONLY | REG_EQ_RN, + .rn_con = REG_LOW_ONLY, + .rm_con = REG_LOW_ONLY, + .implicit_s = true, + .feat = {.t16 = 1}, +}; + +/* T2/T3 (32-bit): {S}.W , , — no shift field, rd/rn/rm any (not PC/SP) */ +static const thop_variant_shape SHAPE_T32_SHIFT_REG = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .rd_con = REG_NOT_PC, + .rn_con = REG_NOT_PC, + .rm_con = REG_NOT_SP | REG_NOT_PC, + .has_s_bit = 1, + .feat = {.t32 = 1}, +}; + +static thumb_opcode shift_reg_t1_emit(uint32_t base, const thop_args *a) +{ + return (thumb_opcode){ + .size = 2, + .opcode = base | (a->rm << 3) | (a->rd & 0x7), + }; +} + +#define V_LSL_REG_T1(b) {&SHAPE_T16_SHIFT_REG, (b), shift_reg_t1_emit} +#define V_LSR_REG_T1(b) {&SHAPE_T16_SHIFT_REG, (b), shift_reg_t1_emit} +#define V_ASR_REG_T1(b) {&SHAPE_T16_SHIFT_REG, (b), shift_reg_t1_emit} +#define V_SHIFT_REG32(b) {&SHAPE_T32_SHIFT_REG, (b)} + +/* ═══════════════════════════════════════════════════════════════════ + * Generic wrapper + * ═══════════════════════════════════════════════════════════════════ */ + +static thumb_opcode thop_shift_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, + thumb_enforce_encoding enc, const thop_table *table) +{ + return thop_emit(table->name, table->variants, table->variant_count, + (thop_args){.rd = rd, .rn = rn, .rm = rm, .flags = flags, .enc = enc}); +} + +#define THOP_SHIFT_REG_FN(fn_name, table_id) \ + thumb_opcode fn_name(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, \ + thumb_enforce_encoding enc) \ + { \ + (void)shift; \ + return thop_shift_reg(rd, rn, rm, flags, enc, &table_id); \ + } + +/* ═══════════════════════════════════════════════════════════════════ + * Instruction tables + * ═══════════════════════════════════════════════════════════════════ */ + +TH_TABLE(TH_LSL_REG, "lsl", V_LSL_REG_T1(0x4080), V_SHIFT_REG32(0xFA00F000)); +THOP_SHIFT_REG_FN(th_lsl_reg, TH_LSL_REG) + +TH_TABLE(TH_LSR_REG, "lsr", V_LSR_REG_T1(0x40C0), V_SHIFT_REG32(0xFA20F000)); +THOP_SHIFT_REG_FN(th_lsr_reg, TH_LSR_REG) + +TH_TABLE(TH_ASR_REG, "asr", V_ASR_REG_T1(0x4100), V_SHIFT_REG32(0xFA40F000)); +THOP_SHIFT_REG_FN(th_asr_reg, TH_ASR_REG) + +TH_TABLE(TH_ROR_REG, "ror", V_SHIFT_REG32(0xFA60F000)); +THOP_SHIFT_REG_FN(th_ror_reg, TH_ROR_REG) diff --git a/arch/arm/thumb/thop_shift_reg.h b/arch/arm/thumb/thop_shift_reg.h new file mode 100644 index 00000000..8987e2b6 --- /dev/null +++ b/arch/arm/thumb/thop_shift_reg.h @@ -0,0 +1,34 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_lsl_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); +thumb_opcode th_lsr_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); +thumb_opcode th_asr_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); +thumb_opcode th_ror_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding); diff --git a/arch/arm/thumb/thop_system.c b/arch/arm/thumb/thop_system.c new file mode 100644 index 00000000..f9fe6ded --- /dev/null +++ b/arch/arm/thumb/thop_system.c @@ -0,0 +1,256 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_system.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * System hints, barriers, exceptions, status + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── Hint instructions (NOP, SEV, WFE, WFI, YIELD) ───── */ + +static const thop_variant_shape SHAPE_HINT_T16 = { + .size = THOP_VARIANT_T16, + .imm = {.kind = IMM_RAW, .width = 4}, + .imm_place = {4, 4}, + .feat = {.t16 = 1}, +}; + +static const thop_variant_shape SHAPE_HINT_T32 = { + .size = THOP_VARIANT_T32, + .imm = {.kind = IMM_RAW, .width = 4}, + .imm_place = {0, 4}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_NOP_T16, "nop", {&SHAPE_HINT_T16, 0xbf00, NULL}); +TH_TABLE(TH_NOP_T32, "nop.w", {&SHAPE_HINT_T32, 0xf3af8000, NULL}); + +TH_TABLE(TH_SEV_T16, "sev", {&SHAPE_HINT_T16, 0xbf40, NULL}); +TH_TABLE(TH_SEV_T32, "sev.w", {&SHAPE_HINT_T32, 0xf3af8004, NULL}); + +TH_TABLE(TH_WFE_T16, "wfe", {&SHAPE_HINT_T16, 0xbf20, NULL}); +TH_TABLE(TH_WFE_T32, "wfe.w", {&SHAPE_HINT_T32, 0xf3af8002, NULL}); + +TH_TABLE(TH_WFI_T16, "wfi", {&SHAPE_HINT_T16, 0xbf30, NULL}); +TH_TABLE(TH_WFI_T32, "wfi.w", {&SHAPE_HINT_T32, 0xf3af8003, NULL}); + +TH_TABLE(TH_YIELD_T16, "yield", {&SHAPE_HINT_T16, 0xbf10, NULL}); +TH_TABLE(TH_YIELD_T32, "yield.w", {&SHAPE_HINT_T32, 0xf3af8001, NULL}); + +/* ───── SVC / BKPT (T16 only, imm8) ───── */ + +static const thop_variant_shape SHAPE_IMM8_T16 = { + .size = THOP_VARIANT_T16, + .imm = {.kind = IMM_RAW, .width = 8}, + .imm_place = {0, 8}, + .feat = {.t16 = 1}, +}; + +TH_TABLE(TH_SVC, "svc", {&SHAPE_IMM8_T16, 0xdf00, NULL}); +TH_TABLE(TH_BKPT, "bkpt", {&SHAPE_IMM8_T16, 0xbe00, NULL}); + +/* ───── UDF (T16 imm8, T32 imm12+imm4) ───── */ + +static const thop_variant_shape SHAPE_UDF_T16 = { + .size = THOP_VARIANT_T16, + .imm = {.kind = IMM_RAW, .width = 8}, + .imm_place = {0, 8}, + .feat = {.t16 = 1}, +}; + +static const thop_variant_shape SHAPE_UDF_T32 = { + .size = THOP_VARIANT_T32, + .imm = {.kind = IMM_RAW, .width = 12}, + .imm_place = {0, 12}, + .imm2_place = {16, 4}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_UDF_T16, "udf", {&SHAPE_UDF_T16, 0xde00, NULL}); +TH_TABLE(TH_UDF_T32, "udf.w", {&SHAPE_UDF_T32, 0xf7f0a000, NULL}); + +/* ───── CPS (T16 only) ───── */ + +static const thop_variant_shape SHAPE_CPS = { + .size = THOP_VARIANT_T16, + .rd_place = {4, 1}, + .imm = {.kind = IMM_RAW, .width = 1}, + .imm_place = {0, 1}, + .imm2_place = {1, 1}, + .feat = {.t16 = 1}, +}; + +TH_TABLE(TH_CPS, "cps", {&SHAPE_CPS, 0xb660, NULL}); + +/* ───── CLREX, CSDB, DMB, DSB, ISB, SSBB (T32 only) ───── */ + +static const thop_variant_shape SHAPE_BARRIER = { + .size = THOP_VARIANT_T32, + .imm = {.kind = IMM_RAW, .width = 4}, + .imm_place = {0, 4}, + .feat = {.t32 = 1}, +}; + +static const thop_variant_shape SHAPE_NOARG_T32 = { + .size = THOP_VARIANT_T32, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_CLREX, "clrex", {&SHAPE_NOARG_T32, 0xf3bf8f2f, NULL}); +TH_TABLE(TH_CSDB, "csdb", {&SHAPE_NOARG_T32, 0xf3af8014, NULL}); +TH_TABLE(TH_DMB, "dmb", {&SHAPE_BARRIER, 0xf3bf8f50, NULL}); +TH_TABLE(TH_DSB, "dsb", {&SHAPE_BARRIER, 0xf3bf8f40, NULL}); +TH_TABLE(TH_ISB, "isb", {&SHAPE_BARRIER, 0xf3bf8f60, NULL}); +TH_TABLE(TH_SSBB, "ssbb", {&SHAPE_NOARG_T32, 0xf3bf8f40, NULL}); + +/* ───── IT (T16 only) ───── */ + +static const thop_variant_shape SHAPE_IT = { + .size = THOP_VARIANT_T16, + .rd_place = {4, 4}, + .imm = {.kind = IMM_RAW, .width = 4}, + .imm_place = {0, 4}, + .feat = {.t16 = 1}, +}; + +TH_TABLE(TH_IT, "it", {&SHAPE_IT, 0xbf00, NULL}); + +/* ───── CLZ (T32 only) ───── */ + +static thumb_opcode clz_emit(uint32_t base, const thop_args *a) +{ + uint32_t op = base | (a->rm << 16) | (a->rd << 8) | a->rm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static const thop_variant_shape SHAPE_CLZ = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rm_place = {16, 4}, + .feat = {.t32 = 1, .clz_rbit = 1}, +}; + +TH_TABLE(TH_CLZ, "clz", {&SHAPE_CLZ, 0xfab0f080, clz_emit}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_nop(thumb_enforce_encoding encoding) +{ + if (encoding == ENFORCE_ENCODING_32BIT) + return thop_emit(TH_NOP_T32.name, TH_NOP_T32.variants, TH_NOP_T32.variant_count, (thop_args){}); + return thop_emit(TH_NOP_T16.name, TH_NOP_T16.variants, TH_NOP_T16.variant_count, (thop_args){}); +} + +thumb_opcode th_sev(thumb_enforce_encoding encoding) +{ + if (encoding == ENFORCE_ENCODING_32BIT) + return thop_emit(TH_SEV_T32.name, TH_SEV_T32.variants, TH_SEV_T32.variant_count, (thop_args){}); + return thop_emit(TH_SEV_T16.name, TH_SEV_T16.variants, TH_SEV_T16.variant_count, (thop_args){}); +} + +thumb_opcode th_wfe(thumb_enforce_encoding encoding) +{ + if (encoding == ENFORCE_ENCODING_32BIT) + return thop_emit(TH_WFE_T32.name, TH_WFE_T32.variants, TH_WFE_T32.variant_count, (thop_args){}); + return thop_emit(TH_WFE_T16.name, TH_WFE_T16.variants, TH_WFE_T16.variant_count, (thop_args){}); +} + +thumb_opcode th_wfi(thumb_enforce_encoding encoding) +{ + if (encoding == ENFORCE_ENCODING_32BIT) + return thop_emit(TH_WFI_T32.name, TH_WFI_T32.variants, TH_WFI_T32.variant_count, (thop_args){}); + return thop_emit(TH_WFI_T16.name, TH_WFI_T16.variants, TH_WFI_T16.variant_count, (thop_args){}); +} + +thumb_opcode th_yield(thumb_enforce_encoding encoding) +{ + if (encoding == ENFORCE_ENCODING_32BIT) + return thop_emit(TH_YIELD_T32.name, TH_YIELD_T32.variants, TH_YIELD_T32.variant_count, (thop_args){}); + return thop_emit(TH_YIELD_T16.name, TH_YIELD_T16.variants, TH_YIELD_T16.variant_count, (thop_args){}); +} + +thumb_opcode th_svc(uint32_t imm) +{ + return thop_emit(TH_SVC.name, TH_SVC.variants, TH_SVC.variant_count, (thop_args){.imm = imm}); +} + +thumb_opcode th_bkpt(uint32_t imm) +{ + return thop_emit(TH_BKPT.name, TH_BKPT.variants, TH_BKPT.variant_count, (thop_args){.imm = imm}); +} + +thumb_opcode th_udf(uint32_t imm, thumb_enforce_encoding encoding) +{ + if (encoding != ENFORCE_ENCODING_32BIT && imm <= 0xff) + return thop_emit(TH_UDF_T16.name, TH_UDF_T16.variants, TH_UDF_T16.variant_count, (thop_args){.imm = imm}); + return thop_emit(TH_UDF_T32.name, TH_UDF_T32.variants, TH_UDF_T32.variant_count, + (thop_args){.imm = imm & 0xfff, .imm2 = (imm >> 12) & 0xf}); +} + +thumb_opcode th_cps(uint32_t enable, uint32_t i, uint32_t f) +{ + return thop_emit(TH_CPS.name, TH_CPS.variants, TH_CPS.variant_count, + (thop_args){.rd = enable, .imm = f, .imm2 = i}); +} + +thumb_opcode th_clrex() +{ + return thop_emit(TH_CLREX.name, TH_CLREX.variants, TH_CLREX.variant_count, (thop_args){}); +} + +thumb_opcode th_csdb() +{ + return thop_emit(TH_CSDB.name, TH_CSDB.variants, TH_CSDB.variant_count, (thop_args){}); +} + +thumb_opcode th_dmb(uint32_t option) +{ + return thop_emit(TH_DMB.name, TH_DMB.variants, TH_DMB.variant_count, (thop_args){.imm = option}); +} + +thumb_opcode th_dsb(uint32_t option) +{ + return thop_emit(TH_DSB.name, TH_DSB.variants, TH_DSB.variant_count, (thop_args){.imm = option}); +} + +thumb_opcode th_isb(uint32_t option) +{ + return thop_emit(TH_ISB.name, TH_ISB.variants, TH_ISB.variant_count, (thop_args){.imm = option}); +} + +thumb_opcode th_ssbb() +{ + return thop_emit(TH_SSBB.name, TH_SSBB.variants, TH_SSBB.variant_count, (thop_args){}); +} + +thumb_opcode th_it(uint16_t cond, uint16_t mask) +{ + return thop_emit(TH_IT.name, TH_IT.variants, TH_IT.variant_count, + (thop_args){.rd = cond, .imm = mask}); +} + +thumb_opcode th_clz(uint32_t rd, uint32_t rm) +{ + return thop_emit(TH_CLZ.name, TH_CLZ.variants, TH_CLZ.variant_count, (thop_args){.rd = rd, .rm = rm}); +} diff --git a/arch/arm/thumb/thop_system.h b/arch/arm/thumb/thop_system.h new file mode 100644 index 00000000..9855ce8d --- /dev/null +++ b/arch/arm/thumb/thop_system.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_nop(thumb_enforce_encoding encoding); +thumb_opcode th_sev(thumb_enforce_encoding encoding); +thumb_opcode th_wfe(thumb_enforce_encoding encoding); +thumb_opcode th_wfi(thumb_enforce_encoding encoding); +thumb_opcode th_yield(thumb_enforce_encoding encoding); +thumb_opcode th_svc(uint32_t imm); +thumb_opcode th_bkpt(uint32_t imm); +thumb_opcode th_udf(uint32_t imm, thumb_enforce_encoding encoding); +thumb_opcode th_cps(uint32_t enable, uint32_t i, uint32_t f); +thumb_opcode th_clrex(); +thumb_opcode th_csdb(); +thumb_opcode th_dmb(uint32_t option); +thumb_opcode th_dsb(uint32_t option); +thumb_opcode th_isb(uint32_t option); +thumb_opcode th_ssbb(); +thumb_opcode th_it(uint16_t cond, uint16_t mask); +thumb_opcode th_clz(uint32_t rd, uint32_t rm); diff --git a/arch/arm/thumb/thop_tbb.c b/arch/arm/thumb/thop_tbb.c new file mode 100644 index 00000000..dc9a0330 --- /dev/null +++ b/arch/arm/thumb/thop_tbb.c @@ -0,0 +1,80 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_tbb.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Table branch (TBB, TBH) and TT instructions + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── TBB / TBH (T32 only, ARMv7-M / v8-M) ───── */ + +static const thop_variant_shape SHAPE_TBB = { + .size = THOP_VARIANT_T32, + .rn_place = {16, 4}, + .rm_place = {0, 4}, + .feat = {.t32 = 1, .tbb_tbh = 1}, +}; + +TH_TABLE(TH_TBB, "tbb", {&SHAPE_TBB, 0xe8d0f000, NULL}); +TH_TABLE(TH_TBH, "tbh", {&SHAPE_TBB, 0xe8d0f010, NULL}); + +/* ───── TT / TTT / TTA / TTAT (T32 only, ARMv8-M) ───── */ + +static thumb_opcode tt_emit(uint32_t base, const thop_args *a) +{ + uint32_t op = base | (a->rn << 16) | (a->rd << 8); + if (a->imm) { + op |= 0x0080; /* A bit (bit 7) */ + } + if (a->imm2) { + op |= 0x0040; /* T bit (bit 6) */ + } + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static const thop_variant_shape SHAPE_TT = { + .size = THOP_VARIANT_T32, + .rd_place = {8, 4}, + .rn_place = {16, 4}, + .feat = {.t32 = 1}, +}; + +TH_TABLE(TH_TT, "tt", {&SHAPE_TT, 0xe840f000, tt_emit}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_tbb(uint32_t rn, uint32_t rm, uint32_t h) +{ + if (h) + return thop_emit(TH_TBH.name, TH_TBH.variants, TH_TBH.variant_count, + (thop_args){.rn = rn, .rm = rm}); + return thop_emit(TH_TBB.name, TH_TBB.variants, TH_TBB.variant_count, + (thop_args){.rn = rn, .rm = rm}); +} + +thumb_opcode th_tt(uint32_t rd, uint32_t rn, uint32_t a, uint32_t t) +{ + return thop_emit(TH_TT.name, TH_TT.variants, TH_TT.variant_count, + (thop_args){.rd = rd, .rn = rn, .imm = a, .imm2 = t}); +} diff --git a/arch/arm/thumb/thop_tbb.h b/arch/arm/thumb/thop_tbb.h new file mode 100644 index 00000000..c36bce19 --- /dev/null +++ b/arch/arm/thumb/thop_tbb.h @@ -0,0 +1,8 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_tbb(uint32_t rn, uint32_t rm, uint32_t h); +thumb_opcode th_tt(uint32_t rd, uint32_t rn, uint32_t a, uint32_t t); diff --git a/arch/arm/thumb/thop_vfp.c b/arch/arm/thumb/thop_vfp.c new file mode 100644 index 00000000..2bed87ed --- /dev/null +++ b/arch/arm/thumb/thop_vfp.c @@ -0,0 +1,476 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "thop_vfp.h" +#include "thumb.h" + +/* ═══════════════════════════════════════════════════════════════════ + * VFP custom emit helpers — handle D:Vd / N:Vn / M:Vm split encoding + * ═══════════════════════════════════════════════════════════════════ */ + +static void vfp_pack_sp(uint32_t reg, uint32_t *D, uint32_t *V) +{ + *D = reg & 1; + *V = (reg >> 1) & 0xf; +} + +static void vfp_pack_dp(uint32_t reg, uint32_t *D, uint32_t *V) +{ + *D = (reg >> 4) & 1; + *V = reg & 0xf; +} + +/* 3-register arithmetic (vadd, vsub, vmul, vdiv) */ +static thumb_opcode vfp_arith3_emit(uint32_t base, const thop_args *a) +{ + uint32_t D, Vd, N, Vn, M, Vm; + if (base & (1u << 8)) + { + vfp_pack_dp(a->rd, &D, &Vd); + vfp_pack_dp(a->rn, &N, &Vn); + vfp_pack_dp(a->rm, &M, &Vm); + } + else + { + vfp_pack_sp(a->rd, &D, &Vd); + vfp_pack_sp(a->rn, &N, &Vn); + vfp_pack_sp(a->rm, &M, &Vm); + } + uint32_t op = base | (D << 22) | (Vn << 16) | (Vd << 12) | (N << 7) | (M << 5) | Vm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +/* 2-register arithmetic / compare (vneg, vcmp, vcmpe) */ +static thumb_opcode vfp_arith2_emit(uint32_t base, const thop_args *a) +{ + uint32_t D, Vd, M, Vm; + if (base & (1u << 8)) + { + vfp_pack_dp(a->rd, &D, &Vd); + vfp_pack_dp(a->rm, &M, &Vm); + } + else + { + vfp_pack_sp(a->rd, &D, &Vd); + vfp_pack_sp(a->rm, &M, &Vm); + } + uint32_t op = base | (D << 22) | (Vd << 12) | (M << 5) | Vm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +/* Register move (vmov_register) */ +static thumb_opcode vmov_reg_emit(uint32_t base, const thop_args *a) +{ + uint32_t D, Vd, M, Vm; + if (base & (1u << 8)) + { + vfp_pack_dp(a->rd, &D, &Vd); + vfp_pack_dp(a->rm, &M, &Vm); + } + else + { + vfp_pack_sp(a->rd, &D, &Vd); + vfp_pack_sp(a->rm, &M, &Vm); + } + uint32_t op = base | (D << 22) | (Vd << 12) | (M << 5) | Vm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +/* Push / pop (vpush, vpop) */ +static thumb_opcode vfp_pushpop_emit(uint32_t base, const thop_args *a) +{ + uint32_t regs = a->imm; + uint32_t is_doubleword = (base >> 8) & 1; + + int first_register = 0; + int register_count = 0; + for (int i = 0; i < 32; i++) + { + if (regs & (1u << i)) + { + first_register = i; + break; + } + } + for (int i = 0; i < 32; i++) + { + if (regs & (1u << i)) + register_count++; + } + + uint32_t D, Vd; + if (is_doubleword) + { + D = (first_register >> 4) & 1; + Vd = first_register & 0xf; + register_count <<= 1; + } + else + { + D = first_register & 1; + Vd = (first_register >> 1) & 0xf; + } + + uint32_t op = base | (D << 22) | (Vd << 12) | (register_count & 0xff); + return (thumb_opcode){.size = 4, .opcode = op}; +} + +/* VMOV between GPR and single-precision VFP register */ +static thumb_opcode vmov_gp_sp_emit(uint32_t base, const thop_args *a) +{ + uint32_t Vn = (a->rn >> 1) & 0xf; + uint32_t N = a->rn & 1; + uint32_t op = base | (a->imm2 << 20) | (a->rd << 12) | (Vn << 16) | (N << 7); + return (thumb_opcode){.size = 4, .opcode = op}; +} + +/* VMOV between two GPRs and double-precision VFP register */ +static thumb_opcode vmov_2gp_dp_emit(uint32_t base, const thop_args *a) +{ + uint32_t M = (a->rm >> 4) & 1; + uint32_t Vm = a->rm & 0xf; + uint32_t op = base | (a->imm2 << 20) | (a->rn << 16) | (a->rd << 12) | (M << 5) | Vm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +/* VCVT float-to-double and double-to-float */ +static thumb_opcode vcvt_fd_emit(uint32_t base, const thop_args *a) +{ + uint32_t D = (a->rd >> 4) & 1; + uint32_t Vd = a->rd & 0xf; + uint32_t M = a->rm & 1; + uint32_t Vm = (a->rm >> 1) & 0xf; + uint32_t op = base | (D << 22) | (Vd << 12) | (M << 5) | Vm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +static thumb_opcode vcvt_df_emit(uint32_t base, const thop_args *a) +{ + uint32_t D = a->rd & 1; + uint32_t Vd = (a->rd >> 1) & 0xf; + uint32_t M = (a->rm >> 4) & 1; + uint32_t Vm = a->rm & 0xf; + uint32_t op = base | (D << 22) | (Vd << 12) | (M << 5) | Vm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +/* VCVT between floating-point and integer */ +static thumb_opcode vcvt_fp_int_emit(uint32_t base, const thop_args *a) +{ + uint32_t sz = (base >> 8) & 1; + uint32_t is_fp_to_int = (a->imm != 0); + uint32_t op_bit = is_fp_to_int | a->imm2; + uint32_t D, Vd, M, Vm; + + if (is_fp_to_int) + { /* fp -> int: destination is always Sd */ + vfp_pack_sp(a->rd, &D, &Vd); + } + else + { /* int -> fp: destination is Sd (sz=0) or Dd (sz=1) */ + if (sz == 0) + vfp_pack_sp(a->rd, &D, &Vd); + else + vfp_pack_dp(a->rd, &D, &Vd); + } + + if (is_fp_to_int && sz == 1) + { /* fp -> int with double source */ + vfp_pack_dp(a->rm, &M, &Vm); + } + else + { /* source is Sm in all other cases */ + vfp_pack_sp(a->rm, &M, &Vm); + } + + uint32_t op = base | (D << 22) | (Vd << 12) | (a->imm << 16) | (op_bit << 7) | 0x40 | (M << 5) | Vm; + return (thumb_opcode){.size = 4, .opcode = op}; +} + +/* ═══════════════════════════════════════════════════════════════════ + * Shared shapes + * ═══════════════════════════════════════════════════════════════════ */ + +static const thop_variant_shape SHAPE_VFP_SP = { + .size = THOP_VARIANT_T32, + .feat = {.t32 = 1, .vfp_sp = 1}, +}; + +static const thop_variant_shape SHAPE_VFP_DP = { + .size = THOP_VARIANT_T32, + .feat = {.t32 = 1, .vfp_dp = 1}, +}; + +static const thop_variant_shape SHAPE_VMOVGPSP = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .imm2_place = {20, 1}, + .feat = {.t32 = 1, .vfp_sp = 1}, +}; + +static const thop_variant_shape SHAPE_VMOV2GPDP = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .rn_place = {16, 4}, + .imm2_place = {20, 1}, + .feat = {.t32 = 1, .vfp_dp = 1}, +}; + +static const thop_variant_shape SHAPE_VMRS = { + .size = THOP_VARIANT_T32, + .rd_place = {12, 4}, + .feat = {.t32 = 1, .vfp_sp = 1}, +}; + +static const thop_variant_shape SHAPE_VCVT_FD = { + .size = THOP_VARIANT_T32, + .feat = {.t32 = 1, .vfp_dp = 1}, +}; + +static const thop_variant_shape SHAPE_VCVT_DF = { + .size = THOP_VARIANT_T32, + .feat = {.t32 = 1, .vfp_dp = 1}, +}; + +static const thop_variant_shape SHAPE_VCVT_FP_INT_SP = { + .size = THOP_VARIANT_T32, + .imm = {.kind = IMM_RAW, .width = 4}, + .imm_place = {16, 4}, + .imm2_place = {7, 1}, + .puw_bits = {8, 1}, + .feat = {.t32 = 1, .vfp_sp = 1}, +}; + +static const thop_variant_shape SHAPE_VCVT_FP_INT_DP = { + .size = THOP_VARIANT_T32, + .imm = {.kind = IMM_RAW, .width = 4}, + .imm_place = {16, 4}, + .imm2_place = {7, 1}, + .puw_bits = {8, 1}, + .feat = {.t32 = 1, .vfp_dp = 1}, +}; + +/* ═══════════════════════════════════════════════════════════════════ + * THOP tables + * ═══════════════════════════════════════════════════════════════════ */ + +/* VADD.F32 / VADD.F64 */ +TH_TABLE(TH_VADD_F_SP, "vadd.f32", {&SHAPE_VFP_SP, 0xee300a00, vfp_arith3_emit}); +TH_TABLE(TH_VADD_F_DP, "vadd.f64", {&SHAPE_VFP_DP, 0xee300b00, vfp_arith3_emit}); + +/* VSUB.F32 / VSUB.F64 */ +TH_TABLE(TH_VSUB_F_SP, "vsub.f32", {&SHAPE_VFP_SP, 0xee300a40, vfp_arith3_emit}); +TH_TABLE(TH_VSUB_F_DP, "vsub.f64", {&SHAPE_VFP_DP, 0xee300b40, vfp_arith3_emit}); + +/* VMUL.F32 / VMUL.F64 */ +TH_TABLE(TH_VMUL_F_SP, "vmul.f32", {&SHAPE_VFP_SP, 0xee200a00, vfp_arith3_emit}); +TH_TABLE(TH_VMUL_F_DP, "vmul.f64", {&SHAPE_VFP_DP, 0xee200b00, vfp_arith3_emit}); + +/* VDIV.F32 / VDIV.F64 */ +TH_TABLE(TH_VDIV_F_SP, "vdiv.f32", {&SHAPE_VFP_SP, 0xee800a00, vfp_arith3_emit}); +TH_TABLE(TH_VDIV_F_DP, "vdiv.f64", {&SHAPE_VFP_DP, 0xee800b00, vfp_arith3_emit}); + +/* VNEG.F32 / VNEG.F64 */ +TH_TABLE(TH_VNEG_F_SP, "vneg.f32", {&SHAPE_VFP_SP, 0xeeb10a40, vfp_arith2_emit}); +TH_TABLE(TH_VNEG_F_DP, "vneg.f64", {&SHAPE_VFP_DP, 0xeeb10b40, vfp_arith2_emit}); + +/* VCMP.F32 / VCMP.F64 */ +TH_TABLE(TH_VCMP_F_SP, "vcmp.f32", {&SHAPE_VFP_SP, 0xeeb40a40, vfp_arith2_emit}); +TH_TABLE(TH_VCMP_F_DP, "vcmp.f64", {&SHAPE_VFP_DP, 0xeeb40b40, vfp_arith2_emit}); + +/* VPUSH SP / DP */ +TH_TABLE(TH_VPUSH_SP, "vpush.f32", {&SHAPE_VFP_SP, 0xed2d0a00, vfp_pushpop_emit}); +TH_TABLE(TH_VPUSH_DP, "vpush.f64", {&SHAPE_VFP_SP, 0xed2d0b00, vfp_pushpop_emit}); + +/* VPOP SP / DP */ +TH_TABLE(TH_VPOP_SP, "vpop.f32", {&SHAPE_VFP_SP, 0xecbd0a00, vfp_pushpop_emit}); +TH_TABLE(TH_VPOP_DP, "vpop.f64", {&SHAPE_VFP_SP, 0xecbd0b00, vfp_pushpop_emit}); + +/* VMOV register SP / DP */ +TH_TABLE(TH_VMOV_REG_SP, "vmov.f32", {&SHAPE_VFP_SP, 0xeeb00a40, vmov_reg_emit}); +TH_TABLE(TH_VMOV_REG_DP, "vmov.f64", {&SHAPE_VFP_DP, 0xeeb00b40, vmov_reg_emit}); + +/* VMOV between GPR and SP VFP register */ +TH_TABLE(TH_VMOV_GP_SP, "vmov.gp_sp", {&SHAPE_VMOVGPSP, 0xee000a10, vmov_gp_sp_emit}); + +/* VMOV between two GPRs and DP VFP register */ +TH_TABLE(TH_VMOV_2GP_DP, "vmov.2gp_dp", {&SHAPE_VMOV2GPDP, 0xec400b10, vmov_2gp_dp_emit}); + +/* VMRS */ +TH_TABLE(TH_VMRS, "vmrs", {&SHAPE_VMRS, 0xeef10a10, NULL}); + +/* VCVT.F64.F32 (SP -> DP) */ +TH_TABLE(TH_VCVT_FD, "vcvt.f64.f32", {&SHAPE_VCVT_FD, 0xeeb70ac0, vcvt_fd_emit}); + +/* VCVT.F32.F64 (DP -> SP) */ +TH_TABLE(TH_VCVT_DF, "vcvt.f32.f64", {&SHAPE_VCVT_DF, 0xeeb70bc0, vcvt_df_emit}); + +/* VCVT fp/int SP / DP */ +TH_TABLE(TH_VCVT_FP_INT_SP, "vcvt.fp_int.f32", {&SHAPE_VCVT_FP_INT_SP, 0xeeb80a40, vcvt_fp_int_emit}); +TH_TABLE(TH_VCVT_FP_INT_DP, "vcvt.fp_int.f64", {&SHAPE_VCVT_FP_INT_DP, 0xeeb80b40, vcvt_fp_int_emit}); + +/* ═══════════════════════════════════════════════════════════════════ + * Public wrappers + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode th_vadd_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz) +{ + if (sz == 0) + return thop_emit(TH_VADD_F_SP.name, TH_VADD_F_SP.variants, TH_VADD_F_SP.variant_count, + (thop_args){.rd = vd, .rn = vn, .rm = vm}); + return thop_emit(TH_VADD_F_DP.name, TH_VADD_F_DP.variants, TH_VADD_F_DP.variant_count, + (thop_args){.rd = vd, .rn = vn, .rm = vm}); +} + +thumb_opcode th_vsub_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz) +{ + if (sz == 0) + return thop_emit(TH_VSUB_F_SP.name, TH_VSUB_F_SP.variants, TH_VSUB_F_SP.variant_count, + (thop_args){.rd = vd, .rn = vn, .rm = vm}); + return thop_emit(TH_VSUB_F_DP.name, TH_VSUB_F_DP.variants, TH_VSUB_F_DP.variant_count, + (thop_args){.rd = vd, .rn = vn, .rm = vm}); +} + +thumb_opcode th_vmul_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz) +{ + if (sz == 0) + return thop_emit(TH_VMUL_F_SP.name, TH_VMUL_F_SP.variants, TH_VMUL_F_SP.variant_count, + (thop_args){.rd = vd, .rn = vn, .rm = vm}); + return thop_emit(TH_VMUL_F_DP.name, TH_VMUL_F_DP.variants, TH_VMUL_F_DP.variant_count, + (thop_args){.rd = vd, .rn = vn, .rm = vm}); +} + +thumb_opcode th_vdiv_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz) +{ + if (sz == 0) + return thop_emit(TH_VDIV_F_SP.name, TH_VDIV_F_SP.variants, TH_VDIV_F_SP.variant_count, + (thop_args){.rd = vd, .rn = vn, .rm = vm}); + return thop_emit(TH_VDIV_F_DP.name, TH_VDIV_F_DP.variants, TH_VDIV_F_DP.variant_count, + (thop_args){.rd = vd, .rn = vn, .rm = vm}); +} + +thumb_opcode th_vneg_f(uint32_t vd, uint32_t vm, uint32_t sz) +{ + if (sz == 0) + return thop_emit(TH_VNEG_F_SP.name, TH_VNEG_F_SP.variants, TH_VNEG_F_SP.variant_count, + (thop_args){.rd = vd, .rm = vm}); + return thop_emit(TH_VNEG_F_DP.name, TH_VNEG_F_DP.variants, TH_VNEG_F_DP.variant_count, + (thop_args){.rd = vd, .rm = vm}); +} + +thumb_opcode th_vcmp_f(uint32_t vd, uint32_t vm, uint32_t sz) +{ + if (sz == 0) + return thop_emit(TH_VCMP_F_SP.name, TH_VCMP_F_SP.variants, TH_VCMP_F_SP.variant_count, + (thop_args){.rd = vd, .rm = vm}); + return thop_emit(TH_VCMP_F_DP.name, TH_VCMP_F_DP.variants, TH_VCMP_F_DP.variant_count, + (thop_args){.rd = vd, .rm = vm}); +} + +thumb_opcode th_vpush(uint32_t regs, uint32_t is_doubleword) +{ + if (is_doubleword == 0) + return thop_emit(TH_VPUSH_SP.name, TH_VPUSH_SP.variants, TH_VPUSH_SP.variant_count, (thop_args){.imm = regs}); + return thop_emit(TH_VPUSH_DP.name, TH_VPUSH_DP.variants, TH_VPUSH_DP.variant_count, (thop_args){.imm = regs}); +} + +thumb_opcode th_vpop(uint32_t regs, uint32_t is_doubleword) +{ + if (is_doubleword == 0) + return thop_emit(TH_VPOP_SP.name, TH_VPOP_SP.variants, TH_VPOP_SP.variant_count, (thop_args){.imm = regs}); + return thop_emit(TH_VPOP_DP.name, TH_VPOP_DP.variants, TH_VPOP_DP.variant_count, (thop_args){.imm = regs}); +} + +thumb_opcode th_vmov_register(uint16_t vd, uint16_t vm, uint32_t sz) +{ + if (sz == 0) + return thop_emit(TH_VMOV_REG_SP.name, TH_VMOV_REG_SP.variants, TH_VMOV_REG_SP.variant_count, + (thop_args){.rd = vd, .rm = vm}); + return thop_emit(TH_VMOV_REG_DP.name, TH_VMOV_REG_DP.variants, TH_VMOV_REG_DP.variant_count, + (thop_args){.rd = vd, .rm = vm}); +} + +thumb_opcode th_vmov_gp_sp(uint16_t rt, uint16_t sn, uint16_t to_arm_register) +{ + return thop_emit(TH_VMOV_GP_SP.name, TH_VMOV_GP_SP.variants, TH_VMOV_GP_SP.variant_count, + (thop_args){.rd = rt, .rn = sn, .imm2 = to_arm_register}); +} + +thumb_opcode th_vmov_2gp_dp(uint16_t rt, uint16_t rt2, uint16_t dm, uint16_t to_arm_register) +{ + return thop_emit(TH_VMOV_2GP_DP.name, TH_VMOV_2GP_DP.variants, TH_VMOV_2GP_DP.variant_count, + (thop_args){.rd = rt, .rn = rt2, .rm = dm, .imm2 = to_arm_register}); +} + +thumb_opcode th_vmrs(uint16_t rt) +{ + return thop_emit(TH_VMRS.name, TH_VMRS.variants, TH_VMRS.variant_count, (thop_args){.rd = rt}); +} + +thumb_opcode th_vcvt_float_to_double(uint32_t vd, uint32_t vm) +{ + return thop_emit(TH_VCVT_FD.name, TH_VCVT_FD.variants, TH_VCVT_FD.variant_count, + (thop_args){.rd = vd, .rm = vm}); +} + +thumb_opcode th_vcvt_double_to_float(uint32_t vd, uint32_t vm) +{ + return thop_emit(TH_VCVT_DF.name, TH_VCVT_DF.variants, TH_VCVT_DF.variant_count, + (thop_args){.rd = vd, .rm = vm}); +} + +thumb_opcode th_vcvt_fp_int(uint32_t vd, uint32_t vm, uint32_t opc, uint32_t is_double, uint32_t op) +{ + if (is_double == 0) + return thop_emit(TH_VCVT_FP_INT_SP.name, TH_VCVT_FP_INT_SP.variants, TH_VCVT_FP_INT_SP.variant_count, + (thop_args){.rd = vd, .rm = vm, .imm = opc, .imm2 = op, .puw = 0}); + return thop_emit(TH_VCVT_FP_INT_DP.name, TH_VCVT_FP_INT_DP.variants, TH_VCVT_FP_INT_DP.variant_count, + (thop_args){.rd = vd, .rm = vm, .imm = opc, .imm2 = op, .puw = 1}); +} + +thumb_opcode th_vcvt_convert(uint32_t vd, uint32_t vm, const char *dest_type, const char *src_type) +{ + if ((strcmp(dest_type, "s32") == 0 || strcmp(dest_type, "u32") == 0) && strcmp(src_type, "f32") == 0) + { + int is_unsigned = strcmp(dest_type, "u32") == 0; + return th_vcvt_fp_int(vd, vm, is_unsigned ? 0x4 : 0x5, 0, 1); + } + else if ((strcmp(dest_type, "s32") == 0 || strcmp(dest_type, "u32") == 0) && strcmp(src_type, "f64") == 0) + { + int is_unsigned = strcmp(dest_type, "u32") == 0; + return th_vcvt_fp_int(vd, vm, is_unsigned ? 0x4 : 0x5, 1, 1); + } + else if ((strcmp(dest_type, "f32") == 0 || strcmp(dest_type, "f64") == 0) && + (strcmp(src_type, "s32") == 0 || strcmp(src_type, "u32") == 0)) + { + int dst_is_double = strcmp(dest_type, "f64") == 0; + int is_unsigned = strcmp(src_type, "u32") == 0; + return th_vcvt_fp_int(vd, vm, 0, dst_is_double, is_unsigned ? 0 : 1); + } + else if (strcmp(dest_type, "f64") == 0 && strcmp(src_type, "f32") == 0) + { + return th_vcvt_float_to_double(vd / 2, vm); + } + else if (strcmp(dest_type, "f32") == 0 && strcmp(src_type, "f64") == 0) + { + return th_vcvt_double_to_float(vd, vm / 2); + } + return (thumb_opcode){.size = 0, .opcode = 0}; +} diff --git a/arch/arm/thumb/thop_vfp.h b/arch/arm/thumb/thop_vfp.h new file mode 100644 index 00000000..549c01b2 --- /dev/null +++ b/arch/arm/thumb/thop_vfp.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +#include "thumb.h" + +thumb_opcode th_vadd_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz); +thumb_opcode th_vsub_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz); +thumb_opcode th_vmul_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz); +thumb_opcode th_vdiv_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz); +thumb_opcode th_vneg_f(uint32_t vd, uint32_t vm, uint32_t sz); +thumb_opcode th_vcmp_f(uint32_t vd, uint32_t vm, uint32_t sz); +thumb_opcode th_vpush(uint32_t regs, uint32_t is_doubleword); +thumb_opcode th_vpop(uint32_t regs, uint32_t is_doubleword); +thumb_opcode th_vmov_register(uint16_t vd, uint16_t vm, uint32_t sz); +thumb_opcode th_vmov_gp_sp(uint16_t rt, uint16_t sn, uint16_t to_arm_register); +thumb_opcode th_vmov_2gp_dp(uint16_t rt, uint16_t rt2, uint16_t dm, uint16_t to_arm_register); +thumb_opcode th_vmrs(uint16_t rt); +thumb_opcode th_vcvt_float_to_double(uint32_t vd, uint32_t vm); +thumb_opcode th_vcvt_double_to_float(uint32_t vd, uint32_t vm); +thumb_opcode th_vcvt_fp_int(uint32_t vd, uint32_t vm, uint32_t opc, uint32_t is_double, uint32_t op); +thumb_opcode th_vcvt_convert(uint32_t vd, uint32_t vm, const char *dest_type, const char *src_type); diff --git a/arch/arm/thumb/thumb.c b/arch/arm/thumb/thumb.c new file mode 100644 index 00000000..f1833c25 --- /dev/null +++ b/arch/arm/thumb/thumb.c @@ -0,0 +1,557 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define USING_GLOBALS +#include "thumb.h" +#include "tcc.h" + +/* ═══════════════════════════════════════════════════════════════════ + * Thumb feature profiles, extensions, and FPU bundles + * ═══════════════════════════════════════════════════════════════════ */ + +/* ───── Profile definitions ───── */ + +static const thop_feat THOP_PROFILE_ARMV6M_CORE = {.t16 = 1}; + +static const thop_feat THOP_PROFILE_ARMV7M_CORE = {.t16 = 1, + .t32 = 1, + .it = 1, + .mod_imm = 1, + .movw_movt = 1, + .bfx = 1, + .clz_rbit = 1, + .tbb_tbh = 1, + .cbz = 1, + .sat = 1, + .div = 1}; + +static const thop_feat THOP_PROFILE_ARMV7EM_CORE = {.t16 = 1, + .t32 = 1, + .it = 1, + .mod_imm = 1, + .movw_movt = 1, + .bfx = 1, + .clz_rbit = 1, + .tbb_tbh = 1, + .cbz = 1, + .sat = 1, + .div = 1, + .dsp = 1}; + +static const thop_feat THOP_PROFILE_ARMV8M_BASE_CORE = {.t16 = 1, .movw_movt = 1, .cbz = 1, .ldaex = 1}; + +static const thop_feat THOP_PROFILE_ARMV8M_MAIN_CORE = {.t16 = 1, + .t32 = 1, + .it = 1, + .mod_imm = 1, + .movw_movt = 1, + .bfx = 1, + .clz_rbit = 1, + .tbb_tbh = 1, + .cbz = 1, + .sat = 1, + .div = 1, + .dsp = 1, + .ldaex = 1, + .fp_armv8 = 1}; + +static const thop_feat THOP_PROFILE_ARMV81M_MAIN_CORE = {.t16 = 1, + .t32 = 1, + .it = 1, + .mod_imm = 1, + .movw_movt = 1, + .bfx = 1, + .clz_rbit = 1, + .tbb_tbh = 1, + .cbz = 1, + .sat = 1, + .div = 1, + .dsp = 1, + .ldaex = 1, + .fp_armv8 = 1, + .lob = 1}; + +/* ───── Optional extension bundles ───── */ + +// static const thop_feat THOP_EXT_CMSE = {.sec = 1, .sec_tt = 1}; +// static const thop_feat THOP_EXT_PACBTI = {.pacbti = 1}; +// static const thop_feat THOP_EXT_CDE = {.cde = 1}; +// static const thop_feat THOP_EXT_MVE_INT = {.mve_int = 1}; +// static const thop_feat THOP_EXT_MVE_FP = {.mve_int = 1, .mve_fp = 1, .fp16 = 1}; + +/* ───── FPU bundles ───── */ + +static const thop_feat THOP_FPU_NONE = {0}; +static const thop_feat THOP_FPU_VFPV4_SP_D16 = {.vfp_sp = 1}; +static const thop_feat THOP_FPU_FPV5_SP_D16 = {.vfp_sp = 1, .fp_armv8 = 1}; +static const thop_feat THOP_FPU_FPV5_D16 = {.vfp_sp = 1, .vfp_dp = 1, .fp_armv8 = 1}; +static const thop_feat THOP_FPU_FPV5_D32 = {.vfp_sp = 1, .vfp_dp = 1, .fp_armv8 = 1, .fp_dp_d32 = 1}; +static const thop_feat THOP_FPU_FP_ARMV8_FULL = {.vfp_sp = 1, .vfp_dp = 1, .fp_armv8 = 1, .fp_dp_d32 = 1, .fp16 = 1}; + +/* ───── Resolve helpers ───── */ + +static thop_feat thop_feats_from_march(const char *s) +{ + if (!s) + return THOP_PROFILE_ARMV8M_MAIN_CORE; + + const char *plus = strchr(s, '+'); + size_t base_len = plus ? (size_t)(plus - s) : strlen(s); + + static const struct { + const char *name; + const thop_feat *feat; + } archs[] = { + {"armv6-m", &THOP_PROFILE_ARMV6M_CORE}, + {"armv7-m", &THOP_PROFILE_ARMV7M_CORE}, + {"armv7e-m", &THOP_PROFILE_ARMV7EM_CORE}, + {"armv8-m.base", &THOP_PROFILE_ARMV8M_BASE_CORE}, + {"armv8-m.main", &THOP_PROFILE_ARMV8M_MAIN_CORE}, + {"armv8.1-m.main", &THOP_PROFILE_ARMV81M_MAIN_CORE}, + }; + + thop_feat feat = {0}; + bool found = false; + for (size_t i = 0; i < sizeof(archs) / sizeof(archs[0]); i++) { + if (strlen(archs[i].name) == base_len && !strncmp(s, archs[i].name, base_len)) { + feat = *archs[i].feat; + found = true; + break; + } + } + if (!found) { + tcc_error("unknown -march=%s", s); + return feat; + } + + while (plus && *plus == '+') { + const char *ext = plus + 1; + const char *next = strchr(ext, '+'); + size_t ext_len = next ? (size_t)(next - ext) : strlen(ext); + + if (ext_len == 3 && !strncmp(ext, "dsp", 3)) + feat.dsp = 1; + else if (ext_len == 3 && !strncmp(ext, "fpu", 3)) + feat.vfp_sp = 1; + else if (ext_len == 2 && !strncmp(ext, "fp", 2)) + feat.vfp_sp = 1; + else if (ext_len == 5 && !strncmp(ext, "fp.dp", 5)) { + feat.vfp_sp = 1; + feat.vfp_dp = 1; + } else if (ext_len == 3 && !strncmp(ext, "mve", 3)) + feat.mve_int = 1; + else if (ext_len == 6 && !strncmp(ext, "mve.fp", 6)) { + feat.mve_int = 1; + feat.mve_fp = 1; + } else if (ext_len == 6 && !strncmp(ext, "pacbti", 6)) + feat.pacbti = 1; + else if (ext_len == 3 && !strncmp(ext, "sec", 3)) + feat.sec = 1; + else if (ext_len == 3 && !strncmp(ext, "lob", 3)) + feat.lob = 1; + else + tcc_warning("ignoring unknown -march extension '+%.*s'", (int)ext_len, ext); + + plus = next; + } + + return feat; +} + +static thop_feat thop_feats_from_mfpu(const char *s) +{ + if (!s || !strcmp(s, "none")) + return THOP_FPU_NONE; + if (!strcmp(s, "vfpv4-sp-d16") || !strcmp(s, "fpv4-sp-d16")) + return THOP_FPU_VFPV4_SP_D16; + if (!strcmp(s, "fpv5-sp-d16")) + return THOP_FPU_FPV5_SP_D16; + if (!strcmp(s, "fpv5-d16")) + return THOP_FPU_FPV5_D16; + if (!strcmp(s, "fpv5-d32")) + return THOP_FPU_FPV5_D32; + if (!strcmp(s, "fp-armv8-full")) + return THOP_FPU_FP_ARMV8_FULL; + tcc_error("unknown -mfpu=%s", s); + return THOP_FPU_NONE; +} + +thop_feat thumb_resolve_features(const char *march, const char *mfpu, uint64_t extra_feat_bits) +{ + thop_feat feat = thop_feats_from_march(march); + feat = thop_feat_or(feat, thop_feat_from_bits(extra_feat_bits)); + if (mfpu) + feat = thop_feat_or(feat, thop_feats_from_mfpu(mfpu)); + + if (feat.mve_fp && !feat.vfp_sp) + tcc_error("-mextension=mve.fp requires an FP unit (-mfpu=…)"); + if ((feat.sec || feat.sec_tt) && !(feat.t32 || feat.movw_movt)) + tcc_error("-mcmse requires a mainline or v8-M baseline profile"); + + return feat; +} + +/* Resolve only the FP-unit feature bits for a given -mfpu / .fpu name. + Unlike thumb_resolve_features(), this does not fold in any core/profile + features, so callers can OR the result into an already-resolved target + feature set. Used by the assembler's `.fpu` directive. Errors on an + unknown name. */ +thop_feat thumb_resolve_fpu(const char *mfpu) +{ + return thop_feats_from_mfpu(mfpu); +} + +/* ═══════════════════════════════════════════════════════════════════ + * thop_emit — generic Thumb instruction encoding engine + * + * Walks variants narrow→wide, returns the first whose constraints + * all pass. Returns {.size=0} if no variant matches. + * ═══════════════════════════════════════════════════════════════════ */ + +thumb_opcode thop_emit_error(const char *name, const thop_variant *table, size_t n, thop_args a) +{ + const thop_feat target_feat = arm_target_dependent.feat; + + bool has_feat_mismatch = false; + for (size_t i = 0; i < n; i++) { + if (!thop_feat32_subset(table[i].shape->feat, target_feat)) { + has_feat_mismatch = true; + break; + } + } + + if (has_feat_mismatch) { + fprintf(stderr, "thop_emit: '%s': no variant matched (%zu candidates)\n", name, n); + for (size_t i = 0; i < n; i++) { + const thop_variant_shape *s = table[i].shape; + if (!thop_feat32_subset(s->feat, target_feat)) { + char missing[256]; + thop_feat_describe_missing(thop_feat32_widen(s->feat), target_feat, missing, sizeof missing); + fprintf(stderr, " T%d: missing features: %s\n", (int)i + 1, missing); + } + } + } + + THOP_TRACE("thop_emit: no variant matched (%zu candidates)\n", n); + THOP_TRACE(" args: rd=%s rn=%s rm=%s ra=%s imm=0x%x imm2=0x%x\n", + th_reg_name(a.rd), th_reg_name(a.rn), + th_reg_name(a.rm), th_reg_name(a.ra), + (unsigned)a.imm, (unsigned)a.imm2); + THOP_TRACE(" flags=%d enc=%d shift=%s #%u puw=%u in_it=%d\n", + a.flags, a.enc, th_shift_name(a.shift.type), + (unsigned)a.shift.value, (unsigned)a.puw, a.in_it_block); + + for (size_t i = 0; i < n; i++) + { + const thop_variant *v = &table[i]; + const thop_variant_shape *s = v->shape; + + THOP_TRACE(" T%d (base=0x%x, %s): REJECT ", + (int)i + 1, v->base, s->size == THOP_VARIANT_T16 ? "T16" : "T32"); + + if (!thop_feat32_subset(s->feat, target_feat)) { + THOP_TRACE("target features mismatch\n"); + continue; + } + + if (a.enc == ENFORCE_ENCODING_16BIT && s->size != THOP_VARIANT_T16) + THOP_TRACE("encoding forced T16 but variant is T32\n"); + else if (a.enc == ENFORCE_ENCODING_32BIT && s->size != THOP_VARIANT_T32) + THOP_TRACE("encoding forced T32 but variant is T16\n"); + else if ((s->rd_place.width || s->rd_con) && !thop_reg_ok(a.rd, s->rd_con)) + THOP_TRACE("%s (%s) constraint failed\n", "rd", th_reg_name(a.rd)); + else if ((s->rn_place.width || s->rn_con) && !thop_reg_ok(a.rn, s->rn_con)) + THOP_TRACE("%s (%s) constraint failed\n", "rn", th_reg_name(a.rn)); + else if ((s->rm_place.width || s->rm_con) && !thop_reg_ok(a.rm, s->rm_con)) + THOP_TRACE("%s (%s) constraint failed\n", "rm", th_reg_name(a.rm)); + else if ((s->ra_place.width || s->ra_con) && !thop_reg_ok(a.ra, s->ra_con)) + THOP_TRACE("%s (%s) constraint failed\n", "ra", th_reg_name(a.ra)); + else if ((s->rd_con & REG_EQ_RN) && a.rd != a.rn) + THOP_TRACE("%s (%s) must equal %s (%s)\n", "rd", th_reg_name(a.rd), "rn", th_reg_name(a.rn)); + else if ((s->rd_con & REG_EQ_RM) && a.rd != a.rm) + THOP_TRACE("%s (%s) must equal %s (%s)\n", "rd", th_reg_name(a.rd), "rm", th_reg_name(a.rm)); + else if (a.flags == FLAGS_BEHAVIOUR_SET && !s->has_s_bit && !s->implicit_s) + THOP_TRACE("needs S flag but variant has no s_bit\n"); + else if (s->forbid_s_in_it && a.in_it_block && a.flags == FLAGS_BEHAVIOUR_SET) + THOP_TRACE("S flag forbidden inside IT block\n"); + else if (s->implicit_s && a.in_it_block) + THOP_TRACE("implicit_s variant forbidden inside IT block\n"); + else if (a.shift.type != THUMB_SHIFT_NONE && (!s->shift_type_bits.width && !s->shift_imm2_bits.width && !s->shift_imm3_bits.width && s->shift_allowed == 0)) + THOP_TRACE("shift requested but variant has no shift fields\n"); + else if (a.shift.type != THUMB_SHIFT_NONE && s->shift_allowed != 0 && !(s->shift_allowed & (1u << a.shift.type))) + THOP_TRACE("shift type not in allowed mask\n"); + else if (s->puw_bits.width == 0 && s->puw_fixed != 0 && a.puw != s->puw_fixed) + THOP_TRACE("puw mismatch\n"); + else if (s->imm.kind != IMM_NONE) { + uint32_t tmp; + if (!thop_try_imm(s, a.imm, &tmp)) + THOP_TRACE("immediate doesn't fit encoding\n"); + else + THOP_TRACE("unknown immediate mismatch\n"); + } + else if (v->custom) + THOP_TRACE("custom emitter returned 0\n"); + else + THOP_TRACE("unknown\n"); + } + + return (thumb_opcode){.size = 0, .opcode = 0}; +} + +/* ═══════════════════════════════════════════════════════════════════ + * Utility functions (moved from arm-thumb-opcodes.c) + * ═══════════════════════════════════════════════════════════════════ */ + +void th_trace_regset(uint16_t regs) +{ + int first = 1; + (void)first; + THOP_TRACE("{"); + for (unsigned r = 0; r < 16; ++r) + { + if (regs & (1u << r)) + { + THOP_TRACE("%s%s", first ? "" : ",", th_reg_name(r)); + first = 0; + } + } + THOP_TRACE("}"); +} + +void th_trace_shift_suffix(thumb_shift shift) +{ + if (shift.type == THUMB_SHIFT_NONE) + return; + if (shift.type == THUMB_SHIFT_RRX) + { + THOP_TRACE(", rrx"); + return; + } + if (shift.mode == THUMB_SHIFT_REGISTER) + THOP_TRACE(", %s %s", th_shift_name(shift.type), th_reg_name(shift.value)); + else + THOP_TRACE(", %s #%u", th_shift_name(shift.type), (unsigned)shift.value); +} + +uint32_t th_packimm_10_11_0(uint32_t imm) +{ + const uint32_t imm11 = (imm >> 1) & 0x7ff; + const uint32_t imm10 = (imm >> 12) & 0x3ff; + const uint32_t s = (imm >> 24) & 1; + const uint32_t j1 = ~((imm >> 23) ^ s) & 1; + const uint32_t j2 = ~((imm >> 22) ^ s) & 1; + return (s << 26) | (imm10 << 16) | (j1 << 13) | (j2 << 11) | imm11; +} + +uint32_t th_packimm_3_8_1(uint32_t imm) +{ + const uint32_t imm8 = imm & 0xff; + const uint32_t imm3 = (imm >> 8) & 0x7; + const uint32_t i = (imm >> 11) & 1; + const uint32_t imm4 = (imm >> 12) & 0xf; + return (i << 26) | (imm4 << 16) | (imm3 << 12) | imm8; +} + +typedef struct ThPackConstCacheEntry +{ + uint32_t imm; + uint32_t packed; + uint8_t valid; +} ThPackConstCacheEntry; + +#define TH_PACK_CONST_CACHE_SIZE 64 /* YASOS: 256 -> 64 saves ~2.3 KiB .bss; pure + perf cache (miss => recompute const pack). */ +static ThPackConstCacheEntry th_pack_const_cache[TH_PACK_CONST_CACHE_SIZE]; + +uint32_t th_pack_const(uint32_t imm) +{ + const uint32_t idx = (imm ^ (imm >> 9) ^ (imm >> 17) ^ (imm >> 25)) & (TH_PACK_CONST_CACHE_SIZE - 1); + ThPackConstCacheEntry *cache = &th_pack_const_cache[idx]; + uint32_t packed; + + if (cache->valid && cache->imm == imm) + return cache->packed; + + // 00000000 00000000 00000000 abcdefgh + if ((imm & 0xffffff00) == 0) + { + packed = imm; + } + // 00000000 abcdefgh 00000000 abcdefgh + else if (!(imm & 0xff00ff00) && (imm >> 16) == (imm & 0xff)) + { + packed = (1 << 12) | (imm & 0xff); + } + // abcdefgh 00000000 abcdefgh 00000000 + else if (!(imm & 0x00ff00ff) && ((imm >> 16) & 0xff00) == (imm & 0xff00)) + { + packed = (2 << 12) | ((imm >> 8) & 0xff); + } + // abcdefgh abcdefgh abcdefgh abcdefgh + else if ((imm & 0xffff) == ((imm >> 16) & 0xffff) && ((imm >> 8) & 0xff) == (imm & 0xff)) + { + packed = (3 << 12) | (imm & 0xff); + } + else + { + packed = 0; + for (uint32_t i = 8, j = 0; i <= 0x1F; i++, j++) + { + uint32_t mask = 0xFF000000 >> j; + uint32_t one = 0x80000000 >> j; + + if ((imm & one) == one && (imm & ~mask) == 0) + { + uint32_t _i = i >> 4; + uint32_t imm3 = (i >> 1) & 7; + uint32_t a = i & 1; + uint32_t bcdefgh = (imm >> (24 - j)) & 0x7f; + + packed = (_i << 26) | (imm3 << 12) | (a << 7) | bcdefgh; + break; + } + } + } + cache->imm = imm; + cache->packed = packed; + cache->valid = 1; + return packed; +} + +uint32_t th_encbranch_b_t3(uint32_t imm) +{ + const uint32_t s = (imm >> 19) & 1; + const uint32_t imm6 = (imm >> 11) & 0x3f; + const uint32_t imm11 = imm & 0x7ff; + const uint32_t j2 = (imm >> 18) & 1; + const uint32_t j1 = (imm >> 17) & 1; + const uint32_t a = (s << 10) | imm6; + const uint32_t b = (j1 << 13) | (j2 << 11) | imm11; + return (a << 16) | b; +} + +uint32_t th_encbranch(int pos, int addr) +{ + TRACE("th_encbranch pos: 0x%x, addr: 0x%x", pos, addr); + return addr - pos - 4; +} + +uint32_t th_encbranch_8(int pos, int addr) +{ + addr = (addr - pos - 4) >> 1; + if (addr > 127 || addr < -128) + { + tcc_error("compiler_error: th_encbranch_8 too far address: %i\n", addr); + return 0; + } + return addr & 0xff; +} + +uint32_t th_encbranch_11(int pos, int addr) +{ + addr = (addr - pos - 4) >> 1; + if (addr >= 1023 || addr < -1024) + { + tcc_error("compiler_error: th_encbranch_11 too far address: %i\n", addr); + return 0; + } + return addr & 0x7ff; +} + +uint32_t th_encbranch_20(int pos, int addr) +{ + addr = (addr - pos - 4) >> 1; + TRACE("th_encbranch_20 pos %x addr %x\n", pos, addr); + return addr; +} + +uint32_t th_shift_type_to_op(thumb_shift shift) +{ + switch (shift.type) + { + case THUMB_SHIFT_ASR: + return 4; + case THUMB_SHIFT_LSL: + return 2; + case THUMB_SHIFT_LSR: + return 3; + case THUMB_SHIFT_ROR: + return 7; + default: + tcc_error("compiler_error: 'th_shift_type_to_op', unknown shift type %d\n", shift.type); + return 0; + } +} + +uint32_t th_shift_value_to_sr_type(thumb_shift shift) +{ + switch (shift.type) + { + case THUMB_SHIFT_NONE: + case THUMB_SHIFT_LSL: + return 0; + case THUMB_SHIFT_LSR: + return 1; + case THUMB_SHIFT_ASR: + return 2; + case THUMB_SHIFT_ROR: + case THUMB_SHIFT_RRX: + return 3; + }; + return 0; +} + +thumb_opcode th_generic_op_reg_shift_with_status(uint32_t op, uint32_t rd, uint32_t rn, uint32_t rm, + thumb_flags_behaviour flags, thumb_shift shift) +{ + int s = 0; + const int sr = th_shift_value_to_sr_type(shift); + const int imm2 = shift.value & 0x3; + const int imm3 = (shift.value >> 2) & 0x7; + if (flags == FLAGS_BEHAVIOUR_SET) + s = 1; + + /* Guard against invalid register values (e.g., -1 or PREG_SPILLED) */ + if (rd > 15 || rn > 15 || rm > 15) + { + tcc_error("compiler_error: 'th_generic_op_reg_shift_with_status' invalid register: rd=%d, rn=%d, rm=%d (op=0x%x)\n", + rd, rn, rm, op); + } + + return (thumb_opcode){ + .size = 4, + .opcode = (op << 16) | (rn << 16) | (rd << 8) | rm | (sr << 4) | (imm2 << 6) | (imm3 << 12) | (s << 20), + }; +} + +// Thumb ELF management +// Start of T32 instructions +void th_sym_t() +{ + const int info = ELFW(ST_INFO)(STB_LOCAL, STT_NOTYPE); + set_elf_sym(symtab_section, ind, 0, info, 0, 1, "$t"); +} + +// Start of data +void th_sym_d() +{ + const int info = ELFW(ST_INFO)(STB_LOCAL, STT_NOTYPE); + set_elf_sym(symtab_section, ind, 0, info, 0, 1, "$d"); +} diff --git a/arch/arm/thumb/thumb.h b/arch/arm/thumb/thumb.h new file mode 100644 index 00000000..94690307 --- /dev/null +++ b/arch/arm/thumb/thumb.h @@ -0,0 +1,785 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2026 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#pragma once + +#include +#include +#include + +#include "log.h" + +static inline const char *th_reg_name(uint32_t r) +{ + static const char *names[] = { + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "sp", "lr", "pc", + }; + static char buf[16]; + if (r < 16) + return names[r]; + snprintf(buf, sizeof buf, "r%u", r); + return buf; +} + +static inline const char *th_shift_name(int type) +{ + switch (type) + { + case 0: + return "none"; + case 1: + return "rrx"; + case 2: + return "lsl"; + case 3: + return "lsr"; + case 4: + return "asr"; + case 5: + return "ror"; + default: + return "?shift"; + } +} + +#if TCC_LOG_THOP +#define THOP_TRACE(...) fprintf(stderr, __VA_ARGS__) +#else +#define THOP_TRACE(...) \ + do \ + { \ + } while (0) +#endif + +#if TCC_LOG_THUMB +#define LOG(...) LOG_THUMB(__VA_ARGS__) +#define TRACE(...) LOG_THUMB(__VA_ARGS__) +#else +#define LOG(...) \ + do \ + { \ + } while (0) +#define TRACE(...) \ + do \ + { \ + } while (0) +#endif + +#define ceil_div(x, d) ((x + (d - 1)) / d) + +#define R0 0 +#define R1 1 +#define R2 2 +#define R3 3 +#define R4 4 +#define R5 5 +#define R6 6 +#define R7 7 +#define R8 8 +#define R9 9 +#define R10 10 +#define R11 11 +#define R12 12 +#define R_IP R12 +#define R_SP 13 +#define R_LR 14 +#define R_PC 15 + +#define R_FP R7 + +typedef enum +{ + FLAGS_BEHAVIOUR_NOT_IMPORTANT = 0, + FLAGS_BEHAVIOUR_SET = 1, + FLAGS_BEHAVIOUR_BLOCK = 2, +} thumb_flags_behaviour; + +typedef enum +{ + ENFORCE_ENCODING_NONE = 0, + ENFORCE_ENCODING_16BIT = 1, + ENFORCE_ENCODING_32BIT = 2, +} thumb_enforce_encoding; + +typedef struct thumb_opcode +{ + uint8_t size; + uint32_t opcode; +} thumb_opcode; + +typedef enum thumb_shift_type +{ + THUMB_SHIFT_NONE, + THUMB_SHIFT_RRX, + THUMB_SHIFT_LSL, + THUMB_SHIFT_LSR, + THUMB_SHIFT_ASR, + THUMB_SHIFT_ROR, +} thumb_shift_type; + +typedef enum thumb_shift_mode +{ + THUMB_SHIFT_IMMEDIATE, + THUMB_SHIFT_REGISTER, +} thumb_shift_mode; + +typedef struct thumb_shift +{ + thumb_shift_type type; + uint32_t value; + thumb_shift_mode mode; +} thumb_shift; + +static const thumb_shift _thumb_shift_default_val = {THUMB_SHIFT_NONE, 0, THUMB_SHIFT_IMMEDIATE}; +#define THUMB_SHIFT_DEFAULT _thumb_shift_default_val + +typedef struct +{ + /* ───── implemented now (bits 0-15) ───── */ + uint64_t t16 : 1; /* 16-bit Thumb-1 (all profiles) */ + uint64_t t32 : 1; /* 32-bit Thumb-2 wide encodings */ + uint64_t it : 1; /* IT blocks */ + uint64_t mod_imm : 1; /* th_pack_const modified imm */ + uint64_t movw_movt : 1; /* movw/movt 16-bit imm moves */ + uint64_t dsp : 1; /* sel, uadd8, usub8, pkhbt, qadd, … */ + uint64_t sat : 1; /* ssat/usat */ + uint64_t div : 1; /* udiv/sdiv */ + uint64_t bfx : 1; /* bfi, bfc, sbfx, ubfx */ + uint64_t clz_rbit : 1; /* clz, rbit */ + uint64_t ldaex : 1; /* lda/stl acquire/release (v8) */ + uint64_t vfp_sp : 1; /* single-precision FP */ + uint64_t vfp_dp : 1; /* double-precision FP */ + uint64_t tbb_tbh : 1; /* tbb/tbh table branches */ + uint64_t cbz : 1; /* cbz/cbnz */ + uint64_t hwdiv_t16 : 1; /* reserved for narrow div forms */ + + /* ───── reserved / future enablers (bits 16-31) ───── */ + uint64_t sec : 1; /* sg, bxns, blxns */ + uint64_t sec_tt : 1; /* tt, ttt, tta, ttat */ + uint64_t lob : 1; /* low-overhead-branch: wls, dls, le… */ + uint64_t pacbti : 1; /* pac, aut, pacg, autg, bti */ + uint64_t cde : 1; /* custom datapath: cx{1,2,3}, vcx… */ + uint64_t ras : 1; /* reliability / esb */ + uint64_t fp16 : 1; /* half-precision FP */ + uint64_t fp_armv8 : 1; /* vrint*, vsel, vmaxnm, vminnm */ + uint64_t fp_dp_d32 : 1; /* 32 double registers (d16..d31) */ + uint64_t mve_int : 1; /* integer MVE */ + uint64_t mve_fp : 1; /* FP MVE */ + uint64_t cache_maint : 1; /* dc, ic cache maintenance forms */ + uint64_t debug : 1; /* bkpt variants, hlt, dbg imm */ + uint64_t coproc : 1; /* mcr/mrc/mcrr/mrrc/cdp */ + uint64_t lrcpc : 1; /* load-acquire RCpc forms */ + uint64_t unpriv_ls : 1; /* ldrt/strt family */ + + /* bits 32-47: architect's playground — reserved without commitment */ + uint64_t reserved_arch : 16; + /* bits 48-63: vendor / compiler-specific feature flags */ + uint64_t reserved_vendor : 16; +} thop_feat; + +_Static_assert(sizeof(thop_feat) == sizeof(uint64_t), "thop_feat must pack into 64 bits"); + +typedef struct +{ + uint32_t t16 : 1; + uint32_t t32 : 1; + uint32_t it : 1; + uint32_t mod_imm : 1; + uint32_t movw_movt : 1; + uint32_t dsp : 1; + uint32_t sat : 1; + uint32_t div : 1; + uint32_t bfx : 1; + uint32_t clz_rbit : 1; + uint32_t ldaex : 1; + uint32_t vfp_sp : 1; + uint32_t vfp_dp : 1; + uint32_t tbb_tbh : 1; + uint32_t cbz : 1; + uint32_t hwdiv_t16 : 1; + uint32_t sec : 1; + uint32_t sec_tt : 1; + uint32_t lob : 1; + uint32_t pacbti : 1; + uint32_t cde : 1; + uint32_t ras : 1; + uint32_t fp16 : 1; + uint32_t fp_armv8 : 1; + uint32_t fp_dp_d32 : 1; + uint32_t mve_int : 1; + uint32_t mve_fp : 1; + uint32_t cache_maint : 1; + uint32_t debug : 1; + uint32_t coproc : 1; + uint32_t lrcpc : 1; + uint32_t unpriv_ls : 1; +} thop_feat32; + +_Static_assert(sizeof(thop_feat32) == sizeof(uint32_t), "thop_feat32 must pack into 32 bits"); + +static inline uint32_t thop_feat32_bits(thop_feat32 f) +{ + uint32_t b; + memcpy(&b, &f, sizeof b); + return b; +} + +thop_feat thumb_resolve_features(const char *march, const char *mfpu, uint64_t extra_feat_bits); + +/* Resolve only the FP-unit feature bits for a -mfpu / .fpu name (no core + features). Used by the `.fpu` assembler directive. */ +thop_feat thumb_resolve_fpu(const char *mfpu); + +/* ───── Backend-owned target-dependent config ───── + * + * Forward-declared as `struct target_dependent_config` in tcc.h; generic + * code sees only the pointer. The full shape (feature mask, TrustZone + * flag, -mcpu= name) is ARM-private and lives here. */ + +struct target_dependent_config +{ + const char *mcpu_name; + thop_feat feat; + bool is_secure_tz; +}; + +extern struct target_dependent_config arm_target_dependent; + +typedef enum +{ + IMM_NONE, + IMM_RAW, /* plain N-bit value, optional scale */ + IMM_PACK_CONST, /* ARMv7-M modified-immediate (th_pack_const) */ + IMM_PACK_3_8_1, /* scattered 12-bit (movw/adr) */ + IMM_PACK_10_11_0, /* branch encoding */ + IMM_SIGNED_PUW, /* load/store with P/U/W bits */ +} imm_kind; + +typedef struct +{ + uint16_t kind : 3; /* imm_kind (6 values) */ + uint16_t width : 4; /* max bits of the *user* value */ + uint16_t scale_log2 : 2; /* 0=byte, 1=half, 2=word */ + uint16_t is_signed : 1; +} imm_spec; + +_Static_assert(sizeof(imm_spec) == 2, "imm_spec must pack into 16 bits"); + +typedef enum +{ + REG_ANY = 0, + REG_LOW_ONLY = 1 << 0, /* R0..R7 */ + REG_NOT_SP = 1 << 1, + REG_NOT_PC = 1 << 2, + REG_NOT_LR = 1 << 3, + REG_EQ_RN = 1 << 4, /* rd must equal rn (e.g. T1 add_imm) */ + REG_EQ_RM = 1 << 5, /* rd must equal rm (e.g. T1 add_sp_reg) */ + REG_SP_ONLY = 1 << 6, /* must be SP (r13) */ + REG_PC_ONLY = 1 << 7, /* must be PC (r15) */ + /* ── bitmask-field constraints (applied to rm when used as reglist) ── */ + REG_LOW_REGSET = 1 << 8, /* only bits [7:0] may be set in rm */ + REG_RM_BIT_NOT_SP = 1 << 9, /* bit 13 of rm must NOT be set */ + REG_RM_BITS_NOT_LR_PC = 1 << 10, /* bits 14,15 of rm must NOT be set */ +} reg_mask; + +/* Where an operand lands in the final 16/32-bit word */ +typedef struct +{ + uint8_t shift; /* LSB position */ + uint8_t width; /* bit width; 0 = field unused */ +} bitfield; + +typedef enum thop_variant_size +{ + THOP_VARIANT_NONE = 0, + THOP_VARIANT_T16 = 2, + THOP_VARIANT_T32 = 4, +} thop_variant_size; + +typedef struct +{ + thop_feat32 feat; + + imm_spec imm; + bitfield rd_place, rn_place, rm_place, ra_place; + bitfield imm_place; + bitfield shift_type_bits; /* e.g. [5:4] in T3 */ + bitfield shift_imm2_bits; /* [7:6] */ + bitfield shift_imm3_bits; /* [14:12] */ + bitfield imm2_place; + bitfield split_imm2_place; /* places (a.imm >> 0) & 0x3 */ + bitfield split_imm3_place; /* places (a.imm >> 2) & 0x7 */ + bitfield puw_bits; + bitfield rm_raw_place; /* place raw rm value at this position (not ARM-encoded) */ + bitfield dn_rd_split; /* DN:Rd split (T1 high-register MOV) — Rd low bits at shift/width, D computed from rd>>3 */ + + uint16_t rd_con, rn_con, rm_con, ra_con; + + uint16_t size : 3; /* thop_variant_size (0, 2, 4) */ + uint16_t shift_allowed : 6; /* bitmask of THUMB_SHIFT_* */ + uint16_t puw_fixed : 3; /* when puw_bits.width==0, match only this */ + uint16_t has_s_bit : 1; /* s_bit always at position 20 when set */ + uint16_t implicit_s : 1; /* T16 always sets flags */ + uint16_t forbid_s_in_it : 1; + uint16_t has_rd_hi : 1; /* rd_hi_place always {7, 1} when set */ +} thop_variant_shape; + +_Static_assert(sizeof(thop_variant_shape) == 44, "thop_variant_shape"); + +typedef struct thop_args thop_args; +typedef thumb_opcode (*thop_custom_emit)(uint32_t base, const thop_args *a); + +typedef struct +{ + const thop_variant_shape *shape; + uint32_t base; + thop_custom_emit custom; +} thop_variant; + +typedef struct thop_table +{ + const char *name; + const thop_variant *variants; + size_t variant_count; +} thop_table; + +#define TH_TABLE(id, mnemonic, ...) \ + static const thop_variant id##_VARIANTS[] = {__VA_ARGS__}; \ + static const thop_table id = { \ + .name = mnemonic, \ + .variants = id##_VARIANTS, \ + .variant_count = sizeof(id##_VARIANTS) / sizeof(id##_VARIANTS[0]), \ + } + +/* ───── Emit engine ───── */ + +struct thop_args +{ + uint32_t rd, rn, rm, ra; + uint32_t imm; + uint32_t imm2; + thumb_shift shift; + thumb_flags_behaviour flags; + thumb_enforce_encoding enc; + bool in_it_block; + uint8_t puw; + uint8_t exclude_bit; /* clear this bit from rm before rm_raw_place placement */ +}; + +/* ───── Utility declarations (defined in thumb.c) ───── */ + +uint32_t th_packimm_10_11_0(uint32_t imm); +uint32_t th_packimm_3_8_1(uint32_t imm); + +uint32_t th_pack_const(uint32_t imm); +uint32_t th_encbranch_b_t3(uint32_t imm); + +uint32_t th_encbranch(int pos, int addr); +uint32_t th_encbranch_8(int pos, int addr); +uint32_t th_encbranch_11(int pos, int addr); +uint32_t th_encbranch_20(int pos, int addr); + +void th_sym_t(); +void th_sym_d(); + +void th_trace_regset(uint16_t regs); +void th_trace_shift_suffix(thumb_shift shift); + +uint32_t th_shift_type_to_op(thumb_shift shift); +uint32_t th_shift_value_to_sr_type(thumb_shift shift); + +thumb_opcode th_generic_op_reg_shift_with_status(uint32_t op, uint32_t rd, uint32_t rn, uint32_t rm, + thumb_flags_behaviour setflags, thumb_shift shift); + +thumb_opcode thop_emit_error(const char *name, const thop_variant *table, size_t n, thop_args a); + +/* Bulk helpers — type-pun through memcpy (defined behaviour, the + compiler folds it away). Used for profile composition and the + engine's subset test; single-capability checks use named fields. */ +static inline uint64_t thop_feat_bits(thop_feat f) +{ + uint64_t b; + memcpy(&b, &f, sizeof b); + return b; +} + +static inline thop_feat thop_feat_from_bits(uint64_t b) +{ + thop_feat f; + memcpy(&f, &b, sizeof f); + return f; +} + +static inline thop_feat thop_feat_or(thop_feat a, thop_feat b) +{ + return thop_feat_from_bits(thop_feat_bits(a) | thop_feat_bits(b)); +} + +static inline bool thop_feat_subset(thop_feat sub, thop_feat sup) +{ + uint64_t s = thop_feat_bits(sub); + return (s & thop_feat_bits(sup)) == s; +} + +static inline thop_feat thop_feat32_widen(thop_feat32 f32) +{ + return thop_feat_from_bits((uint64_t)thop_feat32_bits(f32)); +} + +static inline bool thop_feat32_subset(thop_feat32 sub, thop_feat sup) +{ + uint32_t s = thop_feat32_bits(sub); + return (s & (uint32_t)thop_feat_bits(sup)) == s; +} + +static inline const char *thop_feat_bit_name(int bit) +{ + static const char *names[] = { + [0] = "t16", [1] = "t32", [2] = "it", [3] = "mod_imm", [4] = "movw_movt", + [5] = "dsp", [6] = "sat", [7] = "div", [8] = "bfx", [9] = "clz_rbit", + [10] = "ldaex", [11] = "vfp_sp", [12] = "vfp_dp", [13] = "tbb_tbh", [14] = "cbz", + [15] = "hwdiv_t16", [16] = "sec", [17] = "sec_tt", [18] = "lob", [19] = "pacbti", + [20] = "cde", [21] = "ras", [22] = "fp16", [23] = "fp_armv8", [24] = "fp_dp_d32", + [25] = "mve_int", [26] = "mve_fp", [27] = "cache_maint", [28] = "debug", [29] = "coproc", + [30] = "lrcpc", [31] = "unpriv_ls", + }; + if (bit >= 0 && bit < (int)(sizeof(names) / sizeof(names[0])) && names[bit]) + return names[bit]; + return "?"; +} + +static inline const char *thop_feat_hint(int bit) +{ + switch (bit) + { + case 11: + return "enable with -mfpu=fpv4-sp-d16 or -mfpu=fpv5-sp-d16"; + case 12: + return "enable with -mfpu=fpv5-d16"; + case 22: + return "enable with -mfpu that supports fp16"; + case 23: + return "requires ARMv8-M FP extensions"; + default: + return NULL; + } +} + +static inline void thop_feat_describe_missing(thop_feat need, thop_feat have, char *buf, size_t bufsz) +{ + uint64_t missing = thop_feat_bits(need) & ~thop_feat_bits(have); + size_t pos = 0; + for (int i = 0; i < 64 && missing && pos < bufsz - 1; i++) + { + if (!(missing & (1ull << i))) + continue; + missing &= ~(1ull << i); + const char *name = thop_feat_bit_name(i); + const char *hint = thop_feat_hint(i); + int n; + if (hint) + n = snprintf(buf + pos, bufsz - pos, "%s%s (%s)", pos ? ", " : "", name, hint); + else + n = snprintf(buf + pos, bufsz - pos, "%s%s", pos ? ", " : "", name); + if (n > 0) + pos += (size_t)n; + } + if (pos == 0 && bufsz > 0) + buf[0] = '\0'; +} + +static inline __attribute__((always_inline)) bool thop_reg_ok(uint32_t reg, reg_mask con) +{ + if ((con & REG_LOW_ONLY) && reg > 7) + return false; + if ((con & REG_NOT_SP) && reg == 13) + return false; + if ((con & REG_NOT_PC) && reg == 15) + return false; + if ((con & REG_NOT_LR) && reg == 14) + return false; + if ((con & REG_SP_ONLY) && reg != 13) + return false; + if ((con & REG_PC_ONLY) && reg != 15) + return false; + return true; +} + +static inline __attribute__((always_inline)) uint32_t thop_place(uint32_t val, bitfield bf) +{ + if (bf.width == 0) + return 0; + return (val & ((1u << bf.width) - 1)) << bf.shift; +} + +static inline __attribute__((always_inline)) bool thop_try_imm(const thop_variant_shape *s, uint32_t imm, + uint32_t *out_bits) +{ + const imm_spec *spec = &s->imm; + *out_bits = 0; + + if (spec->kind == IMM_NONE) + return imm == 0; + + uint32_t scaled = imm; + + if (spec->is_signed) + { + int32_t simm = (int32_t)scaled; + if (simm >= 0) + return false; + scaled = (uint32_t)(-simm); + } + + if (spec->scale_log2 > 0) + { + uint32_t mask = (1u << spec->scale_log2) - 1; + if (scaled & mask) + return false; + scaled >>= spec->scale_log2; + } + + switch (spec->kind) + { + case IMM_RAW: + if (scaled >= (1u << spec->width)) + return false; + *out_bits = thop_place(scaled, s->imm_place); + return true; + + case IMM_PACK_CONST: + { + uint32_t packed = th_pack_const(imm); + if (!packed && imm != 0) + return false; + *out_bits = packed; + return true; + } + + case IMM_PACK_3_8_1: + if (spec->width ? (scaled >= (1u << spec->width)) : (scaled > 0xFFFF)) + return false; + *out_bits = th_packimm_3_8_1(scaled); + return true; + + case IMM_PACK_10_11_0: + *out_bits = th_packimm_10_11_0(imm); + return true; + + default: + return false; + } +} + +static inline __attribute__((always_inline)) thumb_opcode thop_emit(const char *name, const thop_variant *table, size_t n, thop_args a) +{ + const thop_feat target_feat = arm_target_dependent.feat; + + for (size_t i = 0; i < n; i++) + { + const thop_variant *v = &table[i]; + const thop_variant_shape *s = v->shape; + + if (!thop_feat32_subset(s->feat, target_feat)) + { + THOP_TRACE("%s: variant %zu skipped (feature mismatch)\n", name ? name : "?unknown?", i); + continue; + } + + if (a.enc == ENFORCE_ENCODING_16BIT && s->size != THOP_VARIANT_T16) + { + THOP_TRACE("%s: variant %zu skipped (encoding T%d, requested T16)\n", name ? name : "?unknown?", i, (int)s->size); + continue; + } + if (a.enc == ENFORCE_ENCODING_32BIT && s->size != THOP_VARIANT_T32) + { + THOP_TRACE("%s: variant %zu skipped (encoding T%d, requested T32)\n", name ? name : "?unknown?", i, (int)s->size); + continue; + } + + if ((s->rd_place.width || s->rd_con) && !thop_reg_ok(a.rd, s->rd_con)) + { + THOP_TRACE("%s: variant %zu skipped (rd=%u constraint failed)\n", name ? name : "?unknown?", i, a.rd); + continue; + } + if ((s->rn_place.width || s->rn_con) && !thop_reg_ok(a.rn, s->rn_con)) + { + THOP_TRACE("%s: variant %zu skipped (rn=%u constraint failed)\n", name ? name : "?unknown?", i, a.rn); + continue; + } + if ((s->rm_place.width || s->rm_con) && !thop_reg_ok(a.rm, s->rm_con)) + { + THOP_TRACE("%s: variant %zu skipped (rm=%u constraint failed)\n", name ? name : "?unknown?", i, a.rm); + continue; + } + if ((s->ra_place.width || s->ra_con) && !thop_reg_ok(a.ra, s->ra_con)) + { + THOP_TRACE("%s: variant %zu skipped (ra=%u constraint failed)\n", name ? name : "?unknown?", i, a.ra); + continue; + } + + if ((s->rd_con & REG_EQ_RN) && a.rd != a.rn) + { + THOP_TRACE("%s: variant %zu skipped (rd==rn required, got rd=%u rn=%u)\n", name ? name : "?unknown?", i, a.rd, a.rn); + continue; + } + if ((s->rd_con & REG_EQ_RM) && a.rd != a.rm) + { + THOP_TRACE("%s: variant %zu skipped (rd==rm required, got rd=%u rm=%u)\n", name ? name : "?unknown?", i, a.rd, a.rm); + continue; + } + + if (a.flags == FLAGS_BEHAVIOUR_SET && !s->has_s_bit && !s->implicit_s) + { + THOP_TRACE("%s: variant %zu skipped (flags SET but no s-bit)\n", name ? name : "?unknown?", i); + continue; + } + if (a.flags == FLAGS_BEHAVIOUR_BLOCK && s->implicit_s) + { + THOP_TRACE("%s: variant %zu skipped (implicit S-bit conflicts with BLOCK)\n", name ? name : "?unknown?", i); + continue; + } + if (s->forbid_s_in_it && a.in_it_block && a.flags == FLAGS_BEHAVIOUR_SET) + { + THOP_TRACE("%s: variant %zu skipped (S-bit forbidden in IT block)\n", name ? name : "?unknown?", i); + continue; + } + if (s->implicit_s && a.in_it_block) + { + THOP_TRACE("%s: variant %zu skipped (implicit S-bit not allowed in IT block)\n", name ? name : "?unknown?", i); + continue; + } + + if (a.shift.type != THUMB_SHIFT_NONE) + { + bool has_shift_fields = s->shift_type_bits.width || s->shift_imm2_bits.width || s->shift_imm3_bits.width; + if (!has_shift_fields && s->shift_allowed == 0) + { + THOP_TRACE("%s: variant %zu skipped (no shift support, type=%u)\n", name ? name : "?unknown?", i, a.shift.type); + continue; + } + if (s->shift_allowed != 0 && !(s->shift_allowed & (1u << a.shift.type))) + { + THOP_TRACE("%s: variant %zu skipped (shift type %u not allowed)\n", name ? name : "?unknown?", i, a.shift.type); + continue; + } + } + + if (s->puw_bits.width == 0 && s->puw_fixed != 0 && a.puw != s->puw_fixed) + { + THOP_TRACE("%s: variant %zu skipped (puw=%u, expected fixed=%u)\n", name ? name : "?unknown?", i, a.puw, s->puw_fixed); + continue; + } + + uint32_t imm_bits = 0; + if (s->imm.kind != IMM_NONE) + { + if (!thop_try_imm(s, a.imm, &imm_bits)) + { + THOP_TRACE("%s: variant %zu skipped (immediate %u invalid for this encoding)\n", name ? name : "?unknown?", i, a.imm); + continue; + } + } + + /* ── bitmask-field pre-processing ── */ + if ((s->rm_con & REG_LOW_REGSET) && (a.rm & ~0xff)) + { + THOP_TRACE("%s: variant %zu skipped (regset bits [15:8] set, got 0x%x)\n", name ? name : "?unknown?", i, a.rm); + continue; + } + if ((s->rm_con & REG_RM_BIT_NOT_SP) && (a.rm & (1u << 13))) + { + THOP_TRACE("%s: variant %zu skipped (SP not allowed in reglist)\n", name ? name : "?unknown?", i); + continue; + } + if ((s->rm_con & REG_RM_BITS_NOT_LR_PC) && (a.rm & ((1u << 14) | (1u << 15)))) + { + THOP_TRACE("%s: variant %zu skipped (LR/PC not allowed in reglist)\n", name ? name : "?unknown?", i); + continue; + } + + /* exclude_bit: clear specified bit from rm before raw placement */ + uint32_t rm_for_place = a.rm; + if (s->rm_raw_place.width && a.exclude_bit) + rm_for_place &= ~(1u << a.exclude_bit); + + if (v->custom) + { + thumb_opcode r = v->custom(v->base, &a); + if (r.size) + { + THOP_TRACE("%s: custom T%d base=0x%x → 0x%x\n", name ? name : "?unknown?", (int)i + 1, v->base, r.opcode); + return r; + } + continue; + } + + uint32_t op = v->base; + op |= thop_place(a.rd, s->rd_place); + if (s->has_rd_hi) + op |= thop_place(a.rd >> s->rd_place.width, (bitfield){7, 1}); + op |= thop_place(a.rn, s->rn_place); + op |= thop_place(a.rm, s->rm_place); + op |= thop_place(a.ra, s->ra_place); + op |= imm_bits; + + /* ── DN:Rd split (T1 high-register MOV) ── */ + if (s->dn_rd_split.width) + { + uint32_t dn = (a.rd >> 3) & 1; + op |= thop_place(dn, (bitfield){7, 1}); + op |= thop_place(a.rd & ((1u << s->dn_rd_split.width) - 1), s->dn_rd_split); + } + + if (s->has_s_bit && a.flags == FLAGS_BEHAVIOUR_SET) + op |= (1u << 20); + + if (s->shift_type_bits.width) + { + uint32_t sr = th_shift_value_to_sr_type(a.shift); + op |= thop_place(sr, s->shift_type_bits); + } + if (s->shift_imm2_bits.width) + op |= thop_place(a.shift.value & 0x3, s->shift_imm2_bits); + if (s->shift_imm3_bits.width) + op |= thop_place((a.shift.value >> 2) & 0x7, s->shift_imm3_bits); + + if (s->imm2_place.width) + op |= thop_place(a.imm2, s->imm2_place); + if (s->split_imm2_place.width) + op |= thop_place(a.imm & 0x3, s->split_imm2_place); + if (s->split_imm3_place.width) + op |= thop_place((a.imm >> 2) & 0x7, s->split_imm3_place); + + if (s->puw_bits.width) + op |= thop_place(a.puw & 0x7, s->puw_bits); + + /* ── raw register list placement ── */ + if (s->rm_raw_place.width) + op |= thop_place(rm_for_place, s->rm_raw_place); + + THOP_TRACE("%s: matched T%d base=0x%x → 0x%x\n", name ? name : "?unknown?", (int)i + 1, v->base, op); + + return (thumb_opcode){.size = s->size, .opcode = op}; + } + + THOP_TRACE("%s: ERROR no variant matched! (tried %zu variants)\n", name ? name : "?unknown?", n); + return thop_emit_error(name, table, n, a); +} diff --git a/arch/fpu/arm/fpv5-d16.c b/arch/fpu/arm/fpv5-d16.c new file mode 100644 index 00000000..4b6a17b1 --- /dev/null +++ b/arch/fpu/arm/fpv5-d16.c @@ -0,0 +1,53 @@ +/* + * TCC - Tiny C Compiler + * + * Copyright (c) 2025 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "tcc.h" + +#include "arch/fpu/arm/fpv5-d16.h" +#include "tccir.h" + +const FloatingPointConfig arm_fpv5_d16_fpu_config = { + .reg_size = 8, + .reg_count = 16, + .stack_align = 8, + .has_fadd = 1, + .has_fsub = 1, + .has_fmul = 1, + .has_fdiv = 1, + .has_fcmp = 1, + .has_ftof = 1, + .has_itof = 1, + .has_ftod = 1, + .has_ftoi = 1, + .has_dadd = 1, + .has_dsub = 1, + .has_dmul = 1, + .has_ddiv = 1, + .has_dcmp = 1, + .has_dtof = 1, + .has_itod = 1, + .has_dtoi = 1, + .has_ltod = 0, + .has_ltof = 0, + .has_dtol = 0, + .has_ftol = 0, + .has_fneg = 1, + .has_dneg = 1, +}; diff --git a/arch/armv8m.c b/arch/fpu/arm/fpv5-d16.h similarity index 79% rename from arch/armv8m.c rename to arch/fpu/arm/fpv5-d16.h index 101ced66..1aef7462 100644 --- a/arch/armv8m.c +++ b/arch/fpu/arm/fpv5-d16.h @@ -18,15 +18,9 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include "tcc.h" +#pragma once -#include "arm-thumb-opcodes.h" +#define USING_GLOBALS +#include "tcc.h" -ArchitectureConfig architecture_config = { - .pointer_size = 4, - .stack_align = 8, - .reg_size = 4, - .parameter_registers = 4, - .has_fpu = 0, - .static_chain_reg = 10, -}; +const FloatingPointConfig arm_fpv5_d16_fpu_config; diff --git a/arm-link.c b/arm-link.c index 46b72db9..dd222b2b 100644 --- a/arm-link.c +++ b/arm-link.c @@ -1,4 +1,8 @@ -#include "arm-thumb-opcodes.h" +#include "arch/arm/thumb/thumb.h" +#include "arch/arm/thumb/thop_alu_reg.h" +#include "arch/arm/thumb/thop_branch.h" +#include "arch/arm/thumb/thop_cmp.h" +#include "arch/arm/thumb/thop_mem_imm.h" #include "tcc.h" #ifdef NEED_RELOC_TYPE @@ -16,6 +20,7 @@ ST_FUNC int code_reloc(int reloc_type) case R_ARM_REL32: case R_ARM_GOTPC: case R_ARM_GOTOFF: + case R_ARM_RODATA_OFF: case R_ARM_GOT32: case R_ARM_GOT_PREL: case R_ARM_COPY: @@ -84,6 +89,9 @@ ST_FUNC int gotplt_entry_type(int reloc_type) case R_ARM_GOTPC: case R_ARM_GOTOFF: + case R_ARM_RODATA_OFF: + /* RODATA_OFF needs the GOT to exist (for the reserved rodata anchor slot) + * but no per-symbol GOT entry — same as GOTOFF. */ return BUILD_GOT_ONLY; case R_ARM_GOT32: @@ -140,6 +148,9 @@ ST_FUNC void relocate_plt(TCCState *s1) if (!s1->plt) return; + if (!thop_feat_bits(arm_target_dependent.feat)) + arm_init(s1); + p = s1->plt->data; p_end = p + s1->plt->data_offset; p += 32; @@ -191,7 +202,7 @@ ST_FUNC void relocate_plt(TCCState *s1) // get address of the symbol // load the address of the symbol write_thumb_instruction(p + 10, th_ldr_imm(R_IP, R_IP, 0, 6, ENFORCE_ENCODING_NONE)); - write_thumb_instruction(p + 14, th_cmp_imm(0, R_IP, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_32BIT)); + write_thumb_instruction(p + 14, th_cmp_imm(R_IP, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_32BIT)); // if 0 then call resolver, else move one instruction further write_thumb_instruction(p + 18, th_b_t1(1, 0)); write_thumb_instruction(p + 22, th_bx_reg(R_IP)); @@ -229,9 +240,7 @@ ST_FUNC void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr, { int x, is_thumb, is_call, h, blx_avail, is_bl, th_ko; x = read32le(ptr) & 0xffffff; -#ifdef DEBUG_RELOC - printf("reloc %d: x=0x%x val=0x%x ", type, x, val); -#endif + LOG_RELOC("reloc %d: x=0x%x val=0x%x ", type, x, val); write32le(ptr, read32le(ptr) & 0xff000000); if (x & 0x800000) x -= 0x1000000; @@ -241,9 +250,7 @@ ST_FUNC void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr, is_bl = read32le(ptr) >> 24 == 0xeb; is_call = (type == R_ARM_CALL || (type == R_ARM_PC24 && is_bl)); x += val - addr; -#ifdef DEBUG_RELOC - printf(" newx=0x%x name=%s\n", x, (char *)symtab_section->link->data + sym->st_name); -#endif + LOG_RELOC(" newx=0x%x name=%s", x, (char *)symtab_section->link->data + sym->st_name); h = x & 2; th_ko = (x & 3) && (!blx_avail || !is_call); if (th_ko || x >= 0x2000000 || x < -0x2000000) @@ -571,6 +578,11 @@ ST_FUNC void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr, case R_ARM_GOTOFF: add32le(ptr, val - s1->got->sh_addr); return; + case R_ARM_RODATA_OFF: + /* Offset of the symbol within .rodata: anchor (rodata runtime base, from + * the reserved GOT slot) + this value = the symbol's address. */ + add32le(ptr, val - rodata_section->sh_addr); + return; case R_ARM_GOT32: /* we load the got offset */ write32le(ptr, get_sym_attr(s1, sym_index, 0)->got_offset); @@ -601,7 +613,7 @@ ST_FUNC void relocate(TCCState *s1, ElfW_Rel *rel, int type, unsigned char *ptr, /* do nothing */ return; default: - fprintf(stderr, "FIXME: handle reloc type %d at %x [%p] to %x\n", type, (unsigned)addr, ptr, (unsigned)val); + LOG_RELOC("FIXME: handle reloc type %d at %x [%p] to %x", type, (unsigned)addr, ptr, (unsigned)val); return; } } diff --git a/arm-thumb-asm.c b/arm-thumb-asm.c index 203032f2..626a3325 100644 --- a/arm-thumb-asm.c +++ b/arm-thumb-asm.c @@ -27,7 +27,32 @@ #include #include -#include "arm-thumb-opcodes.h" +#include "arch/arm/thumb/thop_adr.h" +#include "arch/arm/thumb/thop_alu_imm.h" +#include "arch/arm/thumb/thop_alu_reg.h" +#include "arch/arm/thumb/thop_bitfield.h" +#include "arch/arm/thumb/thop_block.h" +#include "arch/arm/thumb/thop_branch.h" +#include "arch/arm/thumb/thop_cmp.h" +#include "arch/arm/thumb/thop_dsp.h" +#include "arch/arm/thumb/thop_extend.h" +#include "arch/arm/thumb/thop_ldaex.h" +#include "arch/arm/thumb/thop_ldrd.h" +#include "arch/arm/thumb/thop_ldrex.h" +#include "arch/arm/thumb/thop_mem_exclusive.h" +#include "arch/arm/thumb/thop_mem_imm.h" +#include "arch/arm/thumb/thop_mem_reg.h" +#include "arch/arm/thumb/thop_mem_unpriv.h" +#include "arch/arm/thumb/thop_mov.h" +#include "arch/arm/thumb/thop_mrs.h" +#include "arch/arm/thumb/thop_mul.h" +#include "arch/arm/thumb/thop_mvn.h" +#include "arch/arm/thumb/thop_pld.h" +#include "arch/arm/thumb/thop_rev.h" +#include "arch/arm/thumb/thop_system.h" +#include "arch/arm/thumb/thop_tbb.h" +#include "arch/arm/thumb/thop_vfp.h" +#include "arch/arm/thumb/thumb.h" #include "tcc.h" #include "tccir.h" @@ -732,6 +757,19 @@ ST_FUNC void asm_clobber(uint8_t *clobber_regs, const char *str) clobber_regs[reg] = 1; } +/* Handle the `.fpu ` assembler directive. Like GNU as, this enables + the FP-unit instruction encodings (vpush/vldr/…) for the remainder of the + translation unit, independent of the -mfpu used to build the object. This + lets FPU-agnostic assembly (e.g. a context-switch routine that saves the FP + register file only when CONTROL.FPCA is set) still assemble the FP opcodes. + The features are OR'd into the live target set so the core profile from + -march/-mcpu is preserved. Errors on an unknown FPU name. */ +ST_FUNC void tcc_asm_set_fpu(const char *name) +{ + thop_feat fpu = thumb_resolve_fpu(name); + arm_target_dependent.feat = thop_feat_or(arm_target_dependent.feat, fpu); +} + static int asm_parse_vfp_regvar(int t, int double_precision) { if (double_precision) @@ -1547,13 +1585,13 @@ thumb_opcode thumb_generate_opcode_for_data_processing(int token, thumb_shift sh { if (token == TOK_ASM_addw) { - return th_add_sp_imm_t4(ops[0].reg, ops[2].e.v, setflags, encoding); + return th_addw(ops[0].reg, R_SP, ops[2].e.v); } - return th_add_sp_imm(ops[0].reg, ops[2].e.v, setflags, encoding); + return th_add_imm(ops[0].reg, R_SP, ops[2].e.v, setflags, encoding); } if (token == TOK_ASM_addw) { - return th_add_imm_t4(ops[0].reg, ops[1].reg, ops[2].e.v); + return th_addw(ops[0].reg, ops[1].reg, ops[2].e.v); } if (token == TOK_ASM_add && thumb_conditional_scope == 0) @@ -1568,10 +1606,6 @@ thumb_opcode thumb_generate_opcode_for_data_processing(int token, thumb_shift sh if (thumb_operand_is_register(ops[2].type)) { - if (ops[1].reg == R_SP) - { - return th_add_sp_reg(ops[0].reg, ops[2].reg, setflags, encoding, shift); - } return th_add_reg(ops[0].reg, ops[1].reg, ops[2].reg, setflags, shift, encoding); } } @@ -1599,7 +1633,7 @@ thumb_opcode thumb_generate_opcode_for_data_processing(int token, thumb_shift sh { if (thumb_operand_is_immediate(ops[2].type)) { - return th_cmp_imm(0, ops[1].reg, ops[2].e.v, FLAGS_BEHAVIOUR_SET, encoding); + return th_cmp_imm(ops[1].reg, ops[2].e.v, FLAGS_BEHAVIOUR_SET, encoding); } return th_cmp_reg(0, ops[1].reg, ops[2].reg, FLAGS_BEHAVIOUR_SET, shift, encoding); } @@ -1609,7 +1643,7 @@ thumb_opcode thumb_generate_opcode_for_data_processing(int token, thumb_shift sh if (thumb_operand_is_immediate(ops[2].type)) { - return th_cmn_imm(ops[1].reg, ops[2].e.v); + return th_cmn_imm(ops[1].reg, ops[2].e.v, FLAGS_BEHAVIOUR_SET, encoding); } if (thumb_operand_is_register(ops[2].type)) @@ -1618,7 +1652,7 @@ thumb_opcode thumb_generate_opcode_for_data_processing(int token, thumb_shift sh { encoding = ENFORCE_ENCODING_32BIT; } - return th_cmn_reg(ops[1].reg, ops[2].reg, shift, encoding); + return th_cmn_reg(ops[1].reg, ops[2].reg, FLAGS_BEHAVIOUR_SET, shift, encoding); } } case TOK_ASM_eors: @@ -1726,13 +1760,13 @@ thumb_opcode thumb_generate_opcode_for_data_processing(int token, thumb_shift sh { if (token == TOK_ASM_subw) { - return th_sub_sp_imm_t3(ops[0].reg, ops[2].e.v, setflags, encoding); + return th_subw(ops[0].reg, R_SP, ops[2].e.v); } - return th_sub_sp_imm(ops[0].reg, ops[2].e.v, setflags, encoding); + return th_sub_imm(ops[0].reg, R_SP, ops[2].e.v, setflags, encoding); } if (token == TOK_ASM_subw) { - return th_sub_imm_t4(ops[0].reg, ops[1].reg, ops[2].e.v); + return th_subw(ops[0].reg, ops[1].reg, ops[2].e.v); } if (token == TOK_ASM_sub && thumb_conditional_scope == 0) @@ -1749,7 +1783,7 @@ thumb_opcode thumb_generate_opcode_for_data_processing(int token, thumb_shift sh { if (ops[1].reg == R_SP) { - return th_sub_sp_reg(ops[0].reg, ops[2].reg, setflags, shift, encoding); + return th_sub_reg(ops[0].reg, R_SP, ops[2].reg, setflags, shift, encoding); } return th_sub_reg(ops[0].reg, ops[1].reg, ops[2].reg, setflags, shift, encoding); } @@ -1759,13 +1793,19 @@ thumb_opcode thumb_generate_opcode_for_data_processing(int token, thumb_shift sh case TOK_ASM_sxth: return th_sxth(ops[1].reg, ops[2].reg, shift, encoding); case TOK_ASM_teq: - return th_teq(ops[1].reg, ops[2].e.v); + return th_teq_imm(ops[1].reg, ops[2].e.v, FLAGS_BEHAVIOUR_SET, encoding); case TOK_ASM_tst: if (thumb_operand_is_register(ops[2].type)) - return th_tst_reg(ops[1].reg, ops[2].reg, shift, encoding); - return th_tst_imm(ops[1].reg, ops[2].e.v); + return th_tst_reg(ops[1].reg, ops[2].reg, FLAGS_BEHAVIOUR_SET, shift, encoding); + return th_tst_imm(ops[1].reg, ops[2].e.v, FLAGS_BEHAVIOUR_SET, encoding); case TOK_ASM_udiv: return th_udiv(ops[0].reg, ops[1].reg, ops[2].reg); + case TOK_ASM_uadd8: + return th_uadd8(ops[0].reg, ops[1].reg, ops[2].reg); + case TOK_ASM_usub8: + return th_usub8(ops[0].reg, ops[1].reg, ops[2].reg); + case TOK_ASM_sel: + return th_sel(ops[0].reg, ops[1].reg, ops[2].reg); case TOK_ASM_uxtb: return th_uxtb(ops[1].reg, ops[2].reg, shift, encoding); case TOK_ASM_uxth: @@ -1822,7 +1862,7 @@ static thumb_opcode thumb_single_memory_transfer_literal_opcode(TCCState *s1, in case TOK_ASM_ldrb: return th_ldrb_imm(op0.reg, R_PC, jump_addr, puw, encoding); case TOK_ASM_ldrd: - return th_ldrd_imm(op0.reg, op1.reg, R_PC, jump_addr, puw, encoding); + return th_ldrd_imm(op0.reg, op1.reg, R_PC, jump_addr, puw); case TOK_ASM_ldrh: return th_ldrh_imm(op0.reg, R_PC, jump_addr, puw, encoding); case TOK_ASM_ldrsb: @@ -1830,7 +1870,7 @@ static thumb_opcode thumb_single_memory_transfer_literal_opcode(TCCState *s1, in case TOK_ASM_ldrsh: return th_ldrsh_imm(op0.reg, R_PC, jump_addr, puw, encoding); case TOK_ASM_strd: - return th_strd_imm(op0.reg, op1.reg, R_PC, jump_addr, puw, encoding); + return th_strd_imm(op0.reg, op1.reg, R_PC, jump_addr, puw); }; return (thumb_opcode){0, 0}; } @@ -2119,7 +2159,7 @@ static void thumb_single_memory_transfer_opcode(TCCState *s1, int token) thumb_emit_opcode(th_ldrb_imm(ops[0].reg, ops[1].reg, imm, puw, encoding)); return; case TOK_ASM_ldrd: - thumb_emit_opcode(th_ldrd_imm(ops[0].reg, op2reg.reg, ops[1].reg, imm, puw, encoding)); + thumb_emit_opcode(th_ldrd_imm(ops[0].reg, op2reg.reg, ops[1].reg, imm, puw)); return; case TOK_ASM_ldrex: thumb_emit_opcode(th_ldrex(ops[0].reg, ops[1].reg, imm)); @@ -2146,7 +2186,7 @@ static void thumb_single_memory_transfer_opcode(TCCState *s1, int token) thumb_emit_opcode(th_strb_imm(ops[0].reg, ops[1].reg, imm, puw, encoding)); return; case TOK_ASM_strd: - thumb_emit_opcode(th_strd_imm(ops[0].reg, op2reg.reg, ops[1].reg, imm, puw, encoding)); + thumb_emit_opcode(th_strd_imm(ops[0].reg, op2reg.reg, ops[1].reg, imm, puw)); return; case TOK_ASM_strex: thumb_emit_opcode(th_strex(ops[0].reg, op2reg.reg, ops[1].reg, imm)); @@ -3413,6 +3453,9 @@ ST_FUNC void asm_opcode(TCCState *s1, int token) case TOK_ASM_teq: case TOK_ASM_tst: case TOK_ASM_udiv: + case TOK_ASM_uadd8: + case TOK_ASM_usub8: + case TOK_ASM_sel: case TOK_ASM_uxtb: case TOK_ASM_uxth: return thumb_data_processing_opcode(s1, token); diff --git a/arm-thumb-callsite.c b/arm-thumb-callsite.c index 90fb70f7..00f9e10f 100644 --- a/arm-thumb-callsite.c +++ b/arm-thumb-callsite.c @@ -11,13 +11,6 @@ #include "tcctype.h" #include -/* Debug output for callsite processing - disabled by default - * Enable with: -DCALLSITE_DEBUG_ENABLED or #define CALLSITE_DEBUG_ENABLED */ -#ifdef CALLSITE_DEBUG_ENABLED -#define CALLSITE_DEBUG(...) fprintf(stderr, __VA_ARGS__) -#else -#define CALLSITE_DEBUG(...) ((void)0) -#endif void thumb_free_call_sites(void) { @@ -96,7 +89,7 @@ ThumbGenCallSite *thumb_get_call_site_for_id(int call_id) int thumb_build_call_layout_from_ir(TCCIRState *ir, int call_idx, int call_id, int argc_hint, TCCAbiCallLayout *layout, IROperand **out_args, MachineOperand **out_mops) { - CALLSITE_DEBUG("[CALLSITE] thumb_build_call_layout_from_ir: call_idx=%d call_id=%d argc_hint=%d total_insns=%d\n", + LOG_CALLSITE("thumb_build_call_layout_from_ir: call_idx=%d call_id=%d argc_hint=%d total_insns=%d", call_idx, call_id, argc_hint, ir ? ir->next_instruction_index : -1); if (!ir || !layout || call_idx < 0) return -1; @@ -129,7 +122,7 @@ int thumb_build_call_layout_from_ir(TCCIRState *ir, int call_idx, int call_id, i { const IROperand src2 = tcc_ir_get_src2(ir, j); int param_call_id = irop_is_none(src2) ? -1 : TCCIR_DECODE_CALL_ID((uint32_t)src2.u.imm32); - CALLSITE_DEBUG("[CALLSITE] legacy scan j=%d: FUNCPARAMVAL param_call_id=%d (want %d) param_idx=%d\n", j, + LOG_CALLSITE("legacy scan j=%d: FUNCPARAMVAL param_call_id=%d (want %d) param_idx=%d", j, param_call_id, call_id, irop_is_none(src2) ? -1 : (int)TCCIR_DECODE_PARAM_IDX((uint32_t)src2.u.imm32)); if (param_call_id == call_id) @@ -141,7 +134,7 @@ int thumb_build_call_layout_from_ir(TCCIRState *ir, int call_idx, int call_id, i } } argc = max_arg_index + 1; - CALLSITE_DEBUG("[CALLSITE] legacy scan result: max_arg_index=%d argc=%d\n", max_arg_index, argc); + LOG_CALLSITE("legacy scan result: max_arg_index=%d argc=%d", max_arg_index, argc); } if (argc <= 0) @@ -183,7 +176,7 @@ int thumb_build_call_layout_from_ir(TCCIRState *ir, int call_idx, int call_id, i mops = (MachineOperand *)tcc_mallocz(sizeof(MachineOperand) * argc); } - CALLSITE_DEBUG("[CALLSITE] scanning backwards from call_idx=%d for call_id=%d argc=%d\n", call_idx, call_id, argc); + LOG_CALLSITE("scanning backwards from call_idx=%d for call_id=%d argc=%d", call_idx, call_id, argc); int found_count = 0; for (int j = call_idx - 1; j >= 0 && found_count < argc; --j) { @@ -194,7 +187,7 @@ int thumb_build_call_layout_from_ir(TCCIRState *ir, int call_idx, int call_id, i int param_call_id = !irop_is_none(src2) ? TCCIR_DECODE_CALL_ID((uint32_t)src2.u.imm32) : -1; int param_idx_raw = !irop_is_none(src2) ? (int)TCCIR_DECODE_PARAM_IDX((uint32_t)src2.u.imm32) : -1; (void)param_idx_raw; /* only used by CALLSITE_DEBUG */ - CALLSITE_DEBUG("[CALLSITE] j=%d FUNCPARAMVAL param_call_id=%d param_idx=%d (want call_id=%d)\n", j, + LOG_CALLSITE("j=%d FUNCPARAMVAL param_call_id=%d param_idx=%d (want call_id=%d)", j, param_call_id, param_idx_raw, call_id); if (param_call_id == call_id) { @@ -202,7 +195,7 @@ int thumb_build_call_layout_from_ir(TCCIRState *ir, int call_idx, int call_id, i int param_idx = TCCIR_DECODE_PARAM_IDX((uint32_t)src2.u.imm32); if (param_idx >= 0 && param_idx < argc && !found[param_idx]) { - CALLSITE_DEBUG("[CALLSITE] recording arg[%d] btype=%d is_64bit=%d\n", param_idx, src1_irop.btype, + LOG_CALLSITE("recording arg[%d] btype=%d is_64bit=%d", param_idx, src1_irop.btype, irop_is_64bit(src1_irop)); /* Collect IROperand if requested */ if (args) @@ -268,11 +261,11 @@ int thumb_build_call_layout_from_ir(TCCIRState *ir, int call_idx, int call_id, i } } - CALLSITE_DEBUG("[CALLSITE] scan complete: found_count=%d argc=%d\n", found_count, argc); + LOG_CALLSITE("scan complete: found_count=%d argc=%d", found_count, argc); /* Verify all parameters were found */ for (int i = 0; i < argc; ++i) { - CALLSITE_DEBUG("[CALLSITE] arg[%d]: found=%d\n", i, found[i]); + LOG_CALLSITE("arg[%d]: found=%d", i, found[i]); if (!found[i]) { tcc_error("compiler_error: missing FUNCPARAMVAL for call_id=%d arg=%d", call_id, i); diff --git a/arm-thumb-defs.h b/arm-thumb-defs.h index dfff3d7b..1d72c9c2 100644 --- a/arm-thumb-defs.h +++ b/arm-thumb-defs.h @@ -116,6 +116,13 @@ enum /* Pointer size, in bytes */ #define PTR_SIZE 4 +/* YASOS RELRO shared-.rodata anchor: a reserved GOT slot (index 3, just after + * the 3 dummy/_DYNAMIC slots) holding the runtime base of the shared .rodata + * segment. Each GOT entry is PTR_SIZE*2 bytes, so the anchor is at byte offset + * 24 from the GOT base (R9). Codegen loads it with ldr [R9, #24]. */ +#define YAFF_RODATA_ANCHOR_GOT_INDEX 3 +#define YAFF_RODATA_ANCHOR_GOT_OFFSET (YAFF_RODATA_ANCHOR_GOT_INDEX * PTR_SIZE * 2) + /* Long double size and alignment, in bytes */ #ifdef TCC_ARM_VFP #define LDOUBLE_SIZE 8 diff --git a/arm-thumb-gen.c b/arm-thumb-gen.c index 8a8b2040..791f93df 100644 --- a/arm-thumb-gen.c +++ b/arm-thumb-gen.c @@ -41,8 +41,11 @@ #define CONFIG_TCC_CPUVER 5 #endif +#include "arch/arm/arm.h" +#include "arch/arm/ssa_opt_arm.h" #include "arm-thumb-defs.h" #include "ir/opt.h" +#include "tcc-chained-hash.h" #include "tcc.h" #include "tccir.h" #include "tccls.h" @@ -85,6 +88,7 @@ enum Armv8mRegisters #define USING_GLOBALS #include "tcc.h" +#include #include /* Target ABI hook: AAPCS-like argument assignment for ARM (R0-R3 + stack). @@ -120,12 +124,38 @@ ST_FUNC int tcc_gen_machine_abi_assign_call_args(const TCCAbiArgDesc *args, int } #include "arch/fpu/arm/fpv5-sp-d16.h" -#include "arm-thumb-opcodes.h" +#include "arch/fpu/arm/fpv5-d16.h" +#include "arch/arm/thumb/thumb.h" +#include "arch/arm/thumb/thop_adr.h" +#include "arch/arm/thumb/thop_alu_imm.h" +#include "arch/arm/thumb/thop_alu_reg.h" +#include "arch/arm/thumb/thop_block.h" +#include "arch/arm/thumb/thop_branch.h" +#include "arch/arm/thumb/thop_cmp.h" +#include "arch/arm/thumb/thop_extend.h" +#include "arch/arm/thumb/thop_ldr_literal.h" +#include "arch/arm/thumb/thop_ldrd.h" +#include "arch/arm/thumb/thop_mem_imm.h" +#include "arch/arm/thumb/thop_mem_reg.h" +#include "arch/arm/thumb/thop_mov.h" +#include "arch/arm/thumb/thop_mul.h" +#include "arch/arm/thumb/thop_mvn.h" +#include "arch/arm/thumb/thop_pld.h" +#include "arch/arm/thumb/thop_shift_imm.h" +#include "arch/arm/thumb/thop_shift_reg.h" +#include "arch/arm/thumb/thop_system.h" #include int load_word_from_base(int ir, int base, int fc, int sign); +static inline thumb_flags_behaviour flags_safe(void) +{ + if (tcc_state->ir && tcc_state->ir->codegen_flags_live) + return FLAGS_BEHAVIOUR_BLOCK; + return FLAGS_BEHAVIOUR_NOT_IMPORTANT; +} + /* Helper to validate a Sym pointer - returns NULL if invalid/unusable for relocation */ static inline Sym *validate_sym_for_reloc(Sym *sym) { @@ -133,17 +163,10 @@ static inline Sym *validate_sym_for_reloc(Sym *sym) return NULL; /* Type descriptors (SYM_FIELD) should not be used for relocations */ if (sym->v & SYM_FIELD) - { - fprintf(stderr, "[TCC-DIAG] validate_sym_for_reloc: sym->v=0x%x has SYM_FIELD, c=%d\n", sym->v, sym->c); return NULL; - } /* Symbols with c < 0 are not properly registered */ if (sym->c < 0) - { - const char *name = get_tok_str(sym->v & ~SYM_FIELD, NULL); - fprintf(stderr, "[TCC-DIAG] validate_sym_for_reloc: sym '%s' has c=%d (<0)\n", name ? name : "?", sym->c); return NULL; - } return sym; } @@ -177,6 +200,7 @@ ST_DATA const int reg_classes[NB_REGS] = { enum float_abi float_abi; unsigned char text_and_data_separation; +unsigned char allow_r9_write; unsigned char pic; int offset_to_args = 0; @@ -186,17 +210,36 @@ thumb_flags_behaviour g_setflags = FLAGS_BEHAVIOUR_SET; uint32_t caller_saved_registers; uint32_t pushed_registers; int allocated_stack_size; +int epilogue_stack_dealloc; /* total SUB SP amount to restore in epilogue (includes alignment pad) */ int callee_push_size = 0; /* bytes pushed BELOW FP in two-phase push */ uint32_t callee_saved_regs = 0; /* register mask for second push (below FP) */ int vararg_push_size = 0; /* bytes pushed for variadic r0-r3 save (16 or 0) */ -/* Adjust a local/spill frame offset when two-phase push is active and - * callee-saved regs are pushed below FP. Only adjusts negative non-param - * offsets (locals/spills); positive and param offsets are unchanged. */ +/* Adjust a local/spill frame offset. + * + * When FP is used with two-phase push: adjusts by callee_push_size (regs + * pushed below FP). + * + * When FP is omitted: converts FP-relative negative offsets to SP-relative + * positive offsets. The alignment pad sits at the top of the SUB SP region + * (right below pushed regs), so locals are addressed relative to + * allocated_stack_size (without pad): + * FP + frame_offset = SP + allocated_stack_size + frame_offset. */ static inline int fp_adjust_local_offset(int frame_offset, int is_param) { - if (!is_param && frame_offset < 0 && callee_push_size > 0) + if (is_param) + return frame_offset; + + if (!tcc_state->need_frame_pointer && frame_offset <= 0) + { + /* Convert FP-relative (negative) to SP-relative (positive). + * FP + frame_offset = SP + allocated_stack_size + frame_offset. */ + return allocated_stack_size + frame_offset; + } + + if (frame_offset < 0 && callee_push_size > 0) return frame_offset - callee_push_size; + return frame_offset; } @@ -211,25 +254,49 @@ static uint32_t scratch_global_exclude = 0; * Size 128 since same register can be pushed multiple times for complex ops like * function calls with many arguments. */ static int scratch_push_stack[128]; +static int scratch_push_type[128]; /* 1 = PUSH, 2 = STR to scratch area */ static int scratch_push_count = 0; +/* Flag: set to 1 when a real-run (non-dry-run) scratch PUSH is emitted. + * Used by codegen to detect when FP omission caused SP-corrupting pushes + * and trigger recompilation with FP enabled. */ +static int real_run_scratch_push_detected = 0; + +/* Tail-call flag: when set, the next gcall_or_jump_mop emits B (branch) + * instead of BL (branch-with-link), and post-call cleanup is skipped. */ +static int tail_call_pending = 0; + +/* Current slot index within the scratch save area (0-based). + * Incremented on save, decremented on restore. */ +static int scratch_save_slot = 0; + /* Debug tracking: current IR opcode being processed (set by codegen.c) */ int g_debug_current_op = -1; int is_valid_opcode(thumb_opcode op); int ot(thumb_opcode op); int ot_check(thumb_opcode op); +static int ot_check_mov_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding enc, bool in_it); +static int ot_check_ldr_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); +static int ot_check_str_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc); +static void mov_equiv_reset_all(void); +static void imm_cache_reset_all(void); +static void imm_cache_invalidate_reg(int reg); +ST_FUNC void tcc_gen_machine_strldr_cache_reset(void); +ST_FUNC void tcc_gen_machine_imm_cache_reset(void); static void thumb_require_materialized_reg(const char *ctx, const char *operand, int reg); static bool thumb_is_hw_reg(int reg); static int get_struct_base_addr_mop(const MachineOperand *mop, int default_reg); +static int find_call_scratch(uint32_t extra_exclude, uint32_t arg_move_dst_mask); int th_has_immediate_value(int r); int load_word_from_base(int ir, int base, int fc, int sign); int th_patch_call(int t, int a); /* Structure to track scratch register allocation with potential save/restore */ typedef struct ScratchRegAlloc { - int reg : 30; /* The allocated scratch register (range 0-15 for ARM) */ - uint32_t saved : 1; /* Whether the register was pushed to stack (real emit only) */ + int reg : 29; /* The allocated scratch register (range 0-15 for ARM) */ + uint32_t saved : 2; /* 0=not saved, 1=PUSH to stack, 2=STR to scratch area */ uint32_t would_save : 1; /* Whether a push was needed (set in both dry-run and real emit) */ } ScratchRegAlloc; @@ -261,8 +328,8 @@ static int resolve_chain_base(TCCIRState *ir, int ci, uint32_t exclude_regs, Scr /* Start from R10 (points to immediate parent's FP) */ thumb_shift no_shift = {THUMB_SHIFT_NONE, 0, THUMB_SHIFT_IMMEDIATE}; - ot_check(th_mov_reg(out_scratch->reg, architecture_config.static_chain_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, no_shift, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(out_scratch->reg, architecture_config.static_chain_reg, flags_safe(), no_shift, + ENFORCE_ENCODING_NONE, false); for (int hop = 1; hop < depth; hop++) { @@ -292,6 +359,48 @@ typedef struct ScratchRegAllocs typedef thumb_opcode (*thumb_imm_handler_t)(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags_behaviour, thumb_enforce_encoding enforce_encoding); + +/* Dispatch an imm_handler call through a direct call instead of an indirect + * (function pointer) call. Same workaround as thumb_call_reg_handler: the + * cross-compiler miscompiles indirect calls that combine an sret return + * (thumb_opcode is 8 bytes) with stack-passed arguments — the callee reads + * garbage for the 5th/6th parameters (flags/enc), so e.g. the high-half SBCS + * of a 64-bit CMP silently loses its S bit. Comparing the pointer and + * branching to a direct call makes the cross emit correct argument passing. */ +static thumb_opcode thumb_call_imm_handler(thumb_imm_handler_t fn, uint32_t rd, uint32_t rn, uint32_t imm, + thumb_flags_behaviour flags, thumb_enforce_encoding encoding) +{ + if (fn == th_add_imm) + return th_add_imm(rd, rn, imm, flags, encoding); + if (fn == th_sub_imm) + return th_sub_imm(rd, rn, imm, flags, encoding); + if (fn == th_adc_imm) + return th_adc_imm(rd, rn, imm, flags, encoding); + if (fn == th_sbc_imm) + return th_sbc_imm(rd, rn, imm, flags, encoding); + if (fn == th_cmp_imm_handler) + return th_cmp_imm_handler(rd, rn, imm, flags, encoding); + if (fn == th_lsl_imm) + return th_lsl_imm(rd, rn, imm, flags, encoding); + if (fn == th_lsr_imm) + return th_lsr_imm(rd, rn, imm, flags, encoding); + if (fn == th_asr_imm) + return th_asr_imm(rd, rn, imm, flags, encoding); + if (fn == th_ror_imm) + return th_ror_imm(rd, rn, imm, flags, encoding); + if (fn == th_orr_imm) + return th_orr_imm(rd, rn, imm, flags, encoding); + if (fn == th_and_imm) + return th_and_imm(rd, rn, imm, flags, encoding); + if (fn == th_eor_imm) + return th_eor_imm(rd, rn, imm, flags, encoding); + if (fn == th_bic_imm) + return th_bic_imm(rd, rn, imm, flags, encoding); + if (fn == th_orn_imm) + return th_orn_imm(rd, rn, imm, flags, encoding); + /* Unreachable for known handlers — fallback to direct call. */ + return fn(rd, rn, imm, flags, encoding); +} int store_word_to_base(int ir, int base, int fc, int sign); static ScratchRegAlloc th_offset_to_reg_ex(int off, int sign, uint32_t exclude_regs); @@ -311,6 +420,7 @@ typedef struct MachineCodegenContext static int g_insn_scratch_allocs = 0; /* total scratch allocs this instruction */ static uint16_t g_insn_scratch_saves = 0; /* registers that required PUSH this instruction */ + /* Allocate a scratch register for the current instruction. * excl: bitmask of registers that must not be chosen. * The allocation is recorded in ctx so mach_release_all() can free it. */ @@ -351,6 +461,16 @@ static int mach_ensure_in_reg(MachineCodegenContext *ctx, const MachineOperand * { switch (op->kind) { + case MACH_OP_NONE: + /* Unresolved operand: vreg has no register allocation (dead path, + * uninitialized variable, etc.). Return a scratch register loaded + * with zero — the value is undefined but we must not crash. */ + { + int r = mach_alloc_scratch(ctx, excl); + tcc_machine_load_constant(r, PREG_REG_NONE, 0, 0, NULL); + return r; + } + case MACH_OP_REG: if (!op->needs_deref) return op->u.reg.r0; @@ -398,24 +518,26 @@ static int mach_ensure_in_reg(MachineCodegenContext *ctx, const MachineOperand * case MACH_OP_SYMBOL: { - int r = mach_alloc_scratch(ctx, excl); Sym *raw_sym = op->u.sym.sym; Sym *sym = raw_sym ? validate_sym_for_reloc(raw_sym) : NULL; if (!op->needs_deref) { /* Load symbol address (with addend baked in). */ + int r = mach_alloc_scratch(ctx, excl); tcc_machine_load_constant(r, PREG_REG_NONE, op->u.sym.addend, 0, sym); + return r; } else { /* Load symbol address into a scratch base reg, then dereference. */ + int r = mach_alloc_scratch(ctx, excl); int base = mach_alloc_scratch(ctx, excl | (1u << (uint32_t)r)); tcc_machine_load_constant(base, PREG_REG_NONE, 0, 0, sym); const int32_t addend = op->u.sym.addend; load_from_base(r, PREG_REG_NONE, op->btype, (int)op->is_unsigned, addend < 0 ? (int)(-addend) : (int)addend, addend < 0 ? 1 : 0, (uint32_t)base); + return r; } - return r; } case MACH_OP_PARAM_STACK: @@ -466,7 +588,8 @@ static int mach_ensure_imm_or_reg(MachineCodegenContext *ctx, const MachineOpera if (op->kind == MACH_OP_IMM && imm_handler) { const uint32_t imm_val = (uint32_t)op->u.imm.val; - if (ot(imm_handler((uint32_t)dest_reg, (uint32_t)src1_reg, imm_val, flags, ENFORCE_ENCODING_NONE))) + if (ot(thumb_call_imm_handler(imm_handler, (uint32_t)dest_reg, (uint32_t)src1_reg, imm_val, flags, + ENFORCE_ENCODING_NONE))) { *imm_emitted = true; return PREG_REG_NONE; @@ -520,8 +643,8 @@ static void mach_writeback_dest(const MachineOperand *op, int reg) if (!op->needs_deref) { if (reg != op->u.reg.r0 && op->u.reg.r0 != (int)PREG_REG_NONE) - ot_check(th_mov_reg((uint32_t)op->u.reg.r0, (uint32_t)reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)op->u.reg.r0, (uint32_t)reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } else { @@ -613,8 +736,8 @@ void tcc_gen_mach_load_to_reg(int dest_reg, const MachineOperand *op) if (!op->needs_deref) { if (op->u.reg.r0 != dest_reg) - ot_check(th_mov_reg((uint32_t)dest_reg, (uint32_t)op->u.reg.r0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, - THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dest_reg, (uint32_t)op->u.reg.r0, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); return; } /* Register-indirect: r0 is an address, load [r0] into dest_reg. */ @@ -697,8 +820,8 @@ void tcc_gen_mach_load_to_reg(int dest_reg, const MachineOperand *op) MachineCodegenContext ctx = {{}, 0}; int r = mach_ensure_in_reg(&ctx, op, (1u << (uint32_t)dest_reg)); if (r != dest_reg) - ot_check(th_mov_reg((uint32_t)dest_reg, (uint32_t)r, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dest_reg, (uint32_t)r, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); mach_release_all(&ctx); return; } @@ -735,75 +858,89 @@ static ThumbLiteralPoolEntry *dry_run_literal_pool = NULL; static int dry_run_literal_pool_count = 0; static int dry_run_literal_pool_size = 0; -/* Hash table for O(1) literal pool lookups instead of O(n) linear search. - * Key: (sym, imm), Value: index into literal pool array. - * Using open addressing with linear probing. */ -#define LITERAL_POOL_HASH_SIZE 256 /* Power of 2 for fast modulo */ -typedef struct LiteralPoolHashEntry +/* Literal pool dedup uses the same bucket+chain scheme as TinyCC's ELF hashes. + * We only hash entries created through th_literal_pool_find_or_allocate(), so + * plain th_literal_pool_allocate() users stay distinct. */ +#define LITERAL_POOL_HASH_BUCKET_COUNT 512 +#define LITERAL_POOL_LOOKUP_CACHE_SIZE 16 + +typedef struct LiteralPoolLookupCacheEntry { Sym *sym; int64_t imm; - int pool_index; /* Index into literal pool array, or -1 if empty */ - int valid; /* 1 if this slot contains a valid entry, 0 if empty */ -} LiteralPoolHashEntry; + int pool_index; + uint32_t hash; + int valid; +} LiteralPoolLookupCacheEntry; + +typedef struct LiteralPoolLookupCache +{ + LiteralPoolLookupCacheEntry entries[LITERAL_POOL_LOOKUP_CACHE_SIZE]; +} LiteralPoolLookupCache; -static LiteralPoolHashEntry literal_pool_hash[LITERAL_POOL_HASH_SIZE]; -static LiteralPoolHashEntry dry_run_literal_pool_hash[LITERAL_POOL_HASH_SIZE]; +static TCCChainedHash literal_pool_hash; +static LiteralPoolLookupCache literal_pool_last_lookup; static inline uint32_t literal_pool_hash_func(Sym *sym, int64_t imm) { - /* Simple hash combining pointer and immediate value */ - uint64_t h = (uint64_t)(uintptr_t)sym; - h ^= (uint64_t)imm; - h ^= h >> 33; - h *= 0xff51afd7ed558ccdULL; - h ^= h >> 33; - return (uint32_t)(h & (LITERAL_POOL_HASH_SIZE - 1)); + /* 32-bit hash to avoid expensive 64-bit multiply on Cortex-M */ + uint32_t h = (uint32_t)(uintptr_t)sym; + h ^= (uint32_t)imm; + h ^= (uint32_t)((uint64_t)imm >> 32); + h ^= h >> 16; + h *= 0x45d9f3bU; + h ^= h >> 16; + return h; } -static void literal_pool_hash_clear(LiteralPoolHashEntry *hash) +static void literal_pool_hash_clear(TCCChainedHash *hash) { - for (int i = 0; i < LITERAL_POOL_HASH_SIZE; i++) - { - hash[i].valid = 0; - hash[i].pool_index = -1; - } + tcc_chained_hash_clear(hash); } -static int literal_pool_hash_find(LiteralPoolHashEntry *hash, Sym *sym, int64_t imm) +static void literal_pool_lookup_cache_clear(LiteralPoolLookupCache *cache) { - uint32_t idx = literal_pool_hash_func(sym, imm); - for (int i = 0; i < LITERAL_POOL_HASH_SIZE; i++) - { - uint32_t probe = (idx + i) & (LITERAL_POOL_HASH_SIZE - 1); - if (!hash[probe].valid) - { - return -1; /* Empty slot - not found */ - } - if (hash[probe].sym == sym && hash[probe].imm == imm) - { - return hash[probe].pool_index; - } - } - return -1; /* Table full, not found */ + memset(cache, 0, sizeof(*cache)); +} + +static inline int literal_pool_lookup_cache_find(LiteralPoolLookupCache *cache, uint32_t full_hash, Sym *sym, + int64_t imm) +{ + LiteralPoolLookupCacheEntry *entry = &cache->entries[full_hash & (LITERAL_POOL_LOOKUP_CACHE_SIZE - 1)]; + if (entry->valid && entry->hash == full_hash && entry->sym == sym && entry->imm == imm) + return entry->pool_index; + return -1; } -static void literal_pool_hash_insert(LiteralPoolHashEntry *hash, Sym *sym, int64_t imm, int pool_index) +static inline void literal_pool_lookup_cache_insert(LiteralPoolLookupCache *cache, uint32_t full_hash, Sym *sym, + int64_t imm, int pool_index) { - uint32_t idx = literal_pool_hash_func(sym, imm); - for (int i = 0; i < LITERAL_POOL_HASH_SIZE; i++) + LiteralPoolLookupCacheEntry *entry = &cache->entries[full_hash & (LITERAL_POOL_LOOKUP_CACHE_SIZE - 1)]; + entry->sym = sym; + entry->imm = imm; + entry->pool_index = pool_index; + entry->hash = full_hash; + entry->valid = 1; +} + +static inline int literal_pool_hash_find(TCCChainedHash *hash, ThumbLiteralPoolEntry *pool, uint32_t full_hash, + Sym *sym, int64_t imm) +{ + uint32_t slot = tcc_chained_hash_bucket_head(hash, full_hash); + while (slot) { - uint32_t probe = (idx + i) & (LITERAL_POOL_HASH_SIZE - 1); - if (!hash[probe].valid) - { - hash[probe].sym = sym; - hash[probe].imm = imm; - hash[probe].pool_index = pool_index; - hash[probe].valid = 1; - return; - } + int pool_index = (int)tcc_chained_hash_slot_to_index(slot); + if (tcc_chained_hash_entry_hash(hash, (uint32_t)pool_index) == full_hash && pool[pool_index].sym == sym && + pool[pool_index].imm == imm) + return pool_index; + slot = tcc_chained_hash_next_slot(hash, slot); } - /* Table full - this shouldn't happen with reasonable pool sizes */ + return -1; +} + +static inline void literal_pool_hash_insert(TCCChainedHash *hash, uint32_t full_hash, int pool_index) +{ + tcc_chained_hash_insert_head(hash, full_hash, (uint32_t)pool_index); } static void dry_run_init(void) @@ -945,7 +1082,8 @@ static int branch_fits_t2(int offset) static void branch_opt_init(void) { branch_opt_state.branch_count = 0; - branch_opt_state.optimization_enabled = 0; /* Disabled: dry-run addresses diverge from real pass */ + branch_opt_state.optimization_enabled = + 0; /* Dry-run analysis disabled: use real-time backward branch narrowing instead */ branch_opt_state.code_size_reduction = 0; if (!branch_opt_state.branches) { @@ -954,8 +1092,8 @@ static void branch_opt_init(void) } } -/* Record a branch for later optimization analysis */ -static void branch_opt_record(int ir_index, int source_addr, int target_ir, int is_conditional) +/* Record a branch for later optimization analysis (used by dry-run analysis path) */ +static void __attribute__((unused)) branch_opt_record(int ir_index, int source_addr, int target_ir, int is_conditional) { if (!branch_opt_state.optimization_enabled) return; @@ -1088,18 +1226,13 @@ static void branch_opt_analyze(uint32_t *ir_to_code_mapping, int mapping_size) } } -#ifdef DEBUG_BRANCH_OPT - fprintf(stderr, - "[BRANCH_OPT] %d branches, %d converted to 16-bit, " - "%d bytes saved, %d iterations\n", - branch_opt_state.branch_count, branch_opt_state.code_size_reduction / 2, branch_opt_state.code_size_reduction, - iterations); -#endif + LOG_BRANCH_OPT("%d branches, %d converted to 16-bit, %d bytes saved, %d iterations", branch_opt_state.branch_count, + branch_opt_state.code_size_reduction / 2, branch_opt_state.code_size_reduction, iterations); } /* Lookup encoding decision for a given IR index */ -/* Local version that returns the enum type */ -static BranchEncoding branch_opt_get_encoding(int ir_index) +/* Local version that returns the enum type (used by dry-run analysis path) */ +static BranchEncoding __attribute__((unused)) branch_opt_get_encoding(int ir_index) { for (int i = 0; i < branch_opt_state.branch_count; i++) { @@ -1122,6 +1255,34 @@ ST_FUNC void tcc_gen_machine_branch_opt_init(void) branch_opt_init(); } +/* Reset the MOV-coalescing and STR->LDR redundant-reload caches. Called + * at IR instruction boundaries because any IR op can be the target of a + * branch from elsewhere: arriving via jump, the runtime register and + * memory state is not what the emission-order state would predict, so + * cross-IR matching is unsafe. Within a single IR op the backend emits + * straight-line code and both peepholes are sound. */ +ST_FUNC void tcc_gen_machine_mov_coalesce_reset(void) +{ + mov_equiv_reset_all(); + tcc_gen_machine_strldr_cache_reset(); +} + +/* Reset only the MOV-coalescing register-equivalence cache. Unlike the + * STR->LDR memory cache, the GPR value-equivalence cache stays sound across + * straight-line IR-op boundaries: every instruction the backend emits passes + * through the ot() updater, which invalidates the destination register (and + * a `bl`/unknown opcode triggers a full reset, covering call clobbers). So + * the only place a reset is genuinely required is a real control-flow merge: + * arriving at a branch target, the emission-order equivalences from the + * fall-through predecessor do not describe the register state on the + * jumped-from path. codegen.c therefore calls this only at jump targets, + * letting cross-IR `mov` chains (e.g. a soft-float call result copied to its + * home pair and then to the next call's argument pair) coalesce away. */ +ST_FUNC void tcc_gen_machine_mov_equiv_reset(void) +{ + mov_equiv_reset_all(); +} + /* Public interface for dry-run code generation */ ST_FUNC void tcc_gen_machine_dry_run_init(void) { @@ -1138,8 +1299,9 @@ ST_FUNC void tcc_gen_machine_dry_run_start(void) dry_run_literal_pool = tcc_malloc(dry_run_literal_pool_size * sizeof(ThumbLiteralPoolEntry)); } dry_run_literal_pool_count = 0; - /* Clear the dry-run hash table */ - literal_pool_hash_clear(dry_run_literal_pool_hash); + /* Clear the shared hash table for dry-run pass */ + literal_pool_hash_clear(&literal_pool_hash); + literal_pool_lookup_cache_clear(&literal_pool_last_lookup); /* Save thumb_gen_state before dry-run */ thumb_gen_state_snapshot_save(&dry_run_snapshot); /* Reset state that should start fresh for dry-run */ @@ -1149,6 +1311,7 @@ ST_FUNC void tcc_gen_machine_dry_run_start(void) thumb_gen_state.cached_global_reg = PREG_NONE; thumb_gen_state.function_argument_count = 0; /* call_sites_by_id - don't modify, just track that we saved it */ + imm_cache_reset_all(); } ST_FUNC void tcc_gen_machine_dry_run_end(void) @@ -1156,6 +1319,11 @@ ST_FUNC void tcc_gen_machine_dry_run_end(void) dry_run_state.active = 0; /* Restore thumb_gen_state after dry-run */ thumb_gen_state_snapshot_restore(&dry_run_snapshot); + imm_cache_reset_all(); + /* Clear the literal pool hash table so that stale dry-run indices + * don't cause real-pass entries to be misidentified as shared. */ + literal_pool_hash_clear(&literal_pool_hash); + literal_pool_lookup_cache_clear(&literal_pool_last_lookup); /* Note: we keep dry_run_literal_pool allocated for reuse */ } @@ -1182,7 +1350,16 @@ ST_FUNC void tcc_gen_machine_reset_scratch_state(void) * NEVER be used as a scratch register. Permanently exclude it. */ scratch_global_exclude = text_and_data_separation ? (1u << R9) : 0; scratch_push_count = 0; + scratch_save_slot = 0; memset(scratch_push_stack, 0, sizeof(scratch_push_stack)); + memset(scratch_push_type, 0, sizeof(scratch_push_type)); + real_run_scratch_push_detected = 0; +} + +/* Returns 1 if any scratch PUSH was emitted during the real run. */ +ST_FUNC int tcc_gen_machine_real_run_had_scratch_push(void) +{ + return real_run_scratch_push_detected; } /* Per-instruction scratch tracking (Phase 3 constraint collection). @@ -1220,10 +1397,7 @@ static ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) ScratchRegAlloc result = {0}; TCCIRState *ir = tcc_state->ir; -#ifdef ARM_THUMB_DEBUG_SCRATCH - fprintf(stderr, "[SCRATCH] get_scratch_reg: input_exclude=0x%x global_exclude=0x%x\n", exclude_regs, - scratch_global_exclude); -#endif + LOG_SCRATCH("get_scratch_reg: input_exclude=0x%x global_exclude=0x%x", exclude_regs, scratch_global_exclude); exclude_regs |= scratch_global_exclude; @@ -1238,9 +1412,7 @@ static ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) /* Never use SP or PC as scratch registers. */ if (reg == R_SP || reg == R_PC) goto no_free_reg; -#ifdef ARM_THUMB_DEBUG_SCRATCH - fprintf(stderr, "[SCRATCH] -> returning reg=%d (free) exclude=0x%x\n", reg, exclude_regs); -#endif + LOG_SCRATCH("-> returning reg=%d (free) exclude=0x%x", reg, exclude_regs); result.reg = reg; result.saved = 0; /* Update global exclude so subsequent calls won't return the same register. @@ -1249,6 +1421,34 @@ static ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) scratch_global_exclude |= (1u << reg); return result; } + + /* Fallback path: a callee-saved register R4-R11 already pushed by the + * prolog AND not live at this instruction can be used as scratch for + * free. The prolog/epilog save/restore makes the clobber invisible to + * the caller. Gated by !dry_run_active so dry-run never sees a value + * different from real-run: pushed_registers is only valid after the + * prolog has actually run, which is real-run. R7 (FP) is reserved. + * R9 is reserved as GOT base when text_and_data_separation is on. */ + if (!dry_run_state.active && pushed_registers && reg == PREG_NONE) + { + uint32_t reserved = (1u << R_FP); + if (tcc_state->text_and_data_separation) + reserved |= (1u << 9); + uint32_t live = tcc_ls_compute_live_regs(&ir->ls, ir->codegen_instruction_idx); + if (ir->ls.live_regs_by_instruction && ir->codegen_instruction_idx >= 0 && + ir->codegen_instruction_idx < ir->ls.live_regs_by_instruction_size) + live |= ir->ls.live_regs_by_instruction[ir->codegen_instruction_idx]; + uint32_t candidate = pushed_registers & 0x0FF0u & ~exclude_regs & ~live & ~reserved; + if (candidate) + { + int sreg = (int)__builtin_ctz(candidate); + LOG_SCRATCH("-> returning reg=%d (pre-pushed callee-saved, dead here) exclude=0x%x", sreg, exclude_regs); + result.reg = sreg; + result.saved = 0; + scratch_global_exclude |= (1u << sreg); + return result; + } + } } int reg_to_save = -1; @@ -1271,28 +1471,26 @@ static ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) return result; } - /* No free register found - we need to save one to the stack */ - /* Prefer R_IP (R12) as it's the inter-procedure scratch register */ - if (!(exclude_regs & (1 << R_IP))) + /* No free register found - we need to save one to the stack. + * Prefer R0-R3: PUSH/POP and most ALU ops use 16-bit Thumb encoding, + * whereas R_IP (R12) forces 32-bit encoding for every instruction. */ + for (int r = 0; r <= 3; ++r) { - reg_to_save = R_IP; + if (!(exclude_regs & (1 << r))) + { + reg_to_save = r; + break; + } } - else if (ir && ir->leaffunc && !(exclude_regs & (1 << R_LR))) + + if (reg_to_save < 0 && ir && ir->leaffunc && !(exclude_regs & (1 << R_LR))) { - /* R_IP is excluded, try R_LR if we're in a leaf function */ reg_to_save = R_LR; } - else + + if (reg_to_save < 0 && !(exclude_regs & (1 << R_IP))) { - /* Try R0-R3 */ - for (int r = 0; r <= 3; ++r) - { - if (!(exclude_regs & (1 << r))) - { - reg_to_save = r; - break; - } - } + reg_to_save = R_IP; } if (reg_to_save < 0) @@ -1314,9 +1512,7 @@ static ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) } /* No free register found - save one to the stack */ -#ifdef ARM_THUMB_DEBUG_SCRATCH - fprintf(stderr, "[SCRATCH] WARNING: no free scratch register! Saving r%d to stack\n", reg_to_save); -#endif + LOG_SCRATCH("WARNING: no free scratch register! Saving r%d to stack", reg_to_save); /* Dry run: record what we would push, but don't emit */ if (dry_run_state.active) @@ -1330,7 +1526,29 @@ static ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) return result; } + /* When FP is omitted, use STR to the pre-reserved scratch save area instead + * of PUSH, to avoid moving SP (which would break SP-relative addressing). */ + if (!tcc_state->need_frame_pointer && ir && ir->scratch_save_size > 0 && + scratch_save_slot < (ir->scratch_save_size / 4)) + { + int frame_offset = ir->scratch_save_base + (scratch_save_slot * 4); + int sp_offset = allocated_stack_size + frame_offset; + if (!store_word_to_base(reg_to_save, R_SP, sp_offset, 0)) + tcc_error("compiler_error: scratch save STR failed (offset %d)", sp_offset); + result.reg = reg_to_save; + result.saved = 2; /* 2 = saved to scratch area (not PUSH) */ + result.would_save = 1; + if (scratch_push_count < 128) + { + scratch_push_type[scratch_push_count] = 2; + scratch_push_stack[scratch_push_count++] = reg_to_save; + } + scratch_save_slot++; + return result; + } + ot_check(th_push(1 << reg_to_save)); + real_run_scratch_push_detected = 1; result.reg = reg_to_save; result.saved = 1; result.would_save = 1; /* Phase 3: push was needed */ @@ -1338,6 +1556,7 @@ static ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) * lists pops in register-number order, not stack order. */ if (scratch_push_count < 128) { + scratch_push_type[scratch_push_count] = 1; scratch_push_stack[scratch_push_count++] = reg_to_save; } else @@ -1353,6 +1572,8 @@ static ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) /* Restore a scratch register if it was saved */ static void restore_scratch_reg(ScratchRegAlloc *alloc) { + if (alloc->saved) + imm_cache_invalidate_reg(alloc->reg); /* Dry run: don't emit pop, just update tracking */ if (dry_run_state.active) { @@ -1373,7 +1594,22 @@ static void restore_scratch_reg(ScratchRegAlloc *alloc) return; } - if (alloc->saved) + if (alloc->saved == 2) + { + /* Saved to scratch area (FP omitted path): restore via LDR */ + TCCIRState *ir = tcc_state->ir; + if (scratch_save_slot > 0) + scratch_save_slot--; + int frame_offset = ir->scratch_save_base + (scratch_save_slot * 4); + int sp_offset = allocated_stack_size + frame_offset; + if (!load_word_from_base(alloc->reg, R_SP, sp_offset, 0)) + tcc_error("compiler_error: scratch restore LDR failed (offset %d)", sp_offset); + alloc->saved = 0; + if (scratch_push_count > 0 && scratch_push_stack[scratch_push_count - 1] == alloc->reg) + scratch_push_count--; + scratch_global_exclude &= ~(1u << alloc->reg); + } + else if (alloc->saved == 1) { /* We MUST restore in strict LIFO order. * An out-of-order POP corrupts SP (and can crash under QEMU). @@ -1392,19 +1628,13 @@ static void restore_scratch_reg(ScratchRegAlloc *alloc) { if (scratch_push_count > 0) { -#ifdef ARM_THUMB_DEBUG_SCRATCH - fprintf(stderr, - "[SCRATCH] WARNING: restore_scratch_reg out of order; deferring POP " - "reg=%d (top=%d)\n", - alloc->reg, scratch_push_stack[scratch_push_count - 1]); -#endif + LOG_SCRATCH("WARNING: restore_scratch_reg out of order; deferring POP " + "reg=%d (top=%d)", + alloc->reg, scratch_push_stack[scratch_push_count - 1]); } else { -#ifdef ARM_THUMB_DEBUG_SCRATCH - fprintf(stderr, "[SCRATCH] WARNING: restore_scratch_reg with empty push stack; deferring POP reg=%d\n", - alloc->reg); -#endif + LOG_SCRATCH("WARNING: restore_scratch_reg with empty push stack; deferring POP reg=%d", alloc->reg); } return; } @@ -1426,19 +1656,32 @@ static void restore_all_pushed_scratch_regs(void) if (dry_run_state.active) { scratch_push_count = 0; + scratch_save_slot = 0; scratch_global_exclude = text_and_data_separation ? (1u << R9) : 0; return; } - /* Pop in reverse order - ARM POP with register lists pops in register-number - * order, so we must issue individual POPs in reverse push order */ + /* Restore in reverse order */ for (int i = scratch_push_count - 1; i >= 0; i--) { int reg = scratch_push_stack[i]; -#ifdef ARM_THUMB_DEBUG_SCRATCH - fprintf(stderr, "[SCRATCH] auto-restoring r%d (push order %d)\n", reg, i); -#endif - ot_check(th_pop(1 << reg)); + LOG_SCRATCH("auto-restoring r%d (push order %d, type %d)", reg, i, scratch_push_type[i]); + if (scratch_push_type[i] == 2) + { + /* Saved to scratch area: restore via LDR */ + TCCIRState *ir = tcc_state->ir; + if (scratch_save_slot > 0) + scratch_save_slot--; + int frame_offset = ir->scratch_save_base + (scratch_save_slot * 4); + int sp_offset = allocated_stack_size + frame_offset; + if (!load_word_from_base(reg, R_SP, sp_offset, 0)) + tcc_error("compiler_error: scratch auto-restore LDR failed (offset %d)", sp_offset); + } + else + { + /* Saved via PUSH: restore via POP */ + ot_check(th_pop(1 << reg)); + } } scratch_push_count = 0; /* Also reset global exclude for next IR instruction. @@ -1524,7 +1767,7 @@ int ot_check(thumb_opcode op) { if (!is_valid_opcode(op)) { - fprintf(stderr, "[ot_check FAIL] opcode=0x%x ind=0x%x ir_op=%d\n", op.opcode, (unsigned)ind, g_debug_current_op); + LOG_SCRATCH("ot_check FAIL: opcode=0x%x ind=0x%x ir_op=%d", op.opcode, (unsigned)ind, g_debug_current_op); tcc_error("compiler_error: received invalid opcode: 0x%x\n", op.opcode); } return ot(op); @@ -1544,6 +1787,10 @@ void tcc_ir_spill_cache_clear(SpillCache *cache) { cache->entries[i].valid = 0; } + cache->last_emit_kind = 0; + cache->last_emit_ind = 0; + cache->last_emit_reg = 0; + cache->last_emit_offset = 0; } void tcc_ir_spill_cache_record(SpillCache *cache, int reg, int offset) @@ -1619,6 +1866,397 @@ ST_FUNC void gen_fill_nops(int bytes) } } +/* --------------------------------------------------------------------------- + * Redundant MOV reg coalescing + * + * Tracks which physical registers currently hold the same value and drops + * `MOV Rd, Rm` when Rd is already known to equal Rm. Equivalence classes + * are represented by each register's class-representative in mov_equiv[]; + * two registers are equal iff their representatives match. + * + * Updates: + * MOV Rd, Rm -> mov_equiv[Rd] := mov_equiv[Rm] (Rd joins Rm) + * any other write to Rc -> mov_equiv[Rc] := Rc (Rc new class) + * unclassified opcode -> full reset (conservative) + * + * Only applies to plain `MOV Rd, Rm` with no shift and no flag-setting (the + * T1 16-bit encoding and the no-shift/no-flags T2 32-bit encoding). Any + * shifted / flag-setting MOV reads flags or transforms Rm and is left alone. + * --------------------------------------------------------------------------- */ + +static uint8_t mov_equiv[16]; + +/* Count of conditional instructions still pending inside an IT/ITx/ITxy/ITxyz + * block. While this is non-zero the opcode stream seen by ot() is + * conditionally executed; cache updates must treat destinations as "may or + * may not be written", not as guaranteed assignments. */ +static int mov_equiv_it_pending; + +/* Immediate-value cache: tracks the last pure-integer constant loaded into + * each register by tcc_machine_load_constant (no symbol involved). Persists + * across IR instruction boundaries so consecutive STORE instructions that + * materialise the same constant can skip the redundant MOV. Reset at jump + * targets and function calls. */ +/* Per-register materialisation cache. `sym == NULL` means the register holds + * the plain constant `value`; `sym != NULL` means it holds the address of that + * symbol plus addend `value` (so a later reference to the same global address + * can skip the redundant literal-pool load). Invalidated per-register on every + * clobbering emit and at IR boundaries, just like the constant cache. */ +static struct { int64_t value; Sym *sym; uint8_t valid; } imm_cache[16]; + +static void imm_cache_reset_all(void) +{ + for (int i = 0; i < 16; i++) + { + imm_cache[i].valid = 0; + imm_cache[i].sym = NULL; + } +} + +static void imm_cache_invalidate_reg(int reg) +{ + if (reg >= 0 && reg < 16) + imm_cache[reg].valid = 0; +} + +static void mov_equiv_reset_all(void) +{ + for (int i = 0; i < 16; i++) + mov_equiv[i] = (uint8_t)i; + mov_equiv_it_pending = 0; +} + +/* Decode the IT instruction (Thumb-2 16-bit, opcode 0xBF) and + * return the number of instructions that will execute conditionally after + * it — 1..4 depending on which bit of the mask is lowest-set. Returns 0 + * when the opcode is not an IT (mask == 0 is a plain NOP/hint). */ +static int mov_equiv_it_block_length(thumb_opcode op) +{ + if (op.size != 2) + return 0; + uint16_t hw = (uint16_t)(op.opcode & 0xFFFF); + if ((hw & 0xFF00) != 0xBF00) + return 0; + uint16_t mask = hw & 0x0F; + if (mask == 0) + return 0; /* NOP-hint encodings (NOP, YIELD, WFE, ...) */ + if (mask & 0x1) + return 4; + if (mask & 0x2) + return 3; + if (mask & 0x4) + return 2; + return 1; /* mask & 0x8 */ +} + +static void mov_equiv_invalidate_reg(int reg) +{ + if (reg < 0 || reg >= 16) + return; + /* Any other register whose representative was `reg` becomes independent + * of reg's new (unknown) value. Give each such register its own class. */ + uint8_t old_rep = mov_equiv[reg]; + for (int i = 0; i < 16; i++) + { + if (i != reg && mov_equiv[i] == old_rep) + mov_equiv[i] = (uint8_t)i; + } + mov_equiv[reg] = (uint8_t)reg; +} + +static void mov_equiv_record_mov(int rd, int rm) +{ + if (rd < 0 || rd >= 16 || rm < 0 || rm >= 16) + { + mov_equiv_reset_all(); + return; + } + /* First invalidate Rd's old equivalences (Rd stops being equal to whatever + * it was before), then merge into Rm's class. */ + mov_equiv_invalidate_reg(rd); + mov_equiv[rd] = mov_equiv[rm]; +} + +/* Emit `MOV Rd, Rm` unless the register-equivalence cache already says + * Rd currently holds the same value as Rm, in which case the MOV is a + * no-op and nothing is emitted. Only the no-shift / no-flag-set forms + * participate in coalescing (identical to decode_mov_reg_plain); any + * caller passing a shift, setting flags, or using IT-conditional forms + * always emits through ot_check so that the semantics of those MOVs is + * preserved. */ +static int ot_check_mov_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding enc, bool in_it) +{ + const int coalesceable = (flags != FLAGS_BEHAVIOUR_SET) && !in_it && (shift.type == THUMB_SHIFT_NONE) && (rd < 16) && + (rm < 16) && thumb_gen_state.generating_function; + if (coalesceable && (rd == rm || mov_equiv[rd] == mov_equiv[rm])) + { + /* Elided at the call site: ot() is never reached, so ind and code_size + * only reflect instructions that really got emitted. The cache is + * already consistent (rd is equal to rm), so no update is needed. */ + return 0; + } + thumb_opcode mov_op = th_mov_reg(rd, rm, flags, shift, enc, in_it); + return ot_check(mov_op); +} + +/* Return 1 if `op` is a plain MOV Rd, Rm with no shift and no flag set, + * filling *rd_out / *rm_out. Accepts the 16-bit T1 high-register form and + * the 32-bit T2 form when shift/flags are zero. */ +static int decode_mov_reg_plain(thumb_opcode op, int *rd_out, int *rm_out) +{ + if (op.size == 2) + { + uint16_t hw = (uint16_t)(op.opcode & 0xFFFF); + /* T1 MOV high-register: 0100 0110 D Rm4 Rd3 */ + if ((hw & 0xFF00) == 0x4600) + { + int rd = ((hw >> 4) & 0x08) | (hw & 0x07); + int rm = (hw >> 3) & 0x0F; + *rd_out = rd; + *rm_out = rm; + return 1; + } + return 0; + } + if (op.size == 4) + { + uint16_t hi = (uint16_t)((op.opcode >> 16) & 0xFFFF); + uint16_t lo = (uint16_t)(op.opcode & 0xFFFF); + /* T2 MOV register, no shift, no flag set: EA4F 00. + * Opcode layout (ARM ARM): 11101010 0100 1111 | 0 imm3 Rd imm2 type Rm + * For plain MOV (no shift): imm3 = 0, imm2 = 0, type = 00 (LSL). + * S bit distinguishes MOV/MOVS: hi[20] = 0 for MOV, 1 for MOVS. */ + if (hi == 0xEA4F && (lo & 0x70F0) == 0) + { + int rd = (lo >> 8) & 0x0F; + int rm = lo & 0x0F; + *rd_out = rd; + *rm_out = rm; + return 1; + } + return 0; + } + return 0; +} + +/* --------------------------------------------------------------------------- + * STR -> LDR redundant-reload peephole + * + * Tracks recent immediate-offset STR Rt, [Rn, #imm] emissions and skips the + * subsequent LDR Rt, [Rn, #imm] at the call site when Rt is still known to + * hold the stored value. The cache is reset at every IR instruction + * boundary (via tcc_gen_machine_mov_coalesce_reset, same hook as plan C) + * so that cross-IR equivalences cannot be exploited — any IR op may be a + * branch target, and the runtime register/memory state on an entry-by-jump + * path is not what the emission-order state predicts. + * + * Only puw == 6 (P=1, U=1, W=0 — no writeback) STR/LDR forms are tracked. + * The classifier below recognises the T1 16-bit, T2 16-bit SP-relative, and + * T3 32-bit encodings (those that cover the common stack-spill path). + * --------------------------------------------------------------------------- */ + +typedef struct StrLdrCacheEntry +{ + uint8_t valid; + uint8_t rt; + uint8_t rn; + uint8_t size; /* 2 or 4 */ + int imm; + uint32_t puw; +} StrLdrCacheEntry; + +#define STRLDR_CACHE_CAPACITY 8 +static StrLdrCacheEntry strldr_cache[STRLDR_CACHE_CAPACITY]; +static int strldr_cache_count; + +ST_FUNC void tcc_gen_machine_strldr_cache_reset(void) +{ + strldr_cache_count = 0; +} + +ST_FUNC void tcc_gen_machine_imm_cache_reset(void) +{ + imm_cache_reset_all(); +} + +ST_FUNC void tcc_gen_machine_imm_cache_invalidate_live(uint32_t live_mask) +{ + for (int i = 0; i < 16; i++) { + if (live_mask & (1u << i)) + imm_cache[i].valid = 0; + } +} + +/* Invalidate entries where the given register is either the stored value + * (Rt) or the base register (Rn). Called when a subsequent instruction + * writes to that register. */ +static void strldr_cache_invalidate_reg(int reg) +{ + for (int i = 0; i < strldr_cache_count; i++) + { + StrLdrCacheEntry *e = &strldr_cache[i]; + if (e->valid && (e->rt == reg || e->rn == reg)) + e->valid = 0; + } +} + +static void strldr_cache_record_str(int rt, int rn, int imm, uint32_t puw, int size) +{ + if (puw != 6) + { + tcc_gen_machine_strldr_cache_reset(); + return; + } + /* Overwriting the same slot invalidates any prior cache entry for it. */ + for (int i = 0; i < strldr_cache_count; i++) + { + StrLdrCacheEntry *e = &strldr_cache[i]; + if (e->valid && e->rn == rn && e->imm == imm) + e->valid = 0; + } + if (strldr_cache_count >= STRLDR_CACHE_CAPACITY) + { + tcc_gen_machine_strldr_cache_reset(); + } + StrLdrCacheEntry *e = &strldr_cache[strldr_cache_count++]; + e->valid = 1; + e->rt = (uint8_t)rt; + e->rn = (uint8_t)rn; + e->imm = imm; + e->puw = puw; + e->size = (uint8_t)size; +} + +/* Return 1 when a matching unclobbered STR entry exists that makes this + * LDR redundant. Matches on all fields so a 16-bit LDR won't be elided + * against a 32-bit STR (and vice versa) — the encodings might pick + * different scale semantics. */ +static int strldr_cache_try_match_ldr(int rt, int rn, int imm, uint32_t puw, int size) +{ + if (puw != 6) + return 0; + for (int i = 0; i < strldr_cache_count; i++) + { + StrLdrCacheEntry *e = &strldr_cache[i]; + if (!e->valid) + continue; + if (e->rt == rt && e->rn == rn && e->imm == imm && e->puw == puw && e->size == size) + return 1; + } + return 0; +} + +/* Decode T1/T2/T3 STR/LDR immediate-offset forms with no writeback. + * Returns 1 and fills outputs when the opcode matches, 0 otherwise. + * *is_str_out is 1 for STR, 0 for LDR. */ +static int decode_str_ldr_imm(thumb_opcode op, int *is_str_out, int *rt_out, int *rn_out, int *imm_out, + uint32_t *puw_out) +{ + if (op.size == 2) + { + uint16_t hw = (uint16_t)(op.opcode & 0xFFFF); + /* T1: 0b01100 = STR, 0b01101 = LDR (imm5 word-scaled, rn<8, rt<8). */ + if ((hw & 0xF000) == 0x6000) + { + int is_ldr = (hw >> 11) & 1; + *is_str_out = !is_ldr; + *rt_out = hw & 0x7; + *rn_out = (hw >> 3) & 0x7; + *imm_out = ((hw >> 6) & 0x1F) << 2; + *puw_out = 6; + return 1; + } + /* T2: SP-relative. 0b10010 = STR, 0b10011 = LDR (imm8 word-scaled). */ + if ((hw & 0xF000) == 0x9000) + { + int is_ldr = (hw >> 11) & 1; + *is_str_out = !is_ldr; + *rt_out = (hw >> 8) & 0x7; + *rn_out = R_SP; + *imm_out = (hw & 0xFF) << 2; + *puw_out = 6; + return 1; + } + /* STRB/LDRB imm5: 0111 0xxx (STR) / 0111 1xxx (LDR). */ + if ((hw & 0xF000) == 0x7000) + { + *is_str_out = !((hw >> 11) & 1); + *rt_out = hw & 0x7; + *rn_out = (hw >> 3) & 0x7; + *imm_out = (hw >> 6) & 0x1F; + *puw_out = 6; + return 1; + } + /* STRH/LDRH imm5: 1000 0xxx (STR) / 1000 1xxx (LDR). */ + if ((hw & 0xF000) == 0x8000) + { + *is_str_out = !((hw >> 11) & 1); + *rt_out = hw & 0x7; + *rn_out = (hw >> 3) & 0x7; + *imm_out = ((hw >> 6) & 0x1F) << 1; + *puw_out = 6; + return 1; + } + return 0; + } + if (op.size == 4) + { + uint16_t hi = (uint16_t)((op.opcode >> 16) & 0xFFFF); + uint16_t lo = (uint16_t)(op.opcode & 0xFFFF); + /* T3: STR/LDR variants with imm12 (byte/half/word): hi[22:21]=size, + * hi[20]=L. 0xF88x=STRB.W, 0xF89x=LDRB.W, 0xF8Ax=STRH.W, + * 0xF8Bx=LDRH.W, 0xF8Cx=STR.W, 0xF8Dx=LDR.W. */ + if ((hi & 0xFF80) == 0xF880) + { + int is_ldr = (hi >> 4) & 1; + int rn = hi & 0xF; + if (rn == 0xF) + return 0; /* PC-relative literal load; skip. */ + *is_str_out = !is_ldr; + *rn_out = rn; + *rt_out = (lo >> 12) & 0xF; + *imm_out = lo & 0xFFF; + *puw_out = 6; + return 1; + } + return 0; + } + return 0; +} + +/* Emit LDR Rt, [Rn, #imm] unless the STR-cache already knows Rt still + * holds [Rn+imm] from an unclobbered earlier STR, in which case emission + * is skipped entirely. ot() is never called in the elided path, so `ind` + * and code_size only advance for real emissions — same contract as the + * MOV coalescing helper. */ +static int ot_check_ldr_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + thumb_opcode ins = th_ldr_imm(rt, rn, imm, puw, enc); + if (thumb_gen_state.generating_function && puw == 6 && ins.size != 0 && + strldr_cache_try_match_ldr((int)rt, (int)rn, imm, puw, ins.size)) + { + /* Redundant reload: Rt still holds [Rn+imm] from an earlier STR that + * has not been clobbered. No emission, no cache update needed — the + * existing entry remains accurate. */ + return 0; + } + return ot_check(ins); +} + +/* Emit STR Rt, [Rn, #imm]. Always emits (STR cannot be elided); the + * cache-record side effect happens inside ot() once the opcode is + * classified, so there is nothing extra to do here other than go through + * the standard ot_check path. Kept as a dedicated helper only for + * symmetry with ot_check_ldr_imm — callers use it so future refinements + * (e.g. dropping a dead store that follows another store to the same + * slot) can land in one place. */ +static int ot_check_str_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding enc) +{ + thumb_opcode ins = th_str_imm(rt, rn, imm, puw, enc); + return ot_check(ins); +} + static uint32_t mapcc(int cc) { /* In most places we carry high-level TOK_* comparisons (TOK_EQ, TOK_LT, ...). @@ -1685,12 +2323,17 @@ static void th_literal_pool_init() tcc_free(thumb_gen_state.literal_pool); } thumb_gen_state.literal_pool = tcc_mallocz(sizeof(ThumbLiteralPoolEntry) * thumb_gen_state.literal_pool_size); + if (!literal_pool_hash.buckets) + tcc_chained_hash_init(&literal_pool_hash, LITERAL_POOL_HASH_BUCKET_COUNT, thumb_gen_state.literal_pool_size); + else + tcc_chained_hash_reserve(&literal_pool_hash, thumb_gen_state.literal_pool_size); thumb_gen_state.generating_function = 0; thumb_gen_state.code_size = 0; thumb_gen_state.cached_global_sym = NULL; thumb_gen_state.cached_global_reg = PREG_NONE; /* Clear the hash table for O(1) lookups */ - literal_pool_hash_clear(literal_pool_hash); + literal_pool_hash_clear(&literal_pool_hash); + literal_pool_lookup_cache_clear(&literal_pool_last_lookup); } const FloatingPointConfig arm_soft_fpu_config = { @@ -1716,6 +2359,23 @@ const FloatingPointConfig arm_soft_fpu_config = { .has_dtoi = 0, }; +static const char *arm_fpu_type_to_mfpu_str(unsigned char fpu_type) +{ + switch (fpu_type) + { + case ARM_FPU_FPV4_SP_D16: + return "fpv4-sp-d16"; + case ARM_FPU_FPV5_SP_D16: + return "fpv5-sp-d16"; + case ARM_FPU_FPV5_D16: + return "fpv5-d16"; + case ARM_FPU_NONE: + return "none"; + default: + return NULL; + } +} + const FloatingPointConfig *arm_determine_fpu_config(struct TCCState *s) { if (s->fpu_type == 0 || s->fpu_type == ARM_FPU_NONE) @@ -1725,8 +2385,11 @@ const FloatingPointConfig *arm_determine_fpu_config(struct TCCState *s) switch (s->fpu_type) { + case ARM_FPU_FPV4_SP_D16: case ARM_FPU_FPV5_SP_D16: return &arm_fpv5_sp_d16_fpu_config; + case ARM_FPU_FPV5_D16: + return &arm_fpv5_d16_fpu_config; default: fprintf(stderr, "unsupported FPU type: %d for ARM architecture", s->fpu_type); exit(1); @@ -1736,6 +2399,9 @@ const FloatingPointConfig *arm_determine_fpu_config(struct TCCState *s) ST_FUNC void arm_init(struct TCCState *s) { + tcc_ir_ssa_opt_arm_register(); + arm_target_init(s->march_str, arm_fpu_type_to_mfpu_str(s->fpu_type), NULL, 0); + float_type.t = VT_FLOAT; double_type.t = VT_DOUBLE; func_float_type.t = VT_FUNC; @@ -1752,8 +2418,8 @@ ST_FUNC void arm_init(struct TCCState *s) (1 << ARM_R5) | (1 << ARM_R6) | (1 << ARM_R8) | (1 << ARM_R10) | (1 << ARM_R11) | (1 << ARM_R12); - s->registers_for_allocator = 11; - caller_saved_registers = (1 << ARM_R0) | (1 << ARM_R1) | (1 << ARM_R2) | (1 << ARM_R3); + s->registers_for_allocator = 13; /* r0-r12: ip is caller-saved, available for allocation */ + caller_saved_registers = (1 << ARM_R0) | (1 << ARM_R1) | (1 << ARM_R2) | (1 << ARM_R3) | (1 << ARM_R12); /* On yasos with no-pic-data-is-text-relative, R9 holds the GOT base and is * caller-saved: callees (compiled by other toolchains) may clobber it, so @@ -1778,7 +2444,6 @@ ST_FUNC void arm_init(struct TCCState *s) if (!s->pic && !s->text_and_data_separation) { s->registers_map_for_allocator |= (1 << ARM_R9); - s->registers_for_allocator += 1; } /* Always reserve R7 (FP) and never allocate it as a general register. @@ -1794,9 +2459,14 @@ ST_FUNC void arm_deinit(struct TCCState *s) { (void)s; tcc_free(thumb_gen_state.literal_pool); + tcc_free(dry_run_literal_pool); + tcc_chained_hash_destroy(&literal_pool_hash); thumb_gen_state.literal_pool = NULL; + dry_run_literal_pool = NULL; thumb_gen_state.literal_pool_size = 0; thumb_gen_state.literal_pool_count = 0; + dry_run_literal_pool_size = 0; + dry_run_literal_pool_count = 0; thumb_gen_state.generating_function = 0; thumb_gen_state.code_size = 0; thumb_gen_state.cached_global_sym = NULL; @@ -1890,13 +2560,11 @@ static void th_literal_pool_generate(void) if (need_align) { /* align to 4 bytes after branch */ - thumb_opcode nop = - th_mov_reg(R0, R0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); - o(nop.opcode & 0xffff); + ot_check(th_nop(ENFORCE_ENCODING_16BIT)); } /* Array to store the output position of each unique literal */ - int *literal_positions = tcc_malloc(pool_count * sizeof(int)); + int *literal_positions = tcc_mallocz(pool_count * sizeof(int)); th_sym_d(); @@ -2048,7 +2716,8 @@ static void th_literal_pool_generate(void) thumb_gen_state.code_size = 0; generating_pool = 0; /* Clear the hash table after flushing pool */ - literal_pool_hash_clear(literal_pool_hash); + literal_pool_hash_clear(&literal_pool_hash); + literal_pool_lookup_cache_clear(&literal_pool_last_lookup); } static void th_literal_pool_reserve_upcoming_bytes(int upcoming_bytes) @@ -2073,23 +2742,118 @@ int is_valid_opcode(thumb_opcode op) * Returns the destination register number if it can be decoded, or -1. * Only checks data-processing / move / load instructions, NOT push/pop/stm/ldm * (those legitimately reference R9 for save/restore around calls). */ -static int thumb_decode_dest_reg(thumb_opcode op) +/* Detect instructions that set flags only and write no GPR — CMP, CMN, TST, + * TEQ in all their common Thumb-1 / Thumb-2 encodings. thumb_decode_dest_reg + * returns -1 for these, which would otherwise trigger the conservative + * full-cache reset in ot(). Recognising them keeps the mov-equiv and + * imm-in-reg caches alive across a CMP, which lets a follow-up redundant + * load_immediate elide. */ +static int thumb_op_is_pure_flag_setter(thumb_opcode op) { uint32_t w = op.opcode; - if (op.size == 2) { uint16_t hw = (uint16_t)(w & 0xFFFF); - /* 16-bit MOV (high registers): 0100 0110 D Rm4 Rd3 - * Bits [15:8]=0x46, D=bit7 of lower byte, Rd3=bits[2:0] */ - if ((hw >> 8) == 0x46) - return ((hw >> 4) & 0x08) | (hw & 0x07); - /* 16-bit ADD (high registers): 0100 0100 D Rm4 Rd3 */ - if ((hw >> 8) == 0x44) - return ((hw >> 4) & 0x08) | (hw & 0x07); - /* 16-bit CMP (high registers): 0100 0101 — no dest write, skip */ - /* Low-register forms (R0-R7 only) can't reach R9 */ - return -1; + /* T1 16-bit CMP imm8 (low regs): 00101 Rd3 iiii iiii (0x28-0x2F) */ + if ((hw & 0xF800) == 0x2800) + return 1; + /* T1 16-bit CMP reg (low regs): 0100 0010 10Rm3 Rn3 (0x4280) */ + if ((hw & 0xFFC0) == 0x4280) + return 1; + /* T1 16-bit TST reg (low regs): 0100 0010 00Rm3 Rn3 (0x4200) */ + if ((hw & 0xFFC0) == 0x4200) + return 1; + /* T2 16-bit CMP/CMN reg (high regs): 0100 0101 D Rm4 Rn3 (0x4500) */ + if ((hw & 0xFF00) == 0x4500) + return 1; + return 0; + } + if (op.size == 4) + { + uint16_t hi = (uint16_t)(w >> 16); + uint16_t lo = (uint16_t)(w & 0xFFFF); + /* Thumb-2 data-processing (modified immediate), Rd=PC encodes CMP/CMN/ + * TST/TEQ. hi encoding: 1111 0i01 0xxx nnnn (op bits [24:21] = 0x4=TST, + * 0x8=CMN, 0xD=CMP, 0x0=TST/AND-S — table varies; the canonical "no-write" + * marker is lo[11:8] == 0xF (Rd = PC). */ + if ((hi & 0xFA00) == 0xF000 && (lo & 0x8000) == 0 && ((lo >> 8) & 0xF) == 0xF) + return 1; + /* Thumb-2 data-processing (plain binary immediate): same Rd=PC marker. */ + if ((hi & 0xFA00) == 0xF200 && (lo & 0x8000) == 0 && ((lo >> 8) & 0xF) == 0xF) + return 1; + /* Thumb-2 data-processing (shifted register): hi pattern 1110 101x xxxx + * nnnn, lo[15] == 0, lo[11:8] == 0xF (Rd = PC) marks the flag-setter + * variant (CMP.W reg, CMN.W reg, TST.W reg, TEQ.W reg). */ + if ((hi & 0xFE00) == 0xEA00 && (lo & 0x8000) == 0 && ((lo >> 8) & 0xF) == 0xF) + return 1; + return 0; + } + return 0; +} + +static int thumb_decode_dest_reg(thumb_opcode op) +{ + uint32_t w = op.opcode; + + if (op.size == 2) + { + uint16_t hw = (uint16_t)(w & 0xFFFF); + + /* 16-bit shift-immediate / add / subtract: 000xx ... Rd3. Covers + * LSL/LSR/ASR(imm) and ADD/SUB(reg or imm3); every encoding writes the + * low-register Rd in bits [2:0]. */ + if ((hw & 0xE000) == 0x0000) + return hw & 0x07; + + /* 16-bit MOV/CMP/ADD/SUB (8-bit immediate): 001 op2 Rd3 imm8. + * op2==01 is CMP (writes no GPR — leave to the flag-setter path); + * MOV/ADD/SUB write Rd in bits [10:8]. */ + if ((hw & 0xE000) == 0x2000) + { + if (((hw >> 11) & 0x03) == 0x01) + return -1; + return (hw >> 8) & 0x07; + } + + /* 16-bit data-processing (register): 010000 op4 Rm3 Rd3, Rd in bits [2:0]. + * TST(8), CMP(10), CMN(11) write no GPR. */ + if ((hw & 0xFC00) == 0x4000) + { + int op4 = (hw >> 6) & 0x0F; + if (op4 == 0x8 || op4 == 0xA || op4 == 0xB) + return -1; + return hw & 0x07; + } + + /* 16-bit MOV (high registers): 0100 0110 D Rm4 Rd3 + * Bits [15:8]=0x46, D=bit7 of lower byte, Rd3=bits[2:0] */ + if ((hw >> 8) == 0x46) + return ((hw >> 4) & 0x08) | (hw & 0x07); + /* 16-bit ADD (high registers): 0100 0100 D Rm4 Rd3 */ + if ((hw >> 8) == 0x44) + return ((hw >> 4) & 0x08) | (hw & 0x07); + /* 16-bit CMP (high registers) 0x45 and BX/BLX 0x47: no single-GPR dest. */ + + /* 16-bit LDR (literal): 01001 Rt3 imm8, Rt in bits [10:8]. */ + if ((hw & 0xF800) == 0x4800) + return (hw >> 8) & 0x07; + + /* 16-bit LDR (SP-relative): 1001 1 Rt3 imm8, Rt in bits [10:8]. + * (0x9000 is the STR form — no GPR dest.) */ + if ((hw & 0xF800) == 0x9800) + return (hw >> 8) & 0x07; + + /* 16-bit ADR / ADD (SP plus immediate): 1010 x Rd3 imm8, Rd in bits [10:8]. */ + if ((hw & 0xF000) == 0xA000) + return (hw >> 8) & 0x07; + + /* 16-bit sign/zero extend (SXTH/SXTB/UXTH/UXTB): 1011 0010 oo Rm3 Rd3. */ + if ((hw & 0xFF00) == 0xB200) + return hw & 0x07; + + /* Remaining low-register and memory forms either don't write a single GPR + * or are decoded by decode_str_ldr_imm before reaching here. */ + return -1; } if (op.size == 4) @@ -2120,6 +2884,16 @@ static int thumb_decode_dest_reg(thumb_opcode op) return (lo >> 8) & 0x0F; if ((hi & 0xFBF0) == 0xF2C0 && (lo & 0x8000) == 0) /* MOVT */ return (lo >> 8) & 0x0F; + /* Thumb-2 data-processing (shifted register): 1110 101x xxxx nnnn | + * 0iii dddd iitt mmmm. Rd = lo[11:8]; Rd==PC (0xF) marks the flag-setter + * variant (CMP.W/CMN.W/TST.W/TEQ.W — no GPR write). */ + if ((hi & 0xFE00) == 0xEA00 && (lo & 0x8000) == 0) + { + int rd = (lo >> 8) & 0x0F; + if (rd != 0x0F) + return rd; + return -1; + } } return -1; @@ -2130,9 +2904,18 @@ int ot(thumb_opcode op) if (op.size == 0) return op.size; + /* DEBUG: emit-stream trace for the 90_struct miscompile. Same compiler + + * identical stable allocation ⇒ device and QEMU emit identical opcode streams + * up to the silicon-divergent branch; diffing this trace pinpoints the first + * differing emitted instruction (and its IR index). Real-run only. */ + if (!dry_run_state.active && funcname && + !strcmp((const char *)funcname, "test_init_struct_from_struct") && tcc_state && tcc_state->ir) + fprintf(stderr, "EMIT i=%d ind=0x%x op=0x%x sz=%d\n", tcc_state->ir->codegen_instruction_idx, (unsigned)ind, + (unsigned)op.opcode, op.size); + /* Detect instructions that write to R9 when it's reserved for GOT pointer. * Exclude push/pop/stmdb/ldmia which legitimately save/restore R9. */ - if (text_and_data_separation) + if (text_and_data_separation && !allow_r9_write) { int dest = thumb_decode_dest_reg(op); if (dest == R9) @@ -2142,6 +2925,149 @@ int ot(thumb_opcode op) } } + /* Update the MOV-coalescing register-equivalence cache and the STR->LDR + * redundant-reload cache based on what is about to be emitted. This only + * tracks state — no elision happens here; elision is performed at the + * call sites via ot_check_mov_reg / ot_check_ldr_imm so that ot()'s + * return value remains the real emitted size and downstream jump/offset + * accounting never sees a phantom emission. + * + * IT blocks: instructions inside an IT/ITx/ITxy/ITxyz are conditionally + * executed. Their writes are therefore not guaranteed, so destination + * registers must be invalidated rather than recorded as equivalences. */ + if (thumb_gen_state.generating_function) + { + if (mov_equiv_it_pending > 0) + { + /* Conditional instruction: pessimistically drop anything this op + * might write, and never record new equivalences. Treat STR/LDR + * the same way — their effect is gated on the IT condition. */ + int mv_rd = -1, mv_rm = -1; + if (decode_mov_reg_plain(op, &mv_rd, &mv_rm)) + { + mov_equiv_invalidate_reg(mv_rd); + strldr_cache_invalidate_reg(mv_rd); + imm_cache_invalidate_reg(mv_rd); + } + else if (thumb_op_is_pure_flag_setter(op)) + { + /* CMP/CMN/TST/TEQ — no GPR clobber even under predication. */ + } + else + { + int dest = thumb_decode_dest_reg(op); + if (dest >= 0) + { + mov_equiv_invalidate_reg(dest); + strldr_cache_invalidate_reg(dest); + imm_cache_invalidate_reg(dest); + } + else + { + mov_equiv_reset_all(); + tcc_gen_machine_strldr_cache_reset(); + imm_cache_reset_all(); + } + } + mov_equiv_it_pending--; + } + else + { + int it_len = mov_equiv_it_block_length(op); + if (it_len > 0) + { + /* IT itself writes no GPR; start the conditional window. */ + mov_equiv_it_pending = it_len; + } + else + { + int mv_rd = -1, mv_rm = -1; + int sl_is_str = 0, sl_rt = 0, sl_rn = 0, sl_imm = 0; + uint32_t sl_puw = 0; + if (decode_str_ldr_imm(op, &sl_is_str, &sl_rt, &sl_rn, &sl_imm, &sl_puw)) + { + if (sl_is_str) + { + /* STR does not write a register; record the store for + * redundant-reload matching. MOV-equiv is unaffected. */ + strldr_cache_record_str(sl_rt, sl_rn, sl_imm, sl_puw, op.size); + } + else + { + /* LDR writes Rt: invalidate both caches for that register. + * If the call-site helper ran the match it would have + * elided without reaching ot(); so if we get here, this LDR + * is actually emitting and genuinely clobbers Rt. */ + mov_equiv_invalidate_reg(sl_rt); + strldr_cache_invalidate_reg(sl_rt); + imm_cache_invalidate_reg(sl_rt); + } + } + else if (decode_mov_reg_plain(op, &mv_rd, &mv_rm)) + { + mov_equiv_record_mov(mv_rd, mv_rm); + strldr_cache_invalidate_reg(mv_rd); + imm_cache_invalidate_reg(mv_rd); + } + else if (op.size == 4 && + (((op.opcode >> 16) & 0xFE40) == 0xE840)) + { + /* LDRD/STRD (Thumb-2): encoded as 1110 100P U1W0 nnnn (STRD) or + * 1110 100P U1W1 nnnn (LDRD). Bit 20 (high-halfword bit 4) + * distinguishes load (1) vs store (0). + * + * STRD writes no GPR — only memory. LDRD writes both Rt and Rt2 + * (low-halfword bits [15:12] and [11:8] respectively). Either way + * the rest of the GPR-equivalence cache is unaffected, so don't + * fall through to the "unknown opcode → reset everything" path + * which destroys upstream coalescing wins. */ + if ((op.opcode >> 20) & 1) + { + /* LDRD: invalidate Rt and Rt2 (writeback to Rn is rare here and + * already covered by the writeback handling — for the typical + * STRD imm with W=0 used by the codegen we don't touch Rn). */ + int rt = (int)((op.opcode >> 12) & 0xF); + int rt2 = (int)((op.opcode >> 8) & 0xF); + mov_equiv_invalidate_reg(rt); + mov_equiv_invalidate_reg(rt2); + strldr_cache_invalidate_reg(rt); + strldr_cache_invalidate_reg(rt2); + imm_cache_invalidate_reg(rt); + imm_cache_invalidate_reg(rt2); + } + /* STRD: no GPR write, leave the mov_equiv cache alone. */ + } + else if (thumb_op_is_pure_flag_setter(op)) + { + /* CMP/CMN/TST/TEQ write only the flags — no GPR clobber, no + * cache invalidation needed. */ + } + else + { + int dest = thumb_decode_dest_reg(op); + if (dest >= 0) + { + mov_equiv_invalidate_reg(dest); + strldr_cache_invalidate_reg(dest); + imm_cache_invalidate_reg(dest); + } + else + { + mov_equiv_reset_all(); + tcc_gen_machine_strldr_cache_reset(); + imm_cache_reset_all(); + } + } + } + } + } + else + { + mov_equiv_reset_all(); + tcc_gen_machine_strldr_cache_reset(); + imm_cache_reset_all(); + } + /* Dry run: don't emit actual opcodes, but still track code size and * handle literal pool generation to ensure code addresses match real pass. */ if (dry_run_state.active) @@ -2245,6 +3171,14 @@ int decbranch(int pos) xa = ret + pos + 4; } + else if ((xa & 0xf500) == 0xb100) + { + /* CBZ/CBNZ encoding: offset = (i:imm5) * 2, forward only */ + uint32_t i_bit = (xa >> 9) & 1; + uint32_t imm5 = (xa >> 3) & 0x1f; + uint32_t imm6 = (i_bit << 5) | imm5; + xa = (int)(imm6 * 2) + pos + 4; + } else { tcc_error("internal error: decbranch unknown encoding pos 0x%x, inst: 0x%x\n", pos, xa); @@ -2258,9 +3192,9 @@ static thumb_opcode th_generic_mov_imm(uint32_t r, int imm) { if (imm < 0) { - return th_mvn_imm(r, 0, -imm - 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + return th_mvn_imm(r, 0, -imm - 1, flags_safe(), ENFORCE_ENCODING_NONE); } - return th_mov_imm(r, imm, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + return th_mov_imm(r, imm, flags_safe(), ENFORCE_ENCODING_NONE); } static ScratchRegAlloc th_offset_to_reg_ex(int off, int sign, uint32_t exclude_regs) { @@ -2278,7 +3212,7 @@ static ScratchRegAlloc th_offset_to_reg_ex(int off, int sign, uint32_t exclude_r } if (sign) - ot_check(th_rsb_imm(rr, rr, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_rsb_imm(rr, rr, 0, flags_safe(), ENFORCE_ENCODING_NONE)); return alloc; } @@ -2326,6 +3260,22 @@ int th_patch_call(int t, int a) x[0] |= enc >> 16; x[1] |= enc; } + else if ((*x & 0xf500) == 0xb100) + { + /* CBZ/CBNZ: 16-bit, forward-only, range 0-126 bytes. + * CBZ base = 0xb100, CBNZ base = 0xb900; both match (x & 0xf500) == 0xb100 + * since bit 11 (0x0800) is not in the mask. + * Encoding: op | (i << 9) | (imm5 << 3) | Rn + * where offset = (i:imm5) * 2 */ + int offset = a - (lt + 4); /* PC-relative, Thumb PC = insn + 4 */ + if (offset < 0 || offset > 126 || (offset & 1)) + tcc_error("compiler_error: CBZ/CBNZ target out of range: offset=%d", offset); + uint32_t imm6 = (uint32_t)offset >> 1; + uint32_t i_bit = (imm6 >> 5) & 1; + uint32_t imm5 = imm6 & 0x1f; + *x &= 0xfd07; /* Keep base opcode, NZ bit, and Rn */ + *x |= (uint16_t)((i_bit << 9) | (imm5 << 3)); + } else tcc_error("compiler_error: unhandled branch type in th_patch_call for: t: " "0x%x, a: 0x%x, x: 0x%x 0x%x\n", @@ -2334,37 +3284,50 @@ int th_patch_call(int t, int a) return t; } -static void gadd_sp(int val) +/* Add a value to SP. When the immediate doesn't fit the ADD/SUB SP encoding, + * a scratch register is needed. scratch_reg selects which one: + * >= 0 : use that specific physical register (caller guarantees it's free) + * < 0 : default to R_IP (safe in prologue/epilogue where R0-R3 hold args) + */ +static void gadd_sp_ex(int val, int scratch_reg) { if (val == 0) return; + if (scratch_reg < 0) + scratch_reg = R_IP; + if (val > 0) { - thumb_opcode add_imm = th_add_sp_imm(R_SP, (uint32_t)val, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + thumb_opcode add_imm = th_add_imm(R_SP, R_SP, (uint32_t)val, flags_safe(), ENFORCE_ENCODING_NONE); if (is_valid_opcode(add_imm)) { ot(add_imm); return; } - /* Large adjustment: materialize value into IP and add via register form. */ - load_full_const(R_IP, PREG_NONE, (uint32_t)val, 0); - ot_check(th_add_sp_reg(R_SP, R_IP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE, THUMB_SHIFT_DEFAULT)); + load_full_const(scratch_reg, PREG_NONE, (uint32_t)val, 0); + ot_check( + th_add_reg(R_SP, R_SP, scratch_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); return; } /* val < 0 */ const uint32_t sub = (uint32_t)(-val); - thumb_opcode sub_imm = th_sub_sp_imm(R_SP, sub, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + thumb_opcode sub_imm = th_sub_imm(R_SP, R_SP, sub, flags_safe(), ENFORCE_ENCODING_NONE); if (is_valid_opcode(sub_imm)) { ot(sub_imm); return; } - load_full_const(R_IP, PREG_NONE, (uint32_t)sub, 0); - ot_check(th_sub_sp_reg(R_SP, R_IP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + load_full_const(scratch_reg, PREG_NONE, (uint32_t)sub, 0); + ot_check(th_sub_reg(R_SP, R_SP, scratch_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); +} + +static void gadd_sp(int val) +{ + gadd_sp_ex(val, -1); } void ggoto(void) @@ -2387,6 +3350,34 @@ ST_FUNC void tcc_gen_machine_indirect_jump_mop(MachineOperand src, TccIrOp op) mach_release_all(&ctx); } +/* Returns the number of bytes emitted by tcc_gen_machine_switch_table_mop for + * a table with the given number of entries. Used by the dry-run pass in + * codegen.c so that branch-offset analysis is accurate without the backend + * having to emit any real instructions. */ +ST_FUNC int tcc_gen_machine_switch_table_dry_run_size(int num_entries) +{ + /* Layout: LSL.W(4) + ADD(2) + LDR.W(4) + ADD(2) + BX(2) = 14 bytes preamble + * + 4 bytes per table entry (32-bit signed PC-relative offsets). */ + return 14 + num_entries * 4; +} + +/* Force any pending literal pool to be flushed before a region of + * `upcoming_bytes` is emitted, if leaving the pool pending that long would + * push its load out of range. Public wrapper so codegen.c can reserve + * space symmetrically in both the dry-run and real-run passes. + * + * The SWITCH_TABLE dispatch needs this: its preamble (LSL/ADD/LDR/ADD/BX) + * must be emitted atomically — a literal-pool flush in the middle relocates + * the terminal `ADD Rt, PC; BX Rt` past the pool (bridged by a B.W), which + * invalidates the `ref_point == table_start` assumption that the switch- + * table offset backpatch in codegen.c relies on, producing a wild jump. + * Flushing the pool up front (in both passes, so dry-run size estimates and + * real-run addresses stay consistent) keeps the preamble + table contiguous. */ +ST_FUNC void tcc_gen_machine_reserve_pool_bytes(int upcoming_bytes) +{ + th_literal_pool_reserve_upcoming_bytes(upcoming_bytes); +} + /* MOP variant: accepts a MachineOperand for the index register. */ ST_FUNC void tcc_gen_machine_switch_table_mop(MachineOperand src, TCCIRSwitchTable *table, TCCIRState *ir, int ir_idx) { @@ -2400,13 +3391,14 @@ ST_FUNC void tcc_gen_machine_switch_table_mop(MachineOperand src, TCCIRSwitchTab if (!thumb_is_hw_reg(index_reg)) tcc_error("internal error: SWITCH_TABLE index not in a hardware register (mop)"); - /* Reuse index_reg as scratch - it's dead after SWITCH_TABLE (terminator). */ - int rt = index_reg; + /* Use R_IP as scratch to avoid clobbering index_reg, which may still be + live at the switch targets (SSA can place the loop counter directly here). */ + int rt = R_IP; - ot_check(th_lsl_imm(rt, index_reg, 2, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_32BIT)); - ot_check(th_add_reg(rt, rt, R_PC, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(rt, rt, 6, 6, ENFORCE_ENCODING_32BIT)); - ot_check(th_add_reg(rt, rt, R_PC, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_lsl_imm(rt, index_reg, 2, flags_safe(), ENFORCE_ENCODING_32BIT)); + ot_check(th_add_reg(rt, rt, R_PC, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(rt, rt, 6, 6, ENFORCE_ENCODING_32BIT); + ot_check(th_add_reg(rt, rt, R_PC, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); ot_check(th_bx_reg(rt)); int table_start = ind; @@ -2421,6 +3413,77 @@ ST_FUNC void tcc_gen_machine_switch_table_mop(MachineOperand src, TCCIRSwitchTab mach_release_all(&ctx); } +/* SWITCH_LOAD: data-table dispatch that loads values[index] into dest. + * + * Layout (uniform 14-byte preamble): + * + * LSL.W rt, index, #2 (4 bytes) + * ADD rt, rt, pc (2 bytes) ; PC=preamble_start+8 + * LDR.W ip, [rt, #6] (4 bytes) ; load table[index] -> ip + * B.W skip (4 bytes) ; jump past the table + * (4*N bytes) + * skip: + * [optional STR/MOV ip -> dest] ; only if dest is spilled, + * ; emitted by the IR-level + * ; ASSIGN that follows. + * + * The fixed loaded register is R_IP (same as SWITCH_TABLE's scratch); the + * IR-level optimization wraps SWITCH_LOAD with an ASSIGN that places IP into + * the real dest, so we don't need a separate spill path here. + * + * SYMREF entries emit R_ARM_ABS32 relocations at their table slots; the + * linker fills in the absolute symbol address. + */ +/* SWITCH_LOAD dispatch size: literal-pool LDR (4 bytes, T2 encoding for + * R_IP) + indexed shifted LDR.W (4 bytes). The table itself lives in + * .rodata and contributes no .text bytes. */ +ST_FUNC int tcc_gen_machine_switch_load_dry_run_size(int num_entries) +{ + (void)num_entries; + return 8; +} + +ST_FUNC void tcc_gen_machine_switch_load_mop(MachineOperand src, MachineOperand dest, TCCIRSwitchValueTable *vtab, + TCCIRState *ir, int ir_idx) +{ + (void)ir_idx; + (void)ir; + + TRACE("'tcc_gen_machine_switch_load_mop' vt_id=%d entries=%d\n", (int)(vtab - ir->switch_value_tables), + vtab->num_entries); + + if (!vtab->rodata_sym) + tcc_error("internal error: SWITCH_LOAD table has no rodata symbol (switch_to_data should have allocated it)"); + + MachineCodegenContext ctx = {0}; + /* Keep the index out of R_IP, which we clobber with the table base below. */ + int index_reg = mach_ensure_in_reg(&ctx, &src, (1u << (uint32_t)R_IP)); + if (!thumb_is_hw_reg(index_reg)) + tcc_error("internal error: SWITCH_LOAD index not in a hardware register"); + + /* Resolve the destination register. The switch_to_data optimization tries to + * keep the SWITCH_LOAD dest in a hardware register, but under high register + * pressure the allocator can spill it (or it may be an lvalue store). Rather + * than bail out, allocate a scratch via mach_get_dest_reg() and store it back + * with mach_writeback_dest() afterwards. Exclude index_reg and R_IP — both + * are read by the indexed load below. */ + uint32_t dest_excl = (1u << (uint32_t)index_reg) | (1u << (uint32_t)R_IP); + int dest_reg = mach_get_dest_reg(&ctx, &dest, dest_excl); + + /* Load the table's base address from the literal pool into IP. */ + _lfc_sym = vtab->rodata_sym; + load_full_const(R_IP, PREG_NONE, 0, 0); + + /* dest = table[index] via LDR.W dest, [ip, index, LSL #2]. */ + thumb_shift shift = {THUMB_SHIFT_LSL, 2, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_ldr_reg((uint32_t)dest_reg, (uint32_t)R_IP, (uint32_t)index_reg, shift, ENFORCE_ENCODING_32BIT)); + + /* If the dest was a spill slot or lvalue, write the loaded value back. */ + mach_writeback_dest(&dest, dest_reg); + + mach_release_all(&ctx); +} + void gsym_addr(int t, int a) { TRACE("'gsym_addr' %.8x branch target: %.8x\n", t, a); @@ -2446,7 +3509,7 @@ ST_FUNC void gen_vla_alloc(CType *type, int align) int r = gv(RC_INT); /* r = SP - r */ - ot_check(th_sub_reg(r, R_SP, r, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_reg(r, R_SP, r, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); if (align < 8) align = 8; @@ -2456,7 +3519,7 @@ ST_FUNC void gen_vla_alloc(CType *type, int align) if (align > 1) { /* Try immediate BIC first; if it doesn't encode, fall back to register mask. */ - if (!ot(th_bic_imm(r, r, (uint32_t)(align - 1), FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE))) + if (!ot(th_bic_imm(r, r, (uint32_t)(align - 1), flags_safe(), ENFORCE_ENCODING_NONE))) { ScratchRegAlloc mask_alloc = get_scratch_reg_with_save(1u << r); int mask_reg = mask_alloc.reg; @@ -2464,7 +3527,7 @@ ST_FUNC void gen_vla_alloc(CType *type, int align) { load_full_const(mask_reg, PREG_NONE, LFC_SPLIT(align - 1)); } - ot_check(th_bic_reg(r, r, mask_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_bic_reg(r, r, mask_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); if (mask_alloc.saved) { ot_check(th_pop(1u << mask_reg)); @@ -2473,7 +3536,7 @@ ST_FUNC void gen_vla_alloc(CType *type, int align) } /* SP = r */ - ot_check(th_mov_reg(R_SP, r, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(R_SP, r, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); vpop(); } @@ -2487,9 +3550,12 @@ ST_FUNC void gen_vla_sp_save(int addr) int off = fp_adjust_local_offset(addr, 0 /* not param */); int sign = (off < 0) ? 1 : 0; int abs_off = sign ? -off : off; + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; - ot_check(th_mov_reg(R_IP, R_SP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); - th_store32_imm_or_reg_ex(R_IP, R_FP, abs_off, sign, 0); + ScratchRegAlloc vla_sc = get_scratch_reg_with_save(0); + ot_check_mov_reg(vla_sc.reg, R_SP, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + th_store32_imm_or_reg_ex(vla_sc.reg, base_reg, abs_off, sign, 0); + restore_scratch_reg(&vla_sc); } ST_FUNC void gen_vla_sp_restore(int addr) @@ -2501,9 +3567,12 @@ ST_FUNC void gen_vla_sp_restore(int addr) int off = fp_adjust_local_offset(addr, 0 /* not param */); int sign = (off < 0) ? 1 : 0; int abs_off = sign ? -off : off; + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; - load_from_base(R_IP, PREG_REG_NONE, IROP_BTYPE_INT32, 0, abs_off, sign, R_FP); - ot_check(th_mov_reg(R_SP, R_IP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ScratchRegAlloc vla_sc = get_scratch_reg_with_save(0); + load_from_base(vla_sc.reg, PREG_REG_NONE, IROP_BTYPE_INT32, 0, abs_off, sign, base_reg); + ot_check_mov_reg(R_SP, vla_sc.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + restore_scratch_reg(&vla_sc); } int load_ushort_from_base(int ir, int base, int fc, int sign) @@ -2541,6 +3610,190 @@ int store_word_to_base(int ir, int base, int fc, int sign) return ot(ins); } +/* Returns 1 if a 64-bit access at (sym + addend) is guaranteed 4-byte aligned + * (so LDRD/STRD is safe). Conservative: only allows natural alignment for + * non-struct, non-packed symbols, plus any explicit alignment >= 4. */ +static int sym_is_4_byte_aligned_for_64bit(Sym *sym, int32_t addend) +{ + if (!sym) + return 0; + if ((addend & 3) != 0) + return 0; + if (sym->a.packed) + return 0; + if (sym->a.aligned >= 3) /* explicit alignment 2^(n-1) >= 4 */ + return 1; + if (sym->a.aligned > 0) /* explicit 1 or 2 byte alignment — not safe */ + return 0; + /* sym->a.aligned == 0: rely on the declared type's natural alignment. + * Structs/unions may be packed-wrapped; reject conservatively. Native + * scalars (long long, double, pointer) have natural alignment >= 4. */ + int btype = sym->type.t & VT_BTYPE; + if (btype == VT_STRUCT) + return 0; + return 1; +} + +/* Try to emit STRD Rt, Rt2, [base, #±abs_off] for a 64-bit paired store. + * Constraints (Thumb-2 STRD imm T1): + * - Rt != Rt2 + * - Rt, Rt2 in r0..r12 or r14 (not SP, not PC) + * - abs_off 4-byte aligned and <= 1020 + * Returns 1 on success, 0 if the caller must fall back to two 32-bit stores. */ +static int try_strd_pair(int lo_reg, int hi_reg, int base, int abs_off, int sign) +{ + if ((unsigned)base > 15) + return 0; + if ((abs_off & 3) != 0 || abs_off > 1020) + return 0; + if (lo_reg < 0 || lo_reg > R_LR || lo_reg == R_SP) + return 0; + if (hi_reg < 0 || hi_reg > R_LR || hi_reg == R_SP) + return 0; + const uint32_t puw = sign ? 4u : 6u; + ot_check(th_strd_imm((uint32_t)lo_reg, (uint32_t)hi_reg, (uint32_t)base, abs_off, puw)); + return 1; +} + +/* Mirror of try_strd_pair for LDRD. Same register and offset constraints; + * the caller is responsible for guaranteeing 4-byte alignment of the target + * address (stack, or a symbol that passes sym_is_4_byte_aligned_for_64bit). */ +static int try_ldrd_pair(int lo_reg, int hi_reg, int base, int abs_off, int sign) +{ + if ((abs_off & 3) != 0 || abs_off > 1020) + return 0; + if (lo_reg < 0 || lo_reg > R_LR || lo_reg == R_SP) + return 0; + if (hi_reg < 0 || hi_reg > R_LR || hi_reg == R_SP) + return 0; + if (lo_reg == hi_reg) + return 0; + const uint32_t puw = sign ? 4u : 6u; + ot_check(th_ldrd_imm((uint32_t)lo_reg, (uint32_t)hi_reg, (uint32_t)base, abs_off, puw)); + return 1; +} + +/* Emit a single STR to a spill slot. Used by the codegen STRD pairing logic + * to flush a pending store when pairing wasn't possible. */ +ST_FUNC void tcc_gen_machine_store_spill(int src_reg, int32_t spill_offset) +{ + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; + int adj = fp_adjust_local_offset(spill_offset, 0); + int sign = (adj < 0); + int abs_off = sign ? -adj : adj; + ot_check_str_imm((uint32_t)src_reg, (uint32_t)base_reg, + abs_off, sign ? 4u : 6u, ENFORCE_ENCODING_NONE); +} + +/* Try to emit STRD for two 32-bit values to adjacent spill slots. + * off1 must be the lower offset (off1 + 4 == off2). + * Returns 1 on success, 0 if STRD constraints not met. */ +ST_FUNC int tcc_gen_machine_try_strd_spill(int reg1, int32_t off1, int reg2, int32_t off2) +{ + if (off1 + 4 != off2) + return 0; + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; + int adj = fp_adjust_local_offset(off1, 0); + int sign = (adj < 0); + int abs_off = sign ? -adj : adj; + return try_strd_pair(reg1, reg2, base_reg, abs_off, sign); +} + +/* Try to emit LDRD for two 32-bit values from adjacent spill slots. + * off1 must be the lower offset (off1 + 4 == off2). + * Returns 1 on success, 0 if LDRD constraints not met. */ +ST_FUNC int tcc_gen_machine_try_ldrd_spill(int reg1, int32_t off1, int reg2, int32_t off2) +{ + if (off1 + 4 != off2) + return 0; + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; + int adj = fp_adjust_local_offset(off1, 0); + int sign = (adj < 0); + int abs_off = sign ? -adj : adj; + return try_ldrd_pair(reg1, reg2, base_reg, abs_off, sign); +} + +/* Try to emit LDRD/STRD for two 32-bit values from adjacent offsets off a + * generic base register (not FP/SP). Used by the LOAD_INDEXED/STORE_INDEXED + * pairing peephole. `off` is the lower offset (caller has verified + * off + 4 fits within the same access range). Returns 1 on success. */ +ST_FUNC int tcc_gen_machine_try_ldrd_base(int reg1, int reg2, int base_reg, int32_t off) +{ + int sign = (off < 0); + int abs_off = sign ? -off : off; + return try_ldrd_pair(reg1, reg2, base_reg, abs_off, sign); +} + +ST_FUNC int tcc_gen_machine_try_strd_base(int reg1, int reg2, int base_reg, int32_t off) +{ + int sign = (off < 0); + int abs_off = sign ? -off : off; + return try_strd_pair(reg1, reg2, base_reg, abs_off, sign); +} + +ST_FUNC int tcc_gen_machine_try_strd_imm_spill(int64_t val1, int64_t val2, + int32_t off1, int32_t off2) +{ + if (off1 + 4 != off2) + return 0; + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; + int adj = fp_adjust_local_offset(off1, 0); + int sign = (adj < 0); + int abs_off = sign ? -adj : adj; + if ((abs_off & 3) != 0 || abs_off > 1020) + return 0; + + MachineCodegenContext ctx = {0}; + MachineOperand op1 = {.kind = MACH_OP_IMM, .u.imm.val = val1}; + int r1 = mach_ensure_in_reg(&ctx, &op1, 0); + int r2; + if (val1 == val2) { + r2 = r1; + } else { + MachineOperand op2 = {.kind = MACH_OP_IMM, .u.imm.val = val2}; + r2 = mach_ensure_in_reg(&ctx, &op2, (1u << (uint32_t)r1)); + } + if (r1 == R_SP || r2 == R_SP) { + mach_release_all(&ctx); + return 0; + } + const uint32_t puw = sign ? 4u : 6u; + ot_check(th_strd_imm((uint32_t)r1, (uint32_t)r2, (uint32_t)base_reg, abs_off, puw)); + mach_release_all(&ctx); + return 1; +} + +ST_FUNC int tcc_gen_machine_try_strd_imm_base(int64_t val1, int64_t val2, + int base_reg, int32_t off) +{ + int sign = (off < 0); + int abs_off = sign ? -off : off; + if ((unsigned)base_reg > 15) + return 0; + if ((abs_off & 3) != 0 || abs_off > 1020) + return 0; + + uint32_t excl = (1u << (uint32_t)base_reg); + MachineCodegenContext ctx = {0}; + MachineOperand op1 = {.kind = MACH_OP_IMM, .u.imm.val = val1}; + int r1 = mach_ensure_in_reg(&ctx, &op1, excl); + int r2; + if (val1 == val2) { + r2 = r1; + } else { + MachineOperand op2 = {.kind = MACH_OP_IMM, .u.imm.val = val2}; + r2 = mach_ensure_in_reg(&ctx, &op2, excl | (1u << (uint32_t)r1)); + } + if (r1 == R_SP || r2 == R_SP) { + mach_release_all(&ctx); + return 0; + } + const uint32_t puw = sign ? 4u : 6u; + ot_check(th_strd_imm((uint32_t)r1, (uint32_t)r2, (uint32_t)base_reg, abs_off, puw)); + mach_release_all(&ctx); + return 1; +} + ST_FUNC int tcc_machine_can_encode_stack_offset_for_reg(int frame_offset, int dest_reg) { /* Check if frame_offset can be directly encoded in ldr/str instructions @@ -2576,6 +3829,19 @@ ST_FUNC void tcc_machine_load_spill_slot(int dest_reg, int frame_offset) /* Adjust for callee-saved gap below FP (spill slots are always locals) */ frame_offset = fp_adjust_local_offset(frame_offset, 0); + + /* Peephole: if the previous emit was a STR or LDR of the same register to/from + * the same slot AND no other instruction has been emitted since, the value is + * already in dest_reg — skip the redundant load. */ + TCCIRState *ir = tcc_state ? tcc_state->ir : NULL; + if (ir && ir->spill_cache.last_emit_kind != 0 && + ir->spill_cache.last_emit_ind == ind && + ir->spill_cache.last_emit_reg == dest_reg && + ir->spill_cache.last_emit_offset == frame_offset) + { + return; + } + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; const int sign = (frame_offset < 0); const int abs_offset = sign ? -frame_offset : frame_offset; @@ -2587,6 +3853,14 @@ ST_FUNC void tcc_machine_load_spill_slot(int dest_reg, int frame_offset) ot_check(th_ldr_reg(dest_reg, base_reg, rr, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&rr_alloc); } + + if (ir) + { + ir->spill_cache.last_emit_kind = 2; /* LDR */ + ir->spill_cache.last_emit_ind = ind; + ir->spill_cache.last_emit_reg = (int8_t)dest_reg; + ir->spill_cache.last_emit_offset = frame_offset; + } } ST_FUNC void tcc_machine_store_spill_slot(int src_reg, int frame_offset) @@ -2621,6 +3895,15 @@ ST_FUNC void tcc_machine_store_spill_slot(int src_reg, int frame_offset) ot_check(th_str_reg(src_reg, base_reg, rr, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&rr_alloc); } + + TCCIRState *ir = tcc_state ? tcc_state->ir : NULL; + if (ir) + { + ir->spill_cache.last_emit_kind = 1; /* STR */ + ir->spill_cache.last_emit_ind = ind; + ir->spill_cache.last_emit_reg = (int8_t)src_reg; + ir->spill_cache.last_emit_offset = frame_offset; + } } /* Like tcc_machine_store_spill_slot, but for stack-passed parameters. @@ -2743,9 +4026,10 @@ static ThumbLiteralPoolEntry *th_literal_pool_allocate() dry_run_literal_pool_size <<= 1; dry_run_literal_pool = tcc_realloc(dry_run_literal_pool, dry_run_literal_pool_size * sizeof(ThumbLiteralPoolEntry)); + tcc_chained_hash_reserve(&literal_pool_hash, dry_run_literal_pool_size); } entry = &dry_run_literal_pool[dry_run_literal_pool_count++]; - memset(entry, 0, sizeof(ThumbLiteralPoolEntry)); + entry->sym = NULL; entry->relocation = -1; entry->shared_index = -1; /* Track the count in the main state for code size calculations */ @@ -2758,9 +4042,10 @@ static ThumbLiteralPoolEntry *th_literal_pool_allocate() const int new_size = thumb_gen_state.literal_pool_size << 1; thumb_gen_state.literal_pool = tcc_realloc(thumb_gen_state.literal_pool, new_size * sizeof(ThumbLiteralPoolEntry)); thumb_gen_state.literal_pool_size = new_size; + tcc_chained_hash_reserve(&literal_pool_hash, new_size); } entry = &thumb_gen_state.literal_pool[thumb_gen_state.literal_pool_count++]; - memset(entry, 0, sizeof(ThumbLiteralPoolEntry)); + entry->sym = NULL; entry->relocation = -1; entry->shared_index = -1; return entry; @@ -2772,22 +4057,33 @@ static ThumbLiteralPoolEntry *th_literal_pool_allocate() static ThumbLiteralPoolEntry *th_literal_pool_find_or_allocate(Sym *sym, int64_t imm) { int found_index; - LiteralPoolHashEntry *hash; + uint32_t full_hash; + TCCChainedHash *hash; + LiteralPoolLookupCache *cache; + ThumbLiteralPoolEntry *pool; int new_index; if (dry_run_state.active) { - hash = dry_run_literal_pool_hash; + hash = &literal_pool_hash; + cache = &literal_pool_last_lookup; + pool = dry_run_literal_pool; new_index = dry_run_literal_pool_count; } else { - hash = literal_pool_hash; + hash = &literal_pool_hash; + cache = &literal_pool_last_lookup; + pool = thumb_gen_state.literal_pool; new_index = thumb_gen_state.literal_pool_count; } - /* O(1) hash lookup instead of O(n) linear search */ - found_index = literal_pool_hash_find(hash, sym, imm); + full_hash = literal_pool_hash_func(sym, imm); + found_index = literal_pool_lookup_cache_find(cache, full_hash, sym, imm); + if (found_index < 0) + { + found_index = literal_pool_hash_find(hash, pool, full_hash, sym, imm); + } /* Allocate new entry */ ThumbLiteralPoolEntry *entry = th_literal_pool_allocate(); @@ -2798,9 +4094,10 @@ static ThumbLiteralPoolEntry *th_literal_pool_find_or_allocate(Sym *sym, int64_t } else { - /* This is a new primary entry - add to hash table */ - literal_pool_hash_insert(hash, sym, imm, new_index); + literal_pool_hash_insert(hash, full_hash, new_index); + found_index = new_index; } + literal_pool_lookup_cache_insert(cache, full_hash, sym, imm, found_index); return entry; } @@ -2809,15 +4106,44 @@ static void load_full_const(int r, int r1, uint32_t imm_lo, uint32_t imm_hi) struct Sym *sym = _lfc_sym; _lfc_sym = NULL; int64_t imm = (int64_t)((uint64_t)imm_hi << 32 | (uint64_t)imm_lo); - ElfSym *esym = NULL; ThumbLiteralPoolEntry *entry; - int sym_off = 0; thumb_opcode load_ins; int patch_pos; /* Validate symbol - only use symbols that can be externalized */ sym = validate_sym_for_reloc(sym); + /* Stable cache key: the validated symbol *before* the registration block + * below may NULL it. Registration is skipped during dry-run, so using the + * post-registration `sym` would make the dry and real passes disagree on + * cache hits and desynchronise code size. `reuse_sym` is identical in both + * passes (validate_sym_for_reloc does not depend on dry-run state). */ + Sym *reuse_sym = sym; + + /* Symbol-address reuse: when a register already holds &sym+imm, skip the + * redundant literal-pool load. Uses the same per-register imm_cache that + * is invalidated on every clobbering emit and at IR boundaries, so the + * decision is deterministic across the dry-run and real passes. Only the + * single-register (non-LDRD) form participates. */ + if (reuse_sym && thumb_gen_state.generating_function && r1 == PREG_NONE && r >= 0 && r < 16) + { + if (imm_cache[r].valid && imm_cache[r].sym == reuse_sym && imm_cache[r].value == imm) + return; /* r already holds &sym+imm */ + for (int rr = 0; rr < 16; rr++) + { + if (rr != r && imm_cache[rr].valid && imm_cache[rr].sym == reuse_sym && imm_cache[rr].value == imm) + { + /* Another register holds it: copy instead of reloading from the + * literal pool (saves a memory access and a pool word). */ + ot_check_mov_reg((uint32_t)r, (uint32_t)rr, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + imm_cache[r].value = imm; + imm_cache[r].sym = reuse_sym; + imm_cache[r].valid = 1; + return; + } + } + } + /* During dry-run, skip symbol registration and literal pool allocation. * We just emit the instruction (ot_check handles dry-run mode) to track * code size and scratch register usage, without creating side effects. */ @@ -2830,17 +4156,9 @@ static void load_full_const(int r, int r1, uint32_t imm_lo, uint32_t imm_hi) if (sym->c <= 0) { /* Registration failed - symbol can't be externalized */ - const char *name = get_tok_str(sym->v & ~SYM_FIELD, NULL); - fprintf(stderr, "[TCC-DIAG] load_full_const: put_extern_sym failed for '%s', c=%d\n", name ? name : "?", - sym->c); sym = NULL; } } - - if (sym) - { - esym = elfsym(sym); - } } TRACE("'load_full_const' to register: %d, with imm: %d\n", r, imm); @@ -2856,11 +4174,22 @@ static void load_full_const(int r, int r1, uint32_t imm_lo, uint32_t imm_hi) } else { - load_ins = th_ldrd_imm(r, r1, R_PC, 0, 4, ENFORCE_ENCODING_NONE); + load_ins = th_ldrd_imm(r, r1, R_PC, 0, 4); } ot_check(load_ins); patch_pos = ind - load_ins.size; + /* Record that r now holds &sym+imm so a later reference to the same global + * address can be elided. Must run after ot_check(), whose emit-level + * invalidation cleared imm_cache[r] for the LDR we just produced. Keyed on + * the pre-registration `reuse_sym` for dry/real-pass consistency. */ + if (reuse_sym && thumb_gen_state.generating_function && r1 == PREG_NONE && r >= 0 && r < 16) + { + imm_cache[r].value = imm; + imm_cache[r].sym = reuse_sym; + imm_cache[r].valid = 1; + } + /* During dry-run, we still need to create the literal pool entry to ensure * the literal pool behavior (threshold checks, sharing, etc.) matches the real pass. * We still set sym so that find_or_allocate can match entries correctly. @@ -2872,30 +4201,29 @@ static void load_full_const(int r, int r1, uint32_t imm_lo, uint32_t imm_hi) entry->data_size = (r1 == PREG_NONE) ? 4 : 8; entry->short_instruction = (r1 == PREG_NONE && load_ins.size == 2); + if (!sym) + { + entry->imm = imm; + return; + } + /* Re-derive esym after ot_check(): literal pool generation during ot_check * can call put_elf_sym → section_ptr_add → section_realloc, which may * free and reallocate the symtab section buffer, invalidating any * earlier ElfSym pointer. */ - if (sym) - esym = elfsym(sym); + ElfSym *esym = elfsym(sym); + int sym_off = 0; if (esym) { sym_off = esym->st_shndx; } if (!pic) { - if (sym) - { - entry->relocation = R_ARM_ABS32; - /* The imm value is the addend (offset from symbol base). - For arr[i], imm = i * sizeof(element). - The linker will add the symbol's address to this addend. */ - entry->imm = imm; - } - else - { - entry->imm = imm; - } + entry->relocation = R_ARM_ABS32; + /* The imm value is the addend (offset from symbol base). + For arr[i], imm = i * sizeof(element). + The linker will add the symbol's address to this addend. */ + entry->imm = imm; } else { @@ -2925,25 +4253,38 @@ static void load_full_const(int r, int r1, uint32_t imm_lo, uint32_t imm_hi) * loader patches the slot to the runtime code address. */ int sym_in_code_section = 0; + int sym_in_rodata = 0; if (sym_off > 0 && sym_off < tcc_state->nb_sections) { Section *sym_sec = tcc_state->sections[sym_off]; if (sym_sec && (sym_sec->sh_flags & SHF_EXECINSTR)) sym_in_code_section = 1; + /* Only the main .rodata section is anchor-addressed: R_ARM_RODATA_OFF + * resolves against rodata_section->sh_addr, so a symbol in any OTHER + * read-only section would be mis-addressed. Exact pointer match. */ + if (sym_sec && sym_sec == rodata_section) + sym_in_rodata = 1; + } + if (tcc_state->share_rodata && (sym->type.t & VT_STATIC) && sym_off != SHN_UNDEF && + sym_in_rodata) + { + /* Same-module pure-const .rodata symbol: address via the rodata + * anchor (shared base) + R_ARM_RODATA_OFF (offset within .rodata), + * not GOTOFF (which assumes rodata sits at a fixed distance from the + * per-process GOT — false once .rodata is shared XIP). */ + entry->relocation = R_ARM_RODATA_OFF; } - if (sym->type.t & VT_STATIC && sym_off != cur_text_section->sh_num && !sym_in_code_section) + else if (sym->type.t & VT_STATIC && sym_off != SHN_UNDEF && sym_off != cur_text_section->sh_num && + !sym_in_code_section) { - /* Static data symbol — GOTOFF (same segment as GOT) */ + /* Static data symbol — GOTOFF (same segment as GOT). + * sym_off == SHN_UNDEF means the function is forward-declared + * but not yet defined — we don't know its section, so we must + * use GOT32 (safe indirect path) instead of GOTOFF. */ entry->relocation = R_ARM_GOTOFF; } else { - if (sym->type.t & VT_STATIC && sym_off != cur_text_section->sh_num && sym_in_code_section) - { - const char *sym_name = get_tok_str(sym->v & ~SYM_FIELD, NULL); - fprintf(stderr, "[TCC] static code sym '%s' in sec %d (cur %d) -> GOT32\n", sym_name ? sym_name : "?", - sym_off, cur_text_section->sh_num); - } entry->relocation = R_ARM_GOT32; } } @@ -2978,18 +4319,39 @@ static void load_full_const(int r, int r1, uint32_t imm_lo, uint32_t imm_hi) if (sym_sec && (sym_sec->sh_flags & SHF_EXECINSTR)) sym_in_code_section_cg = 1; } - if (sym->type.t & VT_STATIC && sym_off != cur_text_section->sh_num && !sym_in_code_section_cg) + if (entry->relocation == R_ARM_RODATA_OFF) + { + /* Shared .rodata anchor: r holds (sym - rodata_base) from the + * R_ARM_RODATA_OFF literal. Add the rodata runtime base from the + * reserved GOT anchor slot: + * push {tmp}; ldr tmp, [R9, #24]; add r, r, tmp; pop {tmp} + * Use a DETERMINISTIC fixed scratch (a low register other than r, + * saved by push/pop), NOT get_scratch_reg_with_save: the latter's + * callee-saved fallback is gated on !dry_run_state.active, so under + * register pressure (e.g. ps's larger functions) it can pick a + * different register in the dry-run vs real pass, desync instruction + * sizes, and corrupt literal-pool offsets — yielding a near-NULL + * rodata address. A fixed push/pop emits identically in both passes. */ + int anchor_tmp = (r == 0) ? 1 : 0; + ot_check(th_push((uint16_t)(1u << anchor_tmp))); + ot_check_ldr_imm(anchor_tmp, R9, YAFF_RODATA_ANCHOR_GOT_OFFSET, 6, ENFORCE_ENCODING_NONE); + ot_check( + th_add_reg(r, r, anchor_tmp, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_pop((uint16_t)(1u << anchor_tmp))); + } + else if (sym->type.t & VT_STATIC && sym_off != SHN_UNDEF && sym_off != cur_text_section->sh_num && + !sym_in_code_section_cg) { /* Static data symbol — GOTOFF (add R9) */ - ot_check(th_add_reg(r, r, R9, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_add_reg(r, r, R9, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); } else { thumb_opcode ot; - ot_check(th_add_reg(r, r, R9, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_add_reg(r, r, R9, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(r, r, 0, 6, ENFORCE_ENCODING_NONE)); - ot = th_add_imm(r, r, imm, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(r, r, 0, 6, ENFORCE_ENCODING_NONE); + ot = th_add_imm(r, r, imm, flags_safe(), ENFORCE_ENCODING_NONE); if (ot.size != 0) { ot_check(ot); @@ -3020,7 +4382,7 @@ static void load_full_const(int r, int r1, uint32_t imm_lo, uint32_t imm_hi) entry2->data_size = 4; entry2->short_instruction = (ldr.size == 2); ot_check( - th_add_reg(r, r, scratch, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + th_add_reg(r, r, scratch, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&scratch_alloc); } } @@ -3029,15 +4391,15 @@ static void load_full_const(int r, int r1, uint32_t imm_lo, uint32_t imm_hi) { if (sym->type.t & VT_STATIC) { - ot_check(th_add_reg(r, r, R_PC, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - ot_check(th_sub_imm(r, r, 8, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_add_reg(r, r, R_PC, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_imm(r, r, 8, flags_safe(), ENFORCE_ENCODING_NONE)); } else { thumb_opcode ot; - ot_check(th_add_reg(r, r, R_PC, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(r, r, 4, 6, ENFORCE_ENCODING_NONE)); - ot = th_add_imm(r, r, imm, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + ot_check(th_add_reg(r, r, R_PC, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(r, r, 4, 6, ENFORCE_ENCODING_NONE); + ot = th_add_imm(r, r, imm, flags_safe(), ENFORCE_ENCODING_NONE); if (ot.size != 0) { ot_check(ot); @@ -3060,7 +4422,7 @@ static void load_full_const(int r, int r1, uint32_t imm_lo, uint32_t imm_hi) entry2->data_size = 4; entry2->short_instruction = (ldr.size == 2); ot_check( - th_add_reg(r, r, scratch, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + th_add_reg(r, r, scratch, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&scratch_alloc); } } @@ -3095,8 +4457,8 @@ ST_FUNC void tcc_machine_addr_of_stack_slot(int dest_reg, int frame_offset, int { if (dest_reg != base_reg) { - ot_check(th_mov_reg(dest_reg, base_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, - false)); + ot_check_mov_reg(dest_reg, base_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, + false); } return; } @@ -3115,8 +4477,8 @@ ST_FUNC void tcc_machine_addr_of_stack_slot(int dest_reg, int frame_offset, int { if (cached_reg != dest_reg) { - ot_check(th_mov_reg(dest_reg, cached_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(dest_reg, cached_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } return; } @@ -3125,8 +4487,8 @@ ST_FUNC void tcc_machine_addr_of_stack_slot(int dest_reg, int frame_offset, int const int neg = (frame_offset < 0); int abs_off = neg ? -frame_offset : frame_offset; - thumb_opcode op = neg ? th_sub_imm(dest_reg, base_reg, abs_off, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE) - : th_add_imm(dest_reg, base_reg, abs_off, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + thumb_opcode op = neg ? th_sub_imm(dest_reg, base_reg, abs_off, flags_safe(), ENFORCE_ENCODING_NONE) + : th_add_imm(dest_reg, base_reg, abs_off, flags_safe(), ENFORCE_ENCODING_NONE); if (op.size != 0) { @@ -3150,7 +4512,7 @@ ST_FUNC void tcc_machine_addr_of_stack_slot(int dest_reg, int frame_offset, int } load_full_const(offset_reg, PREG_NONE, LFC_SPLIT(frame_offset)); - ot_check(th_add_reg(dest_reg, base_reg, offset_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, + ot_check(th_add_reg(dest_reg, base_reg, offset_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); if (dest_reg == base_reg) @@ -3183,13 +4545,13 @@ ST_FUNC void tcc_machine_load_constant(int dest_reg, int dest_reg_high, int64_t return; } /* Invalid or missing sym - fall through to treat as plain constant */ - { - const char *name = get_tok_str(sym->v & ~SYM_FIELD, NULL); - fprintf(stderr, "[TCC-DIAG] tcc_machine_load_constant: sym '%s' failed validation, loading plain value=%lld\n", - name ? name : "?", (long long)value); - } } + if (!sym && !is_64bit && dest_reg >= 0 && dest_reg < 16 && + imm_cache[dest_reg].valid && imm_cache[dest_reg].sym == NULL && + imm_cache[dest_reg].value == value) + return; + if (is_64bit) { const uint32_t lo = (uint32_t)(value & 0xFFFFFFFF); @@ -3215,6 +4577,13 @@ ST_FUNC void tcc_machine_load_constant(int dest_reg, int dest_reg_high, int64_t /* 32-bit constant */ if (!ot(th_generic_mov_imm(dest_reg, (uint32_t)value))) load_full_const(dest_reg, PREG_NONE, LFC_SPLIT(value)); + + if (!sym && !is_64bit && dest_reg >= 0 && dest_reg < 16) + { + imm_cache[dest_reg].value = value; + imm_cache[dest_reg].sym = NULL; + imm_cache[dest_reg].valid = 1; + } } /* Load comparison result (0 or 1) based on condition flags. @@ -3288,8 +4657,28 @@ static void load_from_base(int r, int r1, int irop_btype, int is_unsigned, int f uint32_t exclude = (1u << r) | (1u << ir_high); base_alloc = get_scratch_reg_with_save(exclude); base_reg = (uint32_t)base_alloc.reg; - ot_check(th_mov_reg((int)base_reg, (int)base, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((int)base_reg, (int)base, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); + } + + /* Try LDRD Rt, Rt2, [Rn, #±imm] when both halves share one base. + * T1 encoding requires: Rt != Rt2, Rt/Rt2 not SP/PC, offset 4-byte + * aligned and |offset| <= 1020. LDRD also requires the target address + * to be 4-byte aligned on ARMv7-M/v8-M (faults otherwise, regardless of + * UNALIGN_TRP). Restrict to SP/FP-relative bases where TCC's stack + * allocator guarantees 4-byte alignment of 64-bit slots; arbitrary + * pointers (e.g. into a packed struct) may be unaligned. */ + const int base_is_stack = (base_reg == (uint32_t)R_SP || base_reg == (uint32_t)R_FP); + if (base_is_stack && (fc & 3) == 0 && fc <= 1020 && r >= 0 && r <= R_LR && r != R_SP && ir_high >= 0 && + ir_high <= R_LR && ir_high != R_SP && r != ir_high) + { + uint32_t puw = sign ? 4 : 6; + ot_check(th_ldrd_imm((uint32_t)r, (uint32_t)ir_high, base_reg, fc, puw)); + if (base_alloc.saved) + restore_scratch_reg(&base_alloc); + if (ir_high_alloc.saved) + restore_scratch_reg(&ir_high_alloc); + return; } /* Load low word */ @@ -3383,6 +4772,44 @@ typedef struct ThumbDataProcessingHandler thumb_reg_handler_t reg_handler; } ThumbDataProcessingHandler; +/* Dispatch a reg_handler call through a direct call instead of an indirect + * (function pointer) call. This works around a code-generation bug where + * struct-by-value arguments (thumb_shift) get corrupted when passed through + * indirect calls that also use sret return (thumb_opcode is 8 bytes). + * By comparing the function pointer and branching to a direct call, the + * cross-compiler generates correct struct passing code. */ +static thumb_opcode thumb_call_reg_handler(thumb_reg_handler_t fn, uint32_t rd, uint32_t rn, uint32_t rm, + thumb_flags_behaviour flags, thumb_shift shift, + thumb_enforce_encoding encoding) +{ + if (fn == th_add_reg) + return th_add_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_sub_reg) + return th_sub_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_adc_reg) + return th_adc_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_sbc_reg) + return th_sbc_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_cmp_reg) + return th_cmp_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_lsl_reg) + return th_lsl_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_lsr_reg) + return th_lsr_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_asr_reg) + return th_asr_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_orr_reg) + return th_orr_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_and_reg) + return th_and_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_eor_reg) + return th_eor_reg(rd, rn, rm, flags, shift, encoding); + if (fn == th_bic_reg) + return th_bic_reg(rd, rn, rm, flags, shift, encoding); + /* Unreachable for known handlers — fallback to direct call. */ + return fn(rd, rn, rm, flags, shift, encoding); +} + static void thumb_require_materialized_reg(const char *ctx, const char *operand, int reg) { const bool reg_is_hw = (reg >= 0) && (reg <= 15); @@ -3412,7 +4839,7 @@ static bool thumb_is_hw_reg(int reg) static void thumb_emit_op_imm_fallback(int rd, int rn, uint32_t imm, thumb_flags_behaviour flags, ThumbDataProcessingHandler handler) { - thumb_opcode sub_low = handler.imm_handler(rd, rn, imm, flags, ENFORCE_ENCODING_NONE); + thumb_opcode sub_low = thumb_call_imm_handler(handler.imm_handler, rd, rn, imm, flags, ENFORCE_ENCODING_NONE); if (sub_low.size == 0) { uint32_t exclude = 0; @@ -3422,7 +4849,8 @@ static void thumb_emit_op_imm_fallback(int rd, int rn, uint32_t imm, thumb_flags exclude |= (1u << rn); ScratchRegAlloc scratch = get_scratch_reg_with_save(exclude); tcc_machine_load_constant(scratch.reg, PREG_NONE, (int32_t)imm, 0, NULL); - ot_check(handler.reg_handler(rd, rn, scratch.reg, flags, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(thumb_call_reg_handler(handler.reg_handler, rd, rn, scratch.reg, flags, THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE)); restore_scratch_reg(&scratch); } else @@ -3435,7 +4863,7 @@ typedef thumb_opcode (*thumb_regonly3_handler_t)(uint32_t rd, uint32_t rn, uint3 static thumb_opcode thumb_mul_regonly(uint32_t rd, uint32_t rn, uint32_t rm) { - return th_mul(rd, rn, rm, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + return th_mul(rd, rn, rm, flags_safe(), ENFORCE_ENCODING_NONE); } static thumb_opcode thumb_sdiv_regonly(uint32_t rd, uint32_t rn, uint32_t rm) @@ -3657,19 +5085,58 @@ static void thumb_emit_data_processing_mop64(const MachineOperand *src1, const M else { rn_hi = mach_alloc_scratch(&mctx, excl); - ot_check(th_mov_imm((uint32_t)rn_hi, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm((uint32_t)rn_hi, 0, flags_safe(), ENFORCE_ENCODING_NONE)); } if (thumb_is_hw_reg(rn_hi)) excl |= (1u << (uint32_t)rn_hi); /* 3. Load src2 and emit the 64-bit operation. */ - const thumb_flags_behaviour lo_flags = uses_carry ? FLAGS_BEHAVIOUR_SET : FLAGS_BEHAVIOUR_NOT_IMPORTANT; + const thumb_flags_behaviour lo_flags = uses_carry ? FLAGS_BEHAVIOUR_SET : flags_safe(); + /* For CMP, the high-word SBCS must set flags (the following SETIF reads them). */ + const thumb_flags_behaviour hi_flags = (op == TCCIR_OP_CMP) ? FLAGS_BEHAVIOUR_SET : flags_safe(); if (src2->kind == MACH_OP_IMM) { const uint32_t imm_lo = (uint32_t)((uint64_t)src2->u.imm.val & 0xffffffffu); const uint32_t imm_hi = (uint32_t)((uint64_t)src2->u.imm.val >> 32); - thumb_emit_op_imm_fallback(rd_lo, rn_lo, imm_lo, lo_flags, regular); - thumb_emit_op_imm_fallback(rd_hi, rn_hi, imm_hi, FLAGS_BEHAVIOUR_NOT_IMPORTANT, carry_h); + /* Per-half peephole: when the immediate half makes the op a constant + * answer (OR/XOR with 0 → copy src; AND with 0 → load 0; AND with -1 → + * copy src), skip the data-processing op. Cuts dead `orr r, r, #0` and + * `and r, r, #0` halves left behind by 64-bit ops on 32-bit values. */ + const bool is_or = (op == TCCIR_OP_OR); + const bool is_xor = (op == TCCIR_OP_XOR); + const bool is_and = (op == TCCIR_OP_AND); + const bool can_simplify_lo = lo_flags == flags_safe(); + const bool can_simplify_hi = hi_flags == flags_safe(); + for (int half = 0; half < 2; half++) + { + const uint32_t imm = (half == 0) ? imm_lo : imm_hi; + const int rd = (half == 0) ? rd_lo : rd_hi; + const int rn = (half == 0) ? rn_lo : rn_hi; + const thumb_flags_behaviour fb = (half == 0) ? lo_flags : hi_flags; + const bool can_simplify = (half == 0) ? can_simplify_lo : can_simplify_hi; + const ThumbDataProcessingHandler *h = (half == 0) ? ®ular : &carry_h; + + if (can_simplify && (is_or || is_xor) && imm == 0) + { + if (rd != rn) + ot_check_mov_reg((uint32_t)rd, (uint32_t)rn, flags_safe(), + THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + } + else if (can_simplify && is_and && imm == 0) + { + ot_check(th_mov_imm((uint32_t)rd, 0, flags_safe(), ENFORCE_ENCODING_NONE)); + } + else if (can_simplify && is_and && imm == 0xFFFFFFFFu) + { + if (rd != rn) + ot_check_mov_reg((uint32_t)rd, (uint32_t)rn, flags_safe(), + THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + } + else + { + thumb_emit_op_imm_fallback(rd, rn, imm, fb, *h); + } + } } else { @@ -3686,12 +5153,14 @@ static void thumb_emit_data_processing_mop64(const MachineOperand *src1, const M else { rm_hi = mach_alloc_scratch(&mctx, excl); - ot_check(th_mov_imm((uint32_t)rm_hi, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm((uint32_t)rm_hi, 0, flags_safe(), ENFORCE_ENCODING_NONE)); + } + { + ot_check(thumb_call_reg_handler(regular.reg_handler, (uint32_t)rd_lo, (uint32_t)rn_lo, (uint32_t)rm_lo, lo_flags, + THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(thumb_call_reg_handler(carry_h.reg_handler, (uint32_t)rd_hi, (uint32_t)rn_hi, (uint32_t)rm_hi, + hi_flags, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); } - ot_check(regular.reg_handler((uint32_t)rd_lo, (uint32_t)rn_lo, (uint32_t)rm_lo, lo_flags, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE)); - ot_check(carry_h.reg_handler((uint32_t)rd_hi, (uint32_t)rn_hi, (uint32_t)rm_hi, FLAGS_BEHAVIOUR_NOT_IMPORTANT, - THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); } /* 4. Write results back to spill/param slots if dest was not pre-allocated. */ @@ -3719,7 +5188,7 @@ static void thumb_emit_data_processing_mop64(const MachineOperand *src1, const M * extracted from MachineOperand rather than IROperand fields. */ static void thumb_emit_shift64_mop(const MachineOperand *src1, const MachineOperand *src2, const MachineOperand *dest, - TccIrOp op) + TccIrOp op, bool skip_lo, bool skip_hi) { if (src2->kind != MACH_OP_IMM) { @@ -3794,8 +5263,18 @@ static void thumb_emit_shift64_mop(const MachineOperand *src1, const MachineOper if (thumb_is_hw_reg(src_lo)) excl |= (1u << (uint32_t)src_lo); + /* Skip src1 high-half materialization when the shift will not read it. + * SHL with sh >= 32 only uses src_lo (everything shifts up out of view). + * SHR/SAR with sh >= 64 produces a 0/sign-fill that the emit tail + * generates directly without referencing src_hi. */ + int hi_needed = 1; + if (is_left && sh >= 32) + hi_needed = 0; + else if (!is_left && sh >= 64) + hi_needed = 0; + /* Load src1 high half or compute by extension. */ - int src_hi; + int src_hi = (int)PREG_REG_NONE; if (src1->is_64bit) { MachineOperand s1_hi = mach_make_hi_half(src1); @@ -3803,24 +5282,24 @@ static void thumb_emit_shift64_mop(const MachineOperand *src1, const MachineOper if (thumb_is_hw_reg(src_hi)) excl |= (1u << (uint32_t)src_hi); } - else + else if (hi_needed) { src_hi = mach_alloc_scratch(&mctx, excl); excl |= (1u << (uint32_t)src_hi); if (arith_right) ot_check( - th_asr_imm((uint32_t)src_hi, (uint32_t)src_lo, 31, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + th_asr_imm((uint32_t)src_hi, (uint32_t)src_lo, 31, flags_safe(), ENFORCE_ENCODING_NONE)); else - ot_check(th_mov_imm((uint32_t)src_hi, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm((uint32_t)src_hi, 0, flags_safe(), ENFORCE_ENCODING_NONE)); } /* Emit the shift — logic identical to thumb_emit_shift64_imm core. */ if (sh == 0) { - ot_check(th_mov_reg((uint32_t)dst_lo, (uint32_t)src_lo, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); - ot_check(th_mov_reg((uint32_t)dst_hi, (uint32_t)src_hi, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dst_lo, (uint32_t)src_lo, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); + ot_check_mov_reg((uint32_t)dst_hi, (uint32_t)src_hi, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } else if (sh < 32) { @@ -3828,25 +5307,55 @@ static void thumb_emit_shift64_mop(const MachineOperand *src1, const MachineOper ScratchRegAlloc tmp = get_scratch_reg_with_save(thumb_exclude_mask_for_regs(4, regs) | excl); if (is_left) { - ot_check( - dst_lo_shift((uint32_t)dst_lo, (uint32_t)src_lo, sh, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); - ot_check(cross_shift((uint32_t)tmp.reg, (uint32_t)src_lo, 32 - sh, FLAGS_BEHAVIOUR_NOT_IMPORTANT, + /* Compute the cross-shift into tmp BEFORE any destination is written, + * because dst_lo/dst_hi may alias src_lo/src_hi. */ + ot_check(thumb_call_imm_handler(cross_shift, (uint32_t)tmp.reg, (uint32_t)src_lo, 32 - sh, flags_safe(), ENFORCE_ENCODING_NONE)); - ot_check( - dst_hi_shift((uint32_t)dst_hi, (uint32_t)src_hi, sh, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); - ot_check(th_orr_reg((uint32_t)dst_hi, (uint32_t)dst_hi, (uint32_t)tmp.reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, + if (dst_hi == src_lo) + { + /* dst_hi aliases src_lo — compute dst_lo first (needs src_lo). */ + if (!skip_lo) + ot_check( + thumb_call_imm_handler(dst_lo_shift, (uint32_t)dst_lo, (uint32_t)src_lo, sh, flags_safe(), ENFORCE_ENCODING_NONE)); + ot_check( + thumb_call_imm_handler(dst_hi_shift, (uint32_t)dst_hi, (uint32_t)src_hi, sh, flags_safe(), ENFORCE_ENCODING_NONE)); + } + else + { + /* Default order: dst_hi first to avoid clobbering src_hi via dst_lo. */ + ot_check( + thumb_call_imm_handler(dst_hi_shift, (uint32_t)dst_hi, (uint32_t)src_hi, sh, flags_safe(), ENFORCE_ENCODING_NONE)); + if (!skip_lo) + ot_check( + thumb_call_imm_handler(dst_lo_shift, (uint32_t)dst_lo, (uint32_t)src_lo, sh, flags_safe(), ENFORCE_ENCODING_NONE)); + } + ot_check(th_orr_reg((uint32_t)dst_hi, (uint32_t)dst_hi, (uint32_t)tmp.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); } else { - ot_check(cross_shift((uint32_t)tmp.reg, (uint32_t)src_hi, 32 - sh, FLAGS_BEHAVIOUR_NOT_IMPORTANT, + /* Compute the cross-shift into tmp BEFORE any destination is written, + * because dst_lo/dst_hi may alias src_lo/src_hi. */ + ot_check(thumb_call_imm_handler(cross_shift, (uint32_t)tmp.reg, (uint32_t)src_hi, 32 - sh, flags_safe(), ENFORCE_ENCODING_NONE)); - ot_check( - th_lsr_imm((uint32_t)dst_lo, (uint32_t)src_lo, sh, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); - ot_check(th_orr_reg((uint32_t)dst_lo, (uint32_t)dst_lo, (uint32_t)tmp.reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, + if (dst_lo == src_hi) + { + /* dst_lo aliases src_hi — compute dst_hi first (needs src_hi). */ + ot_check( + thumb_call_imm_handler(dst_hi_shift, (uint32_t)dst_hi, (uint32_t)src_hi, sh, flags_safe(), ENFORCE_ENCODING_NONE)); + ot_check( + th_lsr_imm((uint32_t)dst_lo, (uint32_t)src_lo, sh, flags_safe(), ENFORCE_ENCODING_NONE)); + } + else + { + /* Default order: dst_lo first to avoid clobbering src_lo via dst_hi. */ + ot_check( + th_lsr_imm((uint32_t)dst_lo, (uint32_t)src_lo, sh, flags_safe(), ENFORCE_ENCODING_NONE)); + ot_check( + thumb_call_imm_handler(dst_hi_shift, (uint32_t)dst_hi, (uint32_t)src_hi, sh, flags_safe(), ENFORCE_ENCODING_NONE)); + } + ot_check(th_orr_reg((uint32_t)dst_lo, (uint32_t)dst_lo, (uint32_t)tmp.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - ot_check( - dst_hi_shift((uint32_t)dst_hi, (uint32_t)src_hi, sh, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); } restore_scratch_reg(&tmp); } @@ -3854,69 +5363,99 @@ static void thumb_emit_shift64_mop(const MachineOperand *src1, const MachineOper { if (is_left) { - ot_check(th_mov_imm((uint32_t)dst_lo, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); - ot_check(th_mov_reg((uint32_t)dst_hi, (uint32_t)src_lo, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + /* Emit MOV dst_hi first: dst_lo may alias src_lo. */ + ot_check_mov_reg((uint32_t)dst_hi, (uint32_t)src_lo, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); + if (!skip_lo) + ot_check(th_mov_imm((uint32_t)dst_lo, 0, flags_safe(), ENFORCE_ENCODING_NONE)); } else { - ot_check(th_mov_reg((uint32_t)dst_lo, (uint32_t)src_hi, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); - if (arith_right) - ot_check( - th_asr_imm((uint32_t)dst_hi, (uint32_t)src_hi, 31, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); - else - ot_check(th_mov_imm((uint32_t)dst_hi, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + /* Emit MOV dst_lo first: dst_hi may alias src_hi. */ + ot_check_mov_reg((uint32_t)dst_lo, (uint32_t)src_hi, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); + if (!skip_hi) + { + if (arith_right) + ot_check( + th_asr_imm((uint32_t)dst_hi, (uint32_t)src_hi, 31, flags_safe(), ENFORCE_ENCODING_NONE)); + else + ot_check(th_mov_imm((uint32_t)dst_hi, 0, flags_safe(), ENFORCE_ENCODING_NONE)); + } } } else if (sh < 64) { if (is_left) { - ot_check(th_mov_imm((uint32_t)dst_lo, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); - ot_check(dst_hi_shift((uint32_t)dst_hi, (uint32_t)src_lo, sh - 32, FLAGS_BEHAVIOUR_NOT_IMPORTANT, + /* Emit shift into dst_hi first: dst_lo may alias src_lo. */ + ot_check(thumb_call_imm_handler(dst_hi_shift, (uint32_t)dst_hi, (uint32_t)src_lo, sh - 32, flags_safe(), ENFORCE_ENCODING_NONE)); + if (!skip_lo) + ot_check(th_mov_imm((uint32_t)dst_lo, 0, flags_safe(), ENFORCE_ENCODING_NONE)); } else { - ot_check(dst_hi_shift((uint32_t)dst_lo, (uint32_t)src_hi, sh - 32, FLAGS_BEHAVIOUR_NOT_IMPORTANT, - ENFORCE_ENCODING_NONE)); - if (arith_right) - ot_check( - th_asr_imm((uint32_t)dst_hi, (uint32_t)src_hi, 31, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + if (arith_right && dst_lo == src_hi) + { + /* dst_lo aliases src_hi — compute dst_hi (sign extension) first + * while src_hi is still intact, then shift into dst_lo. */ + if (!skip_hi) + ot_check( + th_asr_imm((uint32_t)dst_hi, (uint32_t)src_hi, 31, flags_safe(), ENFORCE_ENCODING_NONE)); + ot_check(thumb_call_imm_handler(dst_hi_shift, (uint32_t)dst_lo, (uint32_t)src_hi, sh - 32, flags_safe(), + ENFORCE_ENCODING_NONE)); + } else - ot_check(th_mov_imm((uint32_t)dst_hi, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + { + ot_check(thumb_call_imm_handler(dst_hi_shift, (uint32_t)dst_lo, (uint32_t)src_hi, sh - 32, flags_safe(), + ENFORCE_ENCODING_NONE)); + if (!skip_hi) + { + if (arith_right) + ot_check( + th_asr_imm((uint32_t)dst_hi, (uint32_t)src_hi, 31, flags_safe(), ENFORCE_ENCODING_NONE)); + else + ot_check(th_mov_imm((uint32_t)dst_hi, 0, flags_safe(), ENFORCE_ENCODING_NONE)); + } + } } } else /* sh >= 64 */ { if (is_left) { - ot_check(th_mov_imm((uint32_t)dst_lo, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); - ot_check(th_mov_imm((uint32_t)dst_hi, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + if (!skip_lo) + ot_check(th_mov_imm((uint32_t)dst_lo, 0, flags_safe(), ENFORCE_ENCODING_NONE)); + if (!skip_hi) + ot_check(th_mov_imm((uint32_t)dst_hi, 0, flags_safe(), ENFORCE_ENCODING_NONE)); } else if (arith_right) { + /* Both halves are the sign of src_hi; dst_lo copies dst_hi, so leave + * this degenerate path intact rather than risk the inter-half dep. */ ot_check( - th_asr_imm((uint32_t)dst_hi, (uint32_t)src_hi, 31, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); - ot_check(th_mov_reg((uint32_t)dst_lo, (uint32_t)dst_hi, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + th_asr_imm((uint32_t)dst_hi, (uint32_t)src_hi, 31, flags_safe(), ENFORCE_ENCODING_NONE)); + ot_check_mov_reg((uint32_t)dst_lo, (uint32_t)dst_hi, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } else { - ot_check(th_mov_imm((uint32_t)dst_lo, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); - ot_check(th_mov_imm((uint32_t)dst_hi, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + if (!skip_lo) + ot_check(th_mov_imm((uint32_t)dst_lo, 0, flags_safe(), ENFORCE_ENCODING_NONE)); + if (!skip_hi) + ot_check(th_mov_imm((uint32_t)dst_hi, 0, flags_safe(), ENFORCE_ENCODING_NONE)); } } - /* Write back. */ - if (store_lo) + /* Write back. A dead half was never materialized, so skip its store. */ + if (store_lo && !skip_lo) { MachineOperand dst_lo_op = mach_make_lo_half(dest); dst_lo_op.btype = IROP_BTYPE_INT32; mach_writeback_dest(&dst_lo_op, dst_lo); } - if (store_hi) + if (store_hi && !skip_hi) { MachineOperand dst_hi_op = mach_make_hi_half(dest); dst_hi_op.btype = IROP_BTYPE_INT32; @@ -3935,11 +5474,103 @@ static void thumb_emit_shift64_mop(const MachineOperand *src1, const MachineOper */ static void thumb_emit_data_processing_mop32(const MachineOperand *src1, const MachineOperand *src2, const MachineOperand *dest, TccIrOp op, ThumbDataProcessingHandler handler, - thumb_flags_behaviour flags) + thumb_flags_behaviour flags, uint32_t barrel_shift) { const bool dest_sets_flags = (op == TCCIR_OP_CMP); MachineCodegenContext mctx = {0}; + /* RSB fast path: SUB with immediate src1 → RSB Rd, src2, #imm. + * Avoids materializing the immediate into a register. + * Only attempt when the immediate is encodable as a Thumb-2 modified + * constant (th_pack_const returns non-zero, or imm==0). */ + if (op == TCCIR_OP_SUB && !dest_sets_flags && barrel_shift == 0 && + src1->kind == MACH_OP_IMM && !src1->needs_deref && !src1->is_64bit) + { + uint32_t imm = (uint32_t)src1->u.imm.val; + if (imm == 0 || th_pack_const(imm) != 0) + { + int dest_reg = mach_get_dest_reg(&mctx, dest, 0); + uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; + int src2_reg = mach_ensure_in_reg(&mctx, src2, excl); + ot_check(th_rsb_imm((uint32_t)dest_reg, (uint32_t)src2_reg, imm, flags, ENFORCE_ENCODING_NONE)); + if (dest->kind != MACH_OP_NONE) + { + const bool needs_wb = dest->kind == MACH_OP_SPILL || dest->kind == MACH_OP_PARAM_STACK || + (dest->kind == MACH_OP_REG && (dest->needs_deref || dest->u.reg.r0 == (int)PREG_REG_NONE)); + if (needs_wb) + mach_writeback_dest(dest, dest_reg); + } + mach_release_all(&mctx); + return; + } + } + + /* UXTB/UXTH fast path: AND with #0xFF or #0xFFFF → UXTB/UXTH. + * 16-bit encoding (2 bytes) vs 32-bit AND immediate (4 bytes). */ + if (op == TCCIR_OP_AND && !dest_sets_flags && barrel_shift == 0 && + src2->kind == MACH_OP_IMM && !src2->needs_deref && !src2->is_64bit && + flags != FLAGS_BEHAVIOUR_SET) + { + uint32_t mask = (uint32_t)src2->u.imm.val; + if (mask == 0xFF || mask == 0xFFFF) + { + int dest_reg = mach_get_dest_reg(&mctx, dest, 0); + uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; + int src1_reg = mach_ensure_in_reg(&mctx, src1, excl); + if (mask == 0xFF) + ot_check(th_uxtb((uint32_t)dest_reg, (uint32_t)src1_reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + else + ot_check(th_uxth((uint32_t)dest_reg, (uint32_t)src1_reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + if (dest->kind != MACH_OP_NONE) + { + const bool needs_wb = dest->kind == MACH_OP_SPILL || dest->kind == MACH_OP_PARAM_STACK || + (dest->kind == MACH_OP_REG && (dest->needs_deref || dest->u.reg.r0 == (int)PREG_REG_NONE)); + if (needs_wb) + mach_writeback_dest(dest, dest_reg); + } + mach_release_all(&mctx); + return; + } + } + + /* UBFX fast path: AND with a low-contiguous mask #((1<kind == MACH_OP_IMM && !src2->needs_deref && !src2->is_64bit && + flags != FLAGS_BEHAVIOUR_SET) + { + uint32_t mask = (uint32_t)src2->u.imm.val; + if (mask != 0 && mask != 0xFFFFFFFFu && (mask & (mask + 1)) == 0 && th_pack_const(mask) == 0) + { + int width = 0; + while ((mask >> width) & 1u) + width++; + int dest_reg = mach_get_dest_reg(&mctx, dest, 0); + uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; + int src1_reg = mach_ensure_in_reg(&mctx, src1, excl); + int widthm1 = width - 1; + thumb_opcode ubfx_op; + ubfx_op.size = 4; + ubfx_op.opcode = + 0xF3C00000 | ((uint32_t)src1_reg << 16) | ((uint32_t)dest_reg << 8) | (uint32_t)widthm1; + ot(ubfx_op); + if (dest->kind != MACH_OP_NONE) + { + const bool needs_wb = dest->kind == MACH_OP_SPILL || dest->kind == MACH_OP_PARAM_STACK || + (dest->kind == MACH_OP_REG && (dest->needs_deref || dest->u.reg.r0 == (int)PREG_REG_NONE)); + if (needs_wb) + mach_writeback_dest(dest, dest_reg); + } + mach_release_all(&mctx); + return; + } + } + /* 1. Determine dest register (allocate scratch for spills/param/no-reg). * CMP and other flag-setting ops don't write a result register, so we * use R0 as a dummy (Rd field is architecturally ignored). */ @@ -3951,6 +5582,14 @@ static void thumb_emit_data_processing_mop32(const MachineOperand *src1, const M uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; + /* Exclude src2's register from scratch allocation for src1. + * Without this, materializing an immediate for src1 could pick src2's + * register, clobbering it before src2 is read. This applies whether + * src2 is a plain register or a dereferenced one (the address register + * must survive until the load). */ + if (src2->kind == MACH_OP_REG && thumb_is_hw_reg(src2->u.reg.r0)) + excl |= (1u << (uint32_t)src2->u.reg.r0); + /* 2. Ensure src1 is in a register; add it to the exclusion mask. */ int src1_reg = mach_ensure_in_reg(&mctx, src1, excl); if (thumb_is_hw_reg(src1_reg)) @@ -3962,9 +5601,23 @@ static void thumb_emit_data_processing_mop32(const MachineOperand *src1, const M mach_ensure_imm_or_reg(&mctx, src2, excl, handler.imm_handler, dest_reg, src1_reg, flags, &imm_emitted); if (!imm_emitted) { - /* Immediate form didn't fit (or src2 isn't an immediate): emit reg form. */ - ot_check(handler.reg_handler((uint32_t)dest_reg, (uint32_t)src1_reg, (uint32_t)src2_reg, flags, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE)); + /* Decode barrel shift annotation (0=none, else type<<5|amount). */ + thumb_shift sh = THUMB_SHIFT_DEFAULT; + if (barrel_shift != 0) + { + static const thumb_shift_type bs_map[] = { + [1] = THUMB_SHIFT_LSL, [2] = THUMB_SHIFT_LSR, + [3] = THUMB_SHIFT_ASR, [4] = THUMB_SHIFT_ROR, + }; + uint32_t stype = (barrel_shift >> 5) & 7; + uint32_t samt = barrel_shift & 31; + sh.type = bs_map[stype]; + sh.value = samt; + sh.mode = THUMB_SHIFT_IMMEDIATE; + } + thumb_enforce_encoding enc = (barrel_shift != 0) ? ENFORCE_ENCODING_32BIT : ENFORCE_ENCODING_NONE; + ot_check(thumb_call_reg_handler(handler.reg_handler, (uint32_t)dest_reg, (uint32_t)src1_reg, (uint32_t)src2_reg, + flags, sh, enc)); } /* 4. Write result back to spill slot / stack param / pointer-dest. */ @@ -3986,12 +5639,29 @@ static void thumb_emit_data_processing_mop32(const MachineOperand *src1, const M * Dispatches to thumb_emit_data_processing_mop64 / thumb_emit_shift64_mop for * 64-bit pair destinations, or thumb_emit_data_processing_mop32 for 32-bit. */ -void tcc_gen_machine_data_processing_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest, TccIrOp op) +static void data_processing_mop_impl(MachineOperand src1, MachineOperand src2, MachineOperand dest, TccIrOp op, + thumb_flags_behaviour flags_override, uint32_t barrel_shift); + +void tcc_gen_machine_data_processing_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest, TccIrOp op, + uint32_t barrel_shift) +{ + data_processing_mop_impl(src1, src2, dest, op, flags_safe(), barrel_shift); +} + +void tcc_gen_machine_data_processing_mop_flags(MachineOperand src1, MachineOperand src2, MachineOperand dest, TccIrOp op) +{ + data_processing_mop_impl(src1, src2, dest, op, FLAGS_BEHAVIOUR_SET, 0); +} + +static void data_processing_mop_impl(MachineOperand src1, MachineOperand src2, MachineOperand dest, TccIrOp op, + thumb_flags_behaviour flags_override, uint32_t barrel_shift) { ThumbDataProcessingHandler handler; ThumbDataProcessingHandler carry_handler; /* used for hi word of 64-bit ops */ bool uses_carry = false; - thumb_flags_behaviour flags = FLAGS_BEHAVIOUR_NOT_IMPORTANT; + /* CMP always sets flags — it has no non-flag-setting variant. + * Ignore FLAGS_BEHAVIOUR_BLOCK for CMP; it must always use SET. */ + thumb_flags_behaviour flags = (op == TCCIR_OP_CMP) ? FLAGS_BEHAVIOUR_SET : flags_override; switch (op) { @@ -4010,9 +5680,11 @@ void tcc_gen_machine_data_processing_mop(MachineOperand src1, MachineOperand src uses_carry = true; break; case TCCIR_OP_CMP: - handler.imm_handler = th_cmp_imm; + handler.imm_handler = th_cmp_imm_handler; handler.reg_handler = th_cmp_reg; - carry_handler = handler; + carry_handler.imm_handler = th_sbc_imm; + carry_handler.reg_handler = th_sbc_reg; + uses_carry = true; break; case TCCIR_OP_SHL: handler.imm_handler = th_lsl_imm; @@ -4029,6 +5701,11 @@ void tcc_gen_machine_data_processing_mop(MachineOperand src1, MachineOperand src handler.reg_handler = th_asr_reg; carry_handler = handler; break; + case TCCIR_OP_ROR: + handler.imm_handler = th_ror_imm; + handler.reg_handler = th_ror_reg; + carry_handler = handler; + break; case TCCIR_OP_OR: handler.imm_handler = th_orr_imm; handler.reg_handler = th_orr_reg; @@ -4057,17 +5734,87 @@ void tcc_gen_machine_data_processing_mop(MachineOperand src1, MachineOperand src return; } - /* Dispatch 64-bit pair destinations to the mop64 path. */ - if (dest.is_64bit) + /* Dispatch 64-bit pair destinations to the mop64 path. + * CMP has no dest (MACH_OP_NONE), so also check src1 for 64-bit. */ + if (dest.is_64bit || (op == TCCIR_OP_CMP && src1.is_64bit)) { if (op == TCCIR_OP_SHL || op == TCCIR_OP_SHR || op == TCCIR_OP_SAR) - thumb_emit_shift64_mop(&src1, &src2, &dest, op); + { + bool skip_lo = (barrel_shift >> 16) & 1; + bool skip_hi = (barrel_shift >> 17) & 1; + thumb_emit_shift64_mop(&src1, &src2, &dest, op, skip_lo, skip_hi); + } else thumb_emit_data_processing_mop64(&src1, &src2, &dest, op, handler, carry_handler, uses_carry); return; } - thumb_emit_data_processing_mop32(&src1, &src2, &dest, op, handler, flags); + thumb_emit_data_processing_mop32(&src1, &src2, &dest, op, handler, flags, barrel_shift & 0xFFFFu); +} + +/* tcc_gen_machine_ubfx_mop: emit UBFX Rd, Rn, #lsb, #width. + * src2 encodes lsb (bits 0-4) and width (bits 5-9). */ +void tcc_gen_machine_ubfx_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest) +{ + MachineCodegenContext ctx = {0}; + int rd = mach_get_dest_reg(&ctx, &dest, 0); + uint32_t excl = (1u << (uint32_t)rd); + int rn = mach_ensure_in_reg(&ctx, &src1, excl); + int param = (src2.kind == MACH_OP_IMM) ? (int)src2.u.imm.val : 0; + int lsb = param & 0x1F; + int width = (param >> 5) & 0x1F; + if (width == 0) + width = 8; + int widthm1 = width - 1; + int imm3 = (lsb >> 2) & 0x7; + int imm2 = lsb & 0x3; + /* Thumb-2 UBFX encoding: 11110 0 11 1100 Rn | 0 imm3 Rd imm2 0 widthm1 */ + thumb_opcode op; + op.size = 4; + op.opcode = 0xF3C00000 | ((uint32_t)rn << 16) | ((uint32_t)imm3 << 12) | ((uint32_t)rd << 8) | ((uint32_t)imm2 << 6) | (uint32_t)widthm1; + ot(op); + mach_writeback_dest(&dest, rd); + mach_release_all(&ctx); +} + +/* tcc_gen_machine_bfi_mop: emit BFI Rd, Rn, #lsb, #width. + * src1 = host word (moved into Rd, the BFI base, if not already there), + * src2 = value supplying the field bits (only its low `width` bits are used), + * dest = result. params packs lsb (bits 0-7) and width (bits 8-15). */ +void tcc_gen_machine_bfi_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest, uint32_t params) +{ + MachineCodegenContext ctx = {0}; + int rd = mach_get_dest_reg(&ctx, &dest, 0); + int rn = mach_ensure_in_reg(&ctx, &src2, 0); /* value (Rn) */ + int rword = mach_ensure_in_reg(&ctx, &src1, (1u << (uint32_t)rn)); /* host word */ + /* Establish Rd = host word. If the value happens to live in Rd (RA coalesced + * the result onto src2), preserve it in a scratch before clobbering Rd. */ + if (rd != rword) + { + if (rd == rn) + { + int tmp = mach_alloc_scratch(&ctx, (1u << (uint32_t)rd) | (1u << (uint32_t)rword)); + ot_check_mov_reg((uint32_t)tmp, (uint32_t)rd, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + rn = tmp; + } + ot_check_mov_reg((uint32_t)rd, (uint32_t)rword, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + } + int lsb = (int)(params & 0xFF); + int width = (int)((params >> 8) & 0xFF); + if (width < 1) + width = 1; + int msb = lsb + width - 1; + if (msb > 31) + msb = 31; + int imm3 = (lsb >> 2) & 0x7; + int imm2 = lsb & 0x3; + /* Thumb-2 BFI: 11110 0 11 0110 Rn | 0 imm3 Rd imm2 0 msb */ + thumb_opcode op; + op.size = 4; + op.opcode = 0xF3600000 | ((uint32_t)rn << 16) | ((uint32_t)imm3 << 12) | ((uint32_t)rd << 8) | ((uint32_t)imm2 << 6) | (uint32_t)msb; + ot(op); + mach_writeback_dest(&dest, rd); + mach_release_all(&ctx); } /* ============================================================ @@ -4088,6 +5835,11 @@ static void mach_regonly_binop_mop(MachineCodegenContext *ctx, const MachineOper int dest_reg = mach_get_dest_reg(ctx, dest, 0); uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; + /* Pre-exclude src2's physical register so that loading src1 (which may + * need a scratch for deref) does not clobber src2's value. */ + if (src2->kind == MACH_OP_REG && !src2->needs_deref && thumb_is_hw_reg(src2->u.reg.r0)) + excl |= (1u << (uint32_t)src2->u.reg.r0); + /* 2. Ensure src1 in a register; extend exclusion mask. */ int src1_reg = mach_ensure_in_reg(ctx, src1, excl); if (thumb_is_hw_reg(src1_reg)) @@ -4131,7 +5883,7 @@ static void mach_mod_mop(MachineCodegenContext *ctx, const MachineOperand *src1, ot_check(thumb_mul_regonly((uint32_t)quotient_reg, (uint32_t)quotient_reg, (uint32_t)src2_reg)); /* 7. dest = src1 - quotient */ - ot_check(th_sub_reg(dest_reg, src1_reg, quotient_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, + ot_check(th_sub_reg(dest_reg, src1_reg, quotient_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); /* 8. Write result back. */ @@ -4242,105 +5994,634 @@ static void thumb_emit_mul64_mop(MachineCodegenContext *ctx, const MachineOperan * MLA (accumulator; 4-operand) uses tcc_gen_machine_mla_mop. * UMULL (64-bit output from 32-bit inputs) uses tcc_gen_machine_umull_mop. */ -ST_FUNC void tcc_gen_machine_muldiv_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest, TccIrOp op) +/* Decompose multiply-by-constant into shift+add sequences. + * Returns 1 if handled, 0 to fall back to hardware MUL. */ +static int thumb_try_mul_by_const_mop(MachineCodegenContext *ctx, MachineOperand *src1, MachineOperand *src2, + MachineOperand *dest) { - MachineCodegenContext ctx = {0}; - switch (op) + /* Identify which operand is the immediate and which is the variable. */ + const MachineOperand *imm_op, *var_op; + if (src2->kind == MACH_OP_IMM) { - case TCCIR_OP_MUL: - if (src1.is_64bit || src2.is_64bit || dest.is_64bit) - thumb_emit_mul64_mop(&ctx, &src1, &src2, &dest); - else - mach_regonly_binop_mop(&ctx, &src1, &src2, &dest, thumb_mul_regonly); - break; - case TCCIR_OP_DIV: - mach_regonly_binop_mop(&ctx, &src1, &src2, &dest, thumb_sdiv_regonly); - break; - case TCCIR_OP_UDIV: - mach_regonly_binop_mop(&ctx, &src1, &src2, &dest, thumb_udiv_regonly); - break; - case TCCIR_OP_IMOD: - mach_mod_mop(&ctx, &src1, &src2, &dest, thumb_sdiv_regonly); - break; - case TCCIR_OP_UMOD: - mach_mod_mop(&ctx, &src1, &src2, &dest, thumb_udiv_regonly); - break; - case TCCIR_OP_TEST_ZERO: + imm_op = src2; + var_op = src1; + } + else if (src1->kind == MACH_OP_IMM) { - if (src1.is_64bit) + imm_op = src1; + var_op = src2; + } + else + return 0; + + int64_t c = imm_op->u.imm.val; + if (c <= 0) + return 0; + + /* Determine the decomposition pattern. + * We handle: powers of 2, (2^n ± 1), and products thereof. + * + * Pattern Insns Example + * ─────────────────── ───── ─────── + * 2^n 1 LSL Rd, Rn, #n + * 2^n + 1 1 ADD Rd, Rn, Rn LSL #n + * 2^n - 1 1 SUB Rd, Rn LSL #n, Rn (RSB-like via SUB) + * (2^a + 1) * 2^b 2 ADD Rd, Rn, Rn LSL #a; LSL Rd, Rd, #b + * (2^a - 1) * 2^b 2 SUB Rd, Rn LSL #a, Rn; LSL Rd, Rd, #b + * (2^a + 1)(2^b + 1) 2 ADD Rd, Rn, Rn LSL #a; ADD Rd, Rd, Rn LSL #(a+b) + * — only some cases, handled via table + */ + + int shift1 = 0, shift2 = 0; + enum + { + MUL_NONE, + MUL_POWER_OF_2, /* c = 2^n : LSL #n */ + MUL_TWO_N_PLUS_1, /* c = 2^n+1 : ADD Rd, Rn, Rn LSL #n */ + MUL_TWO_N_MINUS_1, /* c = 2^n-1 : SUB Rd, Rn LSL #n, Rn */ + MUL_TWO_N_PLUS_1_SHIFT, /* c = (2^a+1)*2^b : ADD; LSL */ + MUL_TWO_N_MINUS_1_SHIFT, /* c = (2^a-1)*2^b : SUB; LSL */ + } pattern = MUL_NONE; + + /* Check for power of 2 */ + if (c > 0 && (c & (c - 1)) == 0) + { + int n = 0; + int64_t v = c; + while (v > 1) { - /* 64-bit: Z set iff (lo == 0 && hi == 0). - * Use CMP lo,#0; IT EQ; CMPEQ hi,#0 to avoid clobbering source registers. */ - uint32_t excl = 0; - MachineOperand resolved = mach_resolve_deref_64(&ctx, &src1, &excl); - MachineOperand lo = mach_make_lo_half(&resolved); - lo.btype = IROP_BTYPE_INT32; - MachineOperand hi = mach_make_hi_half(&resolved); - hi.btype = IROP_BTYPE_INT32; - int r_lo = mach_ensure_in_reg(&ctx, &lo, excl); - if (thumb_is_hw_reg(r_lo)) - excl |= (1u << (uint32_t)r_lo); - int r_hi = mach_ensure_in_reg(&ctx, &hi, excl); - ot_check(th_cmp_imm(0, r_lo, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); - th_literal_pool_reserve_upcoming_bytes(6); - ot_check(th_it(mapcc(TOK_EQ), 0x8)); /* IT EQ (single instruction) */ - ot_check(th_cmp_imm(0, r_hi, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); + n++; + v >>= 1; } - else + if (n >= 1 && n <= 31) { - /* 32-bit: CMP src, #0 — no destination, only flags. */ - int src_reg = mach_ensure_in_reg(&ctx, &src1, 0); - ot_check(th_cmp_imm(0, src_reg, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); + shift1 = n; + pattern = MUL_POWER_OF_2; } - break; } - default: - tcc_error("compiler_error: tcc_gen_machine_muldiv_mop: unhandled op %d", (int)op); - break; + + /* Check for 2^n + 1 (3, 5, 9, 17, ...) */ + if (pattern == MUL_NONE && c >= 3) + { + int64_t v = c - 1; + if (v > 0 && (v & (v - 1)) == 0) + { + int n = 0; + while (v > 1) + { + n++; + v >>= 1; + } + if (n >= 1 && n <= 31) + { + shift1 = n; + pattern = MUL_TWO_N_PLUS_1; + } + } } - mach_release_all(&ctx); -} -/* tcc_gen_machine_mla_mop: MachineOperand-based entry point for MLA. - * dest = src1 * src2 + accum (all operands are 32-bit) - * - * All four operands are loaded into hardware registers via mach_ensure_in_reg - * before emitting a single MLA instruction. No fallback path is needed - * because mach_ensure_in_reg always returns a valid register. - * - * Note: th_mla(rd, rn, rm, ra) → rd = rn * rm + ra - */ -ST_FUNC void tcc_gen_machine_mla_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest, - MachineOperand accum) -{ - MachineCodegenContext ctx = {0}; + /* Check for 2^n - 1 (7, 15, 31, ...) */ + if (pattern == MUL_NONE && c >= 7) + { + int64_t v = c + 1; + if (v > 0 && (v & (v - 1)) == 0) + { + int n = 0; + while (v > 1) + { + n++; + v >>= 1; + } + if (n >= 2 && n <= 31) + { + shift1 = n; + pattern = MUL_TWO_N_MINUS_1; + } + } + } - int src1_reg = mach_ensure_in_reg(&ctx, &src1, 0); - uint32_t excl = thumb_is_hw_reg(src1_reg) ? (1u << (uint32_t)src1_reg) : 0u; + /* Check for (2^a + 1) * 2^b (6, 10, 12, 20, 24, 40, 48, ...) */ + if (pattern == MUL_NONE && c >= 6) + { + int64_t v = c; + int b = 0; + while ((v & 1) == 0) + { + b++; + v >>= 1; + } + if (b >= 1 && b <= 31) + { + int64_t inner = v - 1; + if (inner > 0 && (inner & (inner - 1)) == 0) + { + int a = 0; + while (inner > 1) + { + a++; + inner >>= 1; + } + if (a >= 1 && a <= 31) + { + shift1 = a; + shift2 = b; + pattern = MUL_TWO_N_PLUS_1_SHIFT; + } + } + } + } - int src2_reg = mach_ensure_in_reg(&ctx, &src2, excl); - if (thumb_is_hw_reg(src2_reg)) - excl |= (1u << (uint32_t)src2_reg); + /* Check for (2^a - 1) * 2^b (14, 28, 30, 56, 60, 62, ...) */ + if (pattern == MUL_NONE && c >= 14) + { + int64_t v = c; + int b = 0; + while ((v & 1) == 0) + { + b++; + v >>= 1; + } + if (b >= 1 && b <= 31) + { + int64_t inner = v + 1; + if (inner > 0 && (inner & (inner - 1)) == 0) + { + int a = 0; + while (inner > 1) + { + a++; + inner >>= 1; + } + if (a >= 2 && a <= 31) + { + shift1 = a; + shift2 = b; + pattern = MUL_TWO_N_MINUS_1_SHIFT; + } + } + } + } - int accum_reg = mach_ensure_in_reg(&ctx, &accum, excl); - if (thumb_is_hw_reg(accum_reg)) - excl |= (1u << (uint32_t)accum_reg); + if (pattern == MUL_NONE) + return 0; - int dest_reg = mach_get_dest_reg(&ctx, &dest, excl); + /* Emit the decomposed sequence. */ + int dest_reg = mach_get_dest_reg(ctx, dest, 0); + uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; + int var_reg = mach_ensure_in_reg(ctx, var_op, excl); + thumb_flags_behaviour fl = flags_safe(); + thumb_shift sh; - /* th_mla(rd, rn, rm, ra): rd = rn * rm + ra */ - ot_check(th_mla((uint32_t)dest_reg, (uint32_t)src1_reg, (uint32_t)src2_reg, (uint32_t)accum_reg)); + switch (pattern) + { + case MUL_POWER_OF_2: + ot_check(th_lsl_imm((uint32_t)dest_reg, (uint32_t)var_reg, (uint32_t)shift1, fl, ENFORCE_ENCODING_NONE)); + break; - mach_writeback_dest(&dest, dest_reg); - mach_release_all(&ctx); -} + case MUL_TWO_N_PLUS_1: + sh = (thumb_shift){THUMB_SHIFT_LSL, (uint16_t)shift1, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_add_reg((uint32_t)dest_reg, (uint32_t)var_reg, (uint32_t)var_reg, fl, sh, ENFORCE_ENCODING_NONE)); + break; -/* tcc_gen_machine_umull_mop: MachineOperand-based entry point for UMULL. - * {dest_hi:dest_lo} = (uint32_t)src1 * (uint32_t)src2 (64-bit unsigned result) - * - * src1 and src2 are 32-bit inputs (is_64bit is cleared before loading). - * dest must be a 64-bit pair; it is split via mach_make_lo/hi_half. - * Each half is allocated independently via mach_get_dest_reg, with the + case MUL_TWO_N_MINUS_1: + { + /* Thumb-2 SUB Rd, Rn, Rm LSL #n = Rn - (Rm << n). + * We need (var << n) - var, which is the reverse. No RSB with shift + * exists in Thumb-2, so we do: LSL tmp, var, #n; SUB Rd, tmp, var. + * The LSL destination must differ from var_reg, otherwise it destroys + * var before the SUB reads it (mach_ensure_in_reg returns an already- + * resident var in dest_reg's register, ignoring the exclusion mask, so + * dest_reg == var_reg is reachable). Shift straight into dest when they + * differ; otherwise borrow a scratch. */ + int tmp = (dest_reg == var_reg) ? mach_alloc_scratch(ctx, (1u << (uint32_t)var_reg)) : dest_reg; + ot_check(th_lsl_imm((uint32_t)tmp, (uint32_t)var_reg, (uint32_t)shift1, fl, ENFORCE_ENCODING_NONE)); + sh = (thumb_shift){THUMB_SHIFT_NONE, 0, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_sub_reg((uint32_t)dest_reg, (uint32_t)tmp, (uint32_t)var_reg, fl, sh, ENFORCE_ENCODING_NONE)); + break; + } + + case MUL_TWO_N_PLUS_1_SHIFT: + sh = (thumb_shift){THUMB_SHIFT_LSL, (uint16_t)shift1, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_add_reg((uint32_t)dest_reg, (uint32_t)var_reg, (uint32_t)var_reg, fl, sh, ENFORCE_ENCODING_NONE)); + ot_check(th_lsl_imm((uint32_t)dest_reg, (uint32_t)dest_reg, (uint32_t)shift2, fl, ENFORCE_ENCODING_NONE)); + break; + + case MUL_TWO_N_MINUS_1_SHIFT: + { + /* (2^a - 1) * 2^b: LSL tmp, var, #a; SUB tmp, tmp, var; LSL Rd, tmp, #b. + * As in MUL_TWO_N_MINUS_1, the first LSL must not target var_reg, or it + * destroys var before the SUB reads it. */ + int tmp = (dest_reg == var_reg) ? mach_alloc_scratch(ctx, (1u << (uint32_t)var_reg)) : dest_reg; + ot_check(th_lsl_imm((uint32_t)tmp, (uint32_t)var_reg, (uint32_t)shift1, fl, ENFORCE_ENCODING_NONE)); + sh = (thumb_shift){THUMB_SHIFT_NONE, 0, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_sub_reg((uint32_t)tmp, (uint32_t)tmp, (uint32_t)var_reg, fl, sh, ENFORCE_ENCODING_NONE)); + ot_check(th_lsl_imm((uint32_t)dest_reg, (uint32_t)tmp, (uint32_t)shift2, fl, ENFORCE_ENCODING_NONE)); + break; + } + + default: + return 0; + } + + mach_writeback_dest(dest, dest_reg); + mach_release_all(ctx); + return 1; +} + +/* Fused MUL-by-const + ADD peephole. + * Transforms: tmp = var * C; dest = base + tmp + * Into a shorter sequence using ARM shifted-add (ADD Rd, Rn, Rm LSL #imm): + * C = 2^n: ADD dest, base, var LSL #n (1 insn vs 2) + * C = (2^a+1)*2^b: ADD t, var, var LSL #a; + * ADD dest, base, t LSL #b (2 insn vs 3) + * C = (2^a-1)*2^b: LSL t, var, #a; SUB t, t, var; + * ADD dest, base, t LSL #b (3 insn vs 4) + * Returns 1 if fused, 0 to fall back to separate MUL + ADD. */ +ST_FUNC int tcc_gen_machine_mul_const_add_fused_mop(MachineOperand mul_var, int64_t mul_const, + MachineOperand mul_dest, MachineOperand add_base, + MachineOperand add_dest) +{ + if (mul_const <= 0) + return 0; + + int shift1 = 0, shift2 = 0; + enum + { + FUSE_NONE, + FUSE_POW2, + FUSE_TWO_N_PLUS_1_SHIFT, + FUSE_TWO_N_MINUS_1_SHIFT, + } pattern = FUSE_NONE; + + /* Power of 2: C = 2^n */ + if (mul_const > 1 && (mul_const & (mul_const - 1)) == 0) + { + int n = 0; + int64_t v = mul_const; + while (v > 1) { n++; v >>= 1; } + if (n >= 1 && n <= 31) + { + shift1 = n; + pattern = FUSE_POW2; + } + } + + /* (2^a + 1) * 2^b: e.g. 12 = 3*4 = (2^1+1)*2^2 */ + if (pattern == FUSE_NONE && mul_const >= 6) + { + int64_t v = mul_const; + int b = 0; + while ((v & 1) == 0) { b++; v >>= 1; } + if (b >= 1 && b <= 31) + { + int64_t inner = v - 1; + if (inner > 0 && (inner & (inner - 1)) == 0) + { + int a = 0; + while (inner > 1) { a++; inner >>= 1; } + if (a >= 1 && a <= 31) + { + shift1 = a; + shift2 = b; + pattern = FUSE_TWO_N_PLUS_1_SHIFT; + } + } + } + } + + /* (2^a - 1) * 2^b: e.g. 28 = 7*4 = (2^3-1)*2^2 */ + if (pattern == FUSE_NONE && mul_const >= 14) + { + int64_t v = mul_const; + int b = 0; + while ((v & 1) == 0) { b++; v >>= 1; } + if (b >= 1 && b <= 31) + { + int64_t inner = v + 1; + if (inner > 0 && (inner & (inner - 1)) == 0) + { + int a = 0; + while (inner > 1) { a++; inner >>= 1; } + if (a >= 2 && a <= 31) + { + shift1 = a; + shift2 = b; + pattern = FUSE_TWO_N_MINUS_1_SHIFT; + } + } + } + } + + if (pattern == FUSE_NONE) + return 0; + + MachineCodegenContext ctx = {0}; + thumb_flags_behaviour fl = flags_safe(); + thumb_shift sh; + + /* Allocate registers: dest first (may hint to the ADD dest's phys reg), + * then base and var, using exclusion masks to prevent conflicts. */ + int dest_reg = mach_get_dest_reg(&ctx, &add_dest, 0); + uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; + int base_reg = mach_ensure_in_reg(&ctx, &add_base, excl); + if (thumb_is_hw_reg(base_reg)) + excl |= (1u << (uint32_t)base_reg); + int var_reg = mach_ensure_in_reg(&ctx, &mul_var, excl); + + switch (pattern) + { + case FUSE_POW2: + /* ADD dest, base, var LSL #n */ + sh = (thumb_shift){THUMB_SHIFT_LSL, (uint16_t)shift1, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_add_reg((uint32_t)dest_reg, (uint32_t)base_reg, (uint32_t)var_reg, fl, sh, ENFORCE_ENCODING_NONE)); + break; + + case FUSE_TWO_N_PLUS_1_SHIFT: + { + /* Step 1: ADD tmp, var, var LSL #a */ + if (thumb_is_hw_reg(var_reg)) + excl |= (1u << (uint32_t)var_reg); + int tmp_reg = mach_get_dest_reg(&ctx, &mul_dest, excl); + sh = (thumb_shift){THUMB_SHIFT_LSL, (uint16_t)shift1, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_add_reg((uint32_t)tmp_reg, (uint32_t)var_reg, (uint32_t)var_reg, fl, sh, ENFORCE_ENCODING_NONE)); + /* Step 2: ADD dest, base, tmp LSL #b */ + sh = (thumb_shift){THUMB_SHIFT_LSL, (uint16_t)shift2, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_add_reg((uint32_t)dest_reg, (uint32_t)base_reg, (uint32_t)tmp_reg, fl, sh, ENFORCE_ENCODING_NONE)); + break; + } + + case FUSE_TWO_N_MINUS_1_SHIFT: + { + /* Step 1: LSL tmp, var, #a */ + if (thumb_is_hw_reg(var_reg)) + excl |= (1u << (uint32_t)var_reg); + int tmp_reg = mach_get_dest_reg(&ctx, &mul_dest, excl); + ot_check(th_lsl_imm((uint32_t)tmp_reg, (uint32_t)var_reg, (uint32_t)shift1, fl, ENFORCE_ENCODING_NONE)); + /* Step 2: SUB tmp, tmp, var */ + sh = (thumb_shift){THUMB_SHIFT_NONE, 0, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_sub_reg((uint32_t)tmp_reg, (uint32_t)tmp_reg, (uint32_t)var_reg, fl, sh, ENFORCE_ENCODING_NONE)); + /* Step 3: ADD dest, base, tmp LSL #b */ + sh = (thumb_shift){THUMB_SHIFT_LSL, (uint16_t)shift2, THUMB_SHIFT_IMMEDIATE}; + ot_check(th_add_reg((uint32_t)dest_reg, (uint32_t)base_reg, (uint32_t)tmp_reg, fl, sh, ENFORCE_ENCODING_NONE)); + break; + } + + default: + mach_release_all(&ctx); + return 0; + } + + mach_writeback_dest(&add_dest, dest_reg); + mach_release_all(&ctx); + return 1; +} + +ST_FUNC void tcc_gen_machine_muldiv_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest, TccIrOp op) +{ + MachineCodegenContext ctx = {0}; + switch (op) + { + case TCCIR_OP_MUL: + if (src1.is_64bit || src2.is_64bit || dest.is_64bit) + thumb_emit_mul64_mop(&ctx, &src1, &src2, &dest); + else if (!thumb_try_mul_by_const_mop(&ctx, &src1, &src2, &dest)) + mach_regonly_binop_mop(&ctx, &src1, &src2, &dest, thumb_mul_regonly); + break; + case TCCIR_OP_DIV: + mach_regonly_binop_mop(&ctx, &src1, &src2, &dest, thumb_sdiv_regonly); + break; + case TCCIR_OP_UDIV: + mach_regonly_binop_mop(&ctx, &src1, &src2, &dest, thumb_udiv_regonly); + break; + case TCCIR_OP_IMOD: + mach_mod_mop(&ctx, &src1, &src2, &dest, thumb_sdiv_regonly); + break; + case TCCIR_OP_UMOD: + mach_mod_mop(&ctx, &src1, &src2, &dest, thumb_udiv_regonly); + break; + case TCCIR_OP_TEST_ZERO: + { + if (src1.is_64bit) + { + /* 64-bit: Z set iff (lo == 0 && hi == 0). + * Use CMP lo,#0; IT EQ; CMPEQ hi,#0 to avoid clobbering source registers. */ + uint32_t excl = 0; + MachineOperand resolved = mach_resolve_deref_64(&ctx, &src1, &excl); + MachineOperand lo = mach_make_lo_half(&resolved); + lo.btype = IROP_BTYPE_INT32; + MachineOperand hi = mach_make_hi_half(&resolved); + hi.btype = IROP_BTYPE_INT32; + int r_lo = mach_ensure_in_reg(&ctx, &lo, excl); + if (thumb_is_hw_reg(r_lo)) + excl |= (1u << (uint32_t)r_lo); + int r_hi = mach_ensure_in_reg(&ctx, &hi, excl); + ot_check(th_cmp_imm(r_lo, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); + th_literal_pool_reserve_upcoming_bytes(6); + ot_check(th_it(mapcc(TOK_EQ), 0x8)); /* IT EQ (single instruction) */ + ot_check(th_cmp_imm(r_hi, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); + } + else + { + /* 32-bit: CMP src, #0 — no destination, only flags. */ + int src_reg = mach_ensure_in_reg(&ctx, &src1, 0); + ot_check(th_cmp_imm(src_reg, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); + } + break; + } + default: + tcc_error("compiler_error: tcc_gen_machine_muldiv_mop: unhandled op %d", (int)op); + break; + } + mach_release_all(&ctx); +} + +/* tcc_gen_machine_cmp_eq64_mop: 64-bit equality comparison. + * Emits CMP hi1,hi2; IT EQ; CMPEQ lo1,lo2 which correctly sets + * the Z flag for full 64-bit equality (used by SETIF/JUMPIF EQ/NE). */ +ST_FUNC void tcc_gen_machine_cmp_eq64_mop(MachineOperand src1, MachineOperand src2) +{ + MachineCodegenContext ctx = {0}; + uint32_t excl = 0; + + if (src1.kind == MACH_OP_REG) + { + if (src1.u.reg.r0 != (int)PREG_REG_NONE) + excl |= (1u << (uint32_t)src1.u.reg.r0); + if (!src1.needs_deref && src1.is_64bit && src1.u.reg.r1 >= 0) + excl |= (1u << (uint32_t)src1.u.reg.r1); + } + if (src2.kind == MACH_OP_REG) + { + if (src2.u.reg.r0 != (int)PREG_REG_NONE) + excl |= (1u << (uint32_t)src2.u.reg.r0); + if (!src2.needs_deref && src2.is_64bit && src2.u.reg.r1 >= 0) + excl |= (1u << (uint32_t)src2.u.reg.r1); + } + + MachineOperand r_src1 = mach_resolve_deref_64(&ctx, &src1, &excl); + MachineOperand r_src2 = mach_resolve_deref_64(&ctx, &src2, &excl); + + MachineOperand s1_lo = mach_make_lo_half(&r_src1); + s1_lo.btype = IROP_BTYPE_INT32; + int rn_lo = mach_ensure_in_reg(&ctx, &s1_lo, excl); + if (thumb_is_hw_reg(rn_lo)) + excl |= (1u << (uint32_t)rn_lo); + + MachineOperand s1_hi = mach_make_hi_half(&r_src1); + s1_hi.btype = IROP_BTYPE_INT32; + int rn_hi = mach_ensure_in_reg(&ctx, &s1_hi, excl); + if (thumb_is_hw_reg(rn_hi)) + excl |= (1u << (uint32_t)rn_hi); + + /* Immediate-CMP fast path: if src2 is a u64 immediate, try the cmp-imm + * form (`cmp.w Rn, #imm`) for each half — avoids loading the constant + * into a scratch reg. Probe encodability before allocating scratches: + * `mach_ensure_in_reg` on a MACH_OP_IMM would unconditionally emit a + * `movs Rscratch, #imm`, which is exactly the instruction we're trying + * to avoid here. */ + thumb_opcode hi_imm_op = {0}; + thumb_opcode lo_imm_op = {0}; + if (r_src2.kind == MACH_OP_IMM) + { + const uint64_t imm = (uint64_t)r_src2.u.imm.val; + const uint32_t imm_lo = (uint32_t)(imm & 0xffffffffu); + const uint32_t imm_hi = (uint32_t)(imm >> 32); + hi_imm_op = th_cmp_imm((uint32_t)rn_hi, imm_hi, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE); + lo_imm_op = th_cmp_imm((uint32_t)rn_lo, imm_lo, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE); + } + + /* Use cmp-imm for whichever halves fit; only allocate scratch + * registers for halves that need them. */ + int hi_uses_imm = (r_src2.kind == MACH_OP_IMM && hi_imm_op.size); + int lo_uses_imm = (r_src2.kind == MACH_OP_IMM && lo_imm_op.size); + + int rm_lo = 0, rm_hi = 0; + if (!lo_uses_imm) + { + MachineOperand s2_lo = mach_make_lo_half(&r_src2); + s2_lo.btype = IROP_BTYPE_INT32; + rm_lo = mach_ensure_in_reg(&ctx, &s2_lo, excl); + if (thumb_is_hw_reg(rm_lo)) + excl |= (1u << (uint32_t)rm_lo); + } + if (!hi_uses_imm) + { + MachineOperand s2_hi = mach_make_hi_half(&r_src2); + s2_hi.btype = IROP_BTYPE_INT32; + rm_hi = mach_ensure_in_reg(&ctx, &s2_hi, excl); + } + + if (hi_uses_imm) + ot_check(hi_imm_op); + else + ot_check(th_cmp_reg(0, (uint32_t)rn_hi, (uint32_t)rm_hi, FLAGS_BEHAVIOUR_SET, THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE)); + th_literal_pool_reserve_upcoming_bytes(6); + ot_check(th_it(mapcc(TOK_EQ), 0x8)); + if (lo_uses_imm) + ot_check(lo_imm_op); + else + ot_check(th_cmp_reg(0, (uint32_t)rn_lo, (uint32_t)rm_lo, FLAGS_BEHAVIOUR_SET, THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE)); + + mach_release_all(&ctx); +} + +/* tcc_gen_machine_subs_eq_select_01: emit + * SUBS dest, src1, #K + * IT NE + * MOVNE dest, #1 + * for the CMP src1,#K + SELECT(#1,#0,NE) / SELECT(#0,#1,EQ) peephole. + * Returns 1 if emitted, 0 if the SUBS immediate didn't encode (caller falls back). */ +ST_FUNC int tcc_gen_machine_subs_eq_select_01(MachineOperand src1, MachineOperand src2, MachineOperand dest) +{ + if (src2.kind != MACH_OP_IMM) + return 0; + if (src1.kind != MACH_OP_REG || src1.needs_deref || src1.u.reg.r0 < 0) + return 0; + if (dest.kind != MACH_OP_REG || dest.needs_deref || dest.u.reg.r0 < 0) + return 0; + + uint32_t src_reg = (uint32_t)src1.u.reg.r0; + uint32_t dst_reg = (uint32_t)dest.u.reg.r0; + uint32_t Ku = (uint32_t)src2.u.imm.val; + + thumb_opcode subs = th_sub_imm(dst_reg, src_reg, Ku, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE); + if (subs.size == 0) + return 0; + + /* Reserve so a literal-pool flush can't split the IT/MOV pair. */ + th_literal_pool_reserve_upcoming_bytes(10); + ot_check(subs); + ot_check(th_it(mapcc(TOK_NE), 0x8u)); + thumb_opcode movne = th_generic_mov_imm(dst_reg, 1); + if (movne.size != 0) { + ot_check(movne); + } else { + /* mov #1 always encodes on ARM Thumb-2, but be safe. */ + load_full_const((int)dst_reg, PREG_NONE, 1u, 0u); + } + return 1; +} + +/* tcc_gen_machine_mla_mop: MachineOperand-based entry point for MLA. + * dest = src1 * src2 + accum (all operands are 32-bit) + * + * All four operands are loaded into hardware registers via mach_ensure_in_reg + * before emitting a single MLA instruction. No fallback path is needed + * because mach_ensure_in_reg always returns a valid register. + * + * Note: th_mla(rd, rn, rm, ra) → rd = rn * rm + ra + */ +ST_FUNC void tcc_gen_machine_mla_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest, + MachineOperand accum) +{ + MachineCodegenContext ctx = {0}; + + /* Pre-exclude registers directly referenced by REG operands so that scratch + * allocations for other operands (e.g. immediates) cannot clobber them. + * The pre-allocated DEST register must be excluded too: if a source load + * grabs it as a saved scratch (push/pop), the restoring pop after the MLA + * overwrites the just-computed result. */ + uint32_t live_regs = 0; + if (src1.kind == MACH_OP_REG && !src1.needs_deref) + live_regs |= (1u << (uint32_t)src1.u.reg.r0); + if (src2.kind == MACH_OP_REG && !src2.needs_deref) + live_regs |= (1u << (uint32_t)src2.u.reg.r0); + if (accum.kind == MACH_OP_REG && !accum.needs_deref) + live_regs |= (1u << (uint32_t)accum.u.reg.r0); + if (dest.kind == MACH_OP_REG && !dest.needs_deref && + dest.u.reg.r0 != (int)PREG_REG_NONE) + live_regs |= (1u << (uint32_t)dest.u.reg.r0); + + int src1_reg = mach_ensure_in_reg(&ctx, &src1, live_regs); + uint32_t excl = live_regs; + if (thumb_is_hw_reg(src1_reg)) + excl |= (1u << (uint32_t)src1_reg); + + int src2_reg = mach_ensure_in_reg(&ctx, &src2, excl); + if (thumb_is_hw_reg(src2_reg)) + excl |= (1u << (uint32_t)src2_reg); + + int accum_reg = mach_ensure_in_reg(&ctx, &accum, excl); + if (thumb_is_hw_reg(accum_reg)) + excl |= (1u << (uint32_t)accum_reg); + + int dest_reg = mach_get_dest_reg(&ctx, &dest, excl); + + /* th_mla(rd, rn, rm, ra): rd = rn * rm + ra */ + ot_check(th_mla((uint32_t)dest_reg, (uint32_t)src1_reg, (uint32_t)src2_reg, (uint32_t)accum_reg)); + + mach_writeback_dest(&dest, dest_reg); + mach_release_all(&ctx); +} + +/* tcc_gen_machine_umull_mop: MachineOperand-based entry point for UMULL. + * {dest_hi:dest_lo} = (uint32_t)src1 * (uint32_t)src2 (64-bit unsigned result) + * + * src1 and src2 are 32-bit inputs (is_64bit is cleared before loading). + * dest must be a 64-bit pair; it is split via mach_make_lo/hi_half. + * Each half is allocated independently via mach_get_dest_reg, with the * exclusion mask preventing rdlo==rdhi and preventing overlap with rn/rm. * * Note: th_umull(rdlo, rdhi, rn, rm) → {rdhi:rdlo} = rn * rm (unsigned) @@ -4355,8 +6636,19 @@ ST_FUNC void tcc_gen_machine_umull_mop(MachineOperand src1, MachineOperand src2, MachineOperand s2 = src2; s2.is_64bit = false; - int rn = mach_ensure_in_reg(&ctx, &s1, 0); - uint32_t excl = thumb_is_hw_reg(rn) ? (1u << (uint32_t)rn) : 0u; + /* Pre-exclude the pre-allocated dest pair: a saved-scratch (push/pop) on a + * dest register would have its restoring pop clobber the result. */ + uint32_t dest_excl = 0; + if (dest.kind == MACH_OP_REG && !dest.needs_deref) + { + if (dest.u.reg.r0 != (int)PREG_REG_NONE) + dest_excl |= (1u << (uint32_t)dest.u.reg.r0); + if (dest.is_64bit && dest.u.reg.r1 >= 0 && dest.u.reg.r1 != (int)PREG_REG_NONE) + dest_excl |= (1u << (uint32_t)dest.u.reg.r1); + } + + int rn = mach_ensure_in_reg(&ctx, &s1, dest_excl); + uint32_t excl = dest_excl | (thumb_is_hw_reg(rn) ? (1u << (uint32_t)rn) : 0u); int rm = mach_ensure_in_reg(&ctx, &s2, excl); if (thumb_is_hw_reg(rm)) @@ -4381,6 +6673,175 @@ ST_FUNC void tcc_gen_machine_umull_mop(MachineOperand src1, MachineOperand src2, mach_release_all(&ctx); } +/* tcc_gen_machine_smull_mop: MachineOperand-based entry point for SMULL. + * {dest_hi:dest_lo} = (int32_t)src1 * (int32_t)src2 (64-bit signed result). + * Mirrors umull_mop but emits th_smull. */ +ST_FUNC void tcc_gen_machine_smull_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest) +{ + MachineCodegenContext ctx = {0}; + + MachineOperand s1 = src1; + s1.is_64bit = false; + MachineOperand s2 = src2; + s2.is_64bit = false; + + /* Pre-exclude the pre-allocated dest pair (see umull_mop). */ + uint32_t dest_excl = 0; + if (dest.kind == MACH_OP_REG && !dest.needs_deref) + { + if (dest.u.reg.r0 != (int)PREG_REG_NONE) + dest_excl |= (1u << (uint32_t)dest.u.reg.r0); + if (dest.is_64bit && dest.u.reg.r1 >= 0 && dest.u.reg.r1 != (int)PREG_REG_NONE) + dest_excl |= (1u << (uint32_t)dest.u.reg.r1); + } + + int rn = mach_ensure_in_reg(&ctx, &s1, dest_excl); + uint32_t excl = dest_excl | (thumb_is_hw_reg(rn) ? (1u << (uint32_t)rn) : 0u); + + int rm = mach_ensure_in_reg(&ctx, &s2, excl); + if (thumb_is_hw_reg(rm)) + excl |= (1u << (uint32_t)rm); + + MachineOperand dst_lo = mach_make_lo_half(&dest); + MachineOperand dst_hi = mach_make_hi_half(&dest); + dst_lo.btype = IROP_BTYPE_INT32; + dst_hi.btype = IROP_BTYPE_INT32; + + int rd_lo = mach_get_dest_reg(&ctx, &dst_lo, excl); + if (thumb_is_hw_reg(rd_lo)) + excl |= (1u << (uint32_t)rd_lo); + int rd_hi = mach_get_dest_reg(&ctx, &dst_hi, excl); + + /* th_smull(rdlo, rdhi, rn, rm): {rdhi:rdlo} = (signed)rn * (signed)rm */ + ot_check(th_smull((uint32_t)rd_lo, (uint32_t)rd_hi, (uint32_t)rn, (uint32_t)rm)); + + mach_writeback_dest(&dst_lo, rd_lo); + mach_writeback_dest(&dst_hi, rd_hi); + mach_release_all(&ctx); +} + +/* tcc_gen_machine_mlal_accum_mop: emit SMLAL/UMLAL for + * dest = accum + (int32/uint32)src1 * (int32/uint32)src2 + * + * This narrow helper is used by codegen peepholes after register allocation. + * It only handles the cheap in-place accumulate form, where the ADD destination + * already holds the accumulator pair. Other forms fall back to SMULL/UMULL + * plus the normal 64-bit ADD so we do not risk clobbering multiply sources. */ +ST_FUNC int tcc_gen_machine_mlal_accum_mop(MachineOperand src1, MachineOperand src2, MachineOperand accum, + MachineOperand dest, int is_signed) +{ + if (!dest.is_64bit || !accum.is_64bit) + return 0; + if (dest.kind != MACH_OP_REG || accum.kind != MACH_OP_REG) + return 0; + if (dest.needs_deref || accum.needs_deref) + return 0; + if (dest.u.reg.r0 != accum.u.reg.r0 || dest.u.reg.r1 != accum.u.reg.r1) + return 0; + + int rd_lo = dest.u.reg.r0; + int rd_hi = dest.u.reg.r1; + /* Inline the hw-reg range checks rather than calling thumb_is_hw_reg(): the + * self-host cross drops the argument move into the inlined helper here and + * tests a stale register (the dest pointer) instead of rd_lo/rd_hi, so the + * native compiler wrongly bails out of every in-place 64-bit MLA with + * "unable to lower 64-bit MLA". Direct comparisons on rd_lo/rd_hi (as the + * adjacent rd_lo == rd_hi check already does) compile correctly. */ + if (rd_lo < 0 || rd_lo > 15 || rd_hi < 0 || rd_hi > 15 || rd_lo == rd_hi) + return 0; + + MachineCodegenContext ctx = {0}; + MachineOperand s1 = src1; + s1.is_64bit = false; + MachineOperand s2 = src2; + s2.is_64bit = false; + + uint32_t excl = (1u << (uint32_t)rd_lo) | (1u << (uint32_t)rd_hi); + int rn = mach_ensure_in_reg(&ctx, &s1, excl); + if (thumb_is_hw_reg(rn)) + excl |= (1u << (uint32_t)rn); + + int rm = mach_ensure_in_reg(&ctx, &s2, excl); + if (thumb_is_hw_reg(rm)) + excl |= (1u << (uint32_t)rm); + + if (is_signed) + ot_check(th_smlal((uint32_t)rd_lo, (uint32_t)rd_hi, (uint32_t)rn, (uint32_t)rm)); + else + ot_check(th_umlal((uint32_t)rd_lo, (uint32_t)rd_hi, (uint32_t)rn, (uint32_t)rm)); + + MachineOperand dst_lo = mach_make_lo_half(&dest); + MachineOperand dst_hi = mach_make_hi_half(&dest); + dst_lo.btype = IROP_BTYPE_INT32; + dst_hi.btype = IROP_BTYPE_INT32; + mach_writeback_dest(&dst_lo, rd_lo); + mach_writeback_dest(&dst_hi, rd_hi); + mach_release_all(&ctx); + return 1; +} + +/* tcc_gen_machine_pack64_mop: lower TCCIR_OP_PACK64 by emitting two + * 32-bit assigns into the dest's halves. src_lo and src_hi are u32 + * operands; dest is a u64 register pair / spill / param slot. + * + * The two sub-assigns delegate to tcc_gen_machine_assign_mop, so they + * benefit from its existing handling of every dest kind (REG/SPILL/...). + * Often regalloc has already aligned the registers (e.g. dest.r0 = src_lo + * register), in which case the sub-assigns degrade to a no-op MOV that + * the encoder can skip. */ +ST_FUNC void tcc_gen_machine_pack64_mop(MachineOperand src_lo, MachineOperand src_hi, MachineOperand dest) +{ + if (!dest.is_64bit) + { + tcc_error("compiler_error: tcc_gen_machine_pack64_mop: dest not 64-bit"); + return; + } + MachineOperand dst_lo = mach_make_lo_half(&dest); + MachineOperand dst_hi = mach_make_hi_half(&dest); + dst_lo.btype = IROP_BTYPE_INT32; + dst_hi.btype = IROP_BTYPE_INT32; + + /* Detect register-swap aliasing: dst_lo == src_hi AND dst_hi == src_lo. + * Neither write order can preserve both source values; we must stage one + * side through a scratch register. */ + int swap_alias = 0; + if (src_lo.kind == MACH_OP_REG && !src_lo.needs_deref && + src_hi.kind == MACH_OP_REG && !src_hi.needs_deref && + dst_lo.kind == MACH_OP_REG && !dst_lo.needs_deref && + dst_hi.kind == MACH_OP_REG && !dst_hi.needs_deref && + src_hi.u.reg.r0 == dst_lo.u.reg.r0 && src_lo.u.reg.r0 == dst_hi.u.reg.r0 && + src_lo.u.reg.r0 != src_hi.u.reg.r0) + swap_alias = 1; + + if (swap_alias) + { + /* Save src_lo to a scratch before overwriting it via dst_hi. */ + uint32_t excl = (1u << (uint32_t)dst_lo.u.reg.r0) | (1u << (uint32_t)dst_hi.u.reg.r0); + ScratchRegAlloc scratch = get_scratch_reg_with_save(excl); + ot_check_mov_reg((uint32_t)scratch.reg, (uint32_t)src_lo.u.reg.r0, flags_safe(), + THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + /* Now dst_hi = src_hi (still live), then dst_lo = scratch (=old src_lo). */ + tcc_gen_machine_assign_mop(src_hi, dst_hi, TCCIR_OP_ASSIGN); + MachineOperand scratch_op = src_lo; + scratch_op.u.reg.r0 = scratch.reg; + tcc_gen_machine_assign_mop(scratch_op, dst_lo, TCCIR_OP_ASSIGN); + restore_scratch_reg(&scratch); + } + else if (src_hi.kind == MACH_OP_REG && !src_hi.needs_deref && + dst_lo.kind == MACH_OP_REG && !dst_lo.needs_deref && + src_hi.u.reg.r0 == dst_lo.u.reg.r0) + { + /* dst_lo == src_hi register: write hi first to free src_hi's slot. */ + tcc_gen_machine_assign_mop(src_hi, dst_hi, TCCIR_OP_ASSIGN); + tcc_gen_machine_assign_mop(src_lo, dst_lo, TCCIR_OP_ASSIGN); + } + else + { + tcc_gen_machine_assign_mop(src_lo, dst_lo, TCCIR_OP_ASSIGN); + tcc_gen_machine_assign_mop(src_hi, dst_hi, TCCIR_OP_ASSIGN); + } +} + /* tcc_gen_machine_assign_mop: MachineOperand-based entry point for simple * 32-bit value assignment. Called from ir/codegen.c instead of * tcc_gen_machine_assign_op when: @@ -4571,8 +7032,8 @@ ST_FUNC void tcc_gen_machine_assign_mop(MachineOperand src, MachineOperand dest, uint32_t excl = thumb_is_hw_reg(dest_reg) ? (1u << (uint32_t)dest_reg) : 0; int src_reg = mach_ensure_in_reg(&mctx, &src, excl); if (src_reg != dest_reg) - ot_check(th_mov_reg((uint32_t)dest_reg, (uint32_t)src_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dest_reg, (uint32_t)src_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); break; } } @@ -4587,16 +7048,22 @@ ST_FUNC void tcc_gen_machine_assign_mop(MachineOperand src, MachineOperand dest, * src must be MACH_OP_IMM carrying the raw condition code in u.imm.val. * * 32-bit dest: - * MOV dest, #0 - * IT - * MOV dest, #1 + * ITE + * MOV dest, #1 (T: cond met) + * MOV dest, #0 (E: cond not met) * * 64-bit dest pair (e.g. long long result = (x > y)): * The boolean result 0 or 1 fits in 32 bits, so hi word is always 0. - * MOV dest_lo, #0 - * IT + * ITE * MOV dest_lo, #1 + * MOV dest_lo, #0 * MOV dest_hi, #0 (unconditional, outside IT block — hi is always 0) + * + * Inner MOVs use NOT_IMPORTANT for flags: SETIF is the consumer of the CMP + * flags; once the ITE captures the condition, no subsequent code in this + * lowering depends on CMP's flag state, so the 16-bit T1 encoding (which + * implicitly sets flags) is safe. This shrinks each conditional MOV from + * 4 bytes (mov.w) to 2 bytes (movs). */ ST_FUNC void tcc_gen_machine_setif_mop(MachineOperand src, MachineOperand dest, TccIrOp op) { @@ -4604,6 +7071,10 @@ ST_FUNC void tcc_gen_machine_setif_mop(MachineOperand src, MachineOperand dest, MachineCodegenContext mctx = {0}; const int cond = mapcc((int)src.u.imm.val); + /* ITE mask: 2nd instruction has opposite condition. + * mask[3] = 1 if it should be the 'else' bit (opposite of cond[0]). + * For the T-then-E pattern, mask = ((!cond[0]) << 3) | 0x4. */ + const uint16_t ite_mask = (uint16_t)(((cond ^ 1) & 1) << 3) | 0x4u; if (dest.is_64bit) { @@ -4617,13 +7088,13 @@ ST_FUNC void tcc_gen_machine_setif_mop(MachineOperand src, MachineOperand dest, uint32_t excl = thumb_is_hw_reg(lo_reg) ? (1u << (uint32_t)lo_reg) : 0u; int hi_reg = mach_get_dest_reg(&mctx, &dst_hi, excl); - /* Emit SETIF sequence for lo word. */ - ot_check(th_mov_imm(lo_reg, 0, FLAGS_BEHAVIOUR_BLOCK, ENFORCE_ENCODING_NONE)); + /* Emit ITE sequence for lo word. */ th_literal_pool_reserve_upcoming_bytes(6); - ot_check(th_it(cond, 0x8)); /* IT — single conditioned instruction */ + ot_check(th_it(cond, ite_mask)); /* ITE — two conditioned instructions */ ot_check(th_mov_imm(lo_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm(lo_reg, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); /* Hi word is always 0 — boolean result never exceeds 1 (i.e. fits in 32-bit lo). */ - ot_check(th_mov_imm(hi_reg, 0, FLAGS_BEHAVIOUR_BLOCK, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm(hi_reg, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); mach_writeback_dest(&dst_lo, lo_reg); mach_writeback_dest(&dst_hi, hi_reg); @@ -4632,10 +7103,10 @@ ST_FUNC void tcc_gen_machine_setif_mop(MachineOperand src, MachineOperand dest, { int dest_reg = mach_get_dest_reg(&mctx, &dest, 0); - ot_check(th_mov_imm(dest_reg, 0, FLAGS_BEHAVIOUR_BLOCK, ENFORCE_ENCODING_NONE)); th_literal_pool_reserve_upcoming_bytes(6); - ot_check(th_it(cond, 0x8)); /* IT — single conditioned instruction */ + ot_check(th_it(cond, ite_mask)); /* ITE — two conditioned instructions */ ot_check(th_mov_imm(dest_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm(dest_reg, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); mach_writeback_dest(&dest, dest_reg); } @@ -4681,7 +7152,7 @@ ST_FUNC void tcc_gen_machine_bool_mop(MachineOperand src1, MachineOperand src2, int hi1_reg = mach_ensure_in_reg(&mctx, &hi1, excl); excl |= thumb_is_hw_reg(hi1_reg) ? (1u << (uint32_t)hi1_reg) : 0; /* r1 = lo1 | hi1 — is src1 non-zero? */ - ot_check(th_orr_reg(r1, r1, hi1_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_orr_reg(r1, r1, hi1_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); } else { @@ -4700,7 +7171,7 @@ ST_FUNC void tcc_gen_machine_bool_mop(MachineOperand src1, MachineOperand src2, int hi2_reg = mach_ensure_in_reg(&mctx, &hi2, excl); excl |= thumb_is_hw_reg(hi2_reg) ? (1u << (uint32_t)hi2_reg) : 0; /* r2 = lo2 | hi2 — is src2 non-zero? */ - ot_check(th_orr_reg(r2, r2, hi2_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_orr_reg(r2, r2, hi2_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); } else { @@ -4716,18 +7187,18 @@ ST_FUNC void tcc_gen_machine_bool_mop(MachineOperand src1, MachineOperand src2, ot_check(th_mov_imm(dest_reg, 0, FLAGS_BEHAVIOUR_BLOCK, ENFORCE_ENCODING_NONE)); th_literal_pool_reserve_upcoming_bytes(6); ot_check(th_it(0x1, 0x8)); /* IT NE */ - ot_check(th_mov_imm(dest_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm(dest_reg, 1, flags_safe(), ENFORCE_ENCODING_NONE)); } else /* TCCIR_OP_BOOL_AND */ { - ot_check(th_cmp_imm(0, r1, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); + ot_check(th_cmp_imm(r1, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); th_literal_pool_reserve_upcoming_bytes(6); ot_check(th_it(0x1, 0x8)); /* IT NE */ - ot_check(th_cmp_imm(0, r2, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); /* CMPne r2, #0 */ + ot_check(th_cmp_imm(r2, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); /* CMPne r2, #0 */ ot_check(th_mov_imm(dest_reg, 0, FLAGS_BEHAVIOUR_BLOCK, ENFORCE_ENCODING_NONE)); th_literal_pool_reserve_upcoming_bytes(6); ot_check(th_it(0x1, 0x8)); /* IT NE */ - ot_check(th_mov_imm(dest_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm(dest_reg, 1, flags_safe(), ENFORCE_ENCODING_NONE)); } mach_writeback_dest(&dest, dest_reg); @@ -4750,18 +7221,18 @@ ST_FUNC void tcc_gen_machine_bool_mop(MachineOperand src1, MachineOperand src2, ot_check(th_mov_imm(dest_reg, 0, FLAGS_BEHAVIOUR_BLOCK, ENFORCE_ENCODING_NONE)); th_literal_pool_reserve_upcoming_bytes(6); ot_check(th_it(0x1, 0x8)); /* IT NE */ - ot_check(th_mov_imm(dest_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm(dest_reg, 1, flags_safe(), ENFORCE_ENCODING_NONE)); } else /* TCCIR_OP_BOOL_AND */ { - ot_check(th_cmp_imm(0, src1_reg, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); + ot_check(th_cmp_imm(src1_reg, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); th_literal_pool_reserve_upcoming_bytes(6); ot_check(th_it(0x1, 0x8)); /* IT NE */ - ot_check(th_cmp_imm(0, src2_reg, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); /* CMPne src2, #0 */ + ot_check(th_cmp_imm(src2_reg, 0, FLAGS_BEHAVIOUR_SET, ENFORCE_ENCODING_NONE)); /* CMPne src2, #0 */ ot_check(th_mov_imm(dest_reg, 0, FLAGS_BEHAVIOUR_BLOCK, ENFORCE_ENCODING_NONE)); th_literal_pool_reserve_upcoming_bytes(6); ot_check(th_it(0x1, 0x8)); /* IT NE */ - ot_check(th_mov_imm(dest_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm(dest_reg, 1, flags_safe(), ENFORCE_ENCODING_NONE)); } mach_writeback_dest(&dest, dest_reg); @@ -4825,8 +7296,8 @@ ST_FUNC void tcc_gen_machine_load_mop(MachineOperand src, MachineOperand dest, T { /* Direct register-to-register (treat as MOV — should be ASSIGN, not LOAD) */ if (dest_reg != src.u.reg.r0) - ot_check(th_mov_reg((uint32_t)dest_reg, (uint32_t)src.u.reg.r0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, - THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dest_reg, (uint32_t)src.u.reg.r0, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); /* Narrow sub-word parameter values: when a parameter is declared as * char/short but arrives in a full 32-bit register (AAPCS default * argument promotion), the upper bits may contain garbage. Emit @@ -4847,8 +7318,8 @@ ST_FUNC void tcc_gen_machine_load_mop(MachineOperand src, MachineOperand dest, T } /* 64-bit pair: also copy the hi-half register */ if (dest_r1 != PREG_REG_NONE && src.u.reg.r1 >= 0 && dest_r1 != src.u.reg.r1) - ot_check(th_mov_reg((uint32_t)dest_r1, (uint32_t)src.u.reg.r1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, - THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dest_r1, (uint32_t)src.u.reg.r1, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } break; @@ -4900,8 +7371,16 @@ ST_FUNC void tcc_gen_machine_load_mop(MachineOperand src, MachineOperand dest, T /* needs_deref: load symbol address into scratch, then dereference. */ int addr_r = mach_alloc_scratch(&ctx, (uint32_t)1u << (uint32_t)dest_reg); tcc_machine_load_constant(addr_r, PREG_REG_NONE, 0, 0, sym); - load_from_base(dest_reg, dest_r1, btype, is_unsigned, addend < 0 ? (int)(-addend) : (int)addend, addend < 0 ? 1 : 0, - (uint32_t)addr_r); + /* For a 64-bit deref, try LDRD when we can prove the symbol's address + * at `addend` is 4-byte aligned. Otherwise fall back to the pair of + * 32-bit loads via load_from_base. */ + const int sym_sign = (addend < 0), sym_abs = sym_sign ? (int)(-addend) : (int)addend; + if (dest.is_64bit && dest_r1 != PREG_REG_NONE && sym_is_4_byte_aligned_for_64bit(sym, addend) && + try_ldrd_pair(dest_reg, dest_r1, addr_r, sym_abs, sym_sign)) + { + break; + } + load_from_base(dest_reg, dest_r1, btype, is_unsigned, sym_abs, sym_sign, (uint32_t)addr_r); break; } @@ -4910,6 +7389,23 @@ ST_FUNC void tcc_gen_machine_load_mop(MachineOperand src, MachineOperand dest, T tcc_machine_load_constant(dest_reg, dest_r1, src.u.imm.val, (int)dest.is_64bit, NULL); break; + case MACH_OP_FRAME_ADDR: + { + if (!src.needs_deref) + { + /* Load the frame-slot address itself (LEA semantics). */ + tcc_machine_addr_of_stack_slot(dest_reg, src.u.frame.offset, 0); + } + else + { + /* Frame address is a pointer to data — compute addr, then dereference. */ + int addr_r = mach_alloc_scratch(&ctx, (uint32_t)1u << (uint32_t)dest_reg); + tcc_machine_addr_of_stack_slot(addr_r, src.u.frame.offset, 0); + load_from_base(dest_reg, dest_r1, btype, is_unsigned, 0, 0, (uint32_t)addr_r); + } + break; + } + case MACH_OP_CHAIN_REL: { /* Captured variable: load from parent frame via static chain. */ @@ -5036,8 +7532,13 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, MachineOperand src_hi = mach_make_hi_half(&src); src_hi.btype = IROP_BTYPE_INT32; - const int lo_reg = mach_ensure_in_reg(&ctx, &src_lo, 0); - uint32_t excl = thumb_is_hw_reg(lo_reg) ? (1u << (uint32_t)lo_reg) : 0u; + uint32_t dest_excl = 0; + if (dest.kind == MACH_OP_REG && dest.needs_deref && + dest.u.reg.r0 >= 0 && dest.u.reg.r0 < 16) + dest_excl = (1u << (uint32_t)dest.u.reg.r0); + + const int lo_reg = mach_ensure_in_reg(&ctx, &src_lo, dest_excl); + uint32_t excl = dest_excl | (thumb_is_hw_reg(lo_reg) ? (1u << (uint32_t)lo_reg) : 0u); const int hi_reg = mach_ensure_in_reg(&ctx, &src_hi, excl); excl |= thumb_is_hw_reg(hi_reg) ? (1u << (uint32_t)hi_reg) : 0u; @@ -5046,7 +7547,11 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, case MACH_OP_REG: if (dest.needs_deref) { - /* 64-bit pointer-store: STR lo, [base]; STR hi, [base, #4] */ + /* 64-bit pointer-store through a register-held address. Do NOT use + * STRD here: ARMv7-M/v8-M requires 4-byte alignment for STRD and + * faults otherwise, but the pointer may target packed-struct memory + * that is only 1- or 2-byte aligned. Plain STR tolerates unaligned + * (UNALIGN_TRP=0 default) so two 32-bit stores stay safe. */ const uint32_t base = (uint32_t)dest.u.reg.r0; th_store32_imm_or_reg_ex(lo_reg, base, 0, 0, excl | (1u << base)); th_store32_imm_or_reg_ex(hi_reg, base, 4, 0, excl | (1u << base)); @@ -5059,20 +7564,20 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, if (lo_reg == dreg_hi) { if (dreg_lo != lo_reg && dreg_lo != (int)PREG_REG_NONE) - ot_check(th_mov_reg((uint32_t)dreg_lo, (uint32_t)lo_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dreg_lo, (uint32_t)lo_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); if (dreg_hi != hi_reg && dreg_hi != (int)PREG_REG_NONE) - ot_check(th_mov_reg((uint32_t)dreg_hi, (uint32_t)hi_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dreg_hi, (uint32_t)hi_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } else { if (dreg_hi != hi_reg && dreg_hi != (int)PREG_REG_NONE) - ot_check(th_mov_reg((uint32_t)dreg_hi, (uint32_t)hi_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dreg_hi, (uint32_t)hi_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); if (dreg_lo != lo_reg && dreg_lo != (int)PREG_REG_NONE) - ot_check(th_mov_reg((uint32_t)dreg_lo, (uint32_t)lo_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dreg_lo, (uint32_t)lo_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } } break; @@ -5092,14 +7597,22 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, ot_check(th_ldr_reg((uint32_t)ptr_r, base, (uint32_t)rr.reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&rr); } + /* Pointer-through store from an LLOCAL spill slot: the target + * address is arbitrary (may be unaligned packed-struct memory), so + * skip STRD. */ th_store32_imm_or_reg_ex(lo_reg, (uint32_t)ptr_r, 0, 0, excl | (1u << (uint32_t)ptr_r)); th_store32_imm_or_reg_ex(hi_reg, (uint32_t)ptr_r, 4, 0, excl | (1u << (uint32_t)ptr_r)); } else { const int adj_hi = adj + 4; - th_store32_imm_or_reg_ex(lo_reg, base, adj < 0 ? -adj : adj, adj < 0 ? 1 : 0, excl | (1u << base)); - th_store32_imm_or_reg_ex(hi_reg, base, adj_hi < 0 ? -adj_hi : adj_hi, adj_hi < 0 ? 1 : 0, excl | (1u << base)); + const int sign = (adj < 0), abs_off = sign ? -adj : adj; + if (!try_strd_pair(lo_reg, hi_reg, (int)base, abs_off, sign)) + { + th_store32_imm_or_reg_ex(lo_reg, base, abs_off, sign, excl | (1u << base)); + th_store32_imm_or_reg_ex(hi_reg, base, adj_hi < 0 ? -adj_hi : adj_hi, adj_hi < 0 ? 1 : 0, + excl | (1u << base)); + } } break; } @@ -5109,20 +7622,32 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, const int adj = dest.u.param.offset + offset_to_args; const int adj_hi = adj + 4; const uint32_t base = (uint32_t)(tcc_state->need_frame_pointer ? R_FP : R_SP); - th_store32_imm_or_reg_ex(lo_reg, base, adj < 0 ? -adj : adj, adj < 0 ? 1 : 0, excl | (1u << base)); - th_store32_imm_or_reg_ex(hi_reg, base, adj_hi < 0 ? -adj_hi : adj_hi, adj_hi < 0 ? 1 : 0, excl | (1u << base)); + const int sign = (adj < 0), abs_off = sign ? -adj : adj; + if (!try_strd_pair(lo_reg, hi_reg, (int)base, abs_off, sign)) + { + th_store32_imm_or_reg_ex(lo_reg, base, abs_off, sign, excl | (1u << base)); + th_store32_imm_or_reg_ex(hi_reg, base, adj_hi < 0 ? -adj_hi : adj_hi, adj_hi < 0 ? 1 : 0, excl | (1u << base)); + } break; } case MACH_OP_SYMBOL: { + /* Global symbol store. STRD needs 4-byte alignment; allow it only when + * the symbol's declared type guarantees natural alignment >= 4 (regular + * scalar globals) or the symbol was explicitly aligned. Packed structs + * and struct-typed globals stay on the STR-pair path. */ Sym *sym = dest.u.sym.sym ? validate_sym_for_reloc(dest.u.sym.sym) : NULL; int addr_r = mach_alloc_scratch(&ctx, excl); tcc_machine_load_constant(addr_r, PREG_REG_NONE, 0, 0, sym); const int32_t addend = dest.u.sym.addend; const int32_t addend_hi = addend + 4; - th_store32_imm_or_reg_ex(lo_reg, (uint32_t)addr_r, addend < 0 ? (int)(-addend) : (int)addend, addend < 0 ? 1 : 0, - excl | (1u << addr_r)); + const int sign = (addend < 0), abs_off = sign ? (int)(-addend) : (int)addend; + if (sym_is_4_byte_aligned_for_64bit(sym, addend) && try_strd_pair(lo_reg, hi_reg, addr_r, abs_off, sign)) + { + break; + } + th_store32_imm_or_reg_ex(lo_reg, (uint32_t)addr_r, abs_off, sign, excl | (1u << addr_r)); th_store32_imm_or_reg_ex(hi_reg, (uint32_t)addr_r, addend_hi < 0 ? (int)(-addend_hi) : (int)addend_hi, addend_hi < 0 ? 1 : 0, excl | (1u << addr_r)); break; @@ -5130,6 +7655,7 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, case MACH_OP_IMM: { + /* Store to a constant address — alignment unknown, skip STRD. */ int addr_r = mach_alloc_scratch(&ctx, excl); tcc_machine_load_constant(addr_r, PREG_REG_NONE, dest.u.imm.val, 0, NULL); th_store32_imm_or_reg_ex(lo_reg, (uint32_t)addr_r, 0, 0, excl | (1u << addr_r)); @@ -5141,8 +7667,11 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, { int addr_r = mach_alloc_scratch(&ctx, excl); tcc_machine_addr_of_stack_slot(addr_r, dest.u.frame.offset, 0 /* not param */); - th_store32_imm_or_reg_ex(lo_reg, (uint32_t)addr_r, 0, 0, excl | (1u << addr_r)); - th_store32_imm_or_reg_ex(hi_reg, (uint32_t)addr_r, 4, 0, excl | (1u << addr_r)); + if (!try_strd_pair(lo_reg, hi_reg, addr_r, 0, 0)) + { + th_store32_imm_or_reg_ex(lo_reg, (uint32_t)addr_r, 0, 0, excl | (1u << addr_r)); + th_store32_imm_or_reg_ex(hi_reg, (uint32_t)addr_r, 4, 0, excl | (1u << addr_r)); + } break; } @@ -5156,8 +7685,11 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, int sign = (off < 0), abs_off = sign ? (int)(-off) : (int)off; int32_t off_hi = off + 4; int sign_hi = (off_hi < 0), abs_off_hi = sign_hi ? (int)(-off_hi) : (int)off_hi; - th_store32_imm_or_reg_ex(lo_reg, (uint32_t)base, abs_off, sign, excl | (1u << (uint32_t)base)); - th_store32_imm_or_reg_ex(hi_reg, (uint32_t)base, abs_off_hi, sign_hi, excl | (1u << (uint32_t)base)); + if (!try_strd_pair(lo_reg, hi_reg, base, abs_off, sign)) + { + th_store32_imm_or_reg_ex(lo_reg, (uint32_t)base, abs_off, sign, excl | (1u << (uint32_t)base)); + th_store32_imm_or_reg_ex(hi_reg, (uint32_t)base, abs_off_hi, sign_hi, excl | (1u << (uint32_t)base)); + } if (chain_used) restore_scratch_reg(&chain_scratch); break; @@ -5172,8 +7704,25 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, const int btype = dest.btype; /* Store width from destination type */ - /* Get source value register — may allocate a scratch if spilled/const */ - const int src_reg = mach_ensure_in_reg(&ctx, &src, 0); + /* Fast path: plain-register dest (no deref) — load src directly into dest, + * skipping the intermediate scratch + MOV that the generic path emits. + * Covers IMM, SYMBOL, SPILL, FRAME_ADDR, PARAM_STACK, CHAIN_REL, and + * REG-with-or-without-deref src kinds. */ + if (dest.kind == MACH_OP_REG && !dest.needs_deref && dest.u.reg.r0 != (int)PREG_REG_NONE) + { + tcc_gen_mach_load_to_reg(dest.u.reg.r0, &src); + mach_release_all(&ctx); + return; + } + + /* Get source value register — may allocate a scratch if spilled/const. + * When storing through a register-held pointer, protect the base register + * before materializing immediates or spilled values. */ + uint32_t src_excl = 0; + if (dest.kind == MACH_OP_REG && dest.needs_deref && + dest.u.reg.r0 >= 0 && dest.u.reg.r0 < 16) + src_excl |= (1u << (uint32_t)dest.u.reg.r0); + const int src_reg = mach_ensure_in_reg(&ctx, &src, src_excl); switch (dest.kind) { @@ -5194,8 +7743,8 @@ ST_FUNC void tcc_gen_machine_store_mop(MachineOperand dest, MachineOperand src, /* Register-to-register store (MOV) */ const int dreg = dest.u.reg.r0; if (dreg != src_reg && dreg != (int)PREG_REG_NONE) - ot_check(th_mov_reg((uint32_t)dreg, (uint32_t)src_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)dreg, (uint32_t)src_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } break; @@ -5334,35 +7883,157 @@ ST_FUNC void tcc_gen_machine_load_indexed_mop(MachineOperand dest, MachineOperan int shift_amount = (scale.kind == MACH_OP_IMM) ? (int)scale.u.imm.val : 2; if (shift_amount < 0 || shift_amount > 31) shift_amount = 2; - thumb_shift shift = {.type = THUMB_SHIFT_LSL, .value = (uint32_t)shift_amount, .mode = THUMB_SHIFT_IMMEDIATE}; + + /* Fast path: base is &local + constant index — fold into SP/FP-relative load. + * Mirrors the store_indexed FRAME_ADDR fast path. */ + if (!dest.is_64bit && shift_amount == 0 && index.kind == MACH_OP_IMM && + base.kind == MACH_OP_FRAME_ADDR && !base.needs_deref) + { + int combined = base.u.frame.offset + (int)index.u.imm.val; + int adjusted = fp_adjust_local_offset(combined, 0); + int sign = (adjusted < 0); + int abs_off = sign ? -adjusted : adjusted; + if (abs_off <= 4095) + { + const int dest_reg = mach_get_dest_reg(&ctx, &dest, 0); + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; + load_from_base(dest_reg, PREG_REG_NONE, dest.btype, (int)dest.is_unsigned, abs_off, sign, (uint32_t)base_reg); + mach_writeback_dest(&dest, dest_reg); + mach_release_all(&ctx); + return; + } + } + + /* Fast path: constant-displacement load (scale == 0 and index is an immediate). + * Generated by the displacement-fusion pass when folding `ADD base,#imm; LOAD *` + * into a single `LDR dest,[base,#imm]`, matching GCC's addressing-mode output. */ + if (!dest.is_64bit && shift_amount == 0 && index.kind == MACH_OP_IMM) + { + int imm = (int)index.u.imm.val; + int sign = (imm < 0); + int abs_off = sign ? -imm : imm; + if (abs_off <= 4095) + { + const int dest_reg = mach_get_dest_reg(&ctx, &dest, 0); + uint32_t excl = (1u << (uint32_t)dest_reg); + int base_reg = mach_ensure_in_reg(&ctx, &base, excl); + load_from_base(dest_reg, PREG_REG_NONE, dest.btype, (int)dest.is_unsigned, abs_off, sign, (uint32_t)base_reg); + mach_writeback_dest(&dest, dest_reg); + mach_release_all(&ctx); + return; + } + } + + /* scale 0 → no shift: use THUMB_SHIFT_NONE so the 16-bit T1 register-offset + * encoding (all-low regs) can be selected instead of the wide T32 form. */ + thumb_shift shift = (shift_amount == 0) + ? (thumb_shift){.type = THUMB_SHIFT_NONE, .value = 0, .mode = THUMB_SHIFT_IMMEDIATE} + : (thumb_shift){.type = THUMB_SHIFT_LSL, .value = (uint32_t)shift_amount, .mode = THUMB_SHIFT_IMMEDIATE}; + + /* Fast path: 64-bit constant-displacement load using LDRD [base, #imm]. + * LDRD supports word-aligned offsets in range [-1020, 1020]. */ + if (dest.is_64bit && shift_amount == 0 && index.kind == MACH_OP_IMM) + { + int imm = (int)index.u.imm.val; + int sign = (imm < 0); + int abs_off = sign ? -imm : imm; + if (abs_off <= 1020 && (abs_off & 3) == 0) + { + const bool dest_is_reg = (dest.kind == MACH_OP_REG && !dest.needs_deref); + int dest_lo, dest_hi; + MachineOperand dest_hi_mop = {0}; + uint32_t excl = 0; + + if (dest_is_reg) + { + dest_lo = dest.u.reg.r0; + if (!thumb_is_hw_reg(dest.u.reg.r1)) + tcc_error("load_indexed_mop: 64-bit dest has invalid r1=%d (r0=%d) — " + "register allocator must produce a valid pair", + dest.u.reg.r1, dest.u.reg.r0); + dest_hi = dest.u.reg.r1; + excl = (1u << (uint32_t)dest_lo) | (1u << (uint32_t)dest_hi); + } + else + { + dest_lo = mach_get_dest_reg(&ctx, &dest, 0); + excl = (1u << (uint32_t)dest_lo); + dest_hi_mop = mach_make_hi_half(&dest); + dest_hi = mach_get_dest_reg(&ctx, &dest_hi_mop, excl); + excl |= (1u << (uint32_t)dest_hi); + } + + int base_reg = mach_ensure_in_reg(&ctx, &base, excl); + uint32_t puw = sign ? 4u : 6u; + ot_check(th_ldrd_imm((uint32_t)dest_lo, (uint32_t)dest_hi, (uint32_t)base_reg, abs_off, puw)); + if (!dest_is_reg) + { + MachineOperand dest_lo_mop = mach_make_lo_half(&dest); + mach_writeback_dest(&dest_lo_mop, dest_lo); + mach_writeback_dest(&dest_hi_mop, dest_hi); + } + mach_release_all(&ctx); + return; + } + } /* 64-bit indexed load: compute EA = base + index< 31) shift_amount = 2; - thumb_shift shift = {.type = THUMB_SHIFT_LSL, .value = (uint32_t)shift_amount, .mode = THUMB_SHIFT_IMMEDIATE}; + + /* Fast path: base is &local + constant index — fold into SP/FP-relative store. + * Avoids emitting a separate `ADD base, sp, #frame_off` LEA before the STR, + * cutting one instruction per access in dense local-array initialization. */ + if (!value.is_64bit && shift_amount == 0 && index.kind == MACH_OP_IMM && + base.kind == MACH_OP_FRAME_ADDR && !base.needs_deref) + { + int combined = base.u.frame.offset + (int)index.u.imm.val; + int adjusted = fp_adjust_local_offset(combined, 0); + int sign = (adjusted < 0); + int abs_off = sign ? -adjusted : adjusted; + if (abs_off <= 4095) + { + const int btype = value.btype; + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; + int value_reg = mach_ensure_in_reg(&ctx, &value, 0); + if (btype == IROP_BTYPE_INT8) + th_store8_imm_or_reg(value_reg, (uint32_t)base_reg, abs_off, sign); + else if (btype == IROP_BTYPE_INT16) + th_store16_imm_or_reg(value_reg, (uint32_t)base_reg, abs_off, sign); + else + th_store32_imm_or_reg_ex(value_reg, (uint32_t)base_reg, abs_off, sign, + (1u << (uint32_t)value_reg) | (1u << (uint32_t)base_reg)); + mach_release_all(&ctx); + return; + } + } + + /* Fast path: constant-displacement store (scale == 0 and index is an immediate). + * Mirrors the load_indexed fast path; emits `STR value,[base,#imm]`. */ + if (!value.is_64bit && shift_amount == 0 && index.kind == MACH_OP_IMM) + { + int imm = (int)index.u.imm.val; + int sign = (imm < 0); + int abs_off = sign ? -imm : imm; + if (abs_off <= 4095) + { + const int btype = value.btype; + int value_reg = mach_ensure_in_reg(&ctx, &value, 0); + uint32_t excl = (1u << (uint32_t)value_reg); + int base_reg = mach_ensure_in_reg(&ctx, &base, excl); + if (btype == IROP_BTYPE_INT8) + th_store8_imm_or_reg(value_reg, (uint32_t)base_reg, abs_off, sign); + else if (btype == IROP_BTYPE_INT16) + th_store16_imm_or_reg(value_reg, (uint32_t)base_reg, abs_off, sign); + else + th_store32_imm_or_reg_ex(value_reg, (uint32_t)base_reg, abs_off, sign, + (1u << (uint32_t)value_reg) | (1u << (uint32_t)base_reg)); + mach_release_all(&ctx); + return; + } + } + + /* scale 0 → no shift: use THUMB_SHIFT_NONE so the 16-bit T1 register-offset + * encoding (all-low regs) can be selected instead of the wide T32 form. */ + thumb_shift shift = (shift_amount == 0) + ? (thumb_shift){.type = THUMB_SHIFT_NONE, .value = 0, .mode = THUMB_SHIFT_IMMEDIATE} + : (thumb_shift){.type = THUMB_SHIFT_LSL, .value = (uint32_t)shift_amount, .mode = THUMB_SHIFT_IMMEDIATE}; + + /* Fast path: 64-bit constant-displacement store using STRD [base, #imm]. */ + if (value.is_64bit && shift_amount == 0 && index.kind == MACH_OP_IMM) + { + int imm = (int)index.u.imm.val; + int sign = (imm < 0); + int abs_off = sign ? -imm : imm; + if (abs_off <= 1020 && (abs_off & 3) == 0) + { + MachineOperand val_lo = mach_make_lo_half(&value); + val_lo.btype = IROP_BTYPE_INT32; + MachineOperand val_hi = mach_make_hi_half(&value); + val_hi.btype = IROP_BTYPE_INT32; + const int lo_reg = mach_ensure_in_reg(&ctx, &val_lo, 0); + uint32_t excl = (1u << (uint32_t)lo_reg); + const int hi_reg = mach_ensure_in_reg(&ctx, &val_hi, excl); + excl |= (1u << (uint32_t)hi_reg); + int base_reg = mach_ensure_in_reg(&ctx, &base, excl); + uint32_t puw = sign ? 4u : 6u; + ot_check(th_strd_imm((uint32_t)lo_reg, (uint32_t)hi_reg, (uint32_t)base_reg, abs_off, puw)); + mach_release_all(&ctx); + return; + } + } /* 64-bit indexed store: compute EA = base + index<needs_deref) { if (op->u.reg.r0 != target_reg) - ot_check(th_mov_reg((uint32_t)target_reg, (uint32_t)op->u.reg.r0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, - THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)target_reg, (uint32_t)op->u.reg.r0, flags_safe(), + THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); } else { @@ -5819,15 +8582,15 @@ static MachineOperand mach_make_complex_imag(const MachineOperand *op) /* Helper: save a double from R0:R1 to SP-relative stack offset. */ static void fp_mop_save_double_to_sp(int off) { - ot_check(th_str_imm(R0, R_SP, off, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_str_imm(R1, R_SP, off + 4, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, off, 6, ENFORCE_ENCODING_NONE); + ot_check_str_imm(R1, R_SP, off + 4, 6, ENFORCE_ENCODING_NONE); } /* Helper: load a double from SP-relative stack offset into (lo_reg, hi_reg). */ static void fp_mop_load_double_from_sp(int lo_reg, int hi_reg, int off) { - ot_check(th_ldr_imm(lo_reg, R_SP, off, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(hi_reg, R_SP, off + 4, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(lo_reg, R_SP, off, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(hi_reg, R_SP, off + 4, 6, ENFORCE_ENCODING_NONE); } /* Process complex double multiplication via MachineOperands. @@ -5854,7 +8617,7 @@ static void thumb_process_complex_mul_double_mop(MachineOperand src1, MachineOpe MachineOperand s2_imag = mach_make_complex_imag(&src2); /* Allocate 8 bytes to save the scalar 'a'. */ - ot_check(th_sub_sp_imm(R_SP, 8, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_imm(R_SP, R_SP, 8, flags_safe(), ENFORCE_ENCODING_NONE)); /* Load scalar 'a' into R0:R1 and save to stack. */ fp_mop_load_double_arg(R0, R1, &src1); @@ -5873,7 +8636,7 @@ static void thumb_process_complex_mul_double_mop(MachineOperand src1, MachineOpe /* R0:R1 = a*d → write to dest imag. */ fp_mop_writeback_result(&d_imag, 1); - ot_check(th_add_sp_imm(R_SP, 8, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_add_imm(R_SP, R_SP, 8, flags_safe(), ENFORCE_ENCODING_NONE)); } else if (s1_complex && !s2_complex) { @@ -5882,7 +8645,7 @@ static void thumb_process_complex_mul_double_mop(MachineOperand src1, MachineOpe MachineOperand s1_imag = mach_make_complex_imag(&src1); /* Allocate 8 bytes to save the scalar 'c'. */ - ot_check(th_sub_sp_imm(R_SP, 8, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_imm(R_SP, R_SP, 8, flags_safe(), ENFORCE_ENCODING_NONE)); /* Load scalar 'c' into R0:R1 and save to stack. */ fp_mop_load_double_arg(R0, R1, &src2); @@ -5900,7 +8663,7 @@ static void thumb_process_complex_mul_double_mop(MachineOperand src1, MachineOpe fp_mop_do_bl("__aeabi_dmul"); fp_mop_writeback_result(&d_imag, 1); - ot_check(th_add_sp_imm(R_SP, 8, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_add_imm(R_SP, R_SP, 8, flags_safe(), ENFORCE_ENCODING_NONE)); } else { @@ -5922,7 +8685,7 @@ static void thumb_process_complex_mul_double_mop(MachineOperand src1, MachineOpe const int off_scratch0 = 0, off_scratch1 = 8; const int off_a = 16, off_b = 24, off_c = 32, off_d = 40; - ot_check(th_sub_sp_imm(R_SP, 48, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_imm(R_SP, R_SP, 48, flags_safe(), ENFORCE_ENCODING_NONE)); /* Save all 4 components to stack. */ fp_mop_load_double_arg(R0, R1, &s1_real); @@ -5976,7 +8739,7 @@ static void thumb_process_complex_mul_double_mop(MachineOperand src1, MachineOpe fp_mop_load_double_from_sp(R0, R1, off_scratch1); fp_mop_writeback_result(&d_imag, 1); - ot_check(th_add_sp_imm(R_SP, 48, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_add_imm(R_SP, R_SP, 48, flags_safe(), ENFORCE_ENCODING_NONE)); } } @@ -5993,8 +8756,8 @@ static void complex_pair_writeback(MachineOperand *d_lo, int lo_reg, MachineOper { /* Total swap: save hi to temp, then write both */ int tmp = (lo_reg != R2 && hi_reg != R2) ? R2 : R3; - ot_check(th_mov_reg((uint32_t)tmp, (uint32_t)hi_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)tmp, (uint32_t)hi_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); mach_writeback_dest(d_lo, lo_reg); mach_writeback_dest(d_hi, tmp); } @@ -6002,8 +8765,8 @@ static void complex_pair_writeback(MachineOperand *d_lo, int lo_reg, MachineOper { /* Lo writeback would clobber hi value; save hi first */ int tmp = (lo_reg != R2 && hi_reg != R2) ? R2 : R3; - ot_check(th_mov_reg((uint32_t)tmp, (uint32_t)hi_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)tmp, (uint32_t)hi_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); mach_writeback_dest(d_lo, lo_reg); mach_writeback_dest(d_hi, tmp); } @@ -6045,7 +8808,7 @@ static void thumb_process_complex_op_double_mop(MachineOperand src1, MachineOper * [sp+8] = s1_imag (8 bytes) * [sp+0] = s1_real (8 bytes) */ - ot_check(th_sub_sp_imm(R_SP, 32, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_imm(R_SP, R_SP, 32, flags_safe(), ENFORCE_ENCODING_NONE)); /* Save all 4 components to stack. */ fp_mop_load_double_arg(R0, R1, &s1_real); @@ -6077,7 +8840,7 @@ static void thumb_process_complex_op_double_mop(MachineOperand src1, MachineOper fp_mop_load_double_from_sp(R0, R1, 8); fp_mop_writeback_result(&d_imag, 1); - ot_check(th_add_sp_imm(R_SP, 32, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_add_imm(R_SP, R_SP, 32, flags_safe(), ENFORCE_ENCODING_NONE)); } /* Process complex addition/subtraction via MachineOperands. @@ -6104,28 +8867,28 @@ static void thumb_process_complex_op_mop(MachineOperand src1, MachineOperand src * [sp+4] = s1_imag * [sp+0] = s1_real */ - ot_check(th_sub_sp_imm(R_SP, 16, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_imm(R_SP, R_SP, 16, flags_safe(), ENFORCE_ENCODING_NONE)); /* Load and save each component to stack. */ fp_mop_load_arg(R0, &s1_real); - ot_check(th_str_imm(R0, R_SP, 0, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, 0, 6, ENFORCE_ENCODING_NONE); fp_mop_load_arg(R0, &s1_imag); - ot_check(th_str_imm(R0, R_SP, 4, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, 4, 6, ENFORCE_ENCODING_NONE); fp_mop_load_arg(R0, &s2_real); - ot_check(th_str_imm(R0, R_SP, 8, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, 8, 6, ENFORCE_ENCODING_NONE); fp_mop_load_arg(R0, &s2_imag); - ot_check(th_str_imm(R0, R_SP, 12, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, 12, 6, ENFORCE_ENCODING_NONE); /* Compute real part: func(a.real, b.real) */ - ot_check(th_ldr_imm(R0, R_SP, 0, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R1, R_SP, 8, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, R_SP, 0, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R1, R_SP, 8, 6, ENFORCE_ENCODING_NONE); fp_mop_do_bl(func_name); /* Save real result to stack slot 0 */ - ot_check(th_str_imm(R0, R_SP, 0, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, 0, 6, ENFORCE_ENCODING_NONE); /* Compute imag part: func(a.imag, b.imag) */ - ot_check(th_ldr_imm(R0, R_SP, 4, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R1, R_SP, 12, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, R_SP, 4, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R1, R_SP, 12, 6, ENFORCE_ENCODING_NONE); fp_mop_do_bl(func_name); /* R0 = imag result */ @@ -6134,8 +8897,8 @@ static void thumb_process_complex_op_mop(MachineOperand src1, MachineOperand src MachineOperand d_imag = mach_make_hi_half(&dest); /* R0 = imag result. Load real result from stack into R1. */ - ot_check(th_ldr_imm(R1, R_SP, 0, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_add_sp_imm(R_SP, 16, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R1, R_SP, 0, 6, ENFORCE_ENCODING_NONE); + ot_check(th_add_imm(R_SP, R_SP, 16, flags_safe(), ENFORCE_ENCODING_NONE)); /* Write back: R1 = real part, R0 = imag part. * Use safe writeback to avoid clobbering when dest overlaps R0/R1. */ @@ -6161,17 +8924,17 @@ static void thumb_process_complex_mul_mop(MachineOperand src1, MachineOperand sr MachineOperand s2_imag = mach_make_hi_half(&src2); /* Allocate 24 bytes on stack */ - ot_check(th_sub_sp_imm(R_SP, 24, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_imm(R_SP, R_SP, 24, flags_safe(), ENFORCE_ENCODING_NONE)); /* Save inputs to stack */ fp_mop_load_arg(R0, &s1_real); - ot_check(th_str_imm(R0, R_SP, 8, 6, ENFORCE_ENCODING_NONE)); /* a */ + ot_check_str_imm(R0, R_SP, 8, 6, ENFORCE_ENCODING_NONE); /* a */ fp_mop_load_arg(R0, &s1_imag); - ot_check(th_str_imm(R0, R_SP, 12, 6, ENFORCE_ENCODING_NONE)); /* b */ + ot_check_str_imm(R0, R_SP, 12, 6, ENFORCE_ENCODING_NONE); /* b */ fp_mop_load_arg(R0, &s2_real); - ot_check(th_str_imm(R0, R_SP, 16, 6, ENFORCE_ENCODING_NONE)); /* c */ + ot_check_str_imm(R0, R_SP, 16, 6, ENFORCE_ENCODING_NONE); /* c */ fp_mop_load_arg(R0, &s2_imag); - ot_check(th_str_imm(R0, R_SP, 20, 6, ENFORCE_ENCODING_NONE)); /* d */ + ot_check_str_imm(R0, R_SP, 20, 6, ENFORCE_ENCODING_NONE); /* d */ const int off_scratch0 = 0; const int off_scratch1 = 4; @@ -6181,47 +8944,47 @@ static void thumb_process_complex_mul_mop(MachineOperand src1, MachineOperand sr const int off_d = 20; /* Step 1: ac = a * c → scratch0 */ - ot_check(th_ldr_imm(R0, R_SP, off_a, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R1, R_SP, off_c, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, R_SP, off_a, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R1, R_SP, off_c, 6, ENFORCE_ENCODING_NONE); fp_mop_do_bl("__aeabi_fmul"); - ot_check(th_str_imm(R0, R_SP, off_scratch0, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, off_scratch0, 6, ENFORCE_ENCODING_NONE); /* Step 2: bd = b * d → scratch1 */ - ot_check(th_ldr_imm(R0, R_SP, off_b, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R1, R_SP, off_d, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, R_SP, off_b, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R1, R_SP, off_d, 6, ENFORCE_ENCODING_NONE); fp_mop_do_bl("__aeabi_fmul"); - ot_check(th_str_imm(R0, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE); /* Step 3: real = ac - bd → scratch0 */ - ot_check(th_ldr_imm(R0, R_SP, off_scratch0, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R1, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, R_SP, off_scratch0, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R1, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE); fp_mop_do_bl("__aeabi_fsub"); - ot_check(th_str_imm(R0, R_SP, off_scratch0, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, off_scratch0, 6, ENFORCE_ENCODING_NONE); /* Step 4: ad = a * d → scratch1 */ - ot_check(th_ldr_imm(R0, R_SP, off_a, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R1, R_SP, off_d, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, R_SP, off_a, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R1, R_SP, off_d, 6, ENFORCE_ENCODING_NONE); fp_mop_do_bl("__aeabi_fmul"); - ot_check(th_str_imm(R0, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE); /* Step 5: bc = b * c → off_a (no longer needed) */ - ot_check(th_ldr_imm(R0, R_SP, off_b, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R1, R_SP, off_c, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, R_SP, off_b, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R1, R_SP, off_c, 6, ENFORCE_ENCODING_NONE); fp_mop_do_bl("__aeabi_fmul"); - ot_check(th_str_imm(R0, R_SP, off_a, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, off_a, 6, ENFORCE_ENCODING_NONE); /* Step 6: imag = ad + bc → scratch1 */ - ot_check(th_ldr_imm(R0, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R1, R_SP, off_a, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R1, R_SP, off_a, 6, ENFORCE_ENCODING_NONE); fp_mop_do_bl("__aeabi_fadd"); - ot_check(th_str_imm(R0, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R0, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE); /* Load results and write back */ MachineOperand d_real = mach_make_lo_half(&dest); MachineOperand d_imag = mach_make_hi_half(&dest); - ot_check(th_ldr_imm(R0, R_SP, off_scratch0, 6, ENFORCE_ENCODING_NONE)); /* real */ - ot_check(th_ldr_imm(R1, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE)); /* imag */ - ot_check(th_add_sp_imm(R_SP, 24, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, R_SP, off_scratch0, 6, ENFORCE_ENCODING_NONE); /* real */ + ot_check_ldr_imm(R1, R_SP, off_scratch1, 6, ENFORCE_ENCODING_NONE); /* imag */ + ot_check(th_add_imm(R_SP, R_SP, 24, flags_safe(), ENFORCE_ENCODING_NONE)); complex_pair_writeback(&d_real, R0, &d_imag, R1); } @@ -6231,19 +8994,17 @@ static void thumb_process_complex_mul_mop(MachineOperand src1, MachineOperand sr * * __divsc3 calling convention (soft-float AAPCS, hidden return pointer): * R0 = hidden return pointer (8-byte buffer for result) - * R1 = a_re (float) - * R2 = a_im (float) - * R3 = b_re (float) - * [sp+0] = b_im (float, on stack) + * R1 = a_real (first float arg) + * R2 = a_imag (second float arg) + * R3 = b_real (third float arg) + * [sp+0] = b_imag (fourth float arg, on stack) * Result written to [R0+0..3] = real, [R0+4..7] = imag * - * Stack layout (24 bytes, 8-byte aligned): - * [sp+0] = b_im for __divsc3 stack arg (4 bytes) - * [sp+4] = a_re staging (4 bytes) - * [sp+8] = a_im staging (4 bytes) - * [sp+12] = b_re staging (4 bytes) - * [sp+16] = result buffer: real part (4 bytes) - * [sp+20] = result buffer: imag part (4 bytes) + * Stack layout (16 bytes): + * [sp+0] = b_imag for __divsc3 stack arg (4 bytes) + * [sp+4] = padding (4 bytes) + * [sp+8] = result buffer: real part (4 bytes) + * [sp+12] = result buffer: imag part (4 bytes) */ static void thumb_process_complex_div_mop(MachineOperand src1, MachineOperand src2, MachineOperand dest) { @@ -6252,37 +9013,58 @@ static void thumb_process_complex_div_mop(MachineOperand src1, MachineOperand sr MachineOperand s2_real = mach_make_lo_half(&src2); MachineOperand s2_imag = mach_make_hi_half(&src2); - /* Allocate 24 bytes (8-byte aligned). */ - ot_check(th_sub_sp_imm(R_SP, 24, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + /* In PIC mode, save {r9, r12} BEFORE allocating the call frame so that + * SP-relative offsets within the 16-byte area remain correct when + * __divsc3 reads its stack arg at [sp+0]. */ + if (text_and_data_separation) + ot_check(th_push((uint16_t)((1 << R9) | (1 << R12)))); - /* Stage all four operands to stack via R0 to avoid clobbering. */ - fp_mop_load_arg(R0, &s2_imag); - ot_check(th_str_imm(R0, R_SP, 0, 6, ENFORCE_ENCODING_NONE)); /* b_im → [sp+0] (stack arg) */ + /* Allocate 16 bytes on stack. */ + ot_check(th_sub_imm(R_SP, R_SP, 16, flags_safe(), ENFORCE_ENCODING_NONE)); + + /* Save all inputs to stack first to avoid register clobbering. */ fp_mop_load_arg(R0, &s1_real); - ot_check(th_str_imm(R0, R_SP, 4, 6, ENFORCE_ENCODING_NONE)); /* a_re → [sp+4] */ + ot_check_str_imm(R0, R_SP, 0, 6, ENFORCE_ENCODING_NONE); /* a_real */ fp_mop_load_arg(R0, &s1_imag); - ot_check(th_str_imm(R0, R_SP, 8, 6, ENFORCE_ENCODING_NONE)); /* a_im → [sp+8] */ + ot_check_str_imm(R0, R_SP, 4, 6, ENFORCE_ENCODING_NONE); /* a_imag */ fp_mop_load_arg(R0, &s2_real); - ot_check(th_str_imm(R0, R_SP, 12, 6, ENFORCE_ENCODING_NONE)); /* b_re → [sp+12] */ + ot_check_str_imm(R0, R_SP, 8, 6, ENFORCE_ENCODING_NONE); /* b_real */ + fp_mop_load_arg(R0, &s2_imag); + ot_check_str_imm(R0, R_SP, 12, 6, ENFORCE_ENCODING_NONE); /* b_imag */ - /* Load register args from staging area. */ - ot_check(th_ldr_imm(R1, R_SP, 4, 6, ENFORCE_ENCODING_NONE)); /* R1 = a_re */ - ot_check(th_ldr_imm(R2, R_SP, 8, 6, ENFORCE_ENCODING_NONE)); /* R2 = a_im */ - ot_check(th_ldr_imm(R3, R_SP, 12, 6, ENFORCE_ENCODING_NONE)); /* R3 = b_re */ + /* Rearrange stack for __divsc3 call: + * Need [sp+0] = b_imag, [sp+8..15] = result buffer. + * Currently [sp+0]=a_real, [sp+4]=a_imag, [sp+8]=b_real, [sp+12]=b_imag. + * Load R1-R3 from stack, then rearrange. */ + ot_check_ldr_imm(R1, R_SP, 0, 6, ENFORCE_ENCODING_NONE); /* R1 = a_real */ + ot_check_ldr_imm(R2, R_SP, 4, 6, ENFORCE_ENCODING_NONE); /* R2 = a_imag */ + ot_check_ldr_imm(R3, R_SP, 8, 6, ENFORCE_ENCODING_NONE); /* R3 = b_real */ + ot_check_ldr_imm(R0, R_SP, 12, 6, ENFORCE_ENCODING_NONE); /* R0 = b_imag */ + ot_check_str_imm(R0, R_SP, 0, 6, ENFORCE_ENCODING_NONE); /* [sp+0] = b_imag (stack arg) */ - /* R0 = pointer to result buffer at [sp+16]. */ - ot_check(th_add_sp_imm(R0, 16, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + /* R0 = pointer to result buffer at [sp+8]. */ + ot_check(th_add_imm(R0, R_SP, 8, flags_safe(), ENFORCE_ENCODING_NONE)); - /* Call __divsc3. */ - fp_mop_do_bl("__divsc3"); + /* Call __divsc3 (directly, not via fp_mop_do_bl which would add another + * push/pop of {r9, r12} and corrupt the stack arg layout). */ + { + Sym *sym = external_global_sym(tok_alloc_const("__divsc3"), &func_old_type); + MachineOperand func_mop = {0}; + func_mop.kind = MACH_OP_SYMBOL; + func_mop.u.sym.sym = sym; + func_mop.u.sym.addend = 0; + gcall_or_jump_mop(0, func_mop); + } /* Read result from buffer and write back to dest. */ MachineOperand d_real = mach_make_lo_half(&dest); MachineOperand d_imag = mach_make_hi_half(&dest); - ot_check(th_ldr_imm(R0, R_SP, 16, 6, ENFORCE_ENCODING_NONE)); /* real */ - ot_check(th_ldr_imm(R1, R_SP, 20, 6, ENFORCE_ENCODING_NONE)); /* imag */ + ot_check_ldr_imm(R0, R_SP, 8, 6, ENFORCE_ENCODING_NONE); /* real */ + ot_check_ldr_imm(R1, R_SP, 12, 6, ENFORCE_ENCODING_NONE); /* imag */ + ot_check(th_add_imm(R_SP, R_SP, 16, flags_safe(), ENFORCE_ENCODING_NONE)); - ot_check(th_add_sp_imm(R_SP, 24, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + if (text_and_data_separation) + ot_check(th_pop((uint16_t)((1 << R9) | (1 << R12)))); complex_pair_writeback(&d_real, R0, &d_imag, R1); } @@ -6314,8 +9096,14 @@ static void thumb_process_complex_div_double_mop(MachineOperand src1, MachineOpe MachineOperand d_real = mach_make_complex_real(&dest); MachineOperand d_imag = mach_make_complex_imag(&dest); + /* In PIC mode, save {r9, r12} BEFORE allocating the call frame so that + * SP-relative offsets within the 40-byte area remain correct when + * __divdc3 reads its stack args at [sp+0..23]. */ + if (text_and_data_separation) + ot_check(th_push((uint16_t)((1 << R9) | (1 << R12)))); + /* Allocate 40 bytes (8-byte aligned). */ - ot_check(th_sub_sp_imm(R_SP, 40, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_imm(R_SP, R_SP, 40, flags_safe(), ENFORCE_ENCODING_NONE)); /* Set up __divdc3 stack args (must be at lowest sp offsets). */ /* [sp+16] = b_im (src2 imag). */ @@ -6332,10 +9120,18 @@ static void thumb_process_complex_div_double_mop(MachineOperand src1, MachineOpe fp_mop_load_double_arg(R2, R3, &s1_real); /* R0 = pointer to result buffer at [sp+24]. */ - ot_check(th_add_sp_imm(R0, 24, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_add_imm(R0, R_SP, 24, flags_safe(), ENFORCE_ENCODING_NONE)); - /* Call __divdc3. */ - fp_mop_do_bl("__divdc3"); + /* Call __divdc3 (directly, not via fp_mop_do_bl which would add another + * push/pop of {r9, r12} and corrupt the stack arg layout). */ + { + Sym *sym = external_global_sym(tok_alloc_const("__divdc3"), &func_old_type); + MachineOperand func_mop = {0}; + func_mop.kind = MACH_OP_SYMBOL; + func_mop.u.sym.sym = sym; + func_mop.u.sym.addend = 0; + gcall_or_jump_mop(0, func_mop); + } /* Read result from buffer and write back to dest. */ fp_mop_load_double_from_sp(R0, R1, 24); @@ -6343,7 +9139,10 @@ static void thumb_process_complex_div_double_mop(MachineOperand src1, MachineOpe fp_mop_load_double_from_sp(R0, R1, 32); fp_mop_writeback_result(&d_imag, 1); - ot_check(th_add_sp_imm(R_SP, 40, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_add_imm(R_SP, R_SP, 40, flags_safe(), ENFORCE_ENCODING_NONE)); + + if (text_and_data_separation) + ot_check(th_pop((uint16_t)((1 << R9) | (1 << R12)))); } /* tcc_gen_machine_fp_mop: MachineOperand-based entry point for floating-point @@ -6405,7 +9204,7 @@ ST_FUNC void tcc_gen_machine_fp_mop(MachineOperand src1, MachineOperand src2, Ma fp_mop_load_double_arg(R0, R1, &src1); scr = get_scratch_reg_with_save((1u << R0) | (1u << R1)); load_full_const(scr.reg, PREG_NONE, 0x80000000, 0); - ot_check(th_eor_reg(R1, R1, scr.reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_eor_reg(R1, R1, scr.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&scr); fp_mop_writeback_result(&dest, 1); } @@ -6415,7 +9214,7 @@ ST_FUNC void tcc_gen_machine_fp_mop(MachineOperand src1, MachineOperand src2, Ma fp_mop_load_arg(R0, &src1); scr = get_scratch_reg_with_save(1u << R0); load_full_const(scr.reg, PREG_NONE, 0x80000000, 0); - ot_check(th_eor_reg(R0, R0, scr.reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_eor_reg(R0, R0, scr.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&scr); mach_writeback_dest(&dest, R0); } @@ -6590,7 +9389,7 @@ ST_FUNC void tcc_gen_machine_return_value_mop(MachineOperand src, TccIrOp op) MachineCodegenContext ctx = {0}; int src_reg = mach_ensure_in_reg(&ctx, &src, 0); if (src_reg != R0) - ot_check(th_mov_reg(R0, src_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(R0, src_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); mach_release_all(&ctx); } @@ -6606,6 +9405,9 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s /* Clear global symbol cache at function start */ thumb_gen_state.cached_global_sym = NULL; thumb_gen_state.cached_global_reg = PREG_NONE; + /* MOV-coalescing cache is per-function: register live ranges don't + * cross function boundaries. */ + mov_equiv_reset_all(); TCCIRState *ir = tcc_state->ir; /* Determine if LR needs saving */ @@ -6621,7 +9423,7 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s if (tcc_state->force_lr_save) tcc_state->need_frame_pointer = 1; - const int need_fp = (tcc_state->force_frame_pointer || tcc_state->need_frame_pointer || (stack_size > 0)); + const int need_fp = (tcc_state->force_frame_pointer || tcc_state->need_frame_pointer); tcc_state->need_frame_pointer = need_fp; /* Use two-phase push (standard frame record) when __builtin_return_address @@ -6654,6 +9456,8 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s } } + int push_align_pad = 0; /* 4 if push count is odd, absorbed into SUB SP */ + if (standard_frame_record) { /* ── Two-phase push: frame record {r7, lr} then callee-saved ── @@ -6666,12 +9470,14 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s frame_count++; } - /* Pad total to even count for 8-byte alignment (AAPCS). */ + /* Pad total to even count for 8-byte alignment (AAPCS). + * Standard frame record uses FP-relative negative offsets (e.g. static + * chain at [FP-4]), so the alignment gap must stay as SUB SP space + * below FP — cannot use a dummy push register here. */ int total = frame_count + callee_count; if (total % 2 != 0) { - callee_regs_local |= (1 << R12); - callee_count++; + push_align_pad = 4; } th_sym_t(); @@ -6688,7 +9494,7 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s ot_check(th_push(frame_regs)); /* MOV r7, sp — FP points at the frame record */ - if (!ot(th_add_imm(R_FP, R_SP, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE))) + if (!ot(th_add_imm(R_FP, R_SP, 0, flags_safe(), ENFORCE_ENCODING_NONE))) { fprintf(stderr, "compiler_error: prolog frame pointer setup failed\n"); exit(1); @@ -6720,11 +9526,24 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s registers_count++; } - /* Keep the total push size 8-byte aligned (AAPCS). */ + /* Keep the total push size 8-byte aligned (AAPCS). + * When no locals/FP (stack_size == 0, no frame pointer), pad by pushing + * a dummy low register (R3) — avoids SUB SP + ADD SP, saving 2 insns. + * When FP is used, alignment pad must stay as SUB SP space because + * FP-relative negative offsets may address that area. + * When locals exist, absorb the gap into SUB SP instead, keeping + * PUSH in 16-bit encoding (no high regs like R12). */ if (registers_count % 2 != 0) { - registers_to_push |= (1 << R12); - registers_count++; + if (stack_size == 0 && !need_fp) + { + registers_to_push |= (1 << R3); + registers_count++; + } + else + { + push_align_pad = 4; + } } th_sym_t(); @@ -6755,10 +9574,20 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s */ if (stack_size & 7) stack_size = (stack_size + 7) & ~7; + + /* allocated_stack_size is the portion of the stack used for locals/spills. + * It must NOT include the alignment pad — local offsets are computed + * relative to this value, and the pad sits below all addressable locals. */ allocated_stack_size = stack_size; + + /* total_stack_dealloc is the full amount to restore in the epilogue, + * including the alignment pad that sits below addressable locals. */ + int total_stack_dealloc = stack_size + push_align_pad; + epilogue_stack_dealloc = total_stack_dealloc; + stack_size = total_stack_dealloc; if (tcc_state->need_frame_pointer && !standard_frame_record) { - if (!ot(th_add_imm(R_FP, R_SP, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE))) + if (!ot(th_add_imm(R_FP, R_SP, 0, flags_safe(), ENFORCE_ENCODING_NONE))) { // todo mov fp, sp // load r12 immediate @@ -6772,6 +9601,17 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s gadd_sp(-stack_size); } + /* When FP is omitted, SP is lower by the full SUB SP amount (locals + + * alignment pad). Adjust offset_to_args so incoming stack parameters + * are found at the correct SP-relative position. */ + if (!need_fp) + offset_to_args += epilogue_stack_dealloc; + + /* However, local addressing uses allocated_stack_size (without pad). + * The pad sits at the top of the SUB SP region, right below pushed regs, + * so locals occupy SP+0 .. SP+allocated_stack_size-1, matching the same + * addresses as the old push-IP-for-alignment approach. */ + /* Save incoming static chain (R10) at fixed chain slot. * With two-phase push, callee-saved regs are below FP, so the chain * slot is at [FP - callee_push_size - 4] instead of [FP - 4]. @@ -6805,7 +9645,10 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s /* __gr_top = FP + offset_to_args (end of pushed r0-r3, start of stack args). * This is the top of the contiguous register save + stack arg area. */ - ot_check(th_add_imm(R12, R_FP, offset_to_args, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + { + const int fp_or_sp = tcc_state->need_frame_pointer ? R_FP : R_SP; + ot_check(th_add_imm(R12, fp_or_sp, offset_to_args, flags_safe(), ENFORCE_ENCODING_NONE)); + } tcc_gen_machine_store_to_stack(R12, -(callee_push_size + 20)); /* store the number of named-arg bytes consumed in r0-r3 */ @@ -6831,7 +9674,10 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s adj -= callee_push_size; /* Store stack args pointer (FP + offset_to_args = start of stack args area) */ - ot_check(th_add_imm(R_IP, R_FP, offset_to_args, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + { + const int fp_or_sp = tcc_state->need_frame_pointer ? R_FP : R_SP; + ot_check(th_add_imm(R_IP, fp_or_sp, offset_to_args, flags_safe(), ENFORCE_ENCODING_NONE)); + } tcc_gen_machine_store_to_stack_ex(R_IP, adj, (1u << R0) | (1u << R1) | (1u << R2) | (1u << R3)); /* Store r0-r3 at offsets +4, +8, +12, +16 from the block start */ @@ -7011,8 +9857,7 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s if (dst >= 0 && dst < 32 && (src_mask & (1u << dst))) continue; /* dst's current value is still needed as a source somewhere */ - ot_check( - th_mov_reg(dst, src, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(dst, src, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); moves[i] = moves[--move_count]; --i; progressed = 1; @@ -7042,8 +9887,7 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s * then walk dst<-src edges until we return to the start. */ const int start = moves[0].dst; - ot_check( - th_mov_reg(temp, start, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(temp, start, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); int cur = start; for (;;) @@ -7067,13 +9911,11 @@ ST_FUNC void tcc_gen_machine_prolog(int leaffunc, uint64_t used_registers, int s if (src == start) { - ot_check( - th_mov_reg(cur, temp, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(cur, temp, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); break; } - ot_check( - th_mov_reg(cur, src, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(cur, src, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); cur = src; } } @@ -7092,16 +9934,17 @@ ST_FUNC void tcc_gen_machine_epilog(int leaffunc) { /* ── Two-phase pop (mirrors two-phase push) ── */ /* Restore SP from FP (works even with alloca/VLA since FP is stable) */ - ot_check(th_mov_reg(R_SP, R_FP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); - /* SP = FP; callee-saved regs are below FP. Adjust SP down. */ - gadd_sp(-callee_push_size); + ot_check_mov_reg(R_SP, R_FP, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + /* SP = FP; callee-saved regs are below FP. Adjust SP down. + * R3 is free in the epilogue (not used for return values). */ + gadd_sp_ex(-callee_push_size, R3); ot_check(th_pop(callee_saved_regs)); /* SP is now at FP (pointing at frame record {r7, [lr]}) */ if (vararg_push_size > 0 && lr_saved) { /* Variadic: pop FP+LR, then skip over the pushed r0-r3 area */ ot_check(th_pop((1 << R_FP) | (1 << R_LR))); - gadd_sp(vararg_push_size); + gadd_sp_ex(vararg_push_size, R3); ot_check(th_bx_reg(R_LR)); } else if (lr_saved) @@ -7112,19 +9955,19 @@ ST_FUNC void tcc_gen_machine_epilog(int leaffunc) { ot_check(th_pop(1 << R_FP)); if (vararg_push_size > 0) - gadd_sp(vararg_push_size); + gadd_sp_ex(vararg_push_size, R3); ot_check(th_bx_reg(R_LR)); } } else if (tcc_state->need_frame_pointer) { /* ── Original single-push with FP: restore SP from FP, then pop all ── */ - ot_check(th_mov_reg(R_SP, R_FP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(R_SP, R_FP, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); if (vararg_push_size > 0 && lr_saved) { /* Variadic: pop all regs with LR (not PC), then skip pushed r0-r3 */ ot_check(th_pop(pushed_registers)); - gadd_sp(vararg_push_size); + gadd_sp_ex(vararg_push_size, R3); ot_check(th_bx_reg(R_LR)); } else if (lr_saved) @@ -7138,15 +9981,15 @@ ST_FUNC void tcc_gen_machine_epilog(int leaffunc) if (pushed_registers > 0) ot_check(th_pop(pushed_registers)); if (vararg_push_size > 0) - gadd_sp(vararg_push_size); + gadd_sp_ex(vararg_push_size, R3); ot_check(th_bx_reg(R_LR)); } } else { /* ── No frame pointer ── */ - if (allocated_stack_size > 0) - gadd_sp(allocated_stack_size); + if (epilogue_stack_dealloc > 0) + gadd_sp_ex(epilogue_stack_dealloc, R3); if (lr_saved) { pushed_registers |= 1 << R_PC; @@ -7166,6 +10009,13 @@ ST_FUNC void tcc_gen_machine_epilog(int leaffunc) thumb_free_call_sites(); } +ST_FUNC void tcc_gen_machine_finish_noreturn(void) +{ + thumb_gen_state.generating_function = 0; + th_literal_pool_generate(); + thumb_free_call_sites(); +} + /* Load Effective Address: compute the address of src1 into dest. * This is the explicit "address-of" operation for local variables/arrays. * Unlike LOAD which dereferences, LEA computes FP+offset into a register. @@ -7189,12 +10039,22 @@ ST_FUNC void tcc_gen_machine_lea_mop(MachineOperand dest, MachineOperand src) MachineCodegenContext ctx = {0}; int r; + /* When dest is a writable register, compute the address directly into it + * to avoid a scratch + redundant mov. mach_alloc_scratch is driven by the + * per-instruction live_regs bitmap which already marks the dest reg "live" + * (it's the def target), so it would otherwise pick a different reg and + * writeback would emit `mov dest_reg, scratch`. */ + int dest_reg = -1; + if (dest.kind == MACH_OP_REG && !dest.needs_deref && + dest.u.reg.r0 != (int)PREG_REG_NONE) + dest_reg = dest.u.reg.r0; + switch (src.kind) { case MACH_OP_PARAM_STACK: { /* Compute address of caller's argument slot. */ - r = mach_alloc_scratch(&ctx, 0); + r = (dest_reg >= 0) ? dest_reg : mach_alloc_scratch(&ctx, 0); tcc_machine_addr_of_stack_slot(r, src.u.param.offset, 1 /* is_param */); break; } @@ -7205,20 +10065,24 @@ ST_FUNC void tcc_gen_machine_lea_mop(MachineOperand dest, MachineOperand src) int chain_used = 0; uint32_t excl = 0; int base = resolve_chain_base(tcc_state->ir, src.u.chain.chain_index, excl, &chain_scratch, &chain_used); - r = mach_alloc_scratch(&ctx, excl | (1u << (uint32_t)base)); + /* dest_reg only usable if it doesn't collide with the chain base */ + if (dest_reg >= 0 && dest_reg != base && !(excl & (1u << (uint32_t)dest_reg))) + r = dest_reg; + else + r = mach_alloc_scratch(&ctx, excl | (1u << (uint32_t)base)); int32_t off = src.u.chain.offset; int sign = (off < 0); int abs_off = sign ? (int)(-off) : (int)off; if (abs_off == 0) { if (r != base) - ot_check(th_mov_reg((uint32_t)r, (uint32_t)base, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)r, (uint32_t)base, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } else { - thumb_opcode ins = sign ? th_sub_imm(r, base, abs_off, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE) - : th_add_imm(r, base, abs_off, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + thumb_opcode ins = sign ? th_sub_imm(r, base, abs_off, flags_safe(), ENFORCE_ENCODING_NONE) + : th_add_imm(r, base, abs_off, flags_safe(), ENFORCE_ENCODING_NONE); if (ins.size != 0) { ot_check(ins); @@ -7228,9 +10092,9 @@ ST_FUNC void tcc_gen_machine_lea_mop(MachineOperand dest, MachineOperand src) /* Large offset: load into a scratch and use register ADD/SUB */ ScratchRegAlloc off_sc = get_scratch_reg_with_save(excl | (1u << (uint32_t)r) | (1u << (uint32_t)base)); load_full_const(off_sc.reg, PREG_NONE, LFC_SPLIT(abs_off)); - ot_check(sign ? th_sub_reg(r, base, off_sc.reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, + ot_check(sign ? th_sub_reg(r, base, off_sc.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE) - : th_add_reg(r, base, off_sc.reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, + : th_add_reg(r, base, off_sc.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&off_sc); } @@ -7239,8 +10103,23 @@ ST_FUNC void tcc_gen_machine_lea_mop(MachineOperand dest, MachineOperand src) restore_scratch_reg(&chain_scratch); break; } + case MACH_OP_FRAME_ADDR: + /* Address of a local stack slot: ADD dest, sp, #offset. */ + r = (dest_reg >= 0) ? dest_reg : mach_alloc_scratch(&ctx, 0); + tcc_machine_addr_of_stack_slot(r, src.u.frame.offset, 0); + break; + case MACH_OP_SYMBOL: + if (!src.needs_deref) + { + Sym *raw_sym = src.u.sym.sym; + Sym *sym = raw_sym ? validate_sym_for_reloc(raw_sym) : NULL; + r = (dest_reg >= 0) ? dest_reg : mach_alloc_scratch(&ctx, 0); + tcc_machine_load_constant(r, PREG_REG_NONE, src.u.sym.addend, 0, sym); + break; + } + /* fallthrough for needs_deref */ default: - /* FRAME_ADDR, SYMBOL, REG: mach_ensure_in_reg already computes the address. */ + /* REG and other lvalue-y forms: mach_ensure_in_reg already computes the address. */ r = mach_ensure_in_reg(&ctx, &src, 0); break; } @@ -7281,18 +10160,19 @@ ST_FUNC void tcc_gen_machine_store_to_stack(int reg, int offset) */ ST_FUNC void tcc_gen_machine_store_to_stack_ex(int reg, int offset, uint32_t extra_exclude) { + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; int sign = (offset < 0); int abs_offset = sign ? -offset : offset; /* Try direct STR with immediate offset */ - if (!store_word_to_base(reg, R_FP, abs_offset, sign)) + if (!store_word_to_base(reg, base_reg, abs_offset, sign)) { /* Offset too large, use scratch register */ /* Don't reuse the source register as offset scratch, otherwise we'd * clobber the value before the STR (e.g. store -offset instead of value). */ - ScratchRegAlloc rr_alloc = th_offset_to_reg_ex(abs_offset, sign, (1u << reg) | (1u << R_FP) | extra_exclude); + ScratchRegAlloc rr_alloc = th_offset_to_reg_ex(abs_offset, sign, (1u << reg) | (1u << base_reg) | extra_exclude); int rr = rr_alloc.reg; - ot_check(th_str_reg(reg, R_FP, rr, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_str_reg(reg, base_reg, rr, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&rr_alloc); } } @@ -7328,9 +10208,13 @@ ST_FUNC void tcc_gen_machine_store_to_sp(int reg, int offset) */ static void gcall_or_jump_mop(int is_jmp, MachineOperand target) { + /* Tail-call: promote is_jmp so we emit B/BX instead of BL/BLX. */ + if (tail_call_pending) + is_jmp = 1; + if (target.kind == MACH_OP_SYMBOL) { - /* Direct call via BL with relocation. */ + /* Direct call via BL (or B.W for tail call) with relocation. */ Sym *sym = target.u.sym.sym; int32_t addend = target.u.sym.addend; Sym *validated_sym = sym ? validate_sym_for_reloc(sym) : NULL; @@ -7356,7 +10240,10 @@ static void gcall_or_jump_mop(int is_jmp, MachineOperand target) TRACE("gcall_or_jmp_mop: %d, ind: 0x%x, 0x%x", is_jmp, ind, imm); if (imm) { - ot_check(th_bl_t1(imm)); + if (is_jmp) + ot_check(th_b_t4((int32_t)imm)); + else + ot_check(th_bl_t1(imm)); if (!dry_run_state.active && reloc_sym) { int call_pos = ind - 4; @@ -7372,7 +10259,12 @@ static void gcall_or_jump_mop(int is_jmp, MachineOperand target) uint32_t imm = th_encbranch(ind, ind + (int32_t)target.u.imm.val); TRACE("gcall_or_jmp_mop(imm): %d, ind: 0x%x, 0x%x", is_jmp, ind, imm); if (imm) - ot_check(th_bl_t1(imm)); + { + if (is_jmp) + ot_check(th_b_t4((int32_t)imm)); + else + ot_check(th_bl_t1(imm)); + } return; } @@ -7392,12 +10284,7 @@ static void gcall_or_jump_mop(int is_jmp, MachineOperand target) if (is_jmp) { int r = mach_ensure_in_reg(&mctx, &adjusted, arg_regs); - if (r != R_IP) - { - thumb_shift no_shift = {THUMB_SHIFT_NONE, 0, THUMB_SHIFT_IMMEDIATE}; - ot_check(th_mov_reg(R_IP, r, FLAGS_BEHAVIOUR_NOT_IMPORTANT, no_shift, ENFORCE_ENCODING_NONE, false)); - } - ot_check(th_bx_reg(R_IP)); + ot_check(th_bx_reg(r)); } else { @@ -7461,6 +10348,7 @@ typedef struct ThumbArgMove int local_offset; /* valid when kind==THUMB_ARG_MOVE_LOCAL_ADDR */ int local_is_param; /* valid when kind==THUMB_ARG_MOVE_LOCAL_ADDR - if true, add offset_to_args */ int struct_word_count; /* valid when kind==THUMB_ARG_MOVE_STRUCT */ + int struct_src_align; /* struct natural alignment (bytes); gates source LDRD */ MachineOperand mop; /* valid when kind==THUMB_ARG_MOVE_MOP */ } ThumbArgMove; @@ -7473,6 +10361,9 @@ typedef struct CallGenContext MachineOperand *mops; int argc; int stack_size; + uint32_t arg_move_dst_mask; /* Registers that will be explicitly written by register arg moves. + * These are safe to clobber as scratch during stack arg placement + * because the subsequent register moves will overwrite them. */ } CallGenContext; static void thumb_emit_arg_move(const ThumbArgMove *m) @@ -7481,8 +10372,8 @@ static void thumb_emit_arg_move(const ThumbArgMove *m) { if (m->src_reg == m->dst_reg) return; - ot_check(th_mov_reg(m->dst_reg, m->src_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(m->dst_reg, m->src_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, + false); return; } @@ -7500,30 +10391,78 @@ static void thumb_emit_arg_move(const ThumbArgMove *m) int word_count = m->struct_word_count; int base_dst = m->dst_reg; - /* Get the struct base address into a scratch register */ - int base_addr_reg = get_struct_base_addr_mop(&m->mop, ARM_R12); + /* LDRD fast path for the common 2-word (8-byte) aggregate case sourced + * directly from a stack-backed location. Mirrors the LDRD path used by + * THUMB_ARG_MOVE_MOP for 64-bit scalars — Thumb-2 LDRD requires natural + * 4-byte alignment, which spill slots and the caller param stack both + * provide. Skips the scratch + per-word loads that would otherwise + * emit `add.w ip, sp, #N; ldr lo, [ip]; ldr hi, [ip, #4]` (3 insts). + * + * LDRD writes Rt before Rt2, so Rt2 (dst+1) must not equal the base + * register, otherwise the 2nd half reads from a clobbered base. */ + if (word_count == 2 && !m->mop.needs_deref && + (m->mop.kind == MACH_OP_SPILL || m->mop.kind == MACH_OP_PARAM_STACK)) + { + int raw_off = + (m->mop.kind == MACH_OP_SPILL) ? m->mop.u.spill.offset : m->mop.u.param.offset + offset_to_args; + int adjusted = (m->mop.kind == MACH_OP_SPILL) ? fp_adjust_local_offset(raw_off, 0) : raw_off; + int ldrd_base = tcc_state->need_frame_pointer ? R_FP : R_SP; + int ldrd_sign = (adjusted < 0); + int ldrd_abs_off = ldrd_sign ? -adjusted : adjusted; + int dst_hi = base_dst + 1; + if (dst_hi != ldrd_base && base_dst != ldrd_base && + try_ldrd_pair(base_dst, dst_hi, ldrd_base, ldrd_abs_off, ldrd_sign)) + { + return; + } + } - /* Load each word from the struct into consecutive target registers */ - for (int w = 0; w < word_count; ++w) + /* Get the struct base address into a scratch register */ + ScratchRegAlloc struct_scratch = get_scratch_reg_with_save(0); + int base_addr_reg = get_struct_base_addr_mop(&m->mop, struct_scratch.reg); + + /* Load each word from the struct into consecutive target registers. + * Adjacent word pairs use LDRD when the struct's natural alignment is >= 4 + * (so the source address is 4-byte aligned — LDRD faults otherwise) and + * neither destination register aliases the base (LDRD writes Rt then Rt2; + * an alias would read a clobbered base on the fallback path / be unsafe). */ + bool src_aligned = (m->struct_src_align >= 4); + int w = 0; + for (; w + 1 < word_count; ) + { + int dst = base_dst + w; + int dst_hi = base_dst + w + 1; + int offset = w * 4; + if (src_aligned && dst != base_addr_reg && dst_hi != base_addr_reg && + tcc_gen_machine_try_ldrd_base(dst, dst_hi, base_addr_reg, offset)) + { + w += 2; + continue; + } + /* Single-word load of this word; the next iteration handles w+1. */ + if (!load_word_from_base(dst, base_addr_reg, offset, 0)) + { + ScratchRegAlloc off_scratch = get_scratch_reg_with_save((1u << base_addr_reg) | (1u << dst)); + load_immediate(off_scratch.reg, offset, NULL, false); + ot_check(th_ldr_reg(dst, base_addr_reg, off_scratch.reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + restore_scratch_reg(&off_scratch); + } + w += 1; + } + /* Trailing odd word. */ + for (; w < word_count; ++w) { int dst = base_dst + w; int offset = w * 4; if (!load_word_from_base(dst, base_addr_reg, offset, 0)) { - /* Large offset - use R12 as scratch if it's not our base */ - if (base_addr_reg != ARM_R12) - { - load_immediate(ARM_R12, offset, NULL, false); - ot_check(th_ldr_reg(dst, base_addr_reg, ARM_R12, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - } - else - { - /* base_addr_reg is R12, need another approach */ - load_immediate(ARM_LR, offset, NULL, false); - ot_check(th_ldr_reg(dst, base_addr_reg, ARM_LR, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - } + ScratchRegAlloc off_scratch = get_scratch_reg_with_save((1u << base_addr_reg) | (1u << dst)); + load_immediate(off_scratch.reg, offset, NULL, false); + ot_check(th_ldr_reg(dst, base_addr_reg, off_scratch.reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + restore_scratch_reg(&off_scratch); } } + restore_scratch_reg(&struct_scratch); return; } @@ -7559,33 +10498,170 @@ static void thumb_emit_arg_move(const ThumbArgMove *m) uint32_t excl = (1u << m->dst_reg) | (1u << m->dst_reg_hi); base = mach_ensure_in_reg(&mctx, &addr, excl); } - load_from_base(m->dst_reg, PREG_REG_NONE, IROP_BTYPE_INT32, 0, 0, 0, (uint32_t)base); - load_from_base(m->dst_reg_hi, PREG_REG_NONE, IROP_BTYPE_INT32, 0, 4, 0, (uint32_t)base); + /* Use the 64-bit load_from_base path so it preserves the base when + * base == dst_reg (otherwise the lo-load would clobber it before + * the hi-load can use it). */ + load_from_base(m->dst_reg, m->dst_reg_hi, IROP_BTYPE_INT64, 0, 0, 0, (uint32_t)base); } else { - /* 64-bit: load lo and hi halves separately. */ - MachineOperand lo = mach_make_lo_half(&m->mop); - MachineOperand hi = mach_make_hi_half(&m->mop); - uint32_t excl = (1u << m->dst_reg) | (1u << m->dst_reg_hi); - int r_lo = mach_ensure_in_reg(&mctx, &lo, excl); - int r_hi = mach_ensure_in_reg(&mctx, &hi, excl | (1u << (uint32_t)r_lo)); - if (r_lo != m->dst_reg) - ot_check(th_mov_reg(m->dst_reg, r_lo, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); - if (r_hi != m->dst_reg_hi) - ot_check(th_mov_reg(m->dst_reg_hi, r_hi, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + /* Fast path: when the source is a stack-backed 64-bit value (spill + * slot or caller's param stack area) and the destination is a valid + * AAPCS register pair, emit a single LDRD straight into dst_reg / + * dst_reg_hi, skipping the scratch + MOV sequence that the generic + * lo/hi lowering below would produce. + * + * Stack spill slots and the caller-argument frame are guaranteed + * 8-byte aligned (AAPCS stack_align = 8, spill slots obey type + * alignment), so LDRD's 4-byte alignment requirement is satisfied. */ + int ldrd_base = -1; + int ldrd_abs_off = 0; + int ldrd_sign = 0; + int ldrd_ok = 0; + if (!m->mop.needs_deref && (m->mop.kind == MACH_OP_SPILL || m->mop.kind == MACH_OP_PARAM_STACK)) + { + int raw_off = (m->mop.kind == MACH_OP_SPILL) ? m->mop.u.spill.offset : m->mop.u.param.offset + offset_to_args; + int adjusted = (m->mop.kind == MACH_OP_SPILL) ? fp_adjust_local_offset(raw_off, 0) : raw_off; + ldrd_base = tcc_state->need_frame_pointer ? R_FP : R_SP; + ldrd_sign = (adjusted < 0); + ldrd_abs_off = ldrd_sign ? -adjusted : adjusted; + ldrd_ok = 1; + } + if (ldrd_ok && try_ldrd_pair(m->dst_reg, m->dst_reg_hi, ldrd_base, ldrd_abs_off, ldrd_sign)) + { + /* LDRD emitted. */ + } + else + { + /* 64-bit: load lo and hi halves separately. */ + MachineOperand lo = mach_make_lo_half(&m->mop); + MachineOperand hi = mach_make_hi_half(&m->mop); + uint32_t excl = (1u << m->dst_reg) | (1u << m->dst_reg_hi); + int r_lo = mach_ensure_in_reg(&mctx, &lo, excl); + int r_hi = mach_ensure_in_reg(&mctx, &hi, excl | (1u << (uint32_t)r_lo)); + if (r_lo != m->dst_reg) + ot_check_mov_reg(m->dst_reg, r_lo, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); + if (r_hi != m->dst_reg_hi) + ot_check_mov_reg(m->dst_reg_hi, r_hi, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); + } } } else { - /* 32-bit: single-register load. */ - uint32_t excl = (1u << m->dst_reg); - int r = mach_ensure_in_reg(&mctx, &m->mop, excl); - if (r != m->dst_reg) - ot_check(th_mov_reg(m->dst_reg, r, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, - false)); + /* 32-bit: prefer loading directly into dst_reg when the operand kind + * permits it, bypassing the scratch + MOV sequence that + * mach_ensure_in_reg would emit. Kinds that need an extra + * pointer-chain scratch beyond dst_reg (CHAIN_REL) fall through to + * the generic path. */ + const MachineOperand *mop = &m->mop; + const int dst = m->dst_reg; + int handled = 0; + + switch (mop->kind) + { + case MACH_OP_NONE: + tcc_machine_load_constant(dst, PREG_REG_NONE, 0, 0, NULL); + handled = 1; + break; + + case MACH_OP_REG: + if (mop->needs_deref) + { + /* LDR dst, [r0]; legal even when r0 == dst (loaded value + * just replaces the base). */ + load_from_base(dst, PREG_REG_NONE, mop->btype, (int)mop->is_unsigned, 0, 0, + (uint32_t)mop->u.reg.r0); + } + else if (mop->u.reg.r0 != dst) + { + ot_check_mov_reg(dst, mop->u.reg.r0, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); + } + handled = 1; + break; + + case MACH_OP_SPILL: + if (!mop->needs_deref) + { + tcc_machine_load_spill_slot(dst, mop->u.spill.offset); + } + else + { + /* LLOCAL: load pointer into dst, then dereference into dst. */ + tcc_machine_load_spill_slot(dst, mop->u.spill.offset); + load_from_base(dst, PREG_REG_NONE, mop->btype, (int)mop->is_unsigned, 0, 0, + (uint32_t)dst); + } + handled = 1; + break; + + case MACH_OP_PARAM_STACK: + { + const int adjusted = mop->u.param.offset + offset_to_args; + const int base_reg = tcc_state->need_frame_pointer ? R_FP : R_SP; + const int sign = (adjusted < 0); + const int abs_off = sign ? -adjusted : adjusted; + load_from_base(dst, PREG_REG_NONE, mop->btype, (int)mop->is_unsigned, abs_off, sign, + (uint32_t)base_reg); + handled = 1; + break; + } + + case MACH_OP_IMM: + tcc_machine_load_constant(dst, PREG_REG_NONE, mop->u.imm.val, 0, NULL); + handled = 1; + break; + + case MACH_OP_FRAME_ADDR: + if (!mop->needs_deref) + { + tcc_machine_addr_of_stack_slot(dst, mop->u.frame.offset, 0); + } + else + { + tcc_machine_addr_of_stack_slot(dst, mop->u.frame.offset, 0); + load_from_base(dst, PREG_REG_NONE, mop->btype, (int)mop->is_unsigned, 0, 0, + (uint32_t)dst); + } + handled = 1; + break; + + case MACH_OP_SYMBOL: + { + Sym *raw_sym = mop->u.sym.sym; + Sym *sym = raw_sym ? validate_sym_for_reloc(raw_sym) : NULL; + if (!mop->needs_deref) + { + tcc_machine_load_constant(dst, PREG_REG_NONE, mop->u.sym.addend, 0, sym); + } + else + { + tcc_machine_load_constant(dst, PREG_REG_NONE, 0, 0, sym); + const int32_t addend = mop->u.sym.addend; + const int sign = (addend < 0); + const int abs_off = sign ? (int)(-addend) : (int)addend; + load_from_base(dst, PREG_REG_NONE, mop->btype, (int)mop->is_unsigned, abs_off, sign, + (uint32_t)dst); + } + handled = 1; + break; + } + + default: + /* CHAIN_REL etc.: fall through to generic scratch + MOV. */ + break; + } + + if (!handled) + { + uint32_t excl = (1u << dst); + int r = mach_ensure_in_reg(&mctx, mop, excl); + if (r != dst) + ot_check_mov_reg(dst, r, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, + false); + } } mach_release_all(&mctx); return; @@ -7708,8 +10784,8 @@ static void thumb_emit_parallel_arg_moves(ThumbArgMove *moves, int move_count) } thumb_require_materialized_reg("thumb_emit_parallel_arg_moves", "tmp", tmp_alloc.reg); - ot_check(th_mov_reg(tmp_alloc.reg, moves[cyc].src_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(tmp_alloc.reg, moves[cyc].src_reg, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); moves[cyc].src_reg = tmp_alloc.reg; continue; } @@ -7732,10 +10808,10 @@ static void store_word_to_stack(int src_reg, int stack_offset) { if (!store_word_to_base(src_reg, ARM_SP, stack_offset, 0)) { - /* Offset too large - use alternate scratch register */ - int scratch = (src_reg != ARM_R12) ? ARM_R12 : ARM_LR; - load_immediate(scratch, stack_offset, NULL, false); - ot_check(th_str_reg(src_reg, ARM_SP, scratch, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ScratchRegAlloc sc = get_scratch_reg_with_save((1u << src_reg)); + load_immediate(sc.reg, stack_offset, NULL, false); + ot_check(th_str_reg(src_reg, ARM_SP, sc.reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + restore_scratch_reg(&sc); } } @@ -7744,19 +10820,10 @@ static void store_word_to_stack_safe(int src_reg, int stack_offset, int base_add { if (!store_word_to_base(src_reg, ARM_SP, stack_offset, 0)) { - int scratch = (base_addr_reg != ARM_R12) ? ARM_R12 : ARM_R0; - if (scratch == ARM_R0) - { - ot_check(th_push(1 << ARM_R0)); - load_immediate(ARM_R0, stack_offset, NULL, false); - ot_check(th_str_reg(src_reg, ARM_SP, ARM_R0, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - ot_check(th_pop(1 << ARM_R0)); - } - else - { - load_immediate(scratch, stack_offset, NULL, false); - ot_check(th_str_reg(src_reg, ARM_SP, scratch, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - } + ScratchRegAlloc sc = get_scratch_reg_with_save((1u << src_reg) | (1u << base_addr_reg)); + load_immediate(sc.reg, stack_offset, NULL, false); + ot_check(th_str_reg(src_reg, ARM_SP, sc.reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + restore_scratch_reg(&sc); } } @@ -7818,7 +10885,8 @@ static int get_struct_base_addr_mop(const MachineOperand *mop, int default_reg) /* Build register move for a struct argument (MOP path) */ static int build_reg_move_struct(ThumbArgMove *moves, int move_count, const MachineOperand *mop, - const TCCAbiArgLoc *loc, int base_reg, ThumbGenCallSite *call_site) + const TCCAbiArgLoc *loc, int base_reg, ThumbGenCallSite *call_site, + int src_align) { int words = loc->reg_count; if (words > 0 && words <= 4) @@ -7828,6 +10896,7 @@ static int build_reg_move_struct(ThumbArgMove *moves, int move_count, const Mach .dst_reg = base_reg, .mop = *mop, .struct_word_count = words, + .struct_src_align = src_align, }; } for (int w = 0; w < words && w < loc->reg_count; w++) @@ -7923,49 +10992,158 @@ static int build_reg_move_32bit(ThumbArgMove *moves, int move_count, const Machi } /* Place a struct argument on stack (MOP path) */ -static void place_stack_arg_struct(const MachineOperand *mop, const TCCAbiArgLoc *loc, int stack_offset) +/* Load one struct word at [base_addr_reg + off] into `reg`, falling back to a + * register-offset load when `off` exceeds the LDR immediate range. */ +static void load_struct_word_into(int reg, int base_addr_reg, int off) +{ + if (!load_word_from_base(reg, base_addr_reg, off, 0)) + { + load_immediate(reg, off, NULL, false); + ot_check(th_ldr_reg(reg, base_addr_reg, reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + } +} + +/* Copy a (possibly split) struct argument's stack portion into the outgoing + * argument area. `src_align` is the struct's natural alignment in bytes. + * + * Adjacent word pairs are copied with LDRD/STRD instead of two LDR/STR. The + * destination is the outgoing arg area — SP-relative with a word-multiple + * offset and SP 8-byte aligned at the call boundary — so STRD is always + * alignment-safe. LDRD additionally requires the *source* address to be + * 4-byte aligned, which holds exactly when the struct's natural alignment is + * >= 4 (the stack portion starts at base + words_in_regs*4, a word multiple). */ +static void place_stack_arg_struct(const MachineOperand *mop, const TCCAbiArgLoc *loc, int stack_offset, + int src_align) { int words_in_regs = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->reg_count : 0; int struct_src_offset = words_in_regs * 4; int struct_size = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->stack_size : loc->size; int words = (struct_size + 3) / 4; - int base_addr_reg = get_struct_base_addr_mop(mop, ARM_R12); + ScratchRegAlloc struct_sc = get_scratch_reg_with_save(0); + int base_addr_reg = get_struct_base_addr_mop(mop, struct_sc.reg); + + /* Second data register (besides LR) for paired LDRD/STRD. find_call_scratch + * never pushes (SP-relative store offsets must stay valid) and we exclude LR + * and the struct base; an R_IP last-resort result is a permanent scratch and + * safe to clobber. */ + int data2 = find_call_scratch((1u << ARM_LR) | (1u << (uint32_t)base_addr_reg), 0); + bool can_pair = (words >= 2 && data2 != ARM_LR && data2 != base_addr_reg && data2 >= 0 && + data2 <= R_LR && data2 != R_SP); + bool src_aligned = (src_align >= 4); + + int w = 0; + if (can_pair) + { + for (; w + 1 < words; w += 2) + { + int src_off = struct_src_offset + w * 4; + int dst_off = stack_offset + w * 4; + + if (!(src_aligned && tcc_gen_machine_try_ldrd_base(ARM_LR, data2, base_addr_reg, src_off))) + { + load_struct_word_into(ARM_LR, base_addr_reg, src_off); + load_struct_word_into(data2, base_addr_reg, src_off + 4); + } + if (!tcc_gen_machine_try_strd_base(ARM_LR, data2, ARM_SP, dst_off)) + { + store_word_to_stack_safe(ARM_LR, dst_off, base_addr_reg); + store_word_to_stack_safe(data2, dst_off + 4, base_addr_reg); + } + } + } - for (int w = 0; w < words; ++w) + /* Trailing odd word, or every word when pairing was unavailable. */ + for (; w < words; ++w) { int src_off = struct_src_offset + w * 4; int dst_off = stack_offset + w * 4; + load_struct_word_into(ARM_LR, base_addr_reg, src_off); + store_word_to_stack_safe(ARM_LR, dst_off, base_addr_reg); + } + restore_scratch_reg(&struct_sc); +} + +/* Find a free scratch register via liveness (no push/pop). + * Returns the register number, or R_IP as last resort. + * Must not push/pop since SP-relative offsets for stack args would shift. + * + * Unlike tcc_ls_find_free_scratch_reg (which refuses callee-saved regs), + * this also considers callee-saved registers already pushed in the prologue. + * Those are safe to clobber because the epilogue will restore them. + * + * arg_move_dst_mask: registers that will be explicitly written by register + * arg moves AFTER stack arg placement. These are safe to clobber even if + * currently live, because the subsequent moves will overwrite them. + * Pass 0 when not in a pre-move stack arg placement context. */ +static int find_call_scratch(uint32_t extra_exclude, uint32_t arg_move_dst_mask) +{ + TCCIRState *ir = tcc_state->ir; + uint32_t exclude = scratch_global_exclude | extra_exclude; + if (ir) + { + /* Standard path: try caller-saved regs via liveness */ + int reg = tcc_ls_find_free_scratch_reg(&ir->ls, ir->codegen_instruction_idx, exclude, ir->leaffunc); + if (reg != PREG_NONE && reg >= 0 && reg < 16 && reg != R_SP && reg != R_PC) + return reg; - /* Load word from struct into LR */ - if (!load_word_from_base(ARM_LR, base_addr_reg, src_off, 0)) + /* Extended path: try callee-saved regs that are already pushed in prologue + * AND not live at this instruction (so we won't clobber active values). */ + if (ir->ls.live_regs_by_instruction && ir->codegen_instruction_idx >= 0 && + ir->codegen_instruction_idx < ir->ls.live_regs_by_instruction_size) { - load_immediate(ARM_LR, src_off, NULL, false); - ot_check(th_ldr_reg(ARM_LR, base_addr_reg, ARM_LR, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + uint32_t live = ir->ls.live_regs_by_instruction[ir->codegen_instruction_idx]; + uint32_t callee_pushed = pushed_registers & 0x0FF0u; /* R4-R11 that were pushed */ + uint32_t candidates = callee_pushed & ~live & ~exclude; + if (candidates) + { + /* Prefer low registers (R4-R7) for 16-bit encoding */ + int r = (int)__builtin_ctz(candidates); + return r; + } } - store_word_to_stack_safe(ARM_LR, dst_off, base_addr_reg); + /* Pre-move path: registers that are destinations of explicit (non-identity) + * register arg moves can be used as scratch — the moves will overwrite them. + * Prefer low registers for 16-bit encoding. */ + if (arg_move_dst_mask) + { + uint32_t candidates = arg_move_dst_mask & ~exclude; + if (candidates) + { + int r = (int)__builtin_ctz(candidates); + if (r >= 0 && r < 16 && r != R_SP && r != R_PC) + return r; + } + } } + return R_IP; } /* Place a 64-bit argument on stack (MOP path) */ -static void place_stack_arg_64bit(const MachineOperand *mop, int stack_offset, TCCIRState *ir) +static void place_stack_arg_64bit(const MachineOperand *mop, int stack_offset, TCCIRState *ir, + uint32_t arg_move_dst_mask) { int lo_offset = stack_offset; int hi_offset = stack_offset + 4; if (mop->kind == MACH_OP_REG && !mop->needs_deref && thumb_is_hw_reg(mop->u.reg.r0) && thumb_is_hw_reg(mop->u.reg.r1)) { + /* If either register is R0-R3, the value was already stored by + * presave_stack_args_from_arg_regs before the register shuffle. */ + if (mop->u.reg.r0 <= ARM_R3 || mop->u.reg.r1 <= ARM_R3) + return; store_word_to_stack(mop->u.reg.r0, lo_offset); store_word_to_stack(mop->u.reg.r1, hi_offset); } else if (mop->kind == MACH_OP_IMM) { uint64_t imm64 = (uint64_t)mop->u.imm.val; - load_immediate(ARM_R12, (uint32_t)imm64, NULL, false); - store_word_to_stack(ARM_R12, lo_offset); - load_immediate(ARM_R12, (uint32_t)(imm64 >> 32), NULL, false); - store_word_to_stack(ARM_R12, hi_offset); + int scr = find_call_scratch(0, arg_move_dst_mask); + load_immediate(scr, (uint32_t)imm64, NULL, false); + store_word_to_stack(scr, lo_offset); + load_immediate(scr, (uint32_t)(imm64 >> 32), NULL, false); + store_word_to_stack(scr, hi_offset); } else if (mop->needs_deref && mop->kind != MACH_OP_PARAM_STACK) { @@ -7979,13 +11157,14 @@ static void place_stack_arg_64bit(const MachineOperand *mop, int stack_offset, T * loads directly from the caller's argument area (ignores needs_deref), * so the else path with mach_make_lo/hi_half handles it correctly. * - * The base register must NOT be ARM_R12 because both halves are loaded - * into ARM_R12 (the scratch destination). If base == ARM_R12 the first - * load would clobber the pointer before the second load can use it. */ + * The base register must NOT be the scratch because both halves are + * loaded into the scratch. If base == scratch the first load would + * clobber the pointer before the second load can use it. */ + int scr = find_call_scratch(0, arg_move_dst_mask); int base; MachineCodegenContext mctx = {0}; bool need_release = false; - if (mop->kind == MACH_OP_REG && mop->u.reg.r0 != ARM_R12) + if (mop->kind == MACH_OP_REG && mop->u.reg.r0 != scr) { base = mop->u.reg.r0; } @@ -7995,13 +11174,13 @@ static void place_stack_arg_64bit(const MachineOperand *mop, int stack_offset, T addr.needs_deref = false; addr.is_64bit = false; addr.btype = IROP_BTYPE_INT32; - base = mach_ensure_in_reg(&mctx, &addr, (1u << ARM_R12)); + base = mach_ensure_in_reg(&mctx, &addr, (1u << scr)); need_release = true; } - load_from_base(ARM_R12, PREG_REG_NONE, IROP_BTYPE_INT32, 0, 0, 0, (uint32_t)base); - store_word_to_stack(ARM_R12, lo_offset); - load_from_base(ARM_R12, PREG_REG_NONE, IROP_BTYPE_INT32, 0, 4, 0, (uint32_t)base); - store_word_to_stack(ARM_R12, hi_offset); + load_from_base(scr, PREG_REG_NONE, IROP_BTYPE_INT32, 0, 0, 0, (uint32_t)base); + store_word_to_stack(scr, lo_offset); + load_from_base(scr, PREG_REG_NONE, IROP_BTYPE_INT32, 0, 4, 0, (uint32_t)base); + store_word_to_stack(scr, hi_offset); if (need_release) mach_release_all(&mctx); } @@ -8041,34 +11220,41 @@ static void place_stack_arg_32bit(const MachineOperand *mop, int stack_offset, C } else { - /* Register-indirect: load through the register, then store to stack. */ - ot_check(th_ldr_imm(ARM_R12, mop->u.reg.r0, 0, 6, ENFORCE_ENCODING_NONE)); - store_word_to_stack(ARM_R12, stack_offset); + /* Register-indirect: load through the register, then store to stack. + * Must use btype-aware load so that byte/short values are properly + * zero/sign-extended (LDRB/LDRH) instead of always doing a word LDR. */ + int scr = find_call_scratch(1u << mop->u.reg.r0, ctx->arg_move_dst_mask); + load_from_base(scr, PREG_REG_NONE, mop->btype, mop->is_unsigned, 0, 0, mop->u.reg.r0); + store_word_to_stack(scr, stack_offset); } break; case MACH_OP_IMM: - load_immediate(ARM_R12, (uint32_t)mop->u.imm.val, NULL, false); - store_word_to_stack(ARM_R12, stack_offset); + { + int scr = find_call_scratch(0, ctx->arg_move_dst_mask); + load_immediate(scr, (uint32_t)mop->u.imm.val, NULL, false); + store_word_to_stack(scr, stack_offset); break; + } case MACH_OP_SYMBOL: { + int scr = find_call_scratch(0, ctx->arg_move_dst_mask); Sym *sym = mop->u.sym.sym ? validate_sym_for_reloc(mop->u.sym.sym) : NULL; if (mop->needs_deref) { /* Load value from global symbol address. */ - load_immediate(ARM_R12, 0, sym, false); + load_immediate(scr, 0, sym, false); int32_t addend = mop->u.sym.addend; int sign = (addend < 0); int abs_off = sign ? -addend : addend; - load_from_base(ARM_R12, PREG_REG_NONE, mop->btype, mop->is_unsigned, abs_off, sign, ARM_R12); + load_from_base(scr, PREG_REG_NONE, mop->btype, mop->is_unsigned, abs_off, sign, scr); } else { - load_immediate(ARM_R12, (uint32_t)mop->u.sym.addend, sym, false); + load_immediate(scr, (uint32_t)mop->u.sym.addend, sym, false); } - store_word_to_stack(ARM_R12, stack_offset); + store_word_to_stack(scr, stack_offset); break; } @@ -8136,7 +11322,9 @@ static int build_register_arg_moves(CallGenContext *ctx, ThumbArgMove *reg_moves } else { - move_count = build_reg_move_struct(reg_moves, move_count, mop, loc, base_reg, ctx->call_site); + int src_align = 0; + irop_type_size_align(*arg, &src_align); + move_count = build_reg_move_struct(reg_moves, move_count, mop, loc, base_reg, ctx->call_site, src_align); } } else if (is_64bit) @@ -8165,72 +11353,322 @@ static void presave_stack_args_from_arg_regs(CallGenContext *ctx) if (loc->kind == TCC_ABI_LOC_REG) continue; - if (bt == IROP_BTYPE_STRUCT || mop->is_64bit || mop->is_complex) + if (bt == IROP_BTYPE_STRUCT || mop->is_complex) + continue; + if (mop->kind != MACH_OP_REG || mop->needs_deref) + continue; + + if (mop->is_64bit) + { + /* Pre-save 64-bit register pair if either register is in R0-R3. + * The register arg shuffle will overwrite R0-R3, so both halves + * must be stored to the stack before that happens. */ + int r0 = mop->u.reg.r0; + int r1 = mop->u.reg.r1; + if ((thumb_is_hw_reg(r0) && r0 <= ARM_R3) || (thumb_is_hw_reg(r1) && r1 <= ARM_R3)) + { + int stack_offset = loc->stack_off; + if (thumb_is_hw_reg(r0)) + store_word_to_stack(r0, stack_offset); + if (thumb_is_hw_reg(r1)) + store_word_to_stack(r1, stack_offset + 4); + } + } + else + { + /* Only pre-save if operand is in R0-R3 (arg registers that get overwritten). */ + if (mop->u.reg.r0 <= ARM_R3) + { + store_word_to_stack(mop->u.reg.r0, loc->stack_off); + } + } + } +} + +/* True for a plain 32-bit immediate argument destined for a stack slot. */ +static int is_simple_imm_stack_arg(const TCCAbiArgLoc *loc, const MachineOperand *mop) +{ + return loc->kind != TCC_ABI_LOC_REG && mop->kind == MACH_OP_IMM && !mop->is_64bit && + mop->btype != IROP_BTYPE_STRUCT && !mop->is_complex; +} + +/* One collected immediate stack store, for the grouped/windowed emission path. */ +typedef struct StackImmArg +{ + int off; + uint32_t val; +} StackImmArg; + +/* Order by 4 KB window, then value, then offset. Grouping equal values within a + * window lets each distinct value be materialized once per window instead of once + * per argument; the window ordering bounds base-register re-materialization. */ +static int stack_imm_arg_cmp(const void *a, const void *b) +{ + const StackImmArg *x = (const StackImmArg *)a; + const StackImmArg *y = (const StackImmArg *)b; + int wx = x->off & ~0xFFF, wy = y->off & ~0xFFF; + if (wx != wy) + return wx < wy ? -1 : 1; + if (x->val != y->val) + return x->val < y->val ? -1 : 1; + if (x->off != y->off) + return x->off < y->off ? -1 : 1; + return 0; +} + +/* Emit a single non-simple-immediate stack argument (struct/complex/64-bit, or a + * non-immediate 32-bit source). Extracted from place_stack_arguments so both the + * inline and the grouped emission paths share identical handling. */ +static void place_one_stack_arg(CallGenContext *ctx, const TCCAbiArgLoc *loc, const MachineOperand *mop, + int stack_offset, int arg_index) +{ + if (mop->btype == IROP_BTYPE_STRUCT || mop->is_complex) + { + /* Complex values in a register pair: store the stack portion directly + * from registers instead of treating the pair as a memory pointer. */ + if (mop->is_complex && mop->kind == MACH_OP_REG && !mop->needs_deref && mop->is_64bit) + { + int words_in_regs = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->reg_count : 0; + int stack_bytes = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->stack_size : loc->size; + int stack_words = (stack_bytes + 3) / 4; + int pair_regs[2] = {mop->u.reg.r0, mop->u.reg.r1}; + for (int w = 0; w < stack_words; w++) + { + int reg_idx = words_in_regs + w; + if (reg_idx < 2) + store_word_to_stack(pair_regs[reg_idx], stack_offset + w * 4); + } + } + else if (mop->is_complex && mop->kind == MACH_OP_IMM) + { + /* Complex immediate on stack: split 64-bit packed value into words. */ + const uint64_t imm64 = (uint64_t)mop->u.imm.val; + int words_in_regs = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->reg_count : 0; + int stack_bytes = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->stack_size : loc->size; + int stack_words = (stack_bytes + 3) / 4; + int scr = find_call_scratch(0, ctx->arg_move_dst_mask); + for (int w = 0; w < stack_words; w++) + { + int word_idx = words_in_regs + w; + uint32_t word_val = (uint32_t)(imm64 >> (word_idx * 32)); + load_immediate(scr, word_val, NULL, false); + store_word_to_stack(scr, stack_offset + w * 4); + } + } + else + { + /* Struct's natural alignment gates source-side LDRD (see + * place_stack_arg_struct). Default conservatively to 1 (no LDRD) when + * the originating IR operand is unavailable. */ + int src_align = 1; + if (ctx->args && arg_index >= 0 && arg_index < ctx->argc) + { + int a = 0; + irop_type_size_align(ctx->args[arg_index], &a); + if (a > 0) + src_align = a; + } + place_stack_arg_struct(mop, loc, stack_offset, src_align); + } + } + else if (mop->is_64bit) + place_stack_arg_64bit(mop, stack_offset, tcc_state->ir, ctx->arg_move_dst_mask); + else + place_stack_arg_32bit(mop, stack_offset, ctx); +} + +/* Inline (original-order) emission of every stack argument. Used for the common + * case where stack args stay within the immediate-offset store range. */ +static void place_stack_arguments_inline(CallGenContext *ctx) +{ + int cached_imm_reg = -1; + uint32_t cached_imm_val = 0; + + for (int i = 0; i < ctx->argc; ++i) + { + const TCCAbiArgLoc *loc = &ctx->layout->locs[i]; + const MachineOperand *mop = &ctx->mops[i]; + + if (loc->kind == TCC_ABI_LOC_REG) + continue; + + int stack_offset = loc->stack_off; + + if (is_simple_imm_stack_arg(loc, mop)) + { + uint32_t val = (uint32_t)mop->u.imm.val; + int scr = find_call_scratch(0, ctx->arg_move_dst_mask); + if (cached_imm_reg != scr || cached_imm_val != val) + { + load_immediate(scr, val, NULL, false); + cached_imm_reg = scr; + cached_imm_val = val; + } + store_word_to_stack(scr, stack_offset); + continue; + } + + cached_imm_reg = -1; + place_one_stack_arg(ctx, loc, mop, stack_offset, i); + } +} + +/* Place all stack arguments. + * + * For the common case the inline path is byte-identical to before. When simple + * 32-bit immediate stack args spill beyond the immediate-offset store range + * (offset > 4092) — exactly where the naive path emits movw+indexed (3 instr/arg) + * — a windowed/grouped path is used instead: + * - a base register holds sp+window so each store is a single str.w [rb,#disp] + * (re-materialized only when crossing a 4 KB window, ~once / 1024 stores); + * - the immediate stores are reordered by (window, value) so each distinct + * value is loaded once per window rather than once per argument. + * Reordering pure-immediate stores to distinct, non-aliasing stack slots leaves + * the pre-call stack image unchanged, so it is observationally identical. */ +static void place_stack_arguments(CallGenContext *ctx) +{ + int max_imm_off = -1; + int imm_count = 0; + for (int i = 0; i < ctx->argc; ++i) + { + const TCCAbiArgLoc *loc = &ctx->layout->locs[i]; + const MachineOperand *mop = &ctx->mops[i]; + if (is_simple_imm_stack_arg(loc, mop)) + { + imm_count++; + if (loc->stack_off > max_imm_off) + max_imm_off = loc->stack_off; + } + } + + if (!(max_imm_off > 4092 && imm_count >= 2) || getenv("TCC_NO_STACK_ARG_GROUP")) + { + place_stack_arguments_inline(ctx); + return; + } + + /* --- Windowed/grouped path --- */ + + /* Pass 1: emit every non-simple-immediate stack arg first, in original order. */ + for (int i = 0; i < ctx->argc; ++i) + { + const TCCAbiArgLoc *loc = &ctx->layout->locs[i]; + const MachineOperand *mop = &ctx->mops[i]; + if (loc->kind == TCC_ABI_LOC_REG || is_simple_imm_stack_arg(loc, mop)) continue; - - /* Only pre-save if operand is in R0-R3 (arg registers that get overwritten). */ - if (mop->kind == MACH_OP_REG && !mop->needs_deref && mop->u.reg.r0 <= ARM_R3) - { - store_word_to_stack(mop->u.reg.r0, loc->stack_off); + place_one_stack_arg(ctx, loc, mop, loc->stack_off, i); + } + + /* Reserve two stable scratch registers: rv (holds the value) and rb (base + * address). Both are free across the whole argument-setup region — the call's + * register args are moved in afterwards, and find_call_scratch only returns + * registers that are dead here or are arg-move destinations (overwritten + * later). Prefer the lower-numbered register for rv so value materialization + * can use the 16-bit MOVS encoding. */ + int s0 = find_call_scratch(0, ctx->arg_move_dst_mask); + int s1 = find_call_scratch(1u << s0, ctx->arg_move_dst_mask); + if (s1 < s0) + { + int t = s0; + s0 = s1; + s1 = t; + } + int rv = s0, rb = s1; + int regs_ok = (rv != rb && rv >= 0 && rv < 16 && rb >= 0 && rb < 16 && rv != ARM_SP && rv != ARM_PC && + rb != ARM_SP && rb != ARM_PC); + + StackImmArg *items = regs_ok ? tcc_malloc(sizeof(StackImmArg) * imm_count) : NULL; + if (!items) + { + /* Out of stable registers (or alloc failure): emit the immediate args inline. */ + int cached_imm_reg = -1; + uint32_t cached_imm_val = 0; + for (int i = 0; i < ctx->argc; ++i) + { + const TCCAbiArgLoc *loc = &ctx->layout->locs[i]; + const MachineOperand *mop = &ctx->mops[i]; + if (!is_simple_imm_stack_arg(loc, mop)) + continue; + uint32_t val = (uint32_t)mop->u.imm.val; + int scr = find_call_scratch(0, ctx->arg_move_dst_mask); + if (cached_imm_reg != scr || cached_imm_val != val) + { + load_immediate(scr, val, NULL, false); + cached_imm_reg = scr; + cached_imm_val = val; + } + store_word_to_stack(scr, loc->stack_off); } + return; } -} -/* Place all stack arguments */ -static void place_stack_arguments(CallGenContext *ctx) -{ + int n = 0; for (int i = 0; i < ctx->argc; ++i) { const TCCAbiArgLoc *loc = &ctx->layout->locs[i]; const MachineOperand *mop = &ctx->mops[i]; - - if (loc->kind == TCC_ABI_LOC_REG) + if (!is_simple_imm_stack_arg(loc, mop)) continue; + items[n].off = loc->stack_off; + items[n].val = (uint32_t)mop->u.imm.val; + n++; + } + qsort(items, n, sizeof(StackImmArg), stack_imm_arg_cmp); - int stack_offset = loc->stack_off; + uint32_t saved_excl = scratch_global_exclude; + scratch_global_exclude |= (1u << rv) | (1u << rb); + + int cur_window = -1; /* base offset of the window currently in rb */ + int have_val = 0; + uint32_t cur_val = 0; + for (int k = 0; k < n; ++k) + { + int off = items[k].off; + uint32_t val = items[k].val; + int window = off & ~0xFFF; + int disp = off & 0xFFF; + int base_reg; - if (mop->btype == IROP_BTYPE_STRUCT || mop->is_complex) + if (window == 0) { - /* Complex values in a register pair: store the stack portion directly - * from registers instead of treating the pair as a memory pointer. */ - if (mop->is_complex && mop->kind == MACH_OP_REG && !mop->needs_deref && mop->is_64bit) - { - int words_in_regs = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->reg_count : 0; - int stack_bytes = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->stack_size : loc->size; - int stack_words = (stack_bytes + 3) / 4; - int pair_regs[2] = {mop->u.reg.r0, mop->u.reg.r1}; - for (int w = 0; w < stack_words; w++) - { - int reg_idx = words_in_regs + w; - if (reg_idx < 2) - store_word_to_stack(pair_regs[reg_idx], stack_offset + w * 4); - } - } - else if (mop->is_complex && mop->kind == MACH_OP_IMM) + base_reg = ARM_SP; /* sp+0 — store directly off sp, no base register needed */ + } + else + { + if (window != cur_window) { - /* Complex immediate on stack: split 64-bit packed value into words. */ - const uint64_t imm64 = (uint64_t)mop->u.imm.val; - int words_in_regs = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->reg_count : 0; - int stack_bytes = (loc->kind == TCC_ABI_LOC_REG_STACK) ? loc->stack_size : loc->size; - int stack_words = (stack_bytes + 3) / 4; - for (int w = 0; w < stack_words; w++) + thumb_opcode op = th_add_imm(rb, ARM_SP, (uint32_t)window, flags_safe(), ENFORCE_ENCODING_NONE); + if (is_valid_opcode(op)) + ot(op); + else { - int word_idx = words_in_regs + w; - uint32_t word_val = (uint32_t)(imm64 >> (word_idx * 32)); - load_immediate(ARM_R12, word_val, NULL, false); - store_word_to_stack(ARM_R12, stack_offset + w * 4); + load_full_const(rb, PREG_NONE, (uint32_t)window, 0); + ot_check(th_add_reg(rb, ARM_SP, rb, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); } } - else - { - place_stack_arg_struct(mop, loc, stack_offset); - } + base_reg = rb; + } + cur_window = window; + + if (!have_val || cur_val != val) + { + load_immediate(rv, val, NULL, false); + have_val = 1; + cur_val = val; + } + + if (!store_word_to_base(rv, base_reg, disp, 0)) + { + /* disp <= 4092 always encodes via str.w; keep a correct fallback regardless. */ + ScratchRegAlloc sc = get_scratch_reg_with_save((1u << rv) | (1u << base_reg)); + load_immediate(sc.reg, (uint32_t)off, NULL, false); + ot_check(th_str_reg(rv, ARM_SP, sc.reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + restore_scratch_reg(&sc); } - else if (mop->is_64bit) - place_stack_arg_64bit(mop, stack_offset, tcc_state->ir); - else - place_stack_arg_32bit(mop, stack_offset, ctx); } + + scratch_global_exclude = saved_excl; + tcc_free(items); } /* Handle return value after call (MOP path). @@ -8306,46 +11744,58 @@ ST_FUNC void tcc_gen_machine_func_call_mop(MachineOperand func_mop, IROperand ca .stack_size = stack_size, }; - /* === Preserve nested call registers (R0-R3) === */ + /* Set tail_call_pending if this is a tail-call-only function. */ + if (ir->tail_call_only) + tail_call_pending = 1; + + /* === Preserve nested call registers (R0-R3, R9) via STR to frame === + * Instead of PUSH/POP (which moves SP), store to the pre-reserved + * nested-call save area in the frame. SP stays fixed. */ int arg_regs_in_use = call_site->registers_map & 0x0F; - int arg_regs_push_mask = arg_regs_in_use; - int arg_regs_push_count = __builtin_popcount((unsigned)arg_regs_push_mask); + int arg_regs_save_mask = tail_call_pending ? 0 : (arg_regs_in_use); /* On yasos with no-pic-data-is-text-relative, R9 holds the GOT base and is * caller-saved. Save it alongside the nested-call argument registers so it - * is restored after the callee returns. It must be pushed *before* the - * stack-argument area is reserved so the callee sees the correct SP layout. - */ - if (text_and_data_separation) - { - arg_regs_push_mask |= (1 << ARM_R9); - arg_regs_push_count++; - } - - /* AAPCS requires 8-byte SP alignment - pad with R12 if needed */ - if (arg_regs_push_count & 1) - { - arg_regs_push_mask |= (1 << ARM_R12); - arg_regs_push_count++; - } + * is restored after the callee returns. */ + if (!tail_call_pending && text_and_data_separation) + arg_regs_save_mask |= (1 << ARM_R9); - if (arg_regs_push_mask) - { - ot_check(th_push((uint16_t)arg_regs_push_mask)); - call_site->used_stack_size += arg_regs_push_count * 4; + /* Save nested-call registers to pre-reserved frame area via STR. + * The nested save area is at [SP + ir->call_outgoing_size]. + * + * In functions with VLA/alloca the runtime SP has moved below the static + * frame, so [SP + off] would land inside the dynamically allocated memory + * (the callee then overwrites the saved R9/GOT base with user data). + * Address the slots FP-relative instead: the static SP equals + * FP - callee_push_size - epilogue_stack_dealloc. */ + int nested_save_sp_offset = ir ? ir->call_outgoing_size : 0; + int nested_save_fp_bias = tcc_state->func_dynamic_sp + ? -(callee_push_size + epilogue_stack_dealloc) + : 0; + int nested_save_count = 0; + if (arg_regs_save_mask) + { + for (int r = 0; r < 16; r++) + { + if (arg_regs_save_mask & (1 << r)) + { + if (tcc_state->func_dynamic_sp) + tcc_gen_machine_store_to_stack_ex( + r, nested_save_sp_offset + nested_save_count * 4 + nested_save_fp_bias, + arg_regs_save_mask); + else + store_word_to_stack(r, nested_save_sp_offset + nested_save_count * 4); + nested_save_count++; + } + } } - /* === Reserve stack space === */ + /* Stack args are already placed in the pre-reserved outgoing area at [SP+0]. + * No need to adjust SP — the area was allocated in the prologue. */ stack_size = (stack_size + 7) & ~7; /* 8-byte align */ - if (stack_size > 0) - { - gadd_sp(-stack_size); - call_site->used_stack_size += stack_size; - } - /* === Block R0-R3 from scratch allocation during argument setup === */ + /* === Save scratch exclusion state === */ uint32_t saved_scratch_exclude = scratch_global_exclude; - scratch_global_exclude |= 0x0F; /* R0-R3 */ /* === Pre-save indirect call target if it resides in an argument register === * @@ -8361,7 +11811,7 @@ ST_FUNC void tcc_gen_machine_func_call_mop(MachineOperand func_mop, IROperand ca func_mop.u.reg.r0 <= 3) { /* Find a free register outside R0-R3, R12 (stack-arg scratch), SP, PC. */ - uint32_t exclude = scratch_global_exclude | (1u << R_IP) | (1u << R_SP) | (1u << R_PC); + uint32_t exclude = scratch_global_exclude | 0x0Fu | (1u << R_IP) | (1u << R_SP) | (1u << R_PC); int safe_reg = PREG_NONE; if (ir) safe_reg = tcc_ls_find_free_scratch_reg(&ir->ls, ir->codegen_instruction_idx, exclude, ir->leaffunc); @@ -8373,8 +11823,8 @@ ST_FUNC void tcc_gen_machine_func_call_mop(MachineOperand func_mop, IROperand ca /* Move function pointer from arg reg to safe reg. */ thumb_shift no_shift = {THUMB_SHIFT_NONE, 0, THUMB_SHIFT_IMMEDIATE}; - ot_check(th_mov_reg(safe_reg, func_mop.u.reg.r0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, no_shift, ENFORCE_ENCODING_NONE, - false)); + ot_check_mov_reg(safe_reg, func_mop.u.reg.r0, flags_safe(), no_shift, ENFORCE_ENCODING_NONE, + false); /* Rewrite func_mop to point to the safe register. */ func_mop.kind = MACH_OP_REG; @@ -8387,40 +11837,134 @@ ST_FUNC void tcc_gen_machine_func_call_mop(MachineOperand func_mop, IROperand ca } } - /* === Build and execute register argument moves === */ + /* === Build register argument moves === */ ThumbArgMove reg_moves[8]; int reg_move_count = build_register_arg_moves(&ctx, reg_moves); + /* === Compute arg_move_dst_mask and identity-move protection === + * + * Stack arguments are placed BEFORE register argument moves so that + * R0-R3 (non-identity move destinations) can serve as scratch registers + * for stack arg stores, saving 2 bytes per store (16-bit vs 32-bit encoding). + * + * arg_move_dst_mask: registers written by explicit (non-identity) reg moves. + * These will be overwritten by the moves, so they're safe as scratch. + * identity_mask: registers where the reg allocator already placed the correct + * value (no move entry created). These MUST be protected from clobbering. */ + { + uint32_t arg_move_dst_mask = 0; + for (int i = 0; i < reg_move_count; i++) + arg_move_dst_mask |= arg_move_write_set(®_moves[i]); + + /* Compute all register-arg destination registers from the ABI layout. */ + uint32_t all_reg_arg_dst = 0; + for (int i = 0; i < ctx.argc; i++) + { + const TCCAbiArgLoc *loc = &ctx.layout->locs[i]; + if (loc->kind == TCC_ABI_LOC_REG || loc->kind == TCC_ABI_LOC_REG_STACK) + { + int base = ARM_R0 + loc->reg_base; + for (int w = 0; w < loc->reg_count; w++) + all_reg_arg_dst |= (1u << (base + w)); + } + } + + /* Protect identity-move registers (value already in place, no move entry). */ + uint32_t identity_mask = all_reg_arg_dst & ~arg_move_dst_mask; + scratch_global_exclude |= identity_mask; + + ctx.arg_move_dst_mask = arg_move_dst_mask; + } + /* Pre-save stack args sourcing from R0-R3 before register shuffle */ presave_stack_args_from_arg_regs(&ctx); + /* === Place stack arguments FIRST === + * R0-R3 that are non-identity move destinations can be used as scratch + * via arg_move_dst_mask in find_call_scratch, yielding 16-bit STR + * encodings instead of 32-bit STR.W with R12. */ + place_stack_arguments(&ctx); + + /* === Now block all R0-R3 and emit register argument moves === */ + scratch_global_exclude |= 0x0F; thumb_emit_parallel_arg_moves(reg_moves, reg_move_count); - /* === Place stack arguments === */ - place_stack_arguments(&ctx); + /* === Tail call: tear down frame before branching === */ + if (tail_call_pending) + { + /* For indirect calls, the target may be in a callee-saved register that + * will be popped. Move it to R_IP (R12) before frame teardown. */ + if (func_mop.kind == MACH_OP_REG && !func_mop.needs_deref && + func_mop.u.reg.r0 >= R4 && func_mop.u.reg.r0 <= R11) + { + ot_check_mov_reg(R_IP, func_mop.u.reg.r0, flags_safe(), + THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + func_mop.u.reg.r0 = R_IP; + } + if (epilogue_stack_dealloc > 0) + gadd_sp_ex(epilogue_stack_dealloc, R_IP); + /* Only pop true callee-saved registers (R4-R11). R0-R3 may be pushed + * for alignment but now hold call arguments — popping them would clobber + * the prepared args. Skip non-callee slots FIRST (they sit at lower + * addresses after push), then pop callee-saved from correct position. */ + uint32_t callee_pop = pushed_registers & 0x0FF0u; /* R4-R11 only */ + uint32_t non_callee = pushed_registers & ~callee_pop & ~(1u << R_LR) & ~(1u << R_PC); + int non_callee_bytes = __builtin_popcount(non_callee) * 4; + if (non_callee_bytes > 0) + gadd_sp_ex(non_callee_bytes, R_IP); + if (callee_pop) + ot_check(th_pop(callee_pop)); + } /* === Emit call === */ gcall_or_jump_mop(0, func_mop); /* Restore scratch register exclusion */ scratch_global_exclude = saved_scratch_exclude; - /* === Cleanup === */ - if (stack_size > 0) + if (tail_call_pending) { - gadd_sp(stack_size); - call_site->used_stack_size -= stack_size; + tail_call_pending = 0; + goto call_cleanup; } - if (arg_regs_push_mask) + handle_return_value_mop(&dest_mop, drop_value); + + /* === Cleanup: restore nested-call saved registers via LDR === */ + if (arg_regs_save_mask) { - ot_check(th_pop((uint16_t)arg_regs_push_mask)); - call_site->used_stack_size -= arg_regs_push_count * 4; + /* Match the FP-relative addressing used by the save side in functions + * with VLA/alloca (runtime SP has moved; see the save block above). */ + const int restore_base = tcc_state->func_dynamic_sp ? R_FP : ARM_SP; + int restore_idx = 0; + for (int r = 0; r < 16; r++) + { + if (arg_regs_save_mask & (1 << r)) + { + int off = nested_save_sp_offset + restore_idx * 4 + nested_save_fp_bias; + int sign = (off < 0); + int abs_off = sign ? -off : off; + /* R9 restore in text_and_data_separation mode needs the write guard + * temporarily lifted — the safety check blocks all R9 writes, but + * we are legitimately restoring it after a call. */ + if (r == ARM_R9 && text_and_data_separation) + allow_r9_write = 1; + if (!load_word_from_base(r, restore_base, abs_off, sign)) + { + ScratchRegAlloc osc = get_scratch_reg_with_save((1u << r)); + load_immediate(osc.reg, off, NULL, false); + ot_check(th_ldr_reg(r, restore_base, osc.reg, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + restore_scratch_reg(&osc); + } + if (r == ARM_R9 && text_and_data_separation) + allow_r9_write = 0; + restore_idx++; + } + } } - handle_return_value_mop(&dest_mop, drop_value); - call_site->registers_map &= ~0x0F; /* Clear R0-R3 */ +call_cleanup: if (args) tcc_free(args); if (mops) @@ -8429,62 +11973,107 @@ ST_FUNC void tcc_gen_machine_func_call_mop(MachineOperand func_mop, IROperand ca tcc_free(layout.locs); } -ST_FUNC void tcc_gen_machine_jump_mop(TccIrOp op, int32_t target_ir, int ir_idx) +/* Check if a backward branch to target_ir can use a narrow encoding. + * For backward branches, the target code address is already known in + * ir_to_code_mapping (it was emitted earlier in this pass). + * current_ir_idx is the IR index of the branch instruction itself. + * Returns 1 if narrow encoding fits, 0 otherwise. */ +static int can_narrow_backward_branch(int32_t target_ir, int is_conditional, int current_ir_idx) +{ + TCCIRState *ir = tcc_state->ir; + if (!ir || !ir->ir_to_code_mapping) + return 0; + if (target_ir < 0 || target_ir >= ir->ir_to_code_mapping_size) + return 0; + + /* Forward branches have uninitialized ir_to_code_mapping[target_ir] (still 0). + * Only narrow genuinely backward branches where target was already emitted. */ + if (target_ir >= current_ir_idx) + return 0; + + int target_addr = (int)ir->ir_to_code_mapping[target_ir]; + /* ind is the current code address where the branch will be emitted. + * offset = target - (source + 4) for Thumb pipeline. */ + int offset = target_addr - ind - 4; + + /* Only backward branches (negative offset) are safe to narrow here */ + if (offset >= 0) + return 0; + + return is_conditional ? branch_fits_t1(offset) : branch_fits_t2(offset); +} + +ST_FUNC int tcc_gen_machine_jump_mop(TccIrOp op, int32_t target_ir, int ir_idx) { if (dry_run_state.active) { - /* Record branch for later optimization analysis */ - branch_opt_record(ir_idx, ind, target_ir, 0); /* 0 = unconditional */ /* Emit 32-bit placeholder for code size tracking */ ot_check(th_b_t4(0)); - return; + return 4; } - /* Real pass: check if we determined this can be 16-bit */ - BranchEncoding enc = branch_opt_get_encoding(ir_idx); - if (enc == BRANCH_ENC_16BIT) + /* Real pass: try narrow encoding for backward branches */ + if (can_narrow_backward_branch(target_ir, 0, ir_idx)) { - ot_check(th_b_t2(0)); /* 16-bit placeholder */ + ot_check(th_b_t2(0)); /* 16-bit unconditional */ + return 2; } else { - ot_check(th_b_t4(0)); /* 32-bit placeholder */ + ot_check(th_b_t4(0)); /* 32-bit unconditional */ + return 4; } } -ST_FUNC void tcc_gen_machine_conditional_jump_mop(int32_t condition, TccIrOp op, int32_t target_ir, int ir_idx) +ST_FUNC int tcc_gen_machine_conditional_jump_mop(int32_t condition, TccIrOp op, int32_t target_ir, int ir_idx) { int cond = mapcc(condition); if (dry_run_state.active) { - /* Record branch for later optimization analysis */ - branch_opt_record(ir_idx, ind, target_ir, 1); /* 1 = conditional */ /* Emit 32-bit placeholder for code size tracking */ ot_check(th_b_t3(cond, 0)); - return; + return 4; } - /* Real pass: check if we determined this can be 16-bit */ - BranchEncoding enc = branch_opt_get_encoding(ir_idx); - if (enc == BRANCH_ENC_16BIT) + /* Real pass: try narrow encoding for backward branches */ + if (can_narrow_backward_branch(target_ir, 1, ir_idx)) { ot_check(th_b_t1(cond, 0)); /* 16-bit conditional */ + return 2; } else { ot_check(th_b_t3(cond, 0)); /* 32-bit conditional */ + return 4; } } +/* Return the maximum bytes a pending literal pool dump could insert. + * Used for CBZ/CBNZ distance safety checks. */ +ST_FUNC int tcc_gen_machine_pending_pool_size(void) +{ + int count = dry_run_state.active ? dry_run_literal_pool_count : thumb_gen_state.literal_pool_count; + return count * 4 + (count > 0 ? 2 : 0); /* entries + possible alignment padding */ +} + +/* Emit CBZ/CBNZ: combined compare-zero + branch in a single 16-bit instruction. + * rn must be r0-r7, target must be forward within 126 bytes. + * Returns the instruction size (always 2). */ +ST_FUNC int tcc_gen_machine_cbz_jump_mop(int rn, int nonzero, int32_t target_ir, int ir_idx) +{ + ot_check(th_cbz((uint16_t)rn, 0, (uint32_t)nonzero)); + return 2; +} + /* Set static chain register: MOV R10, R7 (FP) */ ST_FUNC void tcc_gen_machine_set_chain(void) { int chain_reg = architecture_config.static_chain_reg; thumb_shift no_shift = {THUMB_SHIFT_NONE, 0, THUMB_SHIFT_IMMEDIATE}; /* MOV chain_reg, R_FP (R7 on ARM Thumb) */ - ot_check(th_mov_reg(chain_reg, R_FP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, no_shift, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(chain_reg, R_FP, flags_safe(), no_shift, ENFORCE_ENCODING_NONE, false); } /* Reload static chain register from the chain save slot at [FP - 4]. @@ -8521,7 +12110,7 @@ ST_FUNC void tcc_gen_machine_init_chain_slot(IROperand src1) load_full_const(scratch.reg, PREG_NONE, 0, 0); /* STR R7, [scratch, #0] — store frame pointer into chain slot */ - ot_check(th_str_imm(R_FP, scratch.reg, 0, 6, ENFORCE_ENCODING_NONE)); + ot_check_str_imm(R_FP, scratch.reg, 0, 6, ENFORCE_ENCODING_NONE); /* Restore scratch register */ restore_scratch_reg(&scratch); @@ -8563,35 +12152,51 @@ ST_FUNC void tcc_gen_machine_vla_mop(MachineOperand dest, MachineOperand src1, M tcc_error("compiler_error: VLA alloc picked SP as temp"); /* r = SP - r (subtract size from stack pointer) */ - ot_check(th_sub_sp_reg(r, r, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_sub_reg(r, R_SP, r, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); if (align > 1) { /* Align down: r &= ~(align-1). Try immediate BIC first. */ - if (!ot(th_bic_imm(r, r, (uint32_t)(align - 1), FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE))) + if (!ot(th_bic_imm(r, r, (uint32_t)(align - 1), flags_safe(), ENFORCE_ENCODING_NONE))) { /* Fallback: materialize mask in a scratch register. */ int mask_reg = mach_alloc_scratch(&ctx, 1u << (uint32_t)r); if (!ot(th_generic_mov_imm(mask_reg, align - 1))) load_full_const(mask_reg, PREG_NONE, LFC_SPLIT(align - 1)); - ot_check(th_bic_reg(r, r, mask_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); + ot_check(th_bic_reg(r, r, mask_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); } } - ot_check(th_mov_reg(R_SP, r, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(R_SP, r, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); break; } case TCCIR_OP_VLA_SP_SAVE: - /* Save current SP to the destination save slot via IP as intermediary. */ - ot_check(th_mov_reg(R_IP, R_SP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); - mach_writeback_dest(&dest, R_IP); + { + /* Fast path: when dest is a register-allocated vreg, copy SP directly into + * its register — saves the scratch-mov + writeback-mov pair that the + * generic path would emit. Triggered by the alloca-load-fwd IR pass + * which rewrites a `VLA_SP_SAVE slot; LOAD vreg <- slot` pair into a + * single `VLA_SP_SAVE vreg`. */ + if (dest.kind == MACH_OP_REG && !dest.needs_deref && + dest.u.reg.r0 != (int)PREG_REG_NONE) + { + ot_check_mov_reg((uint32_t)dest.u.reg.r0, R_SP, flags_safe(), + THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + break; + } + /* Save current SP to the destination save slot via a scratch register. */ + ScratchRegAlloc sp_scratch = get_scratch_reg_with_save(0); + ot_check_mov_reg(sp_scratch.reg, R_SP, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, + false); + mach_writeback_dest(&dest, sp_scratch.reg); + restore_scratch_reg(&sp_scratch); break; + } case TCCIR_OP_VLA_SP_RESTORE: { /* Load the saved SP from src1 into a register, then restore SP. */ int saved_sp = mach_ensure_in_reg(&ctx, &src1, 0); - ot_check( - th_mov_reg(R_SP, saved_sp, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(R_SP, saved_sp, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); break; } default: @@ -8600,6 +12205,354 @@ ST_FUNC void tcc_gen_machine_vla_mop(MachineOperand dest, MachineOperand src1, M mach_release_all(&ctx); } +/* Block copy from const data section to stack using LDM/STM. + * dest = STACKOFF (destination stack offset, is_local=1) + * src = SYMREF (anonymous symbol in rodata) + * size = number of bytes to copy (must be multiple of 4) + * + * Generated code for 20 bytes (5 words): + * LDR r_src, [PC, #lit_pool] ; load rodata address + * ADD r_dst, FP/SP, #stack_off ; compute stack dest + * LDMIA r_src!, {r0, r1, r2, r3} ; load 4 words from rodata + * STMIA r_dst!, {r0, r1, r2, r3} ; store 4 words to stack + * LDR r0, [r_src] ; load remaining word + * STR r0, [r_dst] ; store remaining word + */ +/* tcc_gen_machine_select_mop: Conditional select using ITE block. + * Emits: ITE ; MOV dest, then_val; MOV dest, else_val + * + * For simple register/immediate operands, this is 3 instructions (ITE + 2 MOVs) + * instead of 5+ (B.cond + MOV + B + MOV + ...) with branching. + */ +/* Check if a MachineOperand can be materialized in exactly one instruction. + * Returns 1 for: IMM (any value), REG (no deref), SYMBOL (no deref), SPILL (no deref). + * Returns 0 for: multi-instruction sequences (deref, chain_rel, etc). */ +static int select_can_inline(const MachineOperand *op) +{ + switch (op->kind) + { + case MACH_OP_IMM: + return 1; /* MOV/MOVW/MVN or literal pool LDR — always 1 instruction */ + case MACH_OP_REG: + return !op->needs_deref; /* MOV reg is 1 instr; deref needs LDR too */ + case MACH_OP_SYMBOL: + /* A symbol address is a single literal-pool LDR only in the plain, + * non-PIC, non-separated layout. Under PIC/PIE or text+data separation it + * expands to a multi-instruction GOT/GOTOFF sequence (ldr GOT-slot; add r9; + * ldr; ...). Emitting that "inline" inside an IT block predicates only the + * FIRST instruction and lets the remaining ones run unconditionally, which + * clobbers the select result with the else-operand's address. Force + * pre-materialization into a scratch register in those modes. */ + return !op->needs_deref && !pic && !text_and_data_separation; + case MACH_OP_SPILL: + return !op->needs_deref; /* LDR from stack is 1 instr; deref (VT_LLOCAL) needs 2 */ + case MACH_OP_FRAME_ADDR: + return 1; /* ADD reg, FP, #off is 1 instr */ + default: + return 0; + } +} + +/* Emit a single-instruction materialization of 'op' into 'reg'. + * Caller must ensure select_can_inline(op) returned 1. */ +static void select_emit_inline(MachineCodegenContext *ctx, const MachineOperand *op, int reg) +{ + switch (op->kind) + { + case MACH_OP_IMM: + { + thumb_opcode imm_op = th_generic_mov_imm((uint32_t)reg, (int)op->u.imm.val); + if (imm_op.size != 0) + ot(imm_op); + else + load_full_const(reg, PREG_NONE, LFC_SPLIT(op->u.imm.val)); + break; + } + case MACH_OP_REG: + ot_check_mov_reg(reg, op->u.reg.r0, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, + true); + break; + case MACH_OP_SYMBOL: + { + Sym *raw_sym = op->u.sym.sym; + Sym *sym = raw_sym ? validate_sym_for_reloc(raw_sym) : NULL; + tcc_machine_load_constant(reg, PREG_REG_NONE, op->u.sym.addend, 0, sym); + break; + } + case MACH_OP_SPILL: + tcc_machine_load_spill_slot(reg, op->u.spill.offset); + break; + case MACH_OP_FRAME_ADDR: + tcc_machine_addr_of_stack_slot(reg, op->u.frame.offset, 0); + break; + default: + tcc_error("compiler_error: select_emit_inline: unhandled kind %d", (int)op->kind); + break; + } +} + +ST_FUNC void tcc_gen_machine_select_mop(MachineOperand then_val, MachineOperand else_val, MachineOperand dest, + int cond_code) +{ + MachineCodegenContext mctx = {0}; + + int cond = mapcc(cond_code); + + /* Get destination register */ + int dest_reg = mach_get_dest_reg(&mctx, &dest, 0); + uint32_t excl = (1u << (uint32_t)dest_reg); + + /* Determine if each operand can be materialized in exactly one instruction. + * If so, we can emit it directly inside the ITE block into dest_reg, + * saving scratch registers and pre-materialization instructions. + * + * Emitting inside the IT block is preferred because: + * - It avoids flag clobber (MOVS before ITE would destroy CMP flags) + * - It saves scratch registers (no pre-materialization needed) + * - It produces smaller code */ + int then_inline = select_can_inline(&then_val); + int else_inline = select_can_inline(&else_val); + + int then_reg = -1, else_reg = -1; + + /* Pre-materialize operands that need multi-instruction sequences. + * These are loaded into scratch registers BEFORE the ITE block. */ + if (!then_inline) + { + then_reg = mach_ensure_in_reg(&mctx, &then_val, excl); + excl |= (1u << (uint32_t)then_reg); + } + if (!else_inline) + { + else_reg = mach_ensure_in_reg(&mctx, &else_val, excl); + excl |= (1u << (uint32_t)else_reg); + } + + /* Identity-then shortcut: if the then-value is already in dest_reg, the + * predicated mov would be `movXX dest, dest` — a real instruction inside an + * IT block (the usual elision in ot_check_mov_reg is suppressed by in_it). + * Emit `IT ` + the else mov instead. Saves one instruction. */ + int then_is_identity = 0; + if (then_inline && then_val.kind == MACH_OP_REG && !then_val.needs_deref && + (int)then_val.u.reg.r0 == dest_reg) + then_is_identity = 1; + else if (!then_inline && then_reg == dest_reg) + then_is_identity = 1; + + if (then_is_identity) + { + int inv_cond = cond ^ 1; + th_literal_pool_reserve_upcoming_bytes(8); /* IT(2) + instr(2-4) */ + ot_check(th_it((uint16_t)inv_cond, 0x8u)); /* IT , single insn */ + if (else_inline) + select_emit_inline(&mctx, &else_val, dest_reg); + else + ot_check_mov_reg(dest_reg, else_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, true); + mach_writeback_dest(&dest, dest_reg); + mach_release_all(&mctx); + return; + } + + /* ITE mask: the second instruction uses the opposite condition. + * mask encoding: bit3 = E_flag for 2nd instr, bit2 = end marker. + * E_flag = opposite of cond[0], so: mask = ((cond[0]^1) << 3) | (1 << 2) */ + uint32_t ite_mask = (uint32_t)(((cond & 1) ^ 1) << 3) | 0x4u; + + /* Reserve literal pool space to prevent pool dumps inside the IT block */ + th_literal_pool_reserve_upcoming_bytes(10); /* ITE(2) + instr(2-4) + instr(2-4) */ + + ot_check(th_it((uint16_t)cond, (uint16_t)ite_mask)); + + /* Emit the Then instruction inside IT block */ + if (then_inline) + select_emit_inline(&mctx, &then_val, dest_reg); + else + ot_check_mov_reg(dest_reg, then_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, + true); + + /* Emit the Else instruction inside IT block */ + if (else_inline) + select_emit_inline(&mctx, &else_val, dest_reg); + else + ot_check_mov_reg(dest_reg, else_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, + true); + + mach_writeback_dest(&dest, dest_reg); + mach_release_all(&mctx); +} + +ST_FUNC void tcc_gen_machine_block_copy_mop(TCCIRState *ir, IROperand dest, IROperand src, int size) +{ + if (size <= 0 || (size & 3)) + tcc_error("compiler_error: block_copy size must be positive multiple of 4, got %d", size); + + /* Get the source symbol from the SYMREF operand */ + IRPoolSymref *symref = irop_get_symref_ex(ir, src); + if (!symref || !symref->sym) + tcc_error("compiler_error: block_copy source is not a valid symbol reference"); + Sym *sym = validate_sym_for_reloc(symref->sym); + + /* Get the destination stack offset */ + int frame_offset = (int)irop_get_imm64_ex(ir, dest); + + /* For large copies, call memcpy instead of inline LDM/STM. + * Compute dest address into r0 BEFORE pushing lr, since the address is + * sp-relative and pushing changes sp. The BL to memcpy clobbers lr, + * so we must save/restore it for leaf functions whose prologue didn't. */ + if (size >= 64) + { + tcc_machine_addr_of_stack_slot(R0, frame_offset, 0 /* not param */); + tcc_machine_load_constant(R1, PREG_REG_NONE, symref->addend, 0, sym); + tcc_machine_load_constant(R2, PREG_REG_NONE, size, 0, NULL); + int need_lr_save = ir->leaffunc; + if (need_lr_save) + ot_check(th_push(1u << ARM_LR)); + Sym *memcpy_sym = external_global_sym(tok_alloc_const("memcpy"), &func_old_type); + MachineOperand func_mop = {0}; + func_mop.kind = MACH_OP_SYMBOL; + func_mop.u.sym.sym = memcpy_sym; + func_mop.u.sym.addend = 0; + if (text_and_data_separation) + ot_check(th_push((uint16_t)((1 << R9) | (1 << R12)))); + gcall_or_jump_mop(0, func_mop); + if (text_and_data_separation) + ot_check(th_pop((uint16_t)((1 << R9) | (1 << R12)))); + if (need_lr_save) + ot_check(th_pop(1u << ARM_LR)); + return; + } + + int nwords = size / 4; + + /* Allocate pointer registers first and compute addresses BEFORE allocating + * data registers. Data register saves may use PUSH which modifies SP, + * so all SP-relative address computation must happen before that. */ + ScratchRegAlloc src_scratch = get_scratch_reg_with_save(0); + int r_src = src_scratch.reg; + ScratchRegAlloc dst_scratch = get_scratch_reg_with_save(1u << (uint32_t)r_src); + int r_dst = dst_scratch.reg; + + /* Load source address (rodata symbol) into r_src */ + tcc_machine_load_constant(r_src, PREG_REG_NONE, symref->addend, 0, sym); + + /* Compute destination stack address into r_dst BEFORE any data reg saves + * that might change SP via PUSH */ + tcc_machine_addr_of_stack_slot(r_dst, frame_offset, 0 /* not param */); + + /* Now allocate data registers for LDM/STM. Even if these saves use PUSH + * and modify SP, we've already captured the destination address in r_dst. */ + int max_data = nwords < 4 ? nwords : 4; + if (max_data < 1) + max_data = 1; + + ScratchRegAlloc data_scratches[4]; + int data_regs[4]; + int ndata = 0; + uint32_t exclude = (1u << (uint32_t)r_src) | (1u << (uint32_t)r_dst); + for (int k = 0; k < max_data; k++) + { + data_scratches[k] = get_scratch_reg_with_save(exclude); + data_regs[k] = data_scratches[k].reg; + exclude |= (1u << (uint32_t)data_regs[k]); + ndata++; + } + + int remaining_words = nwords; + + /* Process in chunks of ndata words using LDM/STM with writeback */ + while (remaining_words >= ndata && ndata >= 2) + { + uint32_t regset = 0; + for (int j = 0; j < ndata; j++) + regset |= (1u << (uint32_t)data_regs[j]); + + ot_check(th_ldm(r_src, regset, 1 /* writeback */, ENFORCE_ENCODING_NONE)); + ot_check(th_stm(r_dst, regset, 1 /* writeback */, ENFORCE_ENCODING_NONE)); + remaining_words -= ndata; + } + + /* Handle remaining words individually */ + int dr = data_regs[0]; /* first data register */ + while (remaining_words > 0) + { + ot_check_ldr_imm(dr, r_src, 0, 6, ENFORCE_ENCODING_NONE); + ot_check_str_imm(dr, r_dst, 0, 6, ENFORCE_ENCODING_NONE); + if (remaining_words > 1) + { + if (!ot(th_add_imm(r_src, r_src, 4, flags_safe(), ENFORCE_ENCODING_NONE))) + tcc_error("compiler_error: block_copy cannot advance source pointer"); + if (!ot(th_add_imm(r_dst, r_dst, 4, flags_safe(), ENFORCE_ENCODING_NONE))) + tcc_error("compiler_error: block_copy cannot advance dest pointer"); + } + remaining_words--; + } + + /* Restore all scratch registers in reverse order: data regs first, then ptrs */ + for (int k = ndata - 1; k >= 0; k--) + restore_scratch_reg(&data_scratches[k]); + restore_scratch_reg(&dst_scratch); + restore_scratch_reg(&src_scratch); +} + +ST_FUNC void tcc_gen_machine_spill_block_copy(int32_t src_spill_off, int32_t dst_spill_off, int nwords) +{ + ScratchRegAlloc src_scratch = get_scratch_reg_with_save(0); + int r_src = src_scratch.reg; + ScratchRegAlloc dst_scratch = get_scratch_reg_with_save(1u << (uint32_t)r_src); + int r_dst = dst_scratch.reg; + + tcc_machine_addr_of_stack_slot(r_src, src_spill_off, 0); + tcc_machine_addr_of_stack_slot(r_dst, dst_spill_off, 0); + + int max_data = nwords < 4 ? nwords : 4; + if (max_data < 1) + max_data = 1; + + ScratchRegAlloc data_scratches[4]; + int data_regs[4]; + int ndata = 0; + uint32_t exclude = (1u << (uint32_t)r_src) | (1u << (uint32_t)r_dst); + for (int k = 0; k < max_data; k++) + { + data_scratches[k] = get_scratch_reg_with_save(exclude); + data_regs[k] = data_scratches[k].reg; + exclude |= (1u << (uint32_t)data_regs[k]); + ndata++; + } + + int remaining = nwords; + + while (remaining >= ndata && ndata >= 2) + { + uint32_t regset = 0; + for (int j = 0; j < ndata; j++) + regset |= (1u << (uint32_t)data_regs[j]); + ot_check(th_ldm(r_src, regset, 1 /* writeback */, ENFORCE_ENCODING_NONE)); + ot_check(th_stm(r_dst, regset, 1 /* writeback */, ENFORCE_ENCODING_NONE)); + remaining -= ndata; + } + + int dr = data_regs[0]; + while (remaining > 0) + { + ot_check_ldr_imm(dr, r_src, 0, 6, ENFORCE_ENCODING_NONE); + ot_check_str_imm(dr, r_dst, 0, 6, ENFORCE_ENCODING_NONE); + if (remaining > 1) + { + if (!ot(th_add_imm(r_src, r_src, 4, flags_safe(), ENFORCE_ENCODING_NONE))) + tcc_error("compiler_error: spill_block_copy cannot advance source pointer"); + if (!ot(th_add_imm(r_dst, r_dst, 4, flags_safe(), ENFORCE_ENCODING_NONE))) + tcc_error("compiler_error: spill_block_copy cannot advance dest pointer"); + } + remaining--; + } + + for (int k = ndata - 1; k >= 0; k--) + restore_scratch_reg(&data_scratches[k]); + restore_scratch_reg(&dst_scratch); + restore_scratch_reg(&src_scratch); +} + ST_FUNC void tcc_gen_machine_trap_mop(void) { /* Emit UDF #0xfe - Undefined instruction for trap */ @@ -8633,14 +12586,15 @@ ST_FUNC void tcc_gen_machine_prefetch_mop(MachineOperand addr, int rw) case MACH_OP_SPILL: { /* Spill slot: compute address (FP + offset) then PLD */ - /* Load offset into IP (R12), add FP, then PLD [R12] */ int32_t offset = addr.u.spill.offset; if (offset != 0) { - load_full_const(ARM_R12, PREG_NONE, LFC_SPLIT(offset)); - ot_check(th_add_reg(ARM_R12, R_FP, ARM_R12, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, + ScratchRegAlloc scr = get_scratch_reg_with_save(0); + load_full_const(scr.reg, PREG_NONE, LFC_SPLIT(offset)); + ot_check(th_add_reg(scr.reg, R_FP, scr.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - ot_check(th_pld_imm(ARM_R12, 0, 0)); + ot_check(th_pld_imm(scr.reg, 0, 0)); + restore_scratch_reg(&scr); } else { @@ -8651,17 +12605,20 @@ ST_FUNC void tcc_gen_machine_prefetch_mop(MachineOperand addr, int rw) case MACH_OP_IMM: { /* For immediate addresses, load into a register first */ - /* Use R12 (IP) as scratch since it's caller-saved */ - load_full_const(ARM_R12, PREG_NONE, LFC_SPLIT(addr.u.imm.val)); - ot_check(th_pld_imm(ARM_R12, 0, 0)); + ScratchRegAlloc scr = get_scratch_reg_with_save(0); + load_full_const(scr.reg, PREG_NONE, LFC_SPLIT(addr.u.imm.val)); + ot_check(th_pld_imm(scr.reg, 0, 0)); + restore_scratch_reg(&scr); break; } case MACH_OP_SYMBOL: { /* For symbol addresses, load into a register first */ + ScratchRegAlloc scr = get_scratch_reg_with_save(0); _lfc_sym = addr.u.sym.sym; - load_full_const(ARM_R12, PREG_NONE, LFC_SPLIT(addr.u.sym.addend)); - ot_check(th_pld_imm(ARM_R12, 0, 0)); + load_full_const(scr.reg, PREG_NONE, LFC_SPLIT(addr.u.sym.addend)); + ot_check(th_pld_imm(scr.reg, 0, 0)); + restore_scratch_reg(&scr); break; } case MACH_OP_FRAME_ADDR: @@ -8670,10 +12627,12 @@ ST_FUNC void tcc_gen_machine_prefetch_mop(MachineOperand addr, int rw) int32_t offset = addr.u.frame.offset; if (offset != 0) { - load_full_const(ARM_R12, PREG_NONE, LFC_SPLIT(offset)); - ot_check(th_add_reg(ARM_R12, R_FP, ARM_R12, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, + ScratchRegAlloc scr = get_scratch_reg_with_save(0); + load_full_const(scr.reg, PREG_NONE, LFC_SPLIT(offset)); + ot_check(th_add_reg(scr.reg, R_FP, scr.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); - ot_check(th_pld_imm(ARM_R12, 0, 0)); + ot_check(th_pld_imm(scr.reg, 0, 0)); + restore_scratch_reg(&scr); } else { @@ -8688,14 +12647,24 @@ ST_FUNC void tcc_gen_machine_prefetch_mop(MachineOperand addr, int rw) /* __builtin_setjmp implementation for ARM Thumb-2. * - * Jump buffer layout (3 words, fits in the standard 5-word buffer): + * GCC's documented ABI gives __builtin_setjmp a 5-word buffer; callers + * (e.g. gcc.c-torture pr84521) really do pass `void *buf[5]`, so nothing + * larger may be written through the buffer pointer. The callee-saved + * register file (r4-r11) still must be restored on longjmp — the register + * allocator keeps VARs and the R9 GOT base in r4-r11 across the setjmp — + * so those 8 words live in a hidden, compiler-allocated save area in the + * setjmp-containing function's frame (src2/area), which stays valid for + * as long as a longjmp to this buffer is legal. + * + * Jump buffer layout (4 words used, fits the standard 5-word buffer): * buf[0] = frame pointer (R7/FP) * buf[1] = resume address (Thumb-bit set) * buf[2] = stack pointer (SP) + * buf[3] = address of the hidden r4-r11 save area (32 bytes) * * Returns 0 on initial call, 1 when returning via longjmp. */ -ST_FUNC void tcc_gen_machine_setjmp_mop(MachineOperand buf, MachineOperand dest) +ST_FUNC void tcc_gen_machine_setjmp_mop(MachineOperand buf, MachineOperand area, MachineOperand dest) { MachineCodegenContext ctx = {0}; int buf_reg; @@ -8703,19 +12672,44 @@ ST_FUNC void tcc_gen_machine_setjmp_mop(MachineOperand buf, MachineOperand dest) if (buf.kind == MACH_OP_NONE) { buf_reg = mach_alloc_scratch(&ctx, 0); - ot_check(th_mov_imm(buf_reg, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm(buf_reg, 0, flags_safe(), ENFORCE_ENCODING_NONE)); } else { - buf_reg = mach_ensure_in_reg(&ctx, &buf, 0); + /* Exclude r4-r11 as scratch candidates: a saved-scratch there would + * hold the buffer pointer when the area stores below run, corrupting + * the saved register file (same class as the MLA scratch-pop bug). */ + buf_reg = mach_ensure_in_reg(&ctx, &buf, 0x0FF0); + } + + /* ---- save callee-saved r4-r11 into the hidden frame area ---- + * The area address is computed in IP (caller-saved) so the r4-r11 + * values stored are the untouched setjmp-time ones; a scratch from + * mach_alloc_scratch could pick a callee-saved register. */ + if (area.kind == MACH_OP_FRAME_ADDR) + { + tcc_machine_addr_of_stack_slot(R_IP, area.u.frame.offset, 0 /* not param */); + } + else + { + tcc_error("compiler_error: setjmp save area must be a frame slot (kind %d)", (int)area.kind); } + ot_check_str_imm(4, R_IP, 0, 6, ENFORCE_ENCODING_NONE); /* r4 -> area[0] */ + ot_check_str_imm(5, R_IP, 4, 6, ENFORCE_ENCODING_NONE); /* r5 -> area[1] */ + ot_check_str_imm(6, R_IP, 8, 6, ENFORCE_ENCODING_NONE); /* r6 -> area[2] */ + ot_check_str_imm(R_FP, R_IP, 12, 6, ENFORCE_ENCODING_NONE); /* r7 -> area[3] */ + ot_check_str_imm(8, R_IP, 16, 6, ENFORCE_ENCODING_NONE); /* r8 -> area[4] */ + ot_check_str_imm(9, R_IP, 20, 6, ENFORCE_ENCODING_NONE); /* r9 -> area[5] */ + ot_check_str_imm(10, R_IP, 24, 6, ENFORCE_ENCODING_NONE); /* r10 -> area[6] */ + ot_check_str_imm(11, R_IP, 28, 6, ENFORCE_ENCODING_NONE); /* r11 -> area[7] */ + ot_check_str_imm(R_IP, buf_reg, 12, 6, ENFORCE_ENCODING_NONE); /* &area -> buf[3] */ /* ---- save frame pointer ---- */ - ot_check(th_str_imm(R_FP, buf_reg, 0, 6, ENFORCE_ENCODING_NONE)); /* r7 -> buf[0] */ + ot_check_str_imm(R_FP, buf_reg, 0, 6, ENFORCE_ENCODING_NONE); /* r7 -> buf[0] */ /* ---- save SP ---- */ - ot_check(th_mov_reg(R_IP, R_SP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); - ot_check(th_str_imm(R_IP, buf_reg, 8, 6, ENFORCE_ENCODING_NONE)); /* SP -> buf[2] */ + ot_check_mov_reg(R_IP, R_SP, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + ot_check_str_imm(R_IP, buf_reg, 8, 6, ENFORCE_ENCODING_NONE); /* SP -> buf[2] */ /* ---- save resume address (ADR IP, resume_label) ---- */ int adr_addr = ind; @@ -8725,16 +12719,16 @@ ST_FUNC void tcc_gen_machine_setjmp_mop(MachineOperand buf, MachineOperand dest) int adr_imm = resume_label_addr - adr_base; ot_check(th_adr_imm(R_IP, adr_imm, ENFORCE_ENCODING_32BIT)); - ot_check(th_orr_imm(R_IP, R_IP, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); /* Thumb bit */ - ot_check(th_str_imm(R_IP, buf_reg, 4, 6, ENFORCE_ENCODING_NONE)); /* -> buf[1] */ + ot_check(th_orr_imm(R_IP, R_IP, 1, flags_safe(), ENFORCE_ENCODING_NONE)); /* Thumb bit */ + ot_check_str_imm(R_IP, buf_reg, 4, 6, ENFORCE_ENCODING_NONE); /* -> buf[1] */ /* ---- normal path: return 0 ---- */ int dest_reg = mach_get_dest_reg(&ctx, &dest, 0); - ot_check(th_mov_imm(dest_reg, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_32BIT)); /* dest = 0 */ + ot_check(th_mov_imm(dest_reg, 0, flags_safe(), ENFORCE_ENCODING_32BIT)); /* dest = 0 */ ot_check(th_b_t4(4)); /* B.W +4 (skip resume) */ /* ---- resume_label: longjmp lands here ---- */ - ot_check(th_mov_imm(dest_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_32BIT)); /* dest = 1 */ + ot_check(th_mov_imm(dest_reg, 1, flags_safe(), ENFORCE_ENCODING_32BIT)); /* dest = 1 */ /* ---- end_label ---- */ mach_writeback_dest(&dest, dest_reg); @@ -8758,7 +12752,7 @@ ST_FUNC void tcc_gen_machine_nl_setjmp_mop(MachineOperand buf, MachineOperand de if (buf.kind == MACH_OP_NONE) { buf_reg = mach_alloc_scratch(&ctx, 0); - ot_check(th_mov_imm(buf_reg, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); + ot_check(th_mov_imm(buf_reg, 0, flags_safe(), ENFORCE_ENCODING_NONE)); } else { @@ -8766,18 +12760,18 @@ ST_FUNC void tcc_gen_machine_nl_setjmp_mop(MachineOperand buf, MachineOperand de } /* ---- save callee-saved registers r4-r11 ---- */ - ot_check(th_str_imm(4, buf_reg, 0, 6, ENFORCE_ENCODING_NONE)); /* r4 -> buf[0] */ - ot_check(th_str_imm(5, buf_reg, 4, 6, ENFORCE_ENCODING_NONE)); /* r5 -> buf[1] */ - ot_check(th_str_imm(6, buf_reg, 8, 6, ENFORCE_ENCODING_NONE)); /* r6 -> buf[2] */ - ot_check(th_str_imm(R_FP, buf_reg, 12, 6, ENFORCE_ENCODING_NONE)); /* r7 -> buf[3] */ - ot_check(th_str_imm(8, buf_reg, 16, 6, ENFORCE_ENCODING_NONE)); /* r8 -> buf[4] */ - ot_check(th_str_imm(9, buf_reg, 20, 6, ENFORCE_ENCODING_NONE)); /* r9 -> buf[5] */ - ot_check(th_str_imm(10, buf_reg, 24, 6, ENFORCE_ENCODING_NONE)); /* r10 -> buf[6] */ - ot_check(th_str_imm(11, buf_reg, 28, 6, ENFORCE_ENCODING_NONE)); /* r11 -> buf[7] */ + ot_check_str_imm(4, buf_reg, 0, 6, ENFORCE_ENCODING_NONE); /* r4 -> buf[0] */ + ot_check_str_imm(5, buf_reg, 4, 6, ENFORCE_ENCODING_NONE); /* r5 -> buf[1] */ + ot_check_str_imm(6, buf_reg, 8, 6, ENFORCE_ENCODING_NONE); /* r6 -> buf[2] */ + ot_check_str_imm(R_FP, buf_reg, 12, 6, ENFORCE_ENCODING_NONE); /* r7 -> buf[3] */ + ot_check_str_imm(8, buf_reg, 16, 6, ENFORCE_ENCODING_NONE); /* r8 -> buf[4] */ + ot_check_str_imm(9, buf_reg, 20, 6, ENFORCE_ENCODING_NONE); /* r9 -> buf[5] */ + ot_check_str_imm(10, buf_reg, 24, 6, ENFORCE_ENCODING_NONE); /* r10 -> buf[6] */ + ot_check_str_imm(11, buf_reg, 28, 6, ENFORCE_ENCODING_NONE); /* r11 -> buf[7] */ /* ---- save SP ---- */ - ot_check(th_mov_reg(R_IP, R_SP, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); - ot_check(th_str_imm(R_IP, buf_reg, 32, 6, ENFORCE_ENCODING_NONE)); /* SP -> buf[8] */ + ot_check_mov_reg(R_IP, R_SP, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + ot_check_str_imm(R_IP, buf_reg, 32, 6, ENFORCE_ENCODING_NONE); /* SP -> buf[8] */ /* ---- save resume address (ADR IP, resume_label) ---- */ int adr_addr = ind; @@ -8787,16 +12781,16 @@ ST_FUNC void tcc_gen_machine_nl_setjmp_mop(MachineOperand buf, MachineOperand de int adr_imm = resume_label_addr - adr_base; ot_check(th_adr_imm(R_IP, adr_imm, ENFORCE_ENCODING_32BIT)); - ot_check(th_orr_imm(R_IP, R_IP, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE)); /* Thumb bit */ - ot_check(th_str_imm(R_IP, buf_reg, 36, 6, ENFORCE_ENCODING_NONE)); /* -> buf[9] */ + ot_check(th_orr_imm(R_IP, R_IP, 1, flags_safe(), ENFORCE_ENCODING_NONE)); /* Thumb bit */ + ot_check_str_imm(R_IP, buf_reg, 36, 6, ENFORCE_ENCODING_NONE); /* -> buf[9] */ /* ---- normal path: return 0 ---- */ int dest_reg = mach_get_dest_reg(&ctx, &dest, 0); - ot_check(th_mov_imm(dest_reg, 0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_32BIT)); /* dest = 0 */ + ot_check(th_mov_imm(dest_reg, 0, flags_safe(), ENFORCE_ENCODING_32BIT)); /* dest = 0 */ ot_check(th_b_t4(4)); /* B.W +4 (skip resume) */ /* ---- resume_label: longjmp lands here ---- */ - ot_check(th_mov_imm(dest_reg, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_32BIT)); /* dest = 1 */ + ot_check(th_mov_imm(dest_reg, 1, flags_safe(), ENFORCE_ENCODING_32BIT)); /* dest = 1 */ /* ---- end_label ---- */ mach_writeback_dest(&dest, dest_reg); @@ -8805,11 +12799,13 @@ ST_FUNC void tcc_gen_machine_nl_setjmp_mop(MachineOperand buf, MachineOperand de /* __builtin_longjmp implementation for ARM Thumb-2. * - * Restores FP and SP saved by __builtin_setjmp, then jumps to the resume - * address. Uses the minimal 3-word buffer layout. + * Restores the callee-saved register file (r4-r11, from the hidden save + * area whose address setjmp left in buf[3]) and SP, then jumps to the + * resume address. This function does not return, so every caller-saved + * register is fair game as a temporary. * - * Buffer layout (must match __builtin_setjmp): - * buf[0] = FP, buf[1] = resume_addr, buf[2] = SP + * Buffer layout (must match tcc_gen_machine_setjmp_mop): + * buf[0] = FP, buf[1] = resume_addr, buf[2] = SP, buf[3] = &save_area */ ST_FUNC void tcc_gen_machine_longjmp_mop(MachineOperand buf) { @@ -8824,18 +12820,30 @@ ST_FUNC void tcc_gen_machine_longjmp_mop(MachineOperand buf) buf_reg = mach_ensure_in_reg(&ctx, &buf, 0); - /* Copy buf pointer to IP so it survives FP restore */ - ot_check(th_mov_reg(R_IP, buf_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); - - /* Read resume address and saved SP into caller-saved regs first */ - ot_check(th_ldr_imm(0, R_IP, 4, 6, ENFORCE_ENCODING_NONE)); /* r0 = resume addr */ - ot_check(th_ldr_imm(1, R_IP, 8, 6, ENFORCE_ENCODING_NONE)); /* r1 = saved SP */ - - /* Restore frame pointer */ - ot_check(th_ldr_imm(R_FP, R_IP, 0, 6, ENFORCE_ENCODING_NONE)); /* r7 = FP */ + /* Copy buf pointer to IP so it survives the register restores */ + ot_check_mov_reg(R_IP, buf_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + + /* Read resume address, saved SP and save-area pointer into caller-saved + * regs before clobbering anything callee-saved. */ + ot_check_ldr_imm(0, R_IP, 4, 6, ENFORCE_ENCODING_NONE); /* r0 = resume addr */ + ot_check_ldr_imm(1, R_IP, 8, 6, ENFORCE_ENCODING_NONE); /* r1 = saved SP */ + ot_check_ldr_imm(2, R_IP, 12, 6, ENFORCE_ENCODING_NONE); /* r2 = &save_area */ + + /* Restore callee-saved r4-r11 (r7/FP comes from the area too; the copy + * in buf[0] is identical). */ + ot_check_ldr_imm(4, 2, 0, 6, ENFORCE_ENCODING_NONE); /* r4 */ + ot_check_ldr_imm(5, 2, 4, 6, ENFORCE_ENCODING_NONE); /* r5 */ + ot_check_ldr_imm(6, 2, 8, 6, ENFORCE_ENCODING_NONE); /* r6 */ + ot_check_ldr_imm(R_FP, 2, 12, 6, ENFORCE_ENCODING_NONE); /* r7 */ + ot_check_ldr_imm(8, 2, 16, 6, ENFORCE_ENCODING_NONE); /* r8 */ + allow_r9_write = 1; /* restoring the setjmp-time GOT base is the point */ + ot_check_ldr_imm(9, 2, 20, 6, ENFORCE_ENCODING_NONE); /* r9 */ + allow_r9_write = 0; + ot_check_ldr_imm(10, 2, 24, 6, ENFORCE_ENCODING_NONE); /* r10 */ + ot_check_ldr_imm(11, 2, 28, 6, ENFORCE_ENCODING_NONE); /* r11 */ /* Restore SP */ - ot_check(th_mov_reg(R_SP, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(R_SP, 1, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); /* Jump to resume address (Thumb bit already set by setjmp code) */ ot_check(th_bx_reg(0)); @@ -8876,14 +12884,14 @@ ST_FUNC void tcc_gen_machine_nl_longjmp_mop(MachineOperand buf) if (abs_off == 0) { if (buf_reg != base) - ot_check(th_mov_reg((uint32_t)buf_reg, (uint32_t)base, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, - ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg((uint32_t)buf_reg, (uint32_t)base, flags_safe(), THUMB_SHIFT_DEFAULT, + ENFORCE_ENCODING_NONE, false); } else { thumb_opcode ins = sign - ? th_sub_imm(buf_reg, base, abs_off, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE) - : th_add_imm(buf_reg, base, abs_off, FLAGS_BEHAVIOUR_NOT_IMPORTANT, ENFORCE_ENCODING_NONE); + ? th_sub_imm(buf_reg, base, abs_off, flags_safe(), ENFORCE_ENCODING_NONE) + : th_add_imm(buf_reg, base, abs_off, flags_safe(), ENFORCE_ENCODING_NONE); if (ins.size != 0) { ot_check(ins); @@ -8892,9 +12900,9 @@ ST_FUNC void tcc_gen_machine_nl_longjmp_mop(MachineOperand buf) { ScratchRegAlloc off_sc = get_scratch_reg_with_save(excl | (1u << (uint32_t)buf_reg) | (1u << (uint32_t)base)); load_full_const(off_sc.reg, PREG_NONE, LFC_SPLIT(abs_off)); - ot_check(sign ? th_sub_reg(buf_reg, base, off_sc.reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, + ot_check(sign ? th_sub_reg(buf_reg, base, off_sc.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE) - : th_add_reg(buf_reg, base, off_sc.reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, + : th_add_reg(buf_reg, base, off_sc.reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE)); restore_scratch_reg(&off_sc); } @@ -8908,25 +12916,27 @@ ST_FUNC void tcc_gen_machine_nl_longjmp_mop(MachineOperand buf) } /* Copy buf pointer to IP so it survives register restores */ - ot_check(th_mov_reg(R_IP, buf_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(R_IP, buf_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); /* Load resume address and saved SP into caller-saved regs first * (before we clobber r4+ with the restore) */ - ot_check(th_ldr_imm(0, R_IP, 36, 6, ENFORCE_ENCODING_NONE)); /* r0 = resume addr */ - ot_check(th_ldr_imm(1, R_IP, 32, 6, ENFORCE_ENCODING_NONE)); /* r1 = saved SP */ + ot_check_ldr_imm(0, R_IP, 36, 6, ENFORCE_ENCODING_NONE); /* r0 = resume addr */ + ot_check_ldr_imm(1, R_IP, 32, 6, ENFORCE_ENCODING_NONE); /* r1 = saved SP */ /* Restore callee-saved registers r4-r11 */ - ot_check(th_ldr_imm(4, R_IP, 0, 6, ENFORCE_ENCODING_NONE)); /* r4 = buf[0] */ - ot_check(th_ldr_imm(5, R_IP, 4, 6, ENFORCE_ENCODING_NONE)); /* r5 = buf[1] */ - ot_check(th_ldr_imm(6, R_IP, 8, 6, ENFORCE_ENCODING_NONE)); /* r6 = buf[2] */ - ot_check(th_ldr_imm(R_FP, R_IP, 12, 6, ENFORCE_ENCODING_NONE)); /* r7 = buf[3] (FP) */ - ot_check(th_ldr_imm(8, R_IP, 16, 6, ENFORCE_ENCODING_NONE)); /* r8 = buf[4] */ - ot_check(th_ldr_imm(9, R_IP, 20, 6, ENFORCE_ENCODING_NONE)); /* r9 = buf[5] */ - ot_check(th_ldr_imm(10, R_IP, 24, 6, ENFORCE_ENCODING_NONE)); /* r10 = buf[6] */ - ot_check(th_ldr_imm(11, R_IP, 28, 6, ENFORCE_ENCODING_NONE)); /* r11 = buf[7] */ + ot_check_ldr_imm(4, R_IP, 0, 6, ENFORCE_ENCODING_NONE); /* r4 = buf[0] */ + ot_check_ldr_imm(5, R_IP, 4, 6, ENFORCE_ENCODING_NONE); /* r5 = buf[1] */ + ot_check_ldr_imm(6, R_IP, 8, 6, ENFORCE_ENCODING_NONE); /* r6 = buf[2] */ + ot_check_ldr_imm(R_FP, R_IP, 12, 6, ENFORCE_ENCODING_NONE); /* r7 = buf[3] (FP) */ + ot_check_ldr_imm(8, R_IP, 16, 6, ENFORCE_ENCODING_NONE); /* r8 = buf[4] */ + allow_r9_write = 1; + ot_check_ldr_imm(9, R_IP, 20, 6, ENFORCE_ENCODING_NONE); /* r9 = buf[5] */ + allow_r9_write = 0; + ot_check_ldr_imm(10, R_IP, 24, 6, ENFORCE_ENCODING_NONE); /* r10 = buf[6] */ + ot_check_ldr_imm(11, R_IP, 28, 6, ENFORCE_ENCODING_NONE); /* r11 = buf[7] */ /* Restore SP */ - ot_check(th_mov_reg(R_SP, 1, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(R_SP, 1, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); /* Jump to resume address (Thumb bit already set by setjmp code) */ ot_check(th_bx_reg(0)); @@ -8970,25 +12980,41 @@ ST_FUNC void tcc_gen_machine_builtin_apply_mop(MachineOperand fn, MachineOperand { MachineCodegenContext ctx = {0}; - /* Step 1: Load args block pointer into a callee-saved scratch register. - * We use the scratch allocator which will pick a suitable register. */ - int args_reg = mach_ensure_in_reg(&ctx, &args, 0); + /* Registers destroyed by the restore-and-call sequence below: r0-r3 are + * reloaded with the saved argument values, and ip(r12)+lr are clobbered by + * the BLX. The args-block base pointer (used by all four restore loads) and + * the callee address must therefore live OUTSIDE this set until used. */ + const uint32_t clobbered = + (1u << R0) | (1u << R1) | (1u << R2) | (1u << R3) | (1u << (uint32_t)R_IP); + + /* Step 1: Materialize the args block pointer, then guarantee it is in a + * register the restore loads won't overwrite. mach_ensure_in_reg returns an + * already-allocated operand register verbatim (ignoring the exclusion mask), + * so when the value already lives in r0-r3 / ip we must relocate it to a + * safe scratch — otherwise the very first load (r0 <- [base+4]) destroys the + * base pointer and the remaining loads read from garbage addresses. */ + int args_reg = mach_ensure_in_reg(&ctx, &args, clobbered); + if (clobbered & (1u << (uint32_t)args_reg)) + { + int safe = mach_alloc_scratch(&ctx, clobbered); + ot_check_mov_reg(safe, args_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); + args_reg = safe; + } /* Step 2: Load the function pointer into R12 (IP), which survives the * register loads below because IP is not one of r0-r3. */ - int fn_reg = mach_ensure_in_reg(&ctx, &fn, (1u << args_reg)); + int fn_reg = mach_ensure_in_reg(&ctx, &fn, (1u << (uint32_t)args_reg)); if (fn_reg != R_IP) { - ot_check( - th_mov_reg(R_IP, fn_reg, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(R_IP, fn_reg, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); } /* Step 3: Restore r0-r3 from the args block. * Layout: [+0]=stack_args_ptr, [+4]=r0, [+8]=r1, [+12]=r2, [+16]=r3. */ - ot_check(th_ldr_imm(R0, args_reg, 4, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R1, args_reg, 8, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R2, args_reg, 12, 6, ENFORCE_ENCODING_NONE)); - ot_check(th_ldr_imm(R3, args_reg, 16, 6, ENFORCE_ENCODING_NONE)); + ot_check_ldr_imm(R0, args_reg, 4, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R1, args_reg, 8, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R2, args_reg, 12, 6, ENFORCE_ENCODING_NONE); + ot_check_ldr_imm(R3, args_reg, 16, 6, ENFORCE_ENCODING_NONE); /* Step 4: Call the function via BLX R12. * This clobbers LR and r0-r3 (caller-saved). */ @@ -8998,8 +13024,7 @@ ST_FUNC void tcc_gen_machine_builtin_apply_mop(MachineOperand fn, MachineOperand int dest_reg = mach_get_dest_reg(&ctx, &dest, 0); if (dest_reg != R0) { - ot_check( - th_mov_reg(dest_reg, R0, FLAGS_BEHAVIOUR_NOT_IMPORTANT, THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false)); + ot_check_mov_reg(dest_reg, R0, flags_safe(), THUMB_SHIFT_DEFAULT, ENFORCE_ENCODING_NONE, false); } mach_writeback_dest(&dest, dest_reg); @@ -9223,3 +13248,141 @@ ST_FUNC void tcc_gen_machine_func_parameter_mop(MachineOperand src1, MachineOper /* Store parameter information - for now just mark as present */ call_site->function_argument_list[param_index] = 1; /* Mark parameter as present */ } +/* Emit a nested-function trampoline into the current text section. + * chain_slot_sym: TCC symbol for the chain slot in .data + * func_sym: TCC symbol for the nested function in .text + * + * The trampoline loads the parent frame pointer from the chain slot + * into R10 (the static-chain register) and tail-calls the nested function. + * + * Two variants: + * - GOT-indirect (text_and_data_separation): uses R9-relative GOT loads, + * relocations are R_ARM_GOT32 (linker-resolved, no absolute addresses + * in the code section). + * - Direct: inline literal pool with R_ARM_ABS32 relocations. + */ +ST_FUNC addr_t gen_nested_func_trampoline(Sym *chain_slot_sym, Sym *func_sym) +{ + Section *text_sec = cur_text_section; + int use_got = tcc_state->text_and_data_separation; + + section_prealloc(text_sec, use_got ? 36 : 24); + + /* Align ind to 4-byte boundary */ + while (ind & 3) + text_sec->data[ind++] = 0x00; + + addr_t tramp_start = ind; + + if (use_got) + { + /* GOT-indirect trampoline (32 bytes): + * +0: LDR r12, [pc, #20] ; GOT offset of chain_slot (from +24) + * +4: LDR r10, [r9, r12] ; chain_slot address via GOT + * +8: LDR r10, [r10, #0] ; *chain_slot = parent FP + * +12: LDR r12, [pc, #12] ; GOT offset of function (from +28) + * +16: LDR r12, [r9, r12] ; function address via GOT + * +20: BX r12 ; tail-call + * +22: NOP + * +24: .word 0 ; R_ARM_GOT32 chain_slot + * +28: .word 0 ; R_ARM_GOT32 function + */ + + /* +0: LDR R12, [PC, #20] - F8DF C014 */ + text_sec->data[ind++] = 0xDF; + text_sec->data[ind++] = 0xF8; + text_sec->data[ind++] = 0x14; + text_sec->data[ind++] = 0xC0; + + /* +4: LDR R10, [R9, R12] - F859 A00C */ + text_sec->data[ind++] = 0x59; + text_sec->data[ind++] = 0xF8; + text_sec->data[ind++] = 0x0C; + text_sec->data[ind++] = 0xA0; + + /* +8: LDR R10, [R10, #0] - F8DA A000 */ + text_sec->data[ind++] = 0xDA; + text_sec->data[ind++] = 0xF8; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0xA0; + + /* +12: LDR R12, [PC, #12] - F8DF C00C */ + text_sec->data[ind++] = 0xDF; + text_sec->data[ind++] = 0xF8; + text_sec->data[ind++] = 0x0C; + text_sec->data[ind++] = 0xC0; + + /* +16: LDR R12, [R9, R12] - F859 C00C */ + text_sec->data[ind++] = 0x59; + text_sec->data[ind++] = 0xF8; + text_sec->data[ind++] = 0x0C; + text_sec->data[ind++] = 0xC0; + + /* +20: BX R12 - 4760 */ + text_sec->data[ind++] = 0x60; + text_sec->data[ind++] = 0x47; + + /* +22: NOP - BF00 */ + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0xBF; + + /* +24: chain slot GOT offset */ + greloc(text_sec, chain_slot_sym, ind, R_ARM_GOT32); + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + + /* +28: function GOT offset */ + greloc(text_sec, func_sym, ind, R_ARM_GOT32); + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + } + else + { + /* Direct trampoline (20 bytes): + * +0: LDR r10, [pc, #8] ; chain_slot address (from +12) + * +4: LDR r10, [r10, #0] ; *chain_slot = parent FP + * +8: LDR pc, [pc, #4] ; function address (from +16), tail call + * +12: .word chain_slot ; R_ARM_ABS32 + * +16: .word function ; R_ARM_ABS32 + */ + + /* LDR R10, [PC, #8] - F8DF A008 */ + text_sec->data[ind++] = 0xDF; + text_sec->data[ind++] = 0xF8; + text_sec->data[ind++] = 0x08; + text_sec->data[ind++] = 0xA0; + + /* LDR R10, [R10, #0] - F8DA A000 */ + text_sec->data[ind++] = 0xDA; + text_sec->data[ind++] = 0xF8; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0xA0; + + /* LDR PC, [PC, #4] - F8DF F004 */ + text_sec->data[ind++] = 0xDF; + text_sec->data[ind++] = 0xF8; + text_sec->data[ind++] = 0x04; + text_sec->data[ind++] = 0xF0; + + /* chain slot address */ + greloc(text_sec, chain_slot_sym, ind, R_ARM_ABS32); + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + + /* function address */ + greloc(text_sec, func_sym, ind, R_ARM_ABS32); + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + text_sec->data[ind++] = 0x00; + } + + text_sec->data_offset = ind; + return tramp_start + 1; /* +1 for Thumb interworking bit */ +} diff --git a/arm-thumb-opcodes.c b/arm-thumb-opcodes.c deleted file mode 100644 index 10a9d919..00000000 --- a/arm-thumb-opcodes.c +++ /dev/null @@ -1,3900 +0,0 @@ -/* - * ARMvX-m opcodes for TCC - * Uses thumb instruction set - * - * Based on: - * ARM Thumb 2 instruction functions for TCC - * Copyright (c) 2020 Erlend J. Sveen - * from: - * https://git.erlendjs.no/erlendjs/tinycc/-/blob/arm-thumb/arm-thumb-gen.c - * https://git.erlendjs.no/erlendjs/tinycc/-/blob/arm-thumb/arm-thumb-instructions.c - * - * And - * - * ARMv4 code generator for TCC - * - * Copyright (c) 2003 Daniel Glöckner - * Copyright (c) 2012 Thomas Preud'homme - * - * Based on i386-gen.c by Fabrice Bellard - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#define USING_GLOBALS -#include "arm-thumb-opcodes.h" -#include "tcc.h" - -static void th_trace_regset(uint16_t regs) -{ - THOP_TRACE("{"); - for (unsigned r = 0; r < 16; ++r) - { - if (regs & (1u << r)) - { - THOP_TRACE("%s%s", first ? "" : ",", th_reg_name(r)); - } - } - THOP_TRACE("}"); -} - -static void th_trace_shift_suffix(thumb_shift shift) -{ - if (shift.type == THUMB_SHIFT_NONE) - return; - if (shift.type == THUMB_SHIFT_RRX) - { - THOP_TRACE(", rrx"); - return; - } - if (shift.mode == THUMB_SHIFT_REGISTER) - THOP_TRACE(", %s %s", th_shift_name(shift.type), th_reg_name(shift.value)); - else - THOP_TRACE(", %s #%u", th_shift_name(shift.type), (unsigned)shift.value); -} - -thumb_opcode th_nop(thumb_enforce_encoding encoding) -{ - if (encoding == ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3af8000, - }; - } - return (thumb_opcode){ - .size = 2, - .opcode = 0xbf00, - }; -} - -thumb_opcode th_sev(thumb_enforce_encoding encoding) -{ - if (encoding == ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3af8004, - }; - } - return (thumb_opcode){ - .size = 2, - .opcode = 0xbf40, - }; -} - -uint32_t th_packimm_10_11_0(uint32_t imm) -{ - const uint32_t imm11 = (imm >> 1) & 0x7ff; - const uint32_t imm10 = (imm >> 12) & 0x3ff; - const uint32_t s = (imm >> 24) & 1; - const uint32_t j1 = ~((imm >> 23) ^ s) & 1; - const uint32_t j2 = ~((imm >> 22) ^ s) & 1; - return (s << 26) | (imm10 << 16) | (j1 << 13) | (j2 << 11) | imm11; -} - -uint32_t th_packimm_3_8_1(uint32_t imm) -{ - const uint32_t imm8 = imm & 0xff; - const uint32_t imm3 = (imm >> 8) & 0x7; - const uint32_t i = (imm >> 9) & 1; - return (i << 26) | (imm3 << 12) | imm8; -} - -uint32_t th_pack_const(uint32_t imm) -{ - // 00000000 00000000 00000000 abcdefgh - if ((imm & 0xffffff00) == 0) - { - return imm; - } - // 00000000 abcdefgh 00000000 abcdefgh - else if (!(imm & 0xff00ff00) && (imm >> 16) == (imm & 0xff)) - { - return (1 << 12) | (imm & 0xff); - } - // abcdefgh 00000000 abcdefgh 00000000 - else if (!(imm & 0x00ff00ff) && ((imm >> 16) & 0xff00) == (imm & 0xff00)) - { - return (2 << 12) | ((imm >> 8) & 0xff); - } - // abcdefgh abcdefgh abcdefgh abcdefgh - else if ((imm & 0xffff) == ((imm >> 16) & 0xffff) && ((imm >> 8) & 0xff) == (imm & 0xff)) - { - return (3 << 12) | (imm & 0xff); - } - else - { - for (uint32_t i = 8, j = 0; i <= 0x1F; i++, j++) - { - uint32_t mask = 0xFF000000 >> j; - uint32_t one = 0x80000000 >> j; - - if ((imm & one) == one && (imm & ~mask) == 0) - { - uint32_t _i = i >> 4; - uint32_t imm3 = (i >> 1) & 7; - uint32_t a = i & 1; - uint32_t bcdefgh = (imm >> (24 - j)) & 0x7f; - - return (_i << 26) | (imm3 << 12) | (a << 7) | bcdefgh; - } - } - } - return 0; -} - -uint32_t th_encbranch_b_t3(uint32_t imm) -{ - const uint32_t s = (imm >> 19) & 1; - const uint32_t imm6 = (imm >> 11) & 0x3f; - const uint32_t imm11 = imm & 0x7ff; - const uint32_t j2 = (imm >> 18) & 1; - const uint32_t j1 = (imm >> 17) & 1; - const uint32_t a = (s << 10) | imm6; - const uint32_t b = (j1 << 13) | (j2 << 11) | imm11; - return (a << 16) | b; -} - -uint32_t th_encbranch(int pos, int addr) -{ - TRACE("th_encbranch pos: 0x%x, addr: 0x%x", pos, addr); - return addr - pos - 4; -} - -uint32_t th_encbranch_8(int pos, int addr) -{ - addr = (addr - pos - 4) >> 1; - if (addr > 127 || addr < -128) - { - tcc_error("compiler_error: th_encbranch_8 too far address: %i\n", addr); - return 0; - } - return addr & 0xff; -} - -uint32_t th_encbranch_11(int pos, int addr) -{ - addr = (addr - pos - 4) >> 1; - if (addr >= 1023 || addr < -1024) - { - tcc_error("compiler_error: th_encbranch_11 too far address: %i\n", addr); - return 0; - } - return addr & 0x7ff; -} - -uint32_t th_encbranch_20(int pos, int addr) -{ - addr = (addr - pos - 4) >> 1; - TRACE("th_encbranch_20 pos %x addr %x\n", pos, addr); - return addr; -} - -uint32_t th_encbranch_24(int pos, int addr) -{ - addr = (addr - pos - 4) >> 1; - TRACE("th_encbranch_24 pos %x addr %x\n", pos, addr); - return addr; -} - -thumb_opcode th_bx_reg(uint16_t rm) -{ - THOP_TRACE("bx %s\n", th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = (0x4700 | ((rm & 0xf) << 3)), - }; -} - -thumb_opcode th_bl_t1(uint32_t imm) -{ - THOP_TRACE("bl \n", (unsigned)imm); - const uint32_t packed = th_packimm_10_11_0(imm) | 0xF000D000; - return (thumb_opcode){ - .size = 4, - .opcode = packed, - }; -} - -thumb_opcode th_blx_reg(uint16_t rm) -{ - THOP_TRACE("blx %s\n", th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = (0x4780 | (rm << 3)), - }; -} - -thumb_opcode th_b_t1(uint32_t cond, uint32_t imm8) -{ - THOP_TRACE("b%s \n", th_cond_name(cond & 0xf), (unsigned)imm8); - return (thumb_opcode){ - .size = 2, - .opcode = 0xd000 | ((cond & 0xf) << 8) | (imm8 & 0xff), - }; -} - -thumb_opcode th_b_t2(int32_t imm11) -{ - THOP_TRACE("b \n", (int)imm11); - const int32_t i = imm11 >> 1; - if (i < 1023 && i > -1024 && !(imm11 & 1)) - { - return (thumb_opcode){ - .size = 2, - .opcode = (0xe000 | (i & 0x7ff)), - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_b_t3(uint32_t op, uint32_t imm) -{ - THOP_TRACE("b%s.w \n", th_cond_name(op & 0xf), (unsigned)imm); - const uint32_t enc = th_encbranch_b_t3(imm); - return (thumb_opcode){ - .size = 4, - .opcode = (0xf0008000 | (op << 22) | enc), - }; -} - -thumb_opcode th_b_t4(int32_t imm) -{ - THOP_TRACE("b.w \n", (int)imm); - if (imm > 16777215 || imm < -16777215) - tcc_error("compiler_error: th_b_t4 too far address: 0x%x\n", imm); - - return (thumb_opcode){ - .size = 4, - .opcode = 0xf0009000 | th_packimm_10_11_0(imm), - }; -} - -thumb_opcode th_cbz(uint16_t rn, uint32_t imm, uint32_t nonzero) -{ - THOP_TRACE("%s %s, \n", nonzero ? "cbnz" : "cbz", th_reg_name(rn), (unsigned)imm); - const uint32_t imm5 = imm & 0x1f; - const uint32_t i = (imm >> 5) & 0x1; - - return (thumb_opcode){ - .size = 2, - .opcode = 0xb100 | nonzero << 11 | i << 9 | imm5 << 3 | rn, - }; -} - -uint32_t th_shift_type_to_op(thumb_shift shift) -{ - switch (shift.type) - { - case THUMB_SHIFT_ASR: - return 4; - case THUMB_SHIFT_LSL: - return 2; - case THUMB_SHIFT_LSR: - return 3; - case THUMB_SHIFT_ROR: - return 7; - default: - tcc_error("compiler_error: 'th_shift_type_to_op', unknown shift type %d\n", shift.type); - return 0; - } -} - -uint32_t th_shift_value_to_sr_type(thumb_shift shift) -{ - switch (shift.type) - { - case THUMB_SHIFT_NONE: - case THUMB_SHIFT_LSL: - return 0; - case THUMB_SHIFT_LSR: - return 1; - case THUMB_SHIFT_ASR: - return 2; - case THUMB_SHIFT_ROR: - case THUMB_SHIFT_RRX: - return 3; - }; - return 0; -} - -// all t32 arch -thumb_opcode th_mov_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding, bool in_it) -{ - if (shift.mode == THUMB_SHIFT_REGISTER && shift.type != THUMB_SHIFT_NONE) - { - return th_mov_reg_shift(rd, rm, shift.value, flags, shift, encoding); - } - - if (flags != FLAGS_BEHAVIOUR_SET && encoding != ENFORCE_ENCODING_32BIT && shift.type == THUMB_SHIFT_NONE) - { - const uint16_t D = (rd >> 3) & 1; - THOP_TRACE("mov %s, %s\n", th_reg_name(rd), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = (0x4600 | (D << 7) | (rm << 3) | (rd & 0x7)), - }; - } - if (encoding != ENFORCE_ENCODING_32BIT && rd < 8 && rm < 8 && shift.type != THUMB_SHIFT_RRX && - shift.type != THUMB_SHIFT_ROR && - ((flags == FLAGS_BEHAVIOUR_SET && !in_it) || (flags != FLAGS_BEHAVIOUR_SET && in_it))) - { - THOP_TRACE("%s %s, %s, #%u\n", th_shift_name(shift.type), th_reg_name(rd), th_reg_name(rm), (unsigned)shift.value); - return (thumb_opcode){ - .size = 2, - .opcode = (0x0000 | (th_shift_value_to_sr_type(shift) << 11) | shift.value << 6 | (rm << 3) | rd), - }; - } - if (encoding != ENFORCE_ENCODING_16BIT) - { - THOP_TRACE("mov%s %s, %s", flags == FLAGS_BEHAVIOUR_SET ? "s" : "", th_reg_name(rd), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("\n"); - return th_generic_op_reg_shift_with_status(0xea4f, rd, 0xf, rm, flags, shift); - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_mov_imm(uint16_t rd, uint32_t imm, thumb_flags_behaviour setflags, thumb_enforce_encoding encoding) -{ - if (rd <= 7 && imm >= 0 && imm <= 255 && setflags != FLAGS_BEHAVIOUR_BLOCK && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("movs %s, #%u\n", th_reg_name(rd), (unsigned)imm); - return (thumb_opcode){ - .size = 2, - .opcode = 0x2000 | (rd << 8) | imm, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - - if (rd != R_SP && rd != R_PC && encoding != ENFORCE_ENCODING_16BIT) - { - const uint32_t enc = th_pack_const(imm); - const uint32_t s = (setflags == FLAGS_BEHAVIOUR_SET) ? 1 : 0; - if (enc) - { - THOP_TRACE("mov%s %s, #%u\n", s ? "s" : "", th_reg_name(rd), (unsigned)imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf04f0000 | enc | ((rd & 0xf) << 8) | (s << 20), - }; - } - } - - if (imm >= 0 && imm <= 0xffff && rd != R_SP && rd != R_PC && setflags != FLAGS_BEHAVIOUR_SET && - encoding != ENFORCE_ENCODING_16BIT) - { - const uint16_t i = (imm >> 11) & 1; - const uint32_t imm4 = (imm >> 12) & 0xf; - const uint32_t imm3 = (imm >> 8) & 0x7; - THOP_TRACE("movw %s, #%u\n", th_reg_name(rd), (unsigned)imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf2400000 | (i << 26) | (imm4 << 16) | (imm3 << 12) | (rd << 8) | (imm & 0xff), - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_movt(uint32_t rd, uint32_t imm16) -{ - const uint32_t imm8 = imm16 & 0xff; - const uint32_t imm3 = (imm16 >> 8) & 0x7; - const uint32_t i = (imm16 >> 11) & 0x1; - const uint32_t imm4 = (imm16 >> 12) & 0xf; - - if (rd == R_SP || rd == R_PC || imm16 > 0xffff) - { - tcc_error("compiler_error: 'th_movt', SP or PC can't be used as rd\n"); - return (thumb_opcode){0, 0}; - } - - return (thumb_opcode){ - .size = 4, - .opcode = 0xf2c00000 | i << 26 | imm4 << 16 | imm3 << 12 | rd << 8 | imm8, - }; -} - -thumb_opcode th_generic_op_imm_with_status(uint16_t op, uint16_t rd, uint16_t rn, uint32_t imm, - thumb_flags_behaviour setflags) -{ -#ifndef TCC_TARGET_ARM_ARCHV6M - const uint32_t packed = th_pack_const(imm); - if (packed || imm == 0) - { - const uint32_t A = packed >> 16; - const uint32_t B = packed & 0xffff; - return (thumb_opcode){ - .size = 4, - .opcode = ((op | ((setflags == FLAGS_BEHAVIOUR_SET) << 4) | rn | A) << 16) | (rd << 8 | B), - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_generic_op_imm(uint16_t op, uint16_t rd, uint16_t rn, uint32_t imm) -{ - return th_generic_op_imm_with_status(op, rd, rn, imm, FLAGS_BEHAVIOUR_NOT_IMPORTANT); -} - -thumb_opcode th_add_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if ((rd == R_PC) && (rm == R_PC)) - { - tcc_error("compiler_error: 'th_add_reg', PC can't be used as rdn and rm\n"); - } - if (rm < 8 && rd < 8 && rn < 8 && encoding != ENFORCE_ENCODING_32BIT && shift.type == THUMB_SHIFT_NONE) - { - // T1: ADD , , — all low registers, no shift - THOP_TRACE("add%s %s, %s, %s\n", flags == FLAGS_BEHAVIOUR_SET ? "s" : "", th_reg_name(rd), th_reg_name(rn), - th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = 0x1800 | (rm << 6) | (rn << 3) | (rd), - }; - } - - if (rd == rn && flags != FLAGS_BEHAVIOUR_SET && encoding != ENFORCE_ENCODING_32BIT && shift.type == THUMB_SHIFT_NONE) - { - // T2: ADD , — 16-bit, allows PC/SP as Rm - const uint16_t DN = (rd >> 3) & 1; - THOP_TRACE("add %s, %s\n", th_reg_name(rd), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = 0x4400 | (DN << 7) | ((rm & 0xf) << 3) | (rd & 0x7), - }; - } - - /* T3: ADD{S}.W , , {, } — 32-bit encoding - * ARMv8-M constraints: Rd in {13,15} or Rn == 15 or Rm in {13,15} → UNPREDICTABLE. - * (Rd==13 is allowed only without shift and S==0, but we reject it for safety.) - * If PC is needed as an operand, the caller must use the 16-bit T2 encoding instead. */ - if (rn == R_PC || rm == R_PC || rm == R_SP) - { - tcc_error("compiler_error: 'th_add_reg' T3 (32-bit) encoding: " - "Rn=PC or Rm in {SP,PC} is UNPREDICTABLE on ARMv8-M " - "(rd=r%d, rn=r%d, rm=r%d). Use 16-bit T2 encoding for PC.\n", - rd, rn, rm); - } - if (rd == R_PC && flags != FLAGS_BEHAVIOUR_SET) - { - tcc_error("compiler_error: 'th_add_reg' T3 (32-bit) encoding: " - "Rd=PC with S==0 is UNPREDICTABLE on ARMv8-M " - "(rd=r%d, rn=r%d, rm=r%d).\n", - rd, rn, rm); - } - if (rd == R_SP && (shift.type != THUMB_SHIFT_NONE || flags == FLAGS_BEHAVIOUR_SET)) - { - tcc_error("compiler_error: 'th_add_reg' T3 (32-bit) encoding: " - "Rd=SP with shift or S==1 is UNPREDICTABLE on ARMv8-M " - "(rd=r%d, rn=r%d, rm=r%d).\n", - rd, rn, rm); - } - - THOP_TRACE("add%s %s, %s, %s", flags == FLAGS_BEHAVIOUR_SET ? "s" : "", th_reg_name(rd), th_reg_name(rn), - th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("\n"); - return th_generic_op_reg_shift_with_status(0xeb00, rd, rn, rm, flags, shift); -} - -thumb_opcode th_add_imm_t4(uint32_t rd, uint32_t rn, uint32_t imm) -{ - if (imm <= 4095) - { - const uint16_t i = (imm >> 11) & 1; - const uint32_t imm3 = (imm >> 8) & 7; - uint32_t op = (0xf200 | (i << 10) | rn) << 16; - op |= ((imm3 << 12) | (rd << 8) | (imm & 0xff)); - return (thumb_opcode){ - .size = 4, - .opcode = op, - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_add_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - thumb_opcode op = {0, 0}; - if (rd == rn && rd < 8 && imm <= 255 && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("add%s %s, #%u\n", flags == FLAGS_BEHAVIOUR_SET ? "s" : "", th_reg_name(rd), (unsigned)imm); - return (thumb_opcode){ - .size = 2, - .opcode = (0x3000 | (rd << 8) | imm), - }; - } - - if (imm <= 7 && rd < 8 && rn < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("add%s %s, %s, #%u\n", flags == FLAGS_BEHAVIOUR_SET ? "s" : "", th_reg_name(rd), th_reg_name(rn), - (unsigned)imm); - return (thumb_opcode){ - .size = 2, - .opcode = (0x1c00 | (imm << 6) | (rn << 3) | rd), - }; - } - - op = th_generic_op_imm_with_status(0xf100, rd, rn, imm, flags); - if (op.size != 0) - { - THOP_TRACE("add%s %s, %s, #%u\n", flags == FLAGS_BEHAVIOUR_SET ? "s" : "", th_reg_name(rd), th_reg_name(rn), - (unsigned)imm); - return op; - } - if (imm <= 4095 && encoding != ENFORCE_ENCODING_16BIT && flags != FLAGS_BEHAVIOUR_SET) - { - THOP_TRACE("add %s, %s, #%u\n", th_reg_name(rd), th_reg_name(rn), (unsigned)imm); - return th_add_imm_t4(rd, rn, imm); - } - return op; -} - -thumb_opcode th_adr_imm(uint32_t rd, int imm, thumb_enforce_encoding encoding) -{ - if (imm <= 1020 && imm >= 0 && encoding != ENFORCE_ENCODING_32BIT && imm % 4 == 0) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xA000 | (rd << 8) | (imm >> 2), - }; - } - - if (imm >= 0 && imm <= 4095) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf20f0000 | (rd << 8) | th_packimm_3_8_1(imm), - }; - } - - if (imm < 0 && imm >= -4096) - { - imm = -imm; - return (thumb_opcode){ - .size = 4, - .opcode = 0xf2af0000 | (rd << 8) | th_packimm_3_8_1(imm), - }; - } - - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} -thumb_opcode th_bic_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ -#ifndef TCC_TARGET_ARM_ARCHV6M - if (rd != R_SP && rd != R_PC && rn != R_SP && rd != R_PC) - { - const uint32_t packed = th_pack_const(imm); - const uint32_t s = (flags == FLAGS_BEHAVIOUR_SET); - if (packed || imm == 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf0200000 | packed | (rn << 16) | (rd << 8) | (s << 20), - }; - } - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_bic_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rm < 8 && rd < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4380 | (rm << 3) | rd, - }; - } - return th_generic_op_reg_shift_with_status(0xea20, rd, rn, rm, flags, shift); -} - -thumb_opcode th_and_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, - thumb_enforce_encoding encoding) -{ - thumb_opcode op = th_generic_op_imm_with_status(0xf000, rd, rn, imm, setflags); - return op.size != 0 ? op : th_bic_imm(rd, rn, ~imm, setflags, encoding); -} - -thumb_opcode th_and_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rd == rn && rm < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4000 | (rm << 3) | rd, - }; - } - return th_generic_op_reg_shift_with_status(0xea00, rd, rn, rm, flags, shift); -} - -thumb_opcode th_xor_reg(uint16_t rd, uint16_t rn, uint16_t rm) -{ - if (rd == rn && rm < 8 && rn < 8) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4040 | (rm << 3) | rd, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rd != R_SP && rd != R_PC && rn != R_SP && rn != R_PC) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xea800000 | (rn << 16) | (rd << 8) | rm, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_xor_imm(uint16_t rd, uint16_t rn, uint32_t imm) -{ - return th_generic_op_imm(0xf080, rd, rn, imm); -} - -thumb_opcode th_rsb_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - return th_generic_op_reg_shift_with_status(0xebc0, rd, rn, rm, flags, shift); -} - -thumb_opcode th_sub_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rd < 8 && rm < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("sub%s %s, %s, %s\n", flags == FLAGS_BEHAVIOUR_SET ? "s" : "", th_reg_name(rd), th_reg_name(rn), - th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = 0x1a00 | (rm << 6) | (rn << 3) | rd, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rd != R_SP && rd != R_PC && rn != R_PC) - { - const uint32_t imm3 = (shift.value >> 2) & 0x7; - const uint32_t imm2 = shift.value & 0x3; - const uint32_t s = (flags == FLAGS_BEHAVIOUR_SET) ? 1 : 0; - /* rn == R_SP uses opcode 0xeba0 (SUB.W Rd, SP, Rm), otherwise 0xeba0 with - * the full rn field. Both emit the same 32-bit T2 encoding - the opcode - * base already encodes SP when rn=13. */ - THOP_TRACE("sub%s %s, %s, %s", s ? "s" : "", th_reg_name(rd), th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("\n"); - return (thumb_opcode){ - .size = 4, - .opcode = 0xeba00000 | (s << 20) | (rn << 16) | (rd << 8) | rm | imm3 << 12 | imm2 << 6 | - th_shift_value_to_sr_type(shift) << 4, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_sub_sp_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - return th_generic_op_reg_shift_with_status(0xeba0, rd, R_SP, rm, flags, shift); -} - -thumb_opcode th_generic_op_reg_shift_with_status(uint32_t op, uint32_t rd, uint32_t rn, uint32_t rm, - thumb_flags_behaviour flags, thumb_shift shift) -{ - int s = 0; - const int sr = th_shift_value_to_sr_type(shift); - const int imm2 = shift.value & 0x3; - const int imm3 = (shift.value >> 2) & 0x7; - if (flags == FLAGS_BEHAVIOUR_SET) - s = 1; - - /* Guard against invalid register values (e.g., -1 or PREG_SPILLED) */ - if (rd > 15 || rn > 15 || rm > 15) - { - tcc_error("compiler_error: 'th_generic_op_reg_shift_with_status' invalid register: rd=%d, rn=%d, rm=%d (op=0x%x)\n", - rd, rn, rm, op); - } - - return (thumb_opcode){ - .size = 4, - .opcode = (op << 16) | (rn << 16) | (rd << 8) | rm | (sr << 4) | (imm2 << 6) | (imm3 << 12) | (s << 20), - }; -} - -thumb_opcode th_adc_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rd == rn && rm < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4140 | (rm << 3) | rd, - }; - } - - return th_generic_op_reg_shift_with_status(0xeb40, rd, rn, rm, flags, shift); -} - -thumb_opcode th_adc_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, - thumb_enforce_encoding encoding) -{ - return th_generic_op_imm_with_status(0xf140, rd, rn, imm, setflags); -} - -thumb_opcode th_sbc_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - return th_generic_op_imm_with_status(0xf160, rd, rn, imm, flags); -} - -thumb_opcode th_sbc_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rd == rn && rm < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4180 | (rm << 3) | rd, - }; - } - return th_generic_op_reg_shift_with_status(0xeb60, rd, rn, rm, flags, shift); -} - -thumb_opcode th_orr_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, - thumb_enforce_encoding encoding) -{ - (void)encoding; /* currently unused */ - if (rn != R_SP && rd != R_SP && rn != R_PC) - { - return th_generic_op_imm_with_status(0xf040, rd, rn, imm, setflags); - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_cmp_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - (void)rd; /* CMP doesn't use rd - result goes to flags */ - (void)flags; /* CMP always sets flags */ - if (rm < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("cmp %s, %s\n", th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = (0x4280 | (rm << 3) | rn), - }; - } - else if (!(rm < 8 && rn < 8) && rm != R_PC && rn != R_PC && encoding != ENFORCE_ENCODING_32BIT && - shift.type == THUMB_SHIFT_NONE) - { - const uint16_t N = (rn >> 3) & 0x1; - THOP_TRACE("cmp %s, %s\n", th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = (0x4500 | (N << 7) | (rm << 3) | (rn & 0x7)), - }; - } - THOP_TRACE("cmp %s, %s", th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("\n"); - return th_generic_op_reg_shift_with_status(0xebb0, 0xf, rn, rm, FLAGS_BEHAVIOUR_SET, shift); -} - -thumb_opcode th_orr_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rd == rn && rm < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = (0x4300 | (rm << 3) | rd), - }; - } - return th_generic_op_reg_shift_with_status(0xea40, rd, rn, rm, flags, shift); -} - -thumb_opcode th_sub_imm_t4(uint32_t rd, uint32_t rn, uint32_t imm) -{ - if (rd != R_SP && rd != R_PC && imm <= 0xfff) - { - // T4 - const uint16_t i = imm >> 11; - const uint32_t imm3 = (imm >> 8) & 0x7; - return (thumb_opcode){ - .size = 4, - .opcode = 0xf2a00000 | (i << 26) | (rn << 16) | (imm3 << 12) | (rd << 8) | (imm & 0xff), - }; - } - - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_sub_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - if (rd == rn && imm <= 255 && rd < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - // T2 - THOP_TRACE("sub%s %s, #%u\n", flags == FLAGS_BEHAVIOUR_SET ? "s" : "", th_reg_name(rd), (unsigned)imm); - return (thumb_opcode){ - .size = 2, - .opcode = (0x3800 | (rd << 8) | imm), - }; - } - - if (rd < 8 && rn < 8 && imm <= 7 && encoding != ENFORCE_ENCODING_32BIT) - { - // T1 - THOP_TRACE("sub%s %s, %s, #%u\n", flags == FLAGS_BEHAVIOUR_SET ? "s" : "", th_reg_name(rd), th_reg_name(rn), - (unsigned)imm); - return (thumb_opcode){ - .size = 2, - .opcode = (0x1e00 | (imm << 6) | (rn << 3) | rd), - }; - } - - if (rd != 13 && rd != 15) - { - const uint32_t enc = th_pack_const(imm); - const uint32_t s = (flags == FLAGS_BEHAVIOUR_SET) ? 1 : 0; - if (enc || imm == 0) - { - THOP_TRACE("sub%s %s, %s, #%u\n", s ? "s" : "", th_reg_name(rd), th_reg_name(rn), (unsigned)imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf1a00000 | s << 20 | (rn << 16) | (rd << 8) | enc, - }; - } - } - THOP_TRACE("sub %s, %s, #%u\n", th_reg_name(rd), th_reg_name(rn), (unsigned)imm); - return th_sub_imm_t4(rd, rn, imm); -} - -thumb_opcode th_push(uint32_t regs) -{ - // T1 encoding R0-R7 + LR only, all armv-m - // (T2 in armv8-m - inconsistent naming in reference manual) - if (!(regs & 0xbf00)) - { - const uint16_t lr = (regs >> 14) & 1; - THOP_TRACE("push "); - th_trace_regset(regs); - THOP_TRACE("\n"); - return (thumb_opcode){ - .size = 2, - .opcode = (0xb400 | (lr << 8) | (regs & 0xff)), - }; - } -// T2 encoding R0-R12 + LR only, Thumb-2 (not available on ARMv6-M) -// (T1 in armv8-m - inconsistent naming in reference manual) -#ifndef TCC_TARGET_ARM_ARCHV6M - if (!(regs & 0xa000)) - { - THOP_TRACE("push "); - th_trace_regset(regs); - THOP_TRACE("\n"); - return (thumb_opcode){ - .size = 4, - .opcode = (0xe92dU << 16 | regs), - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -int th_ldr_literal_estimate(uint16_t rt, uint32_t imm) -{ - if (rt < 8 && !(imm & 3) && imm <= 0x3ff) - return 2; -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (imm <= 0xfff) - return 4; -#endif - return 0; -} - -thumb_opcode th_ldrsh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding) -{ -#ifndef TCC_TARGET_ARM_ARCHV6M - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (rt != R_SP && imm <= 4095 && puw == 6 && rn != R_PC) - { - uint32_t ins = (0xf9b0 | ((rn & 0xf))) << 16; - ins |= (((rt & 0xf) << 12) | imm); - THOP_TRACE("ldrsh %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = ins, - }; - } - else if (imm <= 4095 && rn == R_PC) - { - const uint32_t u = (puw & 0x2) >> 1; - THOP_TRACE("ldrsh %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf93f0000 | (rn << 16) | (rt << 12) | (u << 23) | imm, - }; - } - else if (rt != R_SP && imm <= 255) - { - uint32_t ins = (0xf930 | (rn & 0xf)) << 16; - ins |= (0x0800 | ((rt & 0xf) << 12) | (puw << 8) | imm); -#if THUMB_OPCODE_TRACE - { - const uint32_t p = (puw >> 2) & 1; - const uint32_t u = (puw >> 1) & 1; - const uint32_t w = (puw >> 0) & 1; - if (p && !w) - { - THOP_TRACE("ldrsh %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (p && w) - { - THOP_TRACE("ldrsh %s, [%s, #%c%d]!\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (!p && w) - { - THOP_TRACE("ldrsh %s, [%s], #%c%d\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else - { - THOP_TRACE("ldrsh %s, [%s, #%c%d] (puw=%u)\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm, - (unsigned)puw); - } - } -#endif - return (thumb_opcode){ - .size = 4, - .opcode = ins, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldrsh_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_LSL) - { - tcc_error("compiler_error: 'th_ldrsh_reg', only LSL shift supported\n"); - } - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (rm < 8 && rt < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("ldrsh %s, [%s, %s]\n", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = 0x5e00 | (rm << 6) | (rn << 3) | rt, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rt != R_SP && rm != R_SP && rm != R_SP) - { - THOP_TRACE("ldrsh %s, [%s, %s", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("]\n"); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf9300000 | (rn << 16) | (rt << 12) | rm | shift.value << 4, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldrh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding) -{ - // T1 encoding, on armv6-m this one is the only one available - if (puw == 6 && rn < 8 && rt < 8 && imm <= 62 && encoding != ENFORCE_ENCODING_32BIT && !(imm & 1)) - { - THOP_TRACE("ldrh %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - imm = imm >> 1; - // imm[0] is enforced to be 0, and sould be divided by 2, thus offset is 5 - return (thumb_opcode){ - .size = 2, - .opcode = (0x8800 | (imm << 6) | (rn << 3) | rt), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (puw == 6 && rt != R_SP && imm >= 0 && imm <= 4095 && rn != R_PC) - { - THOP_TRACE("ldrh %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8b00000 | (rn << 16) | (rt << 12) | imm, - }; - } - else if (imm >= 0 && imm <= 4095 && rn == R_PC) - { - const uint32_t u = (puw & 0x2) >> 1; - THOP_TRACE("ldrh %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf83f0000 | (u << 23) | (rn << 16) | (rt << 12) | imm, - }; - } - else if (rt != R_SP && imm <= 255) - { -#if THUMB_OPCODE_TRACE - { - const uint32_t p = (puw >> 2) & 1; - const uint32_t u = (puw >> 1) & 1; - const uint32_t w = (puw >> 0) & 1; - if (p && !w) - { - THOP_TRACE("ldrh %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (p && w) - { - THOP_TRACE("ldrh %s, [%s, #%c%d]!\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (!p && w) - { - THOP_TRACE("ldrh %s, [%s], #%c%d\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else - { - THOP_TRACE("ldrh %s, [%s, #%c%d] (puw=%u)\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm, - (unsigned)puw); - } - } -#endif - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8300800 | (rn << 16) | (rt << 12) | (puw << 8) | imm, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldrh_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_LSL) - { - tcc_error("compiler_error: 'th_ldr_reg', only LSL shift supported\n"); - } - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (rm < 8 && rt < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("ldrh %s, [%s, %s]\n", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = 0x5a00 | (rm << 6) | (rn << 3) | rt, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rt != R_SP && rm != R_SP && rm != R_PC) - { - THOP_TRACE("ldrh %s, [%s, %s", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("]\n"); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8300000 | (rn << 16) | (rt << 12) | rm | shift.value << 4, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldrsb_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding) -{ -#ifndef TCC_TARGET_ARM_ARCHV6M - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (rt != R_SP && imm <= 4095 && puw == 6 && rn != R_PC) - { - THOP_TRACE("ldrsb %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf9900000 | (rn << 16) | (rt << 12) | imm, - }; - } - else if (imm <= 4095 && rn == R_PC) - { - const uint32_t u = (puw & 0x2) >> 1; - THOP_TRACE("ldrsb %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf91f0000 | (rn << 16) | (rt << 12) | (u << 23) | imm, - }; - } - else if (rt != R_SP && imm <= 255) - { - { -#if THUMB_OPCODE_TRACE - const uint32_t p = (puw >> 2) & 1; - const uint32_t u = (puw >> 1) & 1; - const uint32_t w = (puw >> 0) & 1; - if (p && !w) - { - THOP_TRACE("ldrsb %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (p && w) - { - THOP_TRACE("ldrsb %s, [%s, #%c%d]!\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (!p && w) - { - THOP_TRACE("ldrsb %s, [%s], #%c%d\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else - { - THOP_TRACE("ldrsb %s, [%s, #%c%d] (puw=%u)\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm, - (unsigned)puw); - } -#endif - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf9100800 | (rn << 16) | (rt << 12) | (puw << 8) | imm, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldrsb_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_LSL) - { - tcc_error("compiler_error: 'th_ldr_reg', only LSL shift supported\n"); - } - - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (rm < 8 && rt < 8 && rn < 8 && encoding != ENFORCE_ENCODING_32BIT && shift.type == THUMB_SHIFT_NONE) - { - THOP_TRACE("ldrsb %s, [%s, %s]\n", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = 0x5600 | (rm << 6) | (rn << 3) | rt, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rt != R_SP && rm != R_SP && rm != R_SP) - { - THOP_TRACE("ldrsb %s, [%s, %s", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("]\n"); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf9100000 | (rn << 16) | (rt << 12) | rm | shift.value << 4, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldrb_imm(uint16_t rt, uint16_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding) -{ - // T1 encoding, on armv6-m this one is the only one available - if (puw == 6 && rn < 8 && rt < 8 && imm <= 31 && encoding != ENFORCE_ENCODING_32BIT) - { - // imm[0] is enforced to be 0, and sould be divided by 2, thus offset is 5 - THOP_TRACE("ldrb %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 2, - .opcode = 0x7800 | (imm << 6) | (rn << 3) | rt, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (puw == 6 && rt != R_SP && imm >= 0 && imm <= 4095 && rn != R_PC) - { - THOP_TRACE("ldrb %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8900000 | (rn << 16) | (rt << 12) | imm, - }; - } - else if (imm >= 0 && imm <= 4095 && rn == R_PC) - { - uint32_t u = (puw & 0x2) >> 1; - THOP_TRACE("ldrb %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf81f0000 | (u << 23) | (rt << 12) | imm, - }; - } - else if (rt != R_SP && imm <= 255) - { - { -#if THUMB_OPCODE_TRACE - const uint32_t p = (puw >> 2) & 1; - const uint32_t u = (puw >> 1) & 1; - const uint32_t w = (puw >> 0) & 1; - if (p && !w) - { - THOP_TRACE("ldrb %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (p && w) - { - THOP_TRACE("ldrb %s, [%s, #%c%d]!\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (!p && w) - { - THOP_TRACE("ldrb %s, [%s], #%c%d\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else - { - THOP_TRACE("ldrb %s, [%s, #%c%d] (puw=%u)\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm, - (unsigned)puw); - } -#endif - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8100800 | (rn << 16) | (rt << 12) | (puw << 8) | imm, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldrb_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_LSL) - { - tcc_error("compiler_error: 'th_ldr_reg', only LSL shift supported\n"); - } - if (rm < 8 && rt < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("ldrb %s, [%s, %s]\n", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = 0x5c00 | (rm << 6) | (rn << 3) | rt, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rt != R_SP && rm != R_SP && rm != R_PC) - { - THOP_TRACE("ldrb %s, [%s, %s", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("]\n"); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8100000 | (rn << 16) | (rt << 12) | rm | shift.value << 4, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldr_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding) -{ - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (puw == 6 && rn < 8 && rt < 8 && imm <= 124 && !(imm & 3) && encoding != ENFORCE_ENCODING_32BIT) - { - // imm[0] is enforced to be 0, and sould be divided by 4, thus offset is 4 - THOP_TRACE("ldr %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 2, - .opcode = 0x6800 | (imm << 4) | (rn << 3) | rt, - }; - } - else if (puw == 6 && rn == R_SP && rt < 8 && imm <= 1020 && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("ldr %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 2, - .opcode = 0x9800 | (rt << 8) | (imm >> 2), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (puw == 6 && imm <= 4095 && rn != R_PC) - { - uint32_t ins = (0xf8d0 | (rn & 0xf)) << 16; - ins |= (rt << 12) | imm; - THOP_TRACE("ldr %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = ins, - }; - } - else if (imm >= 0 && imm <= 4095 && rn == R_PC) - { - uint32_t u = (puw & 0x2) >> 1; - THOP_TRACE("ldr %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf85f0000 | (u << 23) | (rt << 12) | imm, - }; - } - else if (imm <= 255) - { - uint32_t ins = (0xf850 | (rn & 0xf)) << 16; - ins |= (0x0800 | ((rt & 0xf) << 12) | ((puw & 0x7) << 8) | imm); - { -#if THUMB_OPCODE_TRACE - const uint32_t p = (puw >> 2) & 1; - const uint32_t u = (puw >> 1) & 1; - const uint32_t w = (puw >> 0) & 1; - if (p && !w) - { - THOP_TRACE("ldr %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (p && w) - { - THOP_TRACE("ldr %s, [%s, #%c%d]!\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (!p && w) - { - THOP_TRACE("ldr %s, [%s], #%c%d\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else - { - THOP_TRACE("ldr %s, [%s, #%c%d] (puw=%u)\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm, - (unsigned)puw); - } -#endif - } - return (thumb_opcode){ - .size = 4, - .opcode = ins, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldr_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_LSL) - { - tcc_error("compiler_error: 'th_ldr_reg', only LSL shift supported\n"); - } - if (rm < 8 && rt < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("ldr %s, [%s, %s]\n", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = (0x5800 | (rm << 6) | (rn << 3) | rt), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rt != R_SP && rm != R_SP && rm != R_PC) - { - THOP_TRACE("ldr %s, [%s, %s", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("]\n"); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8500000 | (rn << 16) | (rt << 12) | rm | shift.value << 4, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldr_literal(uint16_t rt, uint32_t imm, uint32_t add) -{ - if (rt < 8 && imm <= 1020) - { - THOP_TRACE("ldr %s, [%s, #%c%u]\n", th_reg_name(rt), th_reg_name(R_PC), (add & 1) ? '+' : '-', (unsigned)imm); - return (thumb_opcode){ - .size = 2, - .opcode = 0x4800 | (rt << 8) | imm >> 2, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rt != R_PC && imm <= 0xffff) - { - THOP_TRACE("ldr %s, [%s, #%c%u]\n", th_reg_name(rt), th_reg_name(R_PC), (add & 1) ? '+' : '-', (unsigned)imm); - uint32_t ins = (0xf85f | ((add & 1) << 7)) << 16; - ins |= (rt & 0xf) << 12 | imm; - return (thumb_opcode){ - .size = 4, - .opcode = ins, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_pop(uint16_t regs) -{ - // T1 encoding R0-R7 + PC only, all armv-m - // (T2 in armv8-m - inconsistent naming in reference manual) - if (!(regs & 0x7f00)) - { - const uint16_t pc = (regs >> 15) & 1; - THOP_TRACE("pop "); - th_trace_regset(regs); - THOP_TRACE("\n"); - return (thumb_opcode){ - .size = 2, - .opcode = 0xbc00 | (pc << 8) | (regs & 0xff), - }; - } -// T2 encoding R0-R12 + PC + LR, Thumb-2 (not available on ARMv6-M) -// (T1 in armv8-m - inconsistent naming in reference manual) -#ifndef TCC_TARGET_ARM_ARCHV6M - if (!(regs & 0x2000)) - { - THOP_TRACE("pop "); - th_trace_regset(regs); - THOP_TRACE("\n"); - return (thumb_opcode){ - .size = 4, - .opcode = (0xe8bdU << 16) | regs, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -// STR -thumb_opcode th_strh_imm(uint16_t rt, uint16_t rn, int imm, uint16_t puw, thumb_enforce_encoding encoding) -{ - // T1 encoding, on armv6-m this one is the only one available - if (puw == 6 && rn < 8 && rt < 8 && imm <= 62 && encoding != ENFORCE_ENCODING_32BIT && !(imm & 1)) - { - // imm[0] is enforced to be 0, and sould be divided by 2, thus offset is 5 - THOP_TRACE("strh %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - imm >>= 1; - return (thumb_opcode){ - .size = 2, - .opcode = (0x8000 | (imm << 6) | (rn << 3) | rt), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (puw == 6 && rt != R_SP && imm <= 4095) - { - THOP_TRACE("strh %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = (0xf8a00000 | (rn << 16) | (rt << 12) | imm), - }; - } - else if (rt != R_SP && imm <= 255) - { - { -#if THUMB_OPCODE_TRACE - const uint32_t p = (puw >> 2) & 1; - const uint32_t u = (puw >> 1) & 1; - const uint32_t w = (puw >> 0) & 1; - if (p && !w) - { - THOP_TRACE("strh %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (p && w) - { - THOP_TRACE("strh %s, [%s, #%c%d]!\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (!p && w) - { - THOP_TRACE("strh %s, [%s], #%c%d\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else - { - THOP_TRACE("strh %s, [%s, #%c%d] (puw=%u)\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm, - (unsigned)puw); - } -#endif - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8200800 | (rn << 16) | (rt << 12) | ((puw & 0x7) << 8) | imm, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_strh_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (rm < 8 && rt < 8 && rn < 8 && encoding != ENFORCE_ENCODING_32BIT && shift.type == THUMB_SHIFT_NONE) - { - THOP_TRACE("strh %s, [%s, %s]\n", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = 0x5200 | (rm << 6) | (rn << 3) | rt, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rt != R_SP && rm != R_SP && rm != R_PC) - { - THOP_TRACE("strh %s, [%s, %s", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("]\n"); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8200000 | (rn << 16) | (rt << 12) | rm | shift.value << 4, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_strb_imm(uint16_t rt, uint16_t rn, int imm, uint16_t puw, thumb_enforce_encoding encoding) -{ - // T1 encoding, on armv6-m this one is the only one available - if (puw == 6 && rn < 8 && rt < 8 && imm <= 31 && encoding != ENFORCE_ENCODING_32BIT) - { - // imm[0] is enforced to be 0, and sould be divided by 2, thus offset is 5 - THOP_TRACE("strb %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 2, - .opcode = 0x7000 | (imm << 6) | (rn << 3) | rt, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (puw == 6 && rt != R_SP && imm <= 4095) - { - THOP_TRACE("strb %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8800000 | (rn << 16) | (rt << 12) | imm, - }; - } - else if (rt != R_SP && imm <= 255) - { - { -#if THUMB_OPCODE_TRACE - const uint32_t p = (puw >> 2) & 1; - const uint32_t u = (puw >> 1) & 1; - const uint32_t w = (puw >> 0) & 1; - if (p && !w) - { - THOP_TRACE("strb %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (p && w) - { - THOP_TRACE("strb %s, [%s, #%c%d]!\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (!p && w) - { - THOP_TRACE("strb %s, [%s], #%c%d\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else - { - THOP_TRACE("strb %s, [%s, #%c%d] (puw=%u)\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm, - (unsigned)puw); - } -#endif - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8000800 | (rn << 16) | (rt << 12) | ((puw & 0x7) << 8) | imm, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_strb_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (rm < 8 && rt < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("strb %s, [%s, %s]\n", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = (0x5400 | (rm << 6) | (rn << 3) | rt), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rt != R_SP && rm != R_SP && rm != R_PC) - { - THOP_TRACE("strb %s, [%s, %s", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("]\n"); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8000000 | (rn << 16) | (rt << 12) | rm | shift.value << 4, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_str_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_LSL) - { - tcc_error("compiler_error: 'th_str_reg', only LSL shift supported\n"); - } - - if (rm < 8 && rt < 8 && rn < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("str %s, [%s, %s]\n", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - return (thumb_opcode){ - .size = 2, - .opcode = (0x5000 | (rm << 6) | (rn << 3) | rt), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rt != R_SP && rm != R_SP && rm != R_PC) - { - THOP_TRACE("str %s, [%s, %s", th_reg_name(rt), th_reg_name(rn), th_reg_name(rm)); - th_trace_shift_suffix(shift); - THOP_TRACE("]\n"); - return (thumb_opcode){ - .size = 4, - .opcode = (0xf8400000 | (rn << 16) | (rt << 12) | rm | shift.value << 4), - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_mul(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding) -{ - if (rd == rm && rd < 8 && rn < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = (0x4340 | ((rn & 0x7) << 3) | (rm & 0x7)), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else - { - return (thumb_opcode){ - .size = 4, - .opcode = (0xfb00f000 | ((rn & 0xf) << 16) | ((rd & 0xf) << 8) | (rm & 0xf)), - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_umull(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm) -{ -#ifndef TCC_TARGET_ARM_ARCHV6M - return (thumb_opcode){ - .size = 4, - .opcode = 0xfba00000 | ((rn & 0xf) << 16) | ((rdlo & 0xf) << 12) | ((rdhi & 0xf) << 8) | (rm & 0xf), - }; -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_udiv(uint16_t rd, uint16_t rn, uint16_t rm) -{ -#ifndef TCC_TARGET_ARM_ARCHV6M - return (thumb_opcode){ - .size = 4, - .opcode = 0xfbb0f0f0 | (rn << 16) | (rd << 8) | rm, - }; -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_sdiv(uint16_t rd, uint16_t rn, uint16_t rm) -{ -#ifndef TCC_TARGET_ARM_ARCHV6M - return (thumb_opcode){ - .size = 4, - .opcode = 0xfb90f0f0 | (rn << 16) | (rd << 8) | rm, - }; -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_add_sp_imm_t4(uint32_t rd, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding) -{ - if (rd != R_PC && imm <= 4095 && (encoding != ENFORCE_ENCODING_16BIT) && (flags != FLAGS_BEHAVIOUR_SET)) - { - const uint16_t i = (imm >> 11) & 1; - const uint32_t imm3 = (imm >> 8) & 7; - return (thumb_opcode){ - .size = 4, - .opcode = 0xf20d0000 | (i << 26) | (imm3 << 12) | (rd << 8) | (imm & 0xff), - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_add_sp_imm(uint16_t rd, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding) -{ - // T1 on all armv-m - if (rd < 8 && imm <= 1020 && !(imm & 0x3) && (flags != FLAGS_BEHAVIOUR_SET) && (encoding != ENFORCE_ENCODING_32BIT)) - { - return (thumb_opcode){ - .size = 2, - .opcode = (0xa800 | (rd << 8) | (imm >> 2)), - }; - } - // T2 on all armv-m - else if (rd == R_SP && imm <= 508 && !(imm & 0x3) && (flags != FLAGS_BEHAVIOUR_SET) && - (encoding != ENFORCE_ENCODING_32BIT)) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xb000 | (imm >> 2), - }; - } -#if !defined(TCC_TARGET_ARM_ARCHV6M) - // T3 - else if (rd != R_PC && (encoding != ENFORCE_ENCODING_16BIT)) - { - const uint32_t enc = th_pack_const(imm); - const uint32_t s = (flags == FLAGS_BEHAVIOUR_SET) ? 1 : 0; - if (enc || imm == 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf10d0000 | enc | (rd << 8) | (s << 20), - }; - } - } - return th_add_sp_imm_t4(rd, imm, flags, encoding); -#else - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -#endif -} - -thumb_opcode th_add_sp_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding, - thumb_shift shift) -{ - if (rd == rm && flags != FLAGS_BEHAVIOUR_SET && encoding != ENFORCE_ENCODING_32BIT && shift.type == THUMB_SHIFT_NONE) - { - const uint16_t rdm = rd & 7; - const uint16_t dm = rd >> 3; - return (thumb_opcode){ - .size = 2, - .opcode = 0x4468 | (dm << 7) | rdm, - }; - } - - if (rd == R_SP && flags != FLAGS_BEHAVIOUR_SET && encoding != ENFORCE_ENCODING_32BIT && - shift.type == THUMB_SHIFT_NONE) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4485 | (rm << 3), - }; - } - - if (encoding != ENFORCE_ENCODING_16BIT) - { - const uint32_t s = flags == FLAGS_BEHAVIOUR_SET; - const uint32_t imm2 = shift.value & 0x3; - const uint32_t imm3 = (shift.value >> 2) & 0x7; - const uint32_t sr = th_shift_value_to_sr_type(shift); - return (thumb_opcode){ - .size = 4, - .opcode = 0xeb0d0000 | (s << 20) | (imm3 << 12) | (rd << 8) | (imm2 << 6) | (sr << 4) | rm, - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_rsb_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, - thumb_enforce_encoding encoding) -{ - if (rd < 8 && rn < 8 && imm == 0 && setflags == FLAGS_BEHAVIOUR_SET) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4240 | (rn << 3) | rd, - }; - } - else if (rd != R_SP && rd != R_PC && rn != R_SP && rn != R_PC) - { - return th_generic_op_imm_with_status(0xf1c0, rd, rn, imm, setflags); - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_shift_armv7m(uint16_t rd, uint16_t rm, uint32_t imm, uint32_t type, thumb_flags_behaviour setflags) -{ - const uint32_t imm3 = (imm >> 2) & 7; - const uint32_t imm2 = imm & 0x3; - const uint32_t s = setflags == FLAGS_BEHAVIOUR_SET; - return (thumb_opcode){ - .size = 4, - .opcode = 0xea4f0000 | (imm3 << 12) | (rd << 8) | (imm2 << 6) | (type << 4) | rm | s << 20, - }; -} - -thumb_opcode th_lsl_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - (void)shift; /* shift parameter unused for LSL_reg - shift amount is in rm */ - if (rd == rn && rm < 8 && rn < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4080 | (rm << 3) | rd, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rd != R_SP && rd != R_PC && rn != R_SP && rn != R_PC && rm != R_SP && rm != R_PC) - { - const uint32_t s = flags == FLAGS_BEHAVIOUR_SET; - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa00f000 | (rn << 16) | (rd << 8) | rm | s << 20, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_lsl_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - thumb_shift shift = { - .type = THUMB_SHIFT_LSL, - .value = imm, - .mode = THUMB_SHIFT_IMMEDIATE, - }; - return th_mov_reg(rd, rn, flags, shift, encoding, false); -} - -thumb_opcode th_lsr_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rd == rn && rm < 8 && rn < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x40c0 | (rm << 3) | rd, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rd != R_SP && rd != R_PC && rn != R_SP && rn != R_PC && rm != R_SP && rm != R_PC) - { - const uint32_t s = flags == FLAGS_BEHAVIOUR_SET; - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa20f000 | (rn << 16) | (rd << 8) | rm | s << 20, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_lsr_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - if (rm < 8 && rd < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = (0x0800 | (imm << 6) | (rm << 3) | rd), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (imm >= 1 && imm <= 31) - { - return th_shift_armv7m(rd, rm, imm, 1, flags); - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_asr_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rd == rn && rm < 8 && rn < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = (0x4100 | (rm << 3) | rd), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (rd != R_SP && rd != R_PC && rn != R_SP && rn != R_PC && rm != R_SP && rm != R_PC) - { - const uint32_t s = flags == FLAGS_BEHAVIOUR_SET; - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa40f000 | (rn << 16) | (rd << 8) | rm | s << 20, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_asr_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - if (rm < 8 && rd < 8 && encoding != ENFORCE_ENCODING_32BIT && flags == FLAGS_BEHAVIOUR_SET && imm != 0) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x1000 | (imm << 6) | (rm << 3) | rd, - }; - } - - if (rm < 8 && rd < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x1000 | (imm << 6) | (rm << 3) | rd, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (imm >= 1 && imm <= 31) - { - return th_shift_armv7m(rd, rm, imm, 2, flags); - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_mov_reg_shift(uint32_t rd, uint32_t rm, uint32_t rs, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - const uint32_t s = flags == FLAGS_BEHAVIOUR_SET; - if (rd == rm && rd < 8 && rs < 8 && encoding != ENFORCE_ENCODING_32BIT && shift.type != THUMB_SHIFT_RRX) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4000 | (rs << 3) | th_shift_type_to_op(shift) << 6 | rd, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa00f000 | th_shift_value_to_sr_type(shift) << 21 | s << 20 | rm << 16 | rd << 8 | rs, - }; -} - -thumb_opcode th_ror_imm(uint16_t rd, uint16_t rm, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - if (rm < 8 && rd < 8 && encoding != ENFORCE_ENCODING_32BIT && flags == FLAGS_BEHAVIOUR_SET && imm != 0) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x0000 | (imm << 6) | (rm << 3) | rd, - }; - } - if (rm < 8 && rd < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = ((imm << 6) | (rm << 3) | rd), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (imm >= 1 && imm <= 31) - { - return th_shift_armv7m(rd, rm, imm, 0, flags); - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_cmp_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - (void)rd; /* CMP doesn't use rd - result goes to flags */ - (void)flags; /* CMP always sets flags */ - if (rn < 8 && imm <= 255 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x2800 | (rn << 8) | imm, - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else - { - const uint32_t packed = th_pack_const(imm); - if (packed || imm == 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf1b00f00 | (rn << 16) | packed, - }; - } - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -// VFP instructions - -/* VFP arithmetic instructions - single and double precision */ - -/* VADD.F32 Sd, Sn, Sm or VADD.F64 Dd, Dn, Dm - * sz=0 for single (F32), sz=1 for double (F64) - */ -thumb_opcode th_vadd_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz) -{ - uint32_t D, N, M, Vd, Vn, Vm; - if (sz) - { - /* Double precision: D:Vd, N:Vn, M:Vm where D/N/M are bit 4 */ - D = (vd >> 4) & 1; - Vd = vd & 0xf; - N = (vn >> 4) & 1; - Vn = vn & 0xf; - M = (vm >> 4) & 1; - Vm = vm & 0xf; - } - else - { - /* Single precision: Vd:D, Vn:N, Vm:M where D/N/M are bit 0 */ - D = vd & 1; - Vd = (vd >> 1) & 0xf; - N = vn & 1; - Vn = (vn >> 1) & 0xf; - M = vm & 1; - Vm = (vm >> 1) & 0xf; - } - /* VADD: 1110 1110 0D11 nnnn dddd 101s N0M0 mmmm */ - return (thumb_opcode){ - .size = 4, - .opcode = 0xee300a00 | (D << 22) | (Vn << 16) | (Vd << 12) | (sz << 8) | (N << 7) | (M << 5) | Vm, - }; -} - -/* VSUB.F32 Sd, Sn, Sm or VSUB.F64 Dd, Dn, Dm */ -thumb_opcode th_vsub_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz) -{ - uint32_t D, N, M, Vd, Vn, Vm; - if (sz) - { - D = (vd >> 4) & 1; - Vd = vd & 0xf; - N = (vn >> 4) & 1; - Vn = vn & 0xf; - M = (vm >> 4) & 1; - Vm = vm & 0xf; - } - else - { - D = vd & 1; - Vd = (vd >> 1) & 0xf; - N = vn & 1; - Vn = (vn >> 1) & 0xf; - M = vm & 1; - Vm = (vm >> 1) & 0xf; - } - /* VSUB: 1110 1110 0D11 nnnn dddd 101s N1M0 mmmm */ - return (thumb_opcode){ - .size = 4, - .opcode = 0xee300a40 | (D << 22) | (Vn << 16) | (Vd << 12) | (sz << 8) | (N << 7) | (M << 5) | Vm, - }; -} - -/* VMUL.F32 Sd, Sn, Sm or VMUL.F64 Dd, Dn, Dm */ -thumb_opcode th_vmul_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz) -{ - uint32_t D, N, M, Vd, Vn, Vm; - if (sz) - { - D = (vd >> 4) & 1; - Vd = vd & 0xf; - N = (vn >> 4) & 1; - Vn = vn & 0xf; - M = (vm >> 4) & 1; - Vm = vm & 0xf; - } - else - { - D = vd & 1; - Vd = (vd >> 1) & 0xf; - N = vn & 1; - Vn = (vn >> 1) & 0xf; - M = vm & 1; - Vm = (vm >> 1) & 0xf; - } - /* VMUL: 1110 1110 0D10 nnnn dddd 101s N0M0 mmmm */ - return (thumb_opcode){ - .size = 4, - .opcode = 0xee200a00 | (D << 22) | (Vn << 16) | (Vd << 12) | (sz << 8) | (N << 7) | (M << 5) | Vm, - }; -} - -/* VDIV.F32 Sd, Sn, Sm or VDIV.F64 Dd, Dn, Dm */ -thumb_opcode th_vdiv_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz) -{ - uint32_t D, N, M, Vd, Vn, Vm; - if (sz) - { - D = (vd >> 4) & 1; - Vd = vd & 0xf; - N = (vn >> 4) & 1; - Vn = vn & 0xf; - M = (vm >> 4) & 1; - Vm = vm & 0xf; - } - else - { - D = vd & 1; - Vd = (vd >> 1) & 0xf; - N = vn & 1; - Vn = (vn >> 1) & 0xf; - M = vm & 1; - Vm = (vm >> 1) & 0xf; - } - /* VDIV: 1110 1110 1D00 nnnn dddd 101s N0M0 mmmm */ - return (thumb_opcode){ - .size = 4, - .opcode = 0xee800a00 | (D << 22) | (Vn << 16) | (Vd << 12) | (sz << 8) | (N << 7) | (M << 5) | Vm, - }; -} - -/* VNEG.F32 Sd, Sm or VNEG.F64 Dd, Dm */ -thumb_opcode th_vneg_f(uint32_t vd, uint32_t vm, uint32_t sz) -{ - uint32_t D, M, Vd, Vm; - if (sz) - { - D = (vd >> 4) & 1; - Vd = vd & 0xf; - M = (vm >> 4) & 1; - Vm = vm & 0xf; - } - else - { - D = vd & 1; - Vd = (vd >> 1) & 0xf; - M = vm & 1; - Vm = (vm >> 1) & 0xf; - } - /* VNEG: 1110 1110 1D11 0001 dddd 101s 01M0 mmmm */ - return (thumb_opcode){ - .size = 4, - .opcode = 0xeeb10a40 | (D << 22) | (Vd << 12) | (sz << 8) | (M << 5) | Vm, - }; -} - -/* VCMP.F32 Sd, Sm or VCMP.F64 Dd, Dm - * Compares and sets FPSCR flags - */ -thumb_opcode th_vcmp_f(uint32_t vd, uint32_t vm, uint32_t sz) -{ - uint32_t D, M, Vd, Vm; - if (sz) - { - D = (vd >> 4) & 1; - Vd = vd & 0xf; - M = (vm >> 4) & 1; - Vm = vm & 0xf; - } - else - { - D = vd & 1; - Vd = (vd >> 1) & 0xf; - M = vm & 1; - Vm = (vm >> 1) & 0xf; - } - /* VCMP: 1110 1110 1D11 0100 dddd 101s E1M0 mmmm (E=0 for quiet compare) */ - return (thumb_opcode){ - .size = 4, - .opcode = 0xeeb40a40 | (D << 22) | (Vd << 12) | (sz << 8) | (M << 5) | Vm, - }; -} - -/* VCMPE.F32 Sd, Sm or VCMPE.F64 Dd, Dm - * Compares and sets FPSCR flags, signals exception on any NaN - */ -thumb_opcode th_vcmpe_f(uint32_t vd, uint32_t vm, uint32_t sz) -{ - uint32_t D, M, Vd, Vm; - if (sz) - { - D = (vd >> 4) & 1; - Vd = vd & 0xf; - M = (vm >> 4) & 1; - Vm = vm & 0xf; - } - else - { - D = vd & 1; - Vd = (vd >> 1) & 0xf; - M = vm & 1; - Vm = (vm >> 1) & 0xf; - } - /* VCMPE: 1110 1110 1D11 0100 dddd 101s E1M0 mmmm (E=1) */ - return (thumb_opcode){ - .size = 4, - .opcode = 0xeeb40ac0 | (D << 22) | (Vd << 12) | (sz << 8) | (M << 5) | Vm, - }; -} - -thumb_opcode th_vpush(uint32_t regs, uint32_t is_doubleword) -{ - int first_register = 0; - int register_count = 0; - uint32_t D = 0; - uint32_t Vd = 0; - for (int i = 0; i < 32; i++) - { - if (regs & (1 << i)) - { - first_register = i; - break; - } - } - - register_count = 0; - for (int i = 0; i < 32; i++) - { - if (regs & (1 << i)) - { - register_count++; - } - } - - if (is_doubleword) - { - D = first_register >> 4; - Vd = first_register & 0xf; - register_count <<= 1; - } - else - { - D = first_register & 1; - Vd = first_register >> 1; - } - - return (thumb_opcode){ - .size = 4, - .opcode = 0xed2d0a00 | D << 22 | (Vd << 12) | (register_count & 0xff) | (is_doubleword << 8), - }; -} - -thumb_opcode th_vpop(uint32_t regs, uint32_t is_doubleword) -{ - int first_register = 0; - int register_count = 0; - uint32_t D = 0; - uint32_t Vd = 0; - for (int i = 0; i < 32; i++) - { - if (regs & (1 << i)) - { - first_register = i; - break; - } - } - - register_count = 0; - for (int i = 0; i < 32; i++) - { - if (regs & (1 << i)) - { - register_count++; - } - } - - if (is_doubleword) - { - D = first_register >> 4; - Vd = first_register & 0xf; - register_count <<= 1; - } - else - { - D = first_register & 1; - Vd = first_register >> 1; - } - - return (thumb_opcode){ - .size = 4, - .opcode = 0xecbd0a00 | D << 22 | (Vd << 12) | (register_count & 0xff) | (is_doubleword << 8), - }; -} - -thumb_opcode th_vmov_register(uint16_t vd, uint16_t vm, uint32_t sz) -{ - if (sz == 0) - { - /* Single precision: S-register number 0-31, D bit is bit 0 */ - if (vd <= 0x1f && vm <= 0x1f) - { - const uint16_t d = vd & 1; - const uint16_t m = vm & 1; - vd >>= 1; - vm >>= 1; - return (thumb_opcode){ - .size = 4, - .opcode = 0xeeb00a40 | (d << 22) | (vd << 12) | (m << 5) | vm | (sz << 8), - }; - } - } - else - { - /* Double precision: D-register number 0-15, no bit splitting needed */ - if (vd <= 0x0f && vm <= 0x0f) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xeeb00b40 | (vd << 12) | vm, /* sz=1 -> bit 8 set -> 0xb */ - }; - } - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_vldr(uint32_t rn, uint32_t vd, uint32_t add, uint32_t is_doubleword, uint32_t imm) -{ - const uint32_t D = (vd >> 4) & 1; - if (imm > 1020 || (imm & 0x3)) - { - tcc_error("compiler_error: 'th_vldr' imm is outside of range: 0x%x, max " - "value: 0xff\n", - imm); - } - if (is_doubleword) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xed100b00 | (D << 22) | ((add & 1) << 23) | (rn << 16) | (vd << 12) | (imm >> 2), - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xed100a00 | ((add & 1) << 23) | (D << 22) | (rn << 16) | (vd << 12) | (imm >> 2), - }; -} - -thumb_opcode th_vstr(uint32_t rn, uint32_t vd, uint32_t add, uint32_t is_doubleword, uint32_t imm) -{ - const uint32_t D = (vd >> 4) & 1; - if (imm > 1020 || (imm & 0x3)) - { - tcc_error("compiler_error: 'th_vstr' imm is outside of range: 0x%x, max " - "value: 0xff\n", - imm); - } - if (is_doubleword) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xed000b00 | (D << 22) | ((add & 1) << 23) | (rn << 16) | (vd << 12) | (imm >> 2), - - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xed000a00 | (D << 22) | ((add & 1) << 23) | (rn << 16) | (vd << 12) | (imm >> 2), - }; -} - -// move between core general purpose register and single precision floating -// point register -thumb_opcode th_vmov_gp_sp(uint16_t rt, uint16_t sn, uint16_t to_arm_register) -{ - /* Sn encoding: Vn (bits 19:16) = Sn[4:1], N (bit 7) = Sn[0] */ - const uint16_t Vn = (sn >> 1) & 0xf; - const uint16_t N = sn & 1; - return (thumb_opcode){ - .size = 4, - .opcode = 0xee000a10 | (to_arm_register << 20) | (Vn << 16) | (rt << 12) | (N << 7), - }; -} - -// move between two general purpose registers and one doubleword register -thumb_opcode th_vmov_2gp_dp(uint16_t rt, uint16_t rt2, uint16_t dm, uint16_t to_arm_register) -{ - const uint16_t M = (dm >> 4) & 1; - return (thumb_opcode){ - .size = 4, - .opcode = 0xec400b10 | (to_arm_register << 20) | (rt2 << 16) | (rt << 12) | (M << 5) | dm, - }; -} - -thumb_opcode th_sub_sp_imm_t3(uint32_t rd, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding) -{ - if (rd != R_PC && imm <= 4095 && encoding != ENFORCE_ENCODING_16BIT && flags != FLAGS_BEHAVIOUR_SET) - { - const uint32_t i = (imm >> 11) & 1; - const uint32_t imm3 = (imm >> 8) & 0x7; - return (thumb_opcode){ - .size = 4, - .opcode = 0xf2ad0000 | (i << 26) | (imm3 << 12) | (rd << 8) | (imm & 0xff), - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_sub_sp_imm(uint32_t rd, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding) -{ - // T1 encoding - if (rd == R_SP && imm <= 508 && !(imm & 0x3) && encoding != ENFORCE_ENCODING_32BIT && flags != FLAGS_BEHAVIOUR_SET) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xb080 | (imm >> 2), - }; - } - - if (rd != R_PC) - { - const uint32_t enc = th_pack_const(imm); - const uint32_t s = flags == FLAGS_BEHAVIOUR_SET ? 1 : 0; - if (enc || imm == 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf1ad0000 | s << 20 | (rd << 8) | enc, - }; - } - } - - return th_sub_sp_imm_t3(rd, imm, flags, encoding); -} - -thumb_opcode th_vmrs(uint16_t rt) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xeef10a10 | (rt << 12), - }; -} - -thumb_opcode th_vcvt_float_to_double(uint32_t vd, uint32_t vm) -{ - /* VCVT.F64.F32 Dd, Sm - * vd = destination Dd index (0-15), vm = source Sm index (0-31) - * Sm encoding: M = Sm[0] (bit 5), Vm = Sm[4:1] (bits 3:0) - */ - uint32_t M = vm & 1; - uint32_t Vm = (vm >> 1) & 0xf; - return (thumb_opcode){ - .size = 4, - .opcode = (0xeeb70ac0 | (vd << 12) | (M << 5) | Vm), - }; -} - -thumb_opcode th_vcvt_double_to_float(uint32_t vd, uint32_t vm) -{ - /* VCVT.F32.F64 Sd, Dm - * vd = destination Sd index (0-31), vm = source Dm index (0-15) - * Sd encoding: D = Sd[0] (bit 22), Vd = Sd[4:1] (bits 15:12) - */ - uint32_t D = vd & 1; - uint32_t Vd = (vd >> 1) & 0xf; - return (thumb_opcode){ - .size = 4, - .opcode = 0xeeb70bc0 | (D << 22) | (Vd << 12) | vm, - }; -} - -thumb_opcode th_vcvt_fp_int(uint32_t vd, uint32_t vm, uint32_t opc, uint32_t is_double, uint32_t op) -{ - /* VCVT.S32.F32 or VCVT.S32.F64 - floating-point to integer - * vd = destination Sd (single register index 0-31) - * vm = source Sm for single, Dm for double - * opc = operation: 4=unsigned, 5=signed (round toward zero) - * is_double = 0 for F32 source, 1 for F64 source - * op = 1 for fp-to-int, 0 for int-to-fp - */ - uint32_t D = (vd >> 4) & 1; /* Sd[4] */ - uint32_t Vd = vd & 0xf; /* Sd[3:0] */ - uint32_t sz = is_double ? 1 : 0; /* bit 8: 0=F32, 1=F64 source */ - uint32_t M, Vm; - - /* Both single and double use Sm/Dm = Vm:M encoding */ - M = vm & 1; - Vm = (vm >> 1) & 0xf; - - return (thumb_opcode){ - .size = 4, - .opcode = 0xeeb80a40 | (D << 22) | (opc << 16) | (Vd << 12) | (sz << 8) | (op << 7) | (M << 5) | Vm, - }; -} - -thumb_opcode th_vcvt_convert(uint32_t vd, uint32_t vm, const char *dest_type, const char *src_type) -{ - // Helper function for VCVT conversions with type strings - // Examples: dest_type="s32", src_type="f32" for vcvt.s32.f32 - - // Float to int conversion (f32/f64 -> s32/u32) - if ((strcmp(dest_type, "s32") == 0 || strcmp(dest_type, "u32") == 0) && strcmp(src_type, "f32") == 0) - { - int is_unsigned = strcmp(dest_type, "u32") == 0; - return th_vcvt_fp_int(vd, vm, is_unsigned ? 0x4 : 0x5, 0, 1); - } - else if ((strcmp(dest_type, "s32") == 0 || strcmp(dest_type, "u32") == 0) && strcmp(src_type, "f64") == 0) - { - int is_unsigned = strcmp(dest_type, "u32") == 0; - return th_vcvt_fp_int(vd, vm, is_unsigned ? 0x4 : 0x5, 1, 1); - } - // Int to float conversion (s32/u32 -> f32/f64) - else if ((strcmp(dest_type, "f32") == 0 || strcmp(dest_type, "f64") == 0) && - (strcmp(src_type, "s32") == 0 || strcmp(src_type, "u32") == 0)) - { - int dst_is_double = strcmp(dest_type, "f64") == 0; - int is_unsigned = strcmp(src_type, "u32") == 0; - return th_vcvt_fp_int(vd, vm, 0, dst_is_double, is_unsigned ? 0 : 1); - } - // Float precision conversion (f32 <-> f64) - else if (strcmp(dest_type, "f64") == 0 && strcmp(src_type, "f32") == 0) - { - return th_vcvt_float_to_double(vd / 2, vm); - } - else if (strcmp(dest_type, "f32") == 0 && strcmp(src_type, "f64") == 0) - { - return th_vcvt_double_to_float(vd, vm / 2); - } - - // Unsupported conversion - return (thumb_opcode){.size = 0, .opcode = 0}; -} - -thumb_opcode th_it(uint16_t cond, uint16_t mask) -{ - return (thumb_opcode){ - .size = 2, - .opcode = 0xbf00 | (cond << 4) | (mask & 0xf), - }; -} - -thumb_opcode th_clrex() -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3bf8f2f, - }; -} - -thumb_opcode th_svc(uint32_t imm) -{ - if (imm <= 0xff) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xdf00 | imm, - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_bkpt(uint32_t imm) -{ - if (imm <= 0xff) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xbe00 | imm, - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_bfc(uint32_t rd, uint32_t lsb, uint32_t width) -{ - const uint32_t imm2 = lsb & 0x3; - const uint32_t imm3 = (lsb >> 2) & 0x7; - const uint32_t msb = lsb + width - 1; - return (thumb_opcode){ - .size = 4, - .opcode = 0xf36f0000 | (rd << 8) | (imm3 << 12) | (imm2 << 6) | msb, - }; -} - -thumb_opcode th_bfi(uint32_t rd, uint32_t rn, uint32_t lsb, uint32_t width) -{ - const uint32_t imm2 = lsb & 0x3; - const uint32_t imm3 = (lsb >> 2) & 0x7; - const uint32_t msb = lsb + width - 1; - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3600000 | (rn << 16) | (rd << 8) | (imm3 << 12) | (imm2 << 6) | msb, - }; -} - -thumb_opcode th_clz(uint32_t rd, uint32_t rm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xfab0f080 | rm << 16 | rd << 8 | rm, - }; -} - -thumb_opcode th_cmn_imm(uint32_t rn, uint32_t imm) -{ -#ifndef TCC_TARGET_ARM_ARCHV6M - if (rn != R_PC) - { - const uint32_t packed = th_pack_const(imm); - if (packed || imm == 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf1100f00 | packed | (rn << 16), - }; - } - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_cmn_reg(uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - if (rn < 8 && rm < 8 && shift.type == THUMB_SHIFT_NONE && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x42c0 | (rm << 3) | rn, - }; - } - return th_generic_op_reg_shift_with_status(0xeb10, 0xf, rn, rm, FLAGS_BEHAVIOUR_SET, shift); -} - -thumb_opcode th_cps(uint32_t enable, uint32_t i, uint32_t f) -{ - return (thumb_opcode){ - .size = 2, - .opcode = 0xb660 | (enable << 4) | (i << 1) | f, - }; -} - -thumb_opcode th_csdb() -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3af8014, - }; -} - -thumb_opcode th_dmb(uint32_t option) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3bf8f50 | option, - }; -} - -thumb_opcode th_dsb(uint32_t option) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3bf8f40 | option, - }; -} - -thumb_opcode th_isb(uint32_t option) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3bf8f60 | option, - }; -} - -thumb_opcode th_eor_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - uint32_t S = (flags == FLAGS_BEHAVIOUR_SET) ? 1 : 0; - uint32_t packed = th_pack_const(imm); - if (packed || imm == 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf0800000 | (S << 20) | (rd << 8) | (rn << 16) | packed, - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_eor_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rd == rn && rm < 8 && rn < 8 && encoding != ENFORCE_ENCODING_32BIT && shift.type == THUMB_SHIFT_NONE) - { - return (thumb_opcode){ - .size = 2, - .opcode = (0x4040 | (rm << 3) | rd), - }; - } - return th_generic_op_reg_shift_with_status(0xea80, rd, rn, rm, flags, shift); -} - -thumb_opcode th_lda(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8d00faf | (rn << 16) | (rt << 12), - }; -} - -thumb_opcode th_ldab(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8d00f8f | (rn << 16) | (rt << 12), - }; -} - -thumb_opcode th_ldaex(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8d00fef | (rn << 16) | (rt << 12), - }; -} - -thumb_opcode th_ldaexb(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8d00fcf | (rn << 16) | (rt << 12), - }; -} - -thumb_opcode th_ldaexh(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8d00fdf | (rn << 16) | (rt << 12), - }; -} - -thumb_opcode th_ldah(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8d00f9f | (rn << 16) | (rt << 12), - }; -} - -thumb_opcode th_ldm(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding) -{ - if (rn < 8 && regset <= 0xff && encoding != ENFORCE_ENCODING_32BIT && writeback == 1) - { - if (writeback) - { - regset &= ~(1 << rn); - } - else - { - regset |= 1 << rn; - } - return (thumb_opcode){ - .size = 2, - .opcode = 0xc800 | rn << 8 | regset, - }; - }; - if (rn == R_SP && ((regset & 0x7f00) == 0) && encoding != ENFORCE_ENCODING_32BIT && writeback == 1) - { - const uint8_t p = (regset >> R_PC) & 1; - regset &= 0x00ff; - return (thumb_opcode){ - .size = 2, - .opcode = 0xbc00 | regset | (p << 8), - }; - } - - if (!(writeback && (regset & (1 << rn)))) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8900000 | (writeback << 21) | (rn << 16) | regset, - }; - } - - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_ldmdb(uint32_t rn, uint32_t regset, uint32_t writeback) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe9100000 | (writeback << 21) | (rn << 16) | regset, - }; -} - -thumb_opcode th_ldrbt(uint32_t rt, uint32_t rn, int imm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8100e00 | (rn << 16) | (rt << 12) | (imm & 0xff), - }; -} - -thumb_opcode th_ldrd_imm(uint32_t rt, uint32_t rt2, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding) -{ - const uint32_t pu = (puw >> 1) & 0x3; - const uint32_t w = puw & 0x1; - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8500000 | (pu << 23) | w << 21 | rn << 16 | rt << 12 | rt2 << 8 | (imm >> 2), - }; -} - -thumb_opcode th_ldrex(uint32_t rt, uint32_t rn, int imm) -{ - if (imm < 0 || imm > 1020) - { - tcc_error("compiler_error: 'th_ldrex' imm is outside of range: 0x%x, max " - "value: 0x3fc\n", - imm); - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8500f00 | (rn << 16) | (rt << 12) | (imm >> 2), - }; -} - -thumb_opcode th_ldrexb(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8d00f4f | (rn << 16) | (rt << 12), - }; -} - -thumb_opcode th_ldrexh(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8d00f5f | (rn << 16) | (rt << 12), - }; -} - -thumb_opcode th_ldrht(uint32_t rt, uint32_t rn, int imm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8300e00 | (rn << 16) | (rt << 12) | (imm & 0xff), - }; -} - -thumb_opcode th_ldrsbt(uint32_t rt, uint32_t rn, int imm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf9100e00 | (rn << 16) | (rt << 12) | (imm & 0xff), - }; -} - -thumb_opcode th_ldrsht(uint32_t rt, uint32_t rn, int imm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf9300e00 | (rn << 16) | (rt << 12) | (imm & 0xff), - }; -} - -thumb_opcode th_ldrt(uint32_t rt, uint32_t rn, int imm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8500e00 | (rn << 16) | (rt << 12) | (imm & 0xff), - }; -} - -thumb_opcode th_mla(uint32_t rd, uint32_t rn, uint32_t rm, uint32_t ra) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xfb000000 | (rn << 16) | (ra << 12) | (rd << 8) | rm, - }; -} - -thumb_opcode th_mls(uint32_t rd, uint32_t rn, uint32_t rm, uint32_t ra) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xfb000010 | (rn << 16) | (ra << 12) | (rd << 8) | rm, - }; -} - -thumb_opcode th_mrs(uint32_t rd, uint32_t specreg) -{ - if (rd == R_SP || rd == R_PC) - { - tcc_error("compiler_error: 'th_msr', SP or PC can't be used as rd\n"); - return (thumb_opcode){0, 0}; - } - if (specreg > 0xff) - { - tcc_error("compiler_error: 'th_msr', invalid special register\n"); - return (thumb_opcode){0, 0}; - } - - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3ef8000 | (rd << 8) | specreg, - }; -} - -thumb_opcode th_msr(uint32_t specreg, uint32_t rn, uint32_t mask) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3808000 | (mask << 10) | (rn << 16) | specreg, - }; -} - -thumb_opcode th_mvn_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - - uint32_t S = (flags == FLAGS_BEHAVIOUR_SET) ? 1 : 0; - uint32_t packed = th_pack_const(imm); - if (packed == 0) - { - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf06f0000 | (S << 20) | (rd << 8) | packed, - }; -} - -thumb_opcode th_mvn_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - if (rd == rn && rd < 8 && rm < 8 && encoding != ENFORCE_ENCODING_32BIT && shift.type == THUMB_SHIFT_NONE) - { - return (thumb_opcode){ - .size = 2, - .opcode = (0x43c0 | (rm << 3) | rd), - }; - } - return th_generic_op_reg_shift_with_status(0xea6f, rd, rn, rm, flags, shift); -} - -thumb_opcode th_orn_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding) -{ - uint32_t S = (flags == FLAGS_BEHAVIOUR_SET) ? 1 : 0; - uint32_t packed = th_pack_const(imm); - if (packed || imm == 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf0600000 | (S << 20) | (rd << 8) | (rn << 16) | packed, - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_orn_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding) -{ - return th_generic_op_reg_shift_with_status(0xea60, rd, rn, rm, flags, shift); -} - -thumb_opcode th_pkhbt(uint32_t rd, uint32_t rn, uint32_t rm, thumb_shift shift) -{ - const uint32_t imm2 = shift.value & 0x3; - const uint32_t imm3 = (shift.value >> 2) & 0x7; - uint32_t tb = 0; - if (shift.type == THUMB_SHIFT_LSL || shift.value == 0) - { - tb = 0; - } - else if (shift.type == THUMB_SHIFT_ASR) - { - tb = 1; - } - else - { - tcc_error("compiler_error: 'th_pkhbt', invalid shift type\n"); - return (thumb_opcode){0, 0}; - } - - return (thumb_opcode){ - .size = 4, - .opcode = 0xeac00000 | rn << 16 | imm3 << 12 | rd << 8 | imm2 << 6 | tb << 5 | rm, - }; -} - -thumb_opcode th_pld_literal(int imm) -{ - int u = 1; - if (imm < 0) - { - u = 0; - imm = -imm; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf81ff000 | u << 23 | imm, - }; -} - -thumb_opcode th_pld_imm(uint32_t rn, uint32_t w, int imm) -{ - if (imm >= 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf890f000 | w << 22 | rn << 16 | imm, - }; - } - imm = -imm; - return (thumb_opcode){ - .size = 4, - .opcode = 0xf810fc00 | w << 22 | rn << 16 | imm, - }; -} - -thumb_opcode th_pld_reg(uint32_t rn, uint32_t rm, uint32_t w, thumb_shift shift) -{ - if (shift.type == THUMB_SHIFT_NONE) - { - shift.type = THUMB_SHIFT_LSL; - } - if (shift.type != THUMB_SHIFT_LSL || shift.value > 3 || shift.value < 0) - { - tcc_error("compiler_error: 'th_pld_reg', invalid shift type\n"); - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf810f000 | w << 22 | rn << 16 | rm | shift.value << 4, - }; -} - -thumb_opcode th_pli_literal(int imm) -{ - int u = 1; - if (imm < 0) - { - u = 0; - imm = -imm; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf91ff000 | u << 23 | imm, - }; -} - -thumb_opcode th_pli_imm(uint32_t rn, uint32_t w, int imm) -{ - if (imm >= 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf990f000 | w << 22 | rn << 16 | imm, - }; - } - imm = -imm; - return (thumb_opcode){ - .size = 4, - .opcode = 0xf910fc00 | w << 22 | rn << 16 | imm, - }; -} - -thumb_opcode th_pli_reg(uint32_t rn, uint32_t rm, uint32_t w, thumb_shift shift) -{ - if (shift.type == THUMB_SHIFT_NONE) - { - shift.type = THUMB_SHIFT_LSL; - } - if (shift.type != THUMB_SHIFT_LSL || shift.value > 3 || shift.value < 0) - { - tcc_error("compiler_error: 'th_pli_reg', invalid shift type\n"); - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf910f000 | w << 22 | rn << 16 | rm | shift.value << 4, - }; -} - -thumb_opcode th_rbit(uint32_t rd, uint32_t rm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa90f0a0 | (rm << 16) | (rd << 8) | rm, - }; -} - -thumb_opcode th_rev(uint32_t rd, uint32_t rm, thumb_enforce_encoding encoding) -{ - if (rd < 8 && rm < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xba00 | (rm << 3) | rd, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa90f080 | (rm << 16) | (rd << 8) | rm, - }; -} - -thumb_opcode th_rev16(uint32_t rd, uint32_t rm, thumb_enforce_encoding encoding) -{ - if (rd < 8 && rm < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xba40 | (rm << 3) | rd, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa90f090 | (rm << 16) | (rd << 8) | rm, - }; -} - -thumb_opcode th_revsh(uint32_t rd, uint32_t rm, thumb_enforce_encoding encoding) -{ - if (rd < 8 && rm < 8 && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xbac0 | (rm << 3) | rd, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa90f0b0 | (rm << 16) | (rd << 8) | rm, - }; -} - -thumb_opcode th_sbfx(uint32_t rd, uint32_t rn, uint32_t lsb, uint32_t width) -{ - const uint32_t imm2 = lsb & 0x3; - const uint32_t imm3 = (lsb >> 2) & 0x7; - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3400000 | (rn << 16) | (rd << 8) | (imm3 << 12) | (imm2 << 6) | (width - 1), - }; -} - -thumb_opcode th_smlal(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xfbc00000 | (rn << 16) | (rdlo << 12) | (rdhi << 8) | rm, - }; -} - -thumb_opcode th_smull(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xfb800000 | (rn << 16) | (rdlo << 12) | (rdhi << 8) | rm, - }; -} - -thumb_opcode th_ssat(uint32_t rd, uint32_t imm, uint32_t rn, thumb_shift shift) -{ - const uint32_t sh = (shift.type == THUMB_SHIFT_LSL) ? 0 : 1; - const uint32_t imm2 = shift.value & 0x3; - const uint32_t imm3 = (shift.value >> 2) & 0x7; - - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3000000 | (sh << 21) | (rn << 16) | (imm3 << 12) | (rd << 8) | (imm2 << 6) | (imm - 1), - }; -} - -thumb_opcode th_usat(uint32_t rd, uint32_t imm, uint32_t rn, thumb_shift shift) -{ - const uint32_t sh = (shift.type == THUMB_SHIFT_LSL) ? 0 : 1; - const uint32_t imm2 = shift.value & 0x3; - const uint32_t imm3 = (shift.value >> 2) & 0x7; - - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3800000 | (sh << 21) | (rn << 16) | (imm3 << 12) | (rd << 8) | (imm2 << 6) | imm, - }; -} - -thumb_opcode th_ssbb() -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3bf8f40, - }; -} - -thumb_opcode th_stl(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8c00faf | rn << 16 | rt << 12, - }; -} - -thumb_opcode th_stlb(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8c00f8f | rn << 16 | rt << 12, - }; -} - -thumb_opcode th_stlex(uint32_t rd, uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8c00fe0 | rn << 16 | rt << 12 | rd, - }; -} - -thumb_opcode th_stlexb(uint32_t rd, uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8c00fc0 | rn << 16 | rt << 12 | rd, - }; -} - -thumb_opcode th_stlexh(uint32_t rd, uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8c00fd0 | rn << 16 | rt << 12 | rd, - }; -} - -thumb_opcode th_stlh(uint32_t rt, uint32_t rn) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8c00f9f | rn << 16 | rt << 12, - }; -} - -thumb_opcode th_stm(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding) -{ - if (rn < 8 && regset <= 0xff && encoding != ENFORCE_ENCODING_32BIT && writeback == 1) - { - if (writeback) - { - regset &= ~(1 << rn); - } - else - { - regset |= 1 << rn; - } - return (thumb_opcode){ - .size = 2, - .opcode = 0xc000 | rn << 8 | regset, - }; - }; - - if (!(writeback && (regset & (1 << rn)))) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8800000 | (writeback << 21) | (rn << 16) | regset, - }; - } - - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_stmdb(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding) -{ - - if (rn == R_SP && encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xb400 | writeback << 8 | (regset & 0xff), - }; - } - - return (thumb_opcode){ - .size = 4, - .opcode = 0xe9000000 | (writeback << 21) | (rn << 16) | regset, - }; -} - -thumb_opcode th_str_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding) -{ - // puw == 6 means positive offset on rn, so T1 encoding can be used - if (puw == 6 && rn < 8 && rt < 8 && imm <= 124 && !(imm & 3) && encoding != ENFORCE_ENCODING_32BIT) - { - // imm[0] is enforced to be 0, and sould be divided by 4, thus offset is 4 - THOP_TRACE("str %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 2, - .opcode = 0x6000 | (imm << 4) | (rn << 3) | rt, - }; - } - else if (puw == 6 && rn == R_SP && rt < 8 && imm <= 1020 && encoding != ENFORCE_ENCODING_32BIT) - { - THOP_TRACE("str %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 2, - .opcode = 0x9000 | (rt << 8) | (imm >> 2), - }; - } -#ifndef TCC_TARGET_ARM_ARCHV6M - else if (puw == 6 && imm <= 4095 && rn != R_PC) - { - uint32_t ins = (0xf8c0 | (rn & 0xf)) << 16; - ins |= (rt << 12) | imm; - THOP_TRACE("str %s, [%s, #%d]\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = ins, - }; - } - else if (imm >= 0 && imm <= 4095 && rn == R_PC) - { - uint32_t u = (puw & 0x2) >> 1; - THOP_TRACE("str %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf85f0000 | (u << 23) | (rt << 12) | imm, - }; - } - else if (imm <= 255) - { - uint32_t ins = (0xf840 | (rn & 0xf)) << 16; - ins |= (0x0800 | ((rt & 0xf) << 12) | ((puw & 0x7) << 8) | imm); - { -#if THOP_TRACE_ENABLED - const uint32_t p = (puw >> 2) & 1; - const uint32_t u = (puw >> 1) & 1; - const uint32_t w = (puw >> 0) & 1; - if (p && !w) - { - THOP_TRACE("str %s, [%s, #%c%d]\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (p && w) - { - THOP_TRACE("str %s, [%s, #%c%d]!\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else if (!p && w) - { - THOP_TRACE("str %s, [%s], #%c%d\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm); - } - else - { - THOP_TRACE("str %s, [%s, #%c%d] (puw=%u)\n", th_reg_name(rt), th_reg_name(rn), u ? '+' : '-', imm, - (unsigned)puw); - } -#endif - } - return (thumb_opcode){ - .size = 4, - .opcode = ins, - }; - } -#endif - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_strbt(uint32_t rt, uint32_t rn, int imm) -{ - THOP_TRACE("strbt %s, [%s], #%d\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8000e00 | (rn << 16) | (rt << 12) | (imm & 0xff), - }; -} - -thumb_opcode th_strd_imm(uint32_t rt, uint32_t rt2, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding) -{ - const uint32_t pu = (puw >> 1) & 0x3; - const uint32_t w = puw & 0x1; - THOP_TRACE("strd %s, %s, [%s, #%d]%s\n", th_reg_name(rt), th_reg_name(rt2), th_reg_name(rn), imm, w ? "!" : ""); - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8400000 | (pu << 23) | w << 21 | rn << 16 | rt << 12 | rt2 << 8 | (imm >> 2), - }; -} - -thumb_opcode th_strex(uint32_t rd, uint32_t rt, uint32_t rn, int imm) -{ - if (imm < 0 || imm > 1020) - { - tcc_error("compiler_error: 'th_strex' imm is outside of range: 0x%x, max " - "value: 0x3fc\n", - imm); - } - THOP_TRACE("strex %s, %s, [%s, #%d]\n", th_reg_name(rd), th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8400000 | (rn << 16) | (rt << 12) | (rd << 8) | (imm >> 2), - }; -} - -thumb_opcode th_strexb(uint32_t rd, uint32_t rt, uint32_t rn) -{ - THOP_TRACE("strexb %s, %s, [%s]\n", th_reg_name(rd), th_reg_name(rt), th_reg_name(rn)); - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8c00f40 | (rn << 16) | (rt << 12) | rd, - }; -} - -thumb_opcode th_strexh(uint32_t rd, uint32_t rt, uint32_t rn) -{ - THOP_TRACE("strexh %s, %s, [%s]\n", th_reg_name(rd), th_reg_name(rt), th_reg_name(rn)); - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8c00f50 | (rn << 16) | (rt << 12) | rd, - }; -} - -thumb_opcode th_strht(uint32_t rt, uint32_t rn, int imm) -{ - THOP_TRACE("strht %s, [%s], #%d\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8200e00 | (rn << 16) | (rt << 12) | (imm & 0xff), - }; -} - -thumb_opcode th_strt(uint32_t rt, uint32_t rn, int imm) -{ - THOP_TRACE("strt %s, [%s], #%d\n", th_reg_name(rt), th_reg_name(rn), imm); - return (thumb_opcode){ - .size = 4, - .opcode = 0xf8400e00 | (rn << 16) | (rt << 12) | (imm & 0xff), - }; -} - -thumb_opcode th_sxtb(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - - const uint32_t rotate = shift.value >> 3; - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_ROR) - { - tcc_error("compiler_error: 'th_sxtb', invalid shift type\n"); - return (thumb_opcode){0, 0}; - } - - if (shift.value != 0 && shift.value != 8 && shift.value != 16 && shift.value != 24) - { - tcc_error("compiler_error: 'th_sxtb', invalid shift value\n"); - return (thumb_opcode){0, 0}; - } - - if (rd < 8 && rm < 8 && encoding != ENFORCE_ENCODING_32BIT && (shift.type == THUMB_SHIFT_NONE || shift.value == 0)) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xb240 | (rm << 3) | rd, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa4ff080 | rd << 8 | rm | rotate << 4, - }; -} - -thumb_opcode th_sxth(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - - const uint32_t rotate = shift.value >> 3; - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_ROR) - { - tcc_error("compiler_error: 'th_sxth', invalid shift type\n"); - return (thumb_opcode){0, 0}; - } - - if (shift.value != 0 && shift.value != 8 && shift.value != 16 && shift.value != 24) - { - tcc_error("compiler_error: 'th_sxth', invalid shift value\n"); - return (thumb_opcode){0, 0}; - } - - if (rd < 8 && rm < 8 && encoding != ENFORCE_ENCODING_32BIT && (shift.type == THUMB_SHIFT_NONE || shift.value == 0)) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xb200 | (rm << 3) | rd, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa0ff080 | rd << 8 | rm | rotate << 4, - }; -} - -thumb_opcode th_tbb(uint32_t rn, uint32_t rm, uint32_t h) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe8d0f000 | (rn << 16) | rm | h << 4, - }; -} - -thumb_opcode th_teq(uint32_t rn, uint32_t imm) -{ - const uint32_t packed = th_pack_const(imm); - if (packed || imm == 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf0900f00 | (rn << 16) | packed, - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_tst_imm(uint32_t rn, uint32_t imm) -{ - const uint32_t packed = th_pack_const(imm); - if (packed || imm == 0) - { - return (thumb_opcode){ - .size = 4, - .opcode = 0xf0100f00 | (rn << 16) | packed, - }; - } - return (thumb_opcode){ - .size = 0, - .opcode = 0, - }; -} - -thumb_opcode th_tst_reg(uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - if (rn < 8 && rm < 8 && encoding != ENFORCE_ENCODING_32BIT && shift.type == THUMB_SHIFT_NONE) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0x4200 | (rm << 3) | rn, - }; - } - return th_generic_op_reg_shift_with_status(0xea10, 0xf, rn, rm, FLAGS_BEHAVIOUR_NOT_IMPORTANT, shift); -} - -thumb_opcode th_tt(uint32_t rd, uint32_t rn, uint32_t a, uint32_t t) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xe840f000 | rn << 16 | rd << 8 | a << 7 | t << 6, - }; -} - -thumb_opcode th_udf(uint32_t imm, thumb_enforce_encoding encoding) -{ - const uint32_t imm4 = (imm >> 12) & 0xf; - const uint32_t imm12 = imm & 0xfff; - - if (encoding != ENFORCE_ENCODING_32BIT && imm <= 0xff) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xde00 | imm, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf7f0a000 | imm4 << 16 | imm12, - }; -} - -thumb_opcode th_umlal(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm) -{ - return (thumb_opcode){ - .size = 4, - .opcode = 0xfbe00000 | (rn << 16) | (rdlo << 12) | (rdhi << 8) | rm, - }; -} - -thumb_opcode th_uxtb(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - - const uint32_t rotate = shift.value >> 3; - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_ROR) - { - tcc_error("compiler_error: 'th_uxtb', invalid shift type\n"); - return (thumb_opcode){0, 0}; - } - - if (shift.value != 0 && shift.value != 8 && shift.value != 16 && shift.value != 24) - { - tcc_error("compiler_error: 'th_uxtb', invalid shift value\n"); - return (thumb_opcode){0, 0}; - } - - if (rd < 8 && rm < 8 && encoding != ENFORCE_ENCODING_32BIT && (shift.type == THUMB_SHIFT_NONE || shift.value == 0)) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xb2c0 | (rm << 3) | rd, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa5ff080 | rd << 8 | rm | rotate << 4, - }; -} - -thumb_opcode th_uxth(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding) -{ - - const uint32_t rotate = shift.value >> 3; - if (shift.type != THUMB_SHIFT_NONE && shift.type != THUMB_SHIFT_ROR) - { - tcc_error("compiler_error: 'th_uxth', invalid shift type\n"); - return (thumb_opcode){0, 0}; - } - - if (shift.value != 0 && shift.value != 8 && shift.value != 16 && shift.value != 24) - { - tcc_error("compiler_error: 'th_uxth', invalid shift value\n"); - return (thumb_opcode){0, 0}; - } - - if (rd < 8 && rm < 8 && encoding != ENFORCE_ENCODING_32BIT && (shift.type == THUMB_SHIFT_NONE || shift.value == 0)) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xb280 | (rm << 3) | rd, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xfa1ff080 | rd << 8 | rm | rotate << 4, - }; -} - -thumb_opcode th_wfe(thumb_enforce_encoding encoding) -{ - if (encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xbf20, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3af8002, - }; -} - -thumb_opcode th_wfi(thumb_enforce_encoding encoding) -{ - if (encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xbf30, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3af8003, - }; -} - -thumb_opcode th_yield(thumb_enforce_encoding encoding) -{ - if (encoding != ENFORCE_ENCODING_32BIT) - { - return (thumb_opcode){ - .size = 2, - .opcode = 0xbf10, - }; - } - return (thumb_opcode){ - .size = 4, - .opcode = 0xf3af8001, - }; -} - -// Thumb ELF management -// Start of T32 instructions -void th_sym_t() -{ - const int info = ELFW(ST_INFO)(STB_LOCAL, STT_NOTYPE); - set_elf_sym(symtab_section, ind, 0, info, 0, 1, "$t"); -} - -// Start of A32 instructions -void th_sym_a() -{ - const int info = ELFW(ST_INFO)(STB_LOCAL, STT_NOTYPE); - set_elf_sym(symtab_section, ind, 0, info, 0, 1, "$a"); -} - -// Start of data -void th_sym_d() -{ - const int info = ELFW(ST_INFO)(STB_LOCAL, STT_NOTYPE); - set_elf_sym(symtab_section, ind, 0, info, 0, 1, "$d"); -} diff --git a/arm-thumb-opcodes.h b/arm-thumb-opcodes.h deleted file mode 100644 index f08e9e9a..00000000 --- a/arm-thumb-opcodes.h +++ /dev/null @@ -1,456 +0,0 @@ -/* - * ARMvX-m opcodes for TCC - * Uses thumb instruction set - * - * Based on: - * ARM Thumb 2 instruction functions for TCC - * Copyright (c) 2020 Erlend J. Sveen - * from: - * https://git.erlendjs.no/erlendjs/tinycc/-/blob/arm-thumb/arm-thumb-gen.c - * https://git.erlendjs.no/erlendjs/tinycc/-/blob/arm-thumb/arm-thumb-instructions.c - * - * And - * - * ARMv4 code generator for TCC - * - * Copyright (c) 2003 Daniel Glöckner - * Copyright (c) 2012 Thomas Preud'homme - * - * Based on i386-gen.c by Fabrice Bellard - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#pragma once - -#include -#include -#include - -/* Optional mnemonic-style tracing for opcode builders (th_*). - * Enable with e.g.: make CFLAGS+='-DTHUMB_OPCODE_TRACE=1' - * Printed output goes to stderr. - */ -#ifndef THUMB_OPCODE_TRACE -#define THUMB_OPCODE_TRACE 0 -#endif - -#if THUMB_OPCODE_TRACE -#define THOP_TRACE(...) fprintf(stderr, __VA_ARGS__) -#else -#define THOP_TRACE(...) \ - do \ - { \ - } while (0) -#endif - -#ifndef TCC_DEBUG -#define TCC_DEBUG 0 -#endif - -#define TRACE(...) -#define LOG(...) - -#if TCC_DEBUG == 1 || TCC_DEBUG == 2 -#undef LOG -#define LOG(...) \ - printf("[INF]: "); \ - printf(__VA_ARGS__); \ - printf("\n") -#endif - -#if TCC_DEBUG == 2 -#undef TRACE -#define TRACE(...) \ - printf("[TRC]: "); \ - printf(__VA_ARGS__); \ - printf("\n") -#endif - - -#define ceil_div(x, d) ((x + (d - 1)) / d) - -#define R0 0 -#define R1 1 -#define R2 2 -#define R3 3 -#define R4 4 -#define R5 5 -#define R6 6 -#define R7 7 -#define R8 8 -#define R9 9 -#define R10 10 -#define R11 11 -#define R12 12 -#define R_IP R12 -#define R_SP 13 -#define R_LR 14 -#define R_PC 15 - -#define R_IP R12 -#define R_FP R7 - -typedef enum -{ - FLAGS_BEHAVIOUR_NOT_IMPORTANT = 0, - FLAGS_BEHAVIOUR_SET = 1, - FLAGS_BEHAVIOUR_BLOCK = 2, -} thumb_flags_behaviour; - -typedef enum -{ - ENFORCE_ENCODING_NONE = 0, - ENFORCE_ENCODING_16BIT = 1, - ENFORCE_ENCODING_32BIT = 2, -} thumb_enforce_encoding; - -typedef struct thumb_opcode -{ - uint8_t size; - uint32_t opcode; -} thumb_opcode; - -typedef enum thumb_shift_type -{ - THUMB_SHIFT_NONE, - THUMB_SHIFT_RRX, - THUMB_SHIFT_LSL, - THUMB_SHIFT_LSR, - THUMB_SHIFT_ASR, - THUMB_SHIFT_ROR, -} thumb_shift_type; - -typedef enum thumb_shift_mode -{ - THUMB_SHIFT_IMMEDIATE, - THUMB_SHIFT_REGISTER, -} thumb_shift_mode; - -typedef struct thumb_shift -{ - thumb_shift_type type; - uint32_t value; - thumb_shift_mode mode; -} thumb_shift; - -#define THUMB_SHIFT_DEFAULT \ - (thumb_shift) \ - { \ - .type = THUMB_SHIFT_NONE, .value = 0, .mode = THUMB_SHIFT_IMMEDIATE \ - } - -uint32_t th_packimm_10_11_0(uint32_t imm); -uint32_t th_packimm_3_8_1(uint32_t imm); - -uint32_t th_pack_const(uint32_t imm); -uint32_t th_encbranch_b_t3(uint32_t imm); - -uint32_t th_encbranch(int pos, int addr); -uint32_t th_encbranch_8(int pos, int addr); -uint32_t th_encbranch_11(int pos, int addr); -uint32_t th_encbranch_20(int pos, int addr); -uint32_t th_encbranch_24(int pos, int addr); - -thumb_opcode th_nop(thumb_enforce_encoding encoding); -thumb_opcode th_sev(thumb_enforce_encoding encoding); - -thumb_opcode th_bx_reg(uint16_t rm); -thumb_opcode th_bl_t1(uint32_t imm); -thumb_opcode th_blx_reg(uint16_t rm); -thumb_opcode th_b_t1(uint32_t cond, uint32_t imm8); -thumb_opcode th_b_t2(int32_t imm11); -thumb_opcode th_b_t3(uint32_t op, uint32_t imm); -thumb_opcode th_b_t4(int32_t imm); -thumb_opcode th_cbz(uint16_t rn, uint32_t imm, uint32_t nonzero); - -thumb_opcode th_mov_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding, bool in_it); - -thumb_opcode th_mov_imm(uint16_t rd, uint32_t imm, thumb_flags_behaviour setflags, thumb_enforce_encoding encoding); - -thumb_opcode th_movt(uint32_t rd, uint32_t imm16); - -thumb_opcode th_mov_reg_shift(uint32_t rd, uint32_t rm, uint32_t rs, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); - -thumb_opcode th_generic_op_imm_with_status(uint16_t op, uint16_t rd, uint16_t rn, uint32_t imm, - thumb_flags_behaviour setflags); -thumb_opcode th_generic_op_imm(uint16_t op, uint16_t rd, uint16_t rn, uint32_t imm); - -thumb_opcode th_generic_op_reg_shift_with_status(uint32_t op, uint32_t rd, uint32_t rn, uint32_t rm, - thumb_flags_behaviour setflags, thumb_shift shift); - -thumb_opcode th_add_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); - -thumb_opcode th_add_imm_t4(uint32_t rd, uint32_t rn, uint32_t imm); - -thumb_opcode th_add_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); - -thumb_opcode th_adr_imm(uint32_t rd, int imm, thumb_enforce_encoding encoding); - -thumb_opcode th_bic_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_bic_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); - -thumb_opcode th_and_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, - thumb_enforce_encoding encoding); -thumb_opcode th_and_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_xor_reg(uint16_t rd, uint16_t rn, uint16_t rm); -thumb_opcode th_xor_imm(uint16_t rd, uint16_t rn, uint32_t imm); - -thumb_opcode th_rsb_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, - thumb_enforce_encoding encoding); -thumb_opcode th_rsb_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); - -thumb_opcode th_sub_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_adc_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_adc_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour setflags, - thumb_enforce_encoding encoding); - -thumb_opcode th_sbc_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_sbc_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); - -thumb_opcode th_orr_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_cmp_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_orr_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_sub_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_sub_imm_t4(uint32_t rd, uint32_t rn, uint32_t imm); - -thumb_opcode th_push(uint32_t regs); -int th_ldr_literal_estimate(uint16_t rt, uint32_t imm); -thumb_opcode th_ldrsh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding); -thumb_opcode th_ldrsh_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); -thumb_opcode th_ldrh_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding); -thumb_opcode th_ldrh_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); -thumb_opcode th_ldrsb_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding); -thumb_opcode th_ldrsb_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); -thumb_opcode th_ldrb_imm(uint16_t rt, uint16_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding); -thumb_opcode th_ldrb_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); -thumb_opcode th_ldr_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding); -thumb_opcode th_ldr_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); -thumb_opcode th_ldr_literal(uint16_t rt, uint32_t imm, uint32_t add); - -thumb_opcode th_pop(uint16_t regs); -thumb_opcode th_strh_imm(uint16_t rt, uint16_t rn, int imm, uint16_t puw, thumb_enforce_encoding encoding); -thumb_opcode th_strh_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); -thumb_opcode th_strb_imm(uint16_t rt, uint16_t rn, int imm, uint16_t puw, thumb_enforce_encoding encoding); -thumb_opcode th_strb_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); -thumb_opcode th_str_imm(uint32_t rt, uint32_t rn, int imm, uint32_t puw, thumb_enforce_encoding encoding); -thumb_opcode th_str_reg(uint32_t rt, uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); - -thumb_opcode th_mul(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_umull(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm); -thumb_opcode th_udiv(uint16_t rd, uint16_t rn, uint16_t rm); -thumb_opcode th_sdiv(uint16_t rd, uint16_t rn, uint16_t rm); - -thumb_opcode th_add_sp_imm_t4(uint32_t rd, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); -thumb_opcode th_add_sp_imm(uint16_t rd, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); -thumb_opcode th_add_sp_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding, - thumb_shift shift); - -thumb_opcode th_shift_armv7m(uint16_t rd, uint16_t rm, uint32_t imm, uint32_t type, thumb_flags_behaviour setflags); - -thumb_opcode th_lsl_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_lsl_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_lsr_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_lsr_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_asr_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_asr_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_ror_reg(uint16_t rd, uint16_t rn, uint16_t rm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_ror_imm(uint16_t rd, uint16_t rm, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); - -thumb_opcode th_cmp_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); - -/* VFP arithmetic instructions */ -thumb_opcode th_vadd_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz); -thumb_opcode th_vsub_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz); -thumb_opcode th_vmul_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz); -thumb_opcode th_vdiv_f(uint32_t vd, uint32_t vn, uint32_t vm, uint32_t sz); -thumb_opcode th_vneg_f(uint32_t vd, uint32_t vm, uint32_t sz); -thumb_opcode th_vcmp_f(uint32_t vd, uint32_t vm, uint32_t sz); -thumb_opcode th_vcmpe_f(uint32_t vd, uint32_t vm, uint32_t sz); - -thumb_opcode th_vpush(uint32_t regs, uint32_t is_doubleword); -thumb_opcode th_vpop(uint32_t regs, uint32_t is_doubleword); -thumb_opcode th_vmov_register(uint16_t vd, uint16_t vm, uint32_t sz); -thumb_opcode th_vldr(uint32_t rn, uint32_t vd, uint32_t add, uint32_t is_doubleword, uint32_t imm); -thumb_opcode th_vstr(uint32_t rn, uint32_t vd, uint32_t add, uint32_t is_doubleword, uint32_t imm); -thumb_opcode th_vmov_gp_sp(uint16_t rt, uint16_t sn, uint16_t to_arm_register); -thumb_opcode th_vmov_2gp_dp(uint16_t rt, uint16_t rt2, uint16_t dm, uint16_t to_arm_register); - -thumb_opcode th_sub_sp_imm(uint32_t rd, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); - -thumb_opcode th_sub_sp_imm_t3(uint32_t rd, uint32_t imm, thumb_flags_behaviour flags, thumb_enforce_encoding encoding); - -thumb_opcode th_sub_sp_reg(uint32_t rd, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); - -thumb_opcode th_vmrs(uint16_t rt); -thumb_opcode th_vcvt_float_to_double(uint32_t vd, uint32_t vm); -thumb_opcode th_vcvt_double_to_float(uint32_t vd, uint32_t vm); -thumb_opcode th_vcvt_fp_int(uint32_t vd, uint32_t vm, uint32_t opc, uint32_t sz, uint32_t op); - -/* Helper function for VCVT conversions with type strings */ -thumb_opcode th_vcvt_convert(uint32_t vd, uint32_t vm, const char *dest_type, const char *src_type); - -thumb_opcode th_it(uint16_t condition, uint16_t mask); - -thumb_opcode th_clrex(); -thumb_opcode th_svc(uint32_t imm); -thumb_opcode th_bkpt(uint32_t imm); - -thumb_opcode th_bfc(uint32_t rd, uint32_t lsb, uint32_t width); -thumb_opcode th_bfi(uint32_t rd, uint32_t rn, uint32_t lsb, uint32_t width); - -thumb_opcode th_clz(uint32_t rd, uint32_t rm); - -thumb_opcode th_cmn_imm(uint32_t rn, uint32_t imm); -thumb_opcode th_cmn_reg(uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); - -thumb_opcode th_cps(uint32_t enable, uint32_t i, uint32_t f); -thumb_opcode th_csdb(); -thumb_opcode th_dmb(uint32_t option); -thumb_opcode th_dsb(uint32_t option); -thumb_opcode th_isb(uint32_t option); - -thumb_opcode th_eor_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_eor_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); - -thumb_opcode th_lda(uint32_t rd, uint32_t rn); -thumb_opcode th_ldab(uint32_t rd, uint32_t rn); -thumb_opcode th_ldaex(uint32_t rd, uint32_t rn); -thumb_opcode th_ldaexb(uint32_t rd, uint32_t rn); -thumb_opcode th_ldaexh(uint32_t rd, uint32_t rn); -thumb_opcode th_ldah(uint32_t rd, uint32_t rn); - -thumb_opcode th_ldm(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding); -thumb_opcode th_ldmdb(uint32_t rn, uint32_t regset, uint32_t writeback); -thumb_opcode th_ldrbt(uint32_t rt, uint32_t rn, int imm); -thumb_opcode th_ldrd_imm(uint32_t rt, uint32_t rt2, uint32_t rn, int imm, uint32_t puw, - thumb_enforce_encoding encoding); - -thumb_opcode th_ldrex(uint32_t rt, uint32_t rn, int imm); -thumb_opcode th_ldrexb(uint32_t rt, uint32_t rn); -thumb_opcode th_ldrexh(uint32_t rt, uint32_t rn); -thumb_opcode th_ldrht(uint32_t rt, uint32_t rn, int imm); -thumb_opcode th_ldrsbt(uint32_t rt, uint32_t rn, int imm); -thumb_opcode th_ldrsht(uint32_t rt, uint32_t rn, int imm); -thumb_opcode th_ldrt(uint32_t rt, uint32_t rn, int imm); - -thumb_opcode th_mla(uint32_t rd, uint32_t rn, uint32_t rm, uint32_t ra); -thumb_opcode th_mls(uint32_t rd, uint32_t rn, uint32_t rm, uint32_t ra); -thumb_opcode th_mrs(uint32_t rd, uint32_t specreg); -thumb_opcode th_msr(uint32_t specreg, uint32_t rn, uint32_t mask); - -thumb_opcode th_mvn_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_mvn_imm(uint32_t rd, uint32_t rm, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); -thumb_opcode th_orn_reg(uint32_t rd, uint32_t rn, uint32_t rm, thumb_flags_behaviour flags, thumb_shift shift, - thumb_enforce_encoding encoding); -thumb_opcode th_orn_imm(uint32_t rd, uint32_t rn, uint32_t imm, thumb_flags_behaviour flags, - thumb_enforce_encoding encoding); - -thumb_opcode th_pkhbt(uint32_t rd, uint32_t rn, uint32_t rm, thumb_shift shift); - -thumb_opcode th_pld_literal(int imm); -thumb_opcode th_pld_imm(uint32_t rn, uint32_t w, int imm); -thumb_opcode th_pld_reg(uint32_t rn, uint32_t rm, uint32_t w, thumb_shift shift); -thumb_opcode th_pli_literal(int imm); -thumb_opcode th_pli_imm(uint32_t rn, uint32_t w, int imm); -thumb_opcode th_pli_reg(uint32_t rn, uint32_t rm, uint32_t w, thumb_shift shift); - -thumb_opcode th_rbit(uint32_t rd, uint32_t rm); -thumb_opcode th_rev(uint32_t rd, uint32_t rm, thumb_enforce_encoding encoding); -thumb_opcode th_rev16(uint32_t rd, uint32_t rm, thumb_enforce_encoding encoding); -thumb_opcode th_revsh(uint32_t rd, uint32_t rm, thumb_enforce_encoding encoding); - -thumb_opcode th_sbfx(uint32_t rd, uint32_t rn, uint32_t lsb, uint32_t width); -thumb_opcode th_smlal(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm); -thumb_opcode th_smull(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm); - -thumb_opcode th_ssat(uint32_t rd, uint32_t imm, uint32_t rn, thumb_shift shift); -thumb_opcode th_usat(uint32_t rd, uint32_t imm, uint32_t rn, thumb_shift shift); - -thumb_opcode th_ssbb(); - -thumb_opcode th_stl(uint32_t rt, uint32_t rn); -thumb_opcode th_stlb(uint32_t rt, uint32_t rn); -thumb_opcode th_stlex(uint32_t rd, uint32_t rt, uint32_t rn); -thumb_opcode th_stlexb(uint32_t rd, uint32_t rt, uint32_t rn); -thumb_opcode th_stlexh(uint32_t rd, uint32_t rt, uint32_t rn); -thumb_opcode th_stlh(uint32_t rt, uint32_t rn); -thumb_opcode th_stm(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding); -thumb_opcode th_stmdb(uint32_t rn, uint32_t regset, uint32_t writeback, thumb_enforce_encoding encoding); -thumb_opcode th_strbt(uint32_t rt, uint32_t rn, int imm); -thumb_opcode th_strd_imm(uint32_t rt, uint32_t rt2, uint32_t rn, int imm, uint32_t puw, - thumb_enforce_encoding encoding); -thumb_opcode th_strex(uint32_t rd, uint32_t rt, uint32_t rn, int imm); -thumb_opcode th_strexb(uint32_t rd, uint32_t rt, uint32_t rn); -thumb_opcode th_strexh(uint32_t rd, uint32_t rt, uint32_t rn); -thumb_opcode th_strht(uint32_t rt, uint32_t rn, int imm); -thumb_opcode th_strt(uint32_t rt, uint32_t rn, int imm); - -thumb_opcode th_sxtb(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); - -thumb_opcode th_sxth(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); - -thumb_opcode th_tbb(uint32_t rn, uint32_t rm, uint32_t h); - -thumb_opcode th_teq(uint32_t rn, uint32_t imm); -thumb_opcode th_tst_imm(uint32_t rn, uint32_t imm); -thumb_opcode th_tst_reg(uint32_t rn, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); - -thumb_opcode th_tt(uint32_t rd, uint32_t rn, uint32_t a, uint32_t t); -thumb_opcode th_udf(uint32_t imm, thumb_enforce_encoding encoding); -thumb_opcode th_umlal(uint32_t rdlo, uint32_t rdhi, uint32_t rn, uint32_t rm); - -thumb_opcode th_uxtb(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); - -thumb_opcode th_uxth(uint32_t rd, uint32_t rm, thumb_shift shift, thumb_enforce_encoding encoding); - -thumb_opcode th_wfe(thumb_enforce_encoding encoding); -thumb_opcode th_wfi(thumb_enforce_encoding encoding); -thumb_opcode th_yield(thumb_enforce_encoding encoding); - -void th_sym_t(); -void th_sym_a(); -void th_sym_d(); \ No newline at end of file diff --git a/arm-thumb-scratch.c b/arm-thumb-scratch.c index 4e99e093..d9f20cc0 100644 --- a/arm-thumb-scratch.c +++ b/arm-thumb-scratch.c @@ -2,7 +2,8 @@ #include -#include "arm-thumb-opcodes.h" +#include "arch/arm/thumb/thumb.h" +#include "arch/arm/thumb/thop_block.h" #include "tccls.h" /* Provided by arm-thumb-gen.c */ @@ -59,24 +60,24 @@ ScratchRegAllocs get_scratch_regs_with_save(uint32_t exclude_regs, int count) else { int reg_to_save = -1; - if (!(exclude & (1u << R_IP))) + /* Prefer R0-R3: 16-bit PUSH/POP and 16-bit ALU encoding */ + for (int r = 0; r <= 3; ++r) { - reg_to_save = R_IP; + if (!(exclude & (1u << r))) + { + reg_to_save = r; + break; + } } - else if (ir && ir->leaffunc && !(exclude & (1u << R_LR))) + + if (reg_to_save < 0 && ir && ir->leaffunc && !(exclude & (1u << R_LR))) { reg_to_save = R_LR; } - else + + if (reg_to_save < 0 && !(exclude & (1u << R_IP))) { - for (int r = 0; r <= 3; ++r) - { - if (!(exclude & (1u << r))) - { - reg_to_save = r; - break; - } - } + reg_to_save = R_IP; } if (reg_to_save < 0) @@ -199,24 +200,24 @@ ScratchRegAlloc get_scratch_reg_with_save(uint32_t exclude_regs) } int reg_to_save = -1; - if (!(exclude_regs & (1u << R_IP))) + /* Prefer R0-R3: 16-bit PUSH/POP and 16-bit ALU encoding */ + for (int r = 0; r <= 3; ++r) { - reg_to_save = R_IP; + if (!(exclude_regs & (1u << r))) + { + reg_to_save = r; + break; + } } - else if (ir && ir->leaffunc && !(exclude_regs & (1u << R_LR))) + + if (reg_to_save < 0 && ir && ir->leaffunc && !(exclude_regs & (1u << R_LR))) { reg_to_save = R_LR; } - else + + if (reg_to_save < 0 && !(exclude_regs & (1u << R_IP))) { - for (int r = 0; r <= 3; ++r) - { - if (!(exclude_regs & (1u << r))) - { - reg_to_save = r; - break; - } - } + reg_to_save = R_IP; } if (reg_to_save < 0) diff --git a/bugs/01-const-prop-tmp-missing-divmod-folds.md b/bugs/01-const-prop-tmp-missing-divmod-folds.md new file mode 100644 index 00000000..51e45e3b --- /dev/null +++ b/bugs/01-const-prop-tmp-missing-divmod-folds.md @@ -0,0 +1,56 @@ +# 01 — `const_prop_tmp` does not fold IMOD/UMOD/DIV/UDIV/PDIV + +**Status:** FIXED in this branch ([ir/opt_constprop.c:4340-4378](../ir/opt_constprop.c#L4340-L4378)) +**Severity:** Medium — blocks bigger cascades, not a miscompile. + +## Symptom + +`const_prop_tmp`'s two-immediate fold table in [ir/opt_constprop.c:4294-4353](../ir/opt_constprop.c#L4294-L4353) covers +`ADD`/`SUB`/`AND`/`OR`/`XOR`/`SHL`/`SHR`/`SAR`/`ROR`/`MUL`/`UMULL`/`UBFX` +but **not** integer division/remainder. After propagation, an op like +`T11 <-- #-13 IMOD #61` stays in the IR with both operands as immediates +and never folds to `T11 <-- ASSIGN #-13`. + +## Repro + +`tests/gcctestsuite/.../gcc.c-torture/execute/bitfld-1.c` at `-O2`. The +"AFTER LOOP ROTATION" dump shows: + +``` +0008: T11 <-- #-13 IMOD #61 +0009: CMP T11,#-13 +0010: JMP to 13 if "==" +0011: FUNCPARAMVOID FUNCPARAMVOID #131072 +0012: CALL GlobalSym(1137) CALL #131072 ; abort() +``` + +`T11` should fold to `#-13`, `CMP` to a tautology, JMP to unconditional, +and `abort()` to dead code that DCE removes. + +## Why it matters + +Beyond the static fold itself, this stalls **all downstream cascades**: +the `CALL abort()` between a stack STORE and a later stack read keeps +`sl_forward` from forwarding the stored value (it conservatively assumes +a call may clobber memory). Without the fold, the call stays, and the +read-after-store chain never collapses. + +## Fix + +Extend the fold switch with: + +```c +case TCCIR_OP_DIV: +case TCCIR_OP_PDIV: +case TCCIR_OP_UDIV: +case TCCIR_OP_IMOD: +case TCCIR_OP_UMOD: +``` + +each handling `v2 == 0` (and `INT64_MIN / -1` for the signed variants) by +setting `ok = 0` so the fold is skipped on UB inputs. + +## Related + +- [[02]] — without `known_bits`, the operands of these IMOD/UMODs would never *become* both-immediate in the first place. Both bugs together gate the bitfld-1 cascade. +- [[04]] — even after this fold fires, the downstream cleanup needs the pipeline to keep iterating. diff --git a/bugs/02-shl-shr-fold-unequal-amounts.md b/bugs/02-shl-shr-fold-unequal-amounts.md new file mode 100644 index 00000000..87317956 --- /dev/null +++ b/bugs/02-shl-shr-fold-unequal-amounts.md @@ -0,0 +1,59 @@ +# 02 — `SHL N → SHR M` peephole only handles `N == M` + +**Status:** WORKED AROUND via [ir/opt_knownbits.c](../ir/opt_knownbits.c) +**Severity:** Medium — large class of missed folds on bitfield reads. + +## Symptom + +The peephole at [ir/opt_constprop.c:1436-1475](../ir/opt_constprop.c#L1436-L1475) handles only the +byte-/half-cast pattern `SHL #N → SHR #N → AND #mask`: + +```c +if (shl_amt != shr_amt || shl_amt <= 0 || shl_amt >= 32) + continue; +``` + +The bitfield-extract idiom uses **unequal** amounts: + +- 7-bit unsigned bitfield at bit position 7: `SHL #18 → SHR #25` +- 7-bit signed bitfield at bit position 0: `SHL #25 → SAR #25` + +These never collapse. They also can't be folded by `const_prop_tmp` alone +because the source value usually isn't fully constant — only specific bit +ranges are (from a preceding `(x AND mask) OR const` insert). + +## Repro + +bitfld-1's chain after the insert sequence: + +``` +T5 = (...) OR #115 ; bits 0..6 = 115 (= -13 in 7b sign) +T9 = T5 SHL #18 +T10 = T9 SHR #25 ; expect: bits 7..13 of T5 = 61 +T14 = T5 SHL #25 +T15 = T14 SAR #25 ; expect: bits 0..6 sign-ext = -13 +``` + +`const_prop` can fold neither chain. The whole abort-test ladder stays alive. + +## Workaround + +Added [ir/opt_knownbits.c](../ir/opt_knownbits.c) — a known-bits lattice (per-temp +and per-stack-slot `known_zero`/`known_one` masks). It propagates through +`AND`/`OR`/`XOR`/`SHL`/`SHR`/`SAR` and rewrites the op to `ASSIGN imm` +when all 32 bits become known. This covers the bitfield extract because +the relevant bits of `T5` are forced known by the preceding inserts even +though `T5`'s full value is not. + +## A simpler, narrower alternative + +For the unequal-shift peephole alone, generalize the existing fold: +when `shl_amt <= shr_amt`, replace with `(x >> (M - N)) & ((1 << (32 - M)) - 1)` +(`SHR` + `AND`). This won't help when the source value is partially known +but not constant — the cascade still needs known-bits — so the workaround +went the more general route. + +## Related + +- [[01]] — even when known_bits folds the SHL/SHR chain to a constant, the downstream IMOD needs the IMOD fold to also fire. +- [[04]] — and the resulting dead `abort()` call needs the pipeline to iterate so `sl_forward` can forward the stack store to subsequent reads. diff --git a/bugs/03-dead-local-slot-missing-lea-deref.md b/bugs/03-dead-local-slot-missing-lea-deref.md new file mode 100644 index 00000000..455fdde6 --- /dev/null +++ b/bugs/03-dead-local-slot-missing-lea-deref.md @@ -0,0 +1,79 @@ +# 03 — `dead_local_slot_elim` ignores STOREs via LEA temp deref + +**Status:** FIXED in this branch via new pass [ir/opt_dead_lea_store.c](../ir/opt_dead_lea_store.c) +**Severity:** Medium — leaves dead bitfield writes after upstream chains collapse. + +## Symptom + +`dead_local_slot_elim` ([ir/opt_memory.c:4406-4441](../ir/opt_memory.c#L4406-L4441)) +only NOPs STOREs whose `dest` operand is a **direct** `StackLoc[X]` form: + +```c +if (q->op != TCCIR_OP_STORE) continue; +IROperand dest = tcc_ir_op_get_dest(ir, q); +if (irop_get_tag(dest) != IROP_TAG_STACKOFF) continue; +if (!dest.is_local || irop_get_vreg(dest) != -1) continue; +``` + +It silently skips the equally common temp-deref form: + +``` +T0 <-- Addr[StackLoc[-4]] +T0***DEREF*** <-- T2 [STORE] +``` + +The `live[]` collection at [ir/opt_memory.c:4273-4342](../ir/opt_memory.c#L4273-L4342) has the same +asymmetry — temp-deref reads aren't registered either, so even the +elimination logic that *does* fire is working from an incomplete picture +of which slots are live. + +## Repro + +bitfld-1 after the [[02]] workaround folds all the bitfield extractors — +the IR collapses to just the two bitfield-insert STOREs: + +``` +0007: R0(T3)***DEREF*** <-- R2(T5) [STORE] ; never read again +0008: RETURNVALUE #0 +``` + +`dead_local_slot_elim` walks past those STOREs (dest tag != STACKOFF), +the stack frame stays, the bitfield computation stays. Final size: +15 instructions vs GCC's 2. + +## Fix + +New pass [ir/opt_dead_lea_store.c](../ir/opt_dead_lea_store.c): + +1. Identify single-def TEMPs whose RHS is `Addr[StackLoc[Y]]` + (single-def required so the slot mapping is stable; lval dests are + skipped from the def count — that's the gotcha from [[05]]). +2. Resolve both STORE dests and lval-source reads through that map, + so the temp-deref form participates in liveness. +3. Eliminate a STORE whose byte range is never read by a later instruction. + +Conservative bails: any IJUMP / SETJMP / INLINE_ASM / VLA in the function, +any non-mem* CALL, any escape of the address to a VAR/PARAM or untracked +TEMP, any mem* `PARAM1` (the source side) with unknown size or unknown +source. The existing `dead_local_slot_elim` does similar tameness work +for the direct-stack-ref form — extending its 1500-line implementation +to also recognize the temp-deref shape was deemed higher risk than a +narrower companion pass. + +## Why both passes? + +The two forms cover different upstream sources: + +- Direct `STORE StackLoc[X]` form arises after `sl_forward` canonicalizes + a `LEA + STORE T_DEREF` pair — `dead_local_slot_elim` handles these. +- Temp-deref `STORE T0_DEREF` form survives when `sl_forward` doesn't + canonicalize (the LEA temp is reused, has multi-use shape, etc.). + The new pass handles these. + +A future refactor could unify both into one pass with a slot-resolver +helper, but the current split keeps each pass small and obviously sound. + +## Related + +- [[02]] — without `known_bits` the downstream reads of the slot don't go away, so this pass would correctly leave the STOREs alive. +- [[05]] — gotcha that bit the first attempt at this pass. diff --git a/bugs/04-memory-pipeline-trigger-stall.md b/bugs/04-memory-pipeline-trigger-stall.md new file mode 100644 index 00000000..ebf7879c --- /dev/null +++ b/bugs/04-memory-pipeline-trigger-stall.md @@ -0,0 +1,86 @@ +# 04 — `memory_passes` group stalls when its trigger returns 0 mid-cascade + +**Status:** WORKED AROUND via the `kb_cascade` compound pass in [ir/opt_pipeline.c](../ir/opt_pipeline.c) +**Severity:** Medium — limits how far a single pipeline run can drive a chain reaction. + +## Symptom + +`pipeline_run_group` ([ir/opt_pipeline.c:63-118](../ir/opt_pipeline.c#L63-L118)) iterates a pass +group until the *trigger* pass returns 0: + +```c +if (group->trigger_idx >= 0) { + int tch = trigger->run(ctx); + ... + if (tch <= 0) break; +} +``` + +The `memory_passes` group uses `sl_forward` as its trigger +([ir/opt_pipeline.c:220-232](../ir/opt_pipeline.c#L220-L232)). Once `sl_forward` exhausts the +*currently visible* forwarding opportunities, the group exits — even if +other passes in the group (or future iterations) would create new +opportunities for it. + +## Repro + +bitfld-1, iteration 1 of `memory_passes`: + +1. `sl_forward` — forwards stored value into the *first* chain's + re-read. Returns >0. Group continues. +2. `const_cascade`, `known_bits`, `branch_fold_2x`, `dce`, + `elim_fallthru` — together they fold the first chain, kill its + `abort()`, NOP the now-trivial JMP-to-next. + +Iteration 2: + +3. `sl_forward` re-runs on the cleaned-up IR. With the `abort()` call + gone, it *could now* forward the stack store across to the **next** + chain's read. But its analysis returns 0 because the changes from + step 2 haven't been re-discovered as new forwarding sites in this + iteration's pre-scan, **or** sl_forward's incremental check decides + there's nothing new. Group exits. The other three chains never fold. + +End state: only the first of four `abort()` chains is eliminated. + +## Workaround + +A compound pass `kb_cascade` ([ir/opt_pipeline.c:150-169](../ir/opt_pipeline.c#L150-L169)) loops the +relevant subset internally to a fixed point: + +```c +for (int i = 0; i < 8; i++) { + ch += tcc_ir_opt_known_bits(ir); + ch += tcc_ir_opt_const_prop_tmp(ir); + ch += tcc_ir_opt_branch_folding(ir); + tcc_ir_opt_dce(ir); + ch += tcc_ir_opt_eliminate_fallthrough(ir); + tcc_ir_opt_compact_nops(ir); + ch += tcc_ir_opt_sl_forward(ir); + if (!ch) break; +} +``` + +It's added at the end of `memory_passes`. With this, all four bitfld-1 +chains cascade in a single pipeline step. + +## Better fix (deferred) + +The trigger mechanism is a useful optimization (skip the group when +nothing's primed it), but it should be triggered by *any* pass returning +> 0, not specifically the indexed trigger. Two options: + +1. Change `pipeline_run_group` to compute `round_changes` from the full + group and re-iterate while `round_changes > 0`, falling back to the + trigger only as a first-iteration gate. +2. Promote `sl_forward` out of the trigger slot, run the group based on + `round_changes` like the trigger-less groups already do. + +Either change affects every group, so it needs a wider sweep to verify +no group depends on the early-exit behavior. The narrow `kb_cascade` +workaround sidesteps that risk. + +## Related + +- [[02]] — the cascade only matters because `known_bits` *can* fold the chain heads; the trigger stall hid that we needed to. +- [[01]] — the chain head's IMOD fold is what creates the dead `abort()` whose removal lets `sl_forward` continue. diff --git a/bugs/05-var-param-stackoff-encoding.md b/bugs/05-var-param-stackoff-encoding.md new file mode 100644 index 00000000..32ab33d8 --- /dev/null +++ b/bugs/05-var-param-stackoff-encoding.md @@ -0,0 +1,73 @@ +# 05 — VAR/PARAM operands carry `tag=STACKOFF` for their spill slot + +**Status:** DOCUMENTED (footgun, not a bug per se) +**Severity:** Low for existing code; High for new pass authors. + +## What surprised me + +When a VAR or PARAM is referenced via its potential stack-spill encoding, +the operand has: + +- `tag == IROP_TAG_STACKOFF` +- `is_local == 1` +- `is_lval == 1` +- `vreg_type != 0` (the originating VAR/PARAM index) +- `u.imm32` = the spill-slot offset (which may collide with offsets of + real, distinct stack allocations) + +This is **indistinguishable** from a real direct stack reference like +`StackLoc[-4]` (which has `vreg_type == 0`) on every field *except* +`vreg_type`. + +A new pass that filters operands with: + +```c +if (op.tag == IROP_TAG_STACKOFF && op.is_local && op.is_lval) { /* stack ref */ } +``` + +will silently treat a VAR's spill encoding as if it were a real slot. +If the pass also tracks per-stack-slot state (e.g. known-bits) and a +real STORE happens to write the *same offset*, it will load that state +when the VAR is read — and miscompile. + +## How it bit me + +`opt_knownbits.c`'s first cut treated `tag=STACKOFF, is_lval, is_local` +as a direct stack read. On +`tests/.../gcc.c-torture/execute/20040313-1.c`, a `V0` variable holding +`d = 0` was encoded as `StackLoc[-4100], vreg_type=VAR, pos=0`. The +array `t[1025]` happened to start at the same offset `-4100`, with +`t[0] = 1024` stored to it shortly before `d`'s read. The pass loaded +the `t[0]` known-bits value (1024) as if it were `d`'s value, computed +`d << 2 = 4096`, and folded that into a downstream address — turning +`t[d=0]` into `t[1024]`. Tests that depended on `d == 0` corrupted at +runtime. + +## Suggested check for new passes + +When treating a `STACKOFF` operand as a real stack slot reference: + +```c +if (op.tag == IROP_TAG_STACKOFF && op.is_local && op.is_lval && + op.vreg_type == 0) /* MUST: no vreg attached */ +{ + /* genuine direct StackLoc[X] ref */ +} +``` + +`vreg_type == 0` (no vreg) is the only encoding for a true direct stack +reference. Anything else is a vreg-backed pseudoreg whose offset field +is metadata about *where it would spill*, not where the program reads +from. + +## Where this would help + +A short comment in [tccir_operand.h](../tccir_operand.h) at the IROperand definition +documenting this case would have saved hours. The existing +`dead_local_slot_elim` already gets it right (it filters +`irop_get_vreg(op) != -1`), but the convention isn't called out +anywhere I could find. + +## Related + +- [[03]] — the same encoding gotcha affects the new dead-LEA-store pass; it uses the same `vreg_type == 0` guard. diff --git a/bugs/06-tu-summary-store-indexed-is-lval.md b/bugs/06-tu-summary-store-indexed-is-lval.md new file mode 100644 index 00000000..0d8275ee --- /dev/null +++ b/bugs/06-tu-summary-store-indexed-is-lval.md @@ -0,0 +1,56 @@ +# 06 — `collect_tu_func_summary` missed STORE_INDEXED / STORE_POSTINC writes when `is_lval` was cleared + +**Status:** FIXED in this branch ([ir/opt.c:822-844](../ir/opt.c#L822-L844)) +**Severity:** Medium — silently prevented end-of-TU dead-static-store elimination. + +## Symptom + +`tcc_ir_collect_tu_func_summary` recorded a write to a static global only +when the STORE dest carried both `is_sym=1` and `is_lval=1`: + +```c +if (q->op == TCCIR_OP_STORE || q->op == TCCIR_OP_STORE_INDEXED || + q->op == TCCIR_OP_STORE_POSTINC) { + IROperand dest = tcc_ir_op_get_dest(ir, q); + if (dest.is_sym && dest.is_lval) { ... } // <-- too strict +} +``` + +But `disp_fusion` may have cleared `is_lval` on the *base* operand of +`STORE_INDEXED` / `STORE_POSTINC` (see comment in [ir/opt_fusion.c:1925-1928](../ir/opt_fusion.c#L1925-L1928): *"disp_fusion clears +is_lval on STORE_INDEXED's base, so the is_lval test alone would +mis-classify it as a redef"*). Result: writes to a static global through +an indexed/postinc form were silently dropped from the summary. + +## Repro + +`tests/gcctestsuite/.../gcc.c-torture/compile/pr25483.c` at `-O2`. +`decode_init` writes `mdct_win[j] = (int)(d * 3)` inside a loop. After +fusion, the IR contains: + +``` +0019: GlobalSym(1182) <-- R0(T6) STORE_INDEXED R6(T7) +``` + +The summary collector saw `dest.is_lval=0` (cleared by `disp_fusion`) and +skipped the entry, so `mdct_win` never appeared in `static_writes`. +Without that record, [[08]]'s `tcc_ir_tu_analyze_dead_statics` could not +mark `mdct_win` as `tu_no_readers` and `decode_init` was never +re-optimized. + +## Fix + +Relax the `is_lval` check specifically for the indexed/postinc forms — +their dest *is* the memory write target regardless of the flag: + +```c +int dest_is_write_target = + dest.is_sym && + (dest.is_lval || q->op == TCCIR_OP_STORE_INDEXED || + q->op == TCCIR_OP_STORE_POSTINC); +``` + +## Related + +- [[07]] — the same is_lval over-restriction affected `dead_static_store_elim` itself. +- [[08]] — the late_reopt mechanism that this summary feeds. diff --git a/bugs/07-dead-static-store-unfused-temp-deref.md b/bugs/07-dead-static-store-unfused-temp-deref.md new file mode 100644 index 00000000..f5e74898 --- /dev/null +++ b/bugs/07-dead-static-store-unfused-temp-deref.md @@ -0,0 +1,88 @@ +# 07 — `dead_static_store_elim` missed the pre-fusion `T = ADD(SYMREF, …); *T = v` form + +**Status:** FIXED in this branch ([ir/opt_memory.c:5336-5440](../ir/opt_memory.c#L5336-L5440)) +**Severity:** Medium — pass was effectively a no-op for static-array writes. + +## Symptom + +`dead_static_store_elim` looked for the *post-fusion* shape only: + +```c +IROperand dest = tcc_ir_op_get_dest(ir, q); +if (!dest.is_sym || !dest.is_lval) continue; +``` + +i.e. it required the STORE dest itself to be a `SYMREF` operand. But +during the IR optimization pipeline, the canonical form of a static-array +write is still: + +``` +T_addr = ADD(SYMREF, scaled_index) ; or LEA / ASSIGN of SYMREF +*T_addr = value ; STORE through TEMP, dest=lval TEMP +``` + +The fusion from "TEMP-DEREF STORE" to "STORE_INDEXED with SYMREF base" +runs during machine_op / codegen translation, **after** the late_cleanup +pass group has already run. So in practice, the pass never matched a +real-world write to a file-scope static array — it was only fixing +direct `static_int = 0` style scalar writes. + +## Repro + +`tests/gcctestsuite/.../gcc.c-torture/compile/pr25483.c` at `-O2`: + +```c +static int mdct_win[8]; +int decode_init(double d) { + int j; + for (j = 4; j; j--) { d *= 0.5; mdct_win[j] = (d * 3); } +} +``` + +IR in the late_cleanup phase (pre-codegen): + +``` +0011: T3 <-- V0 SHL #2 +0012: T4 <-- GlobalSym(1182) ADD T3 ; T4 = &mdct_win[j] +0018: T4***DEREF*** <-- T6 [STORE] ; *T4 = (int)(d*3) +``` + +`dest=T4` is a TEMP, not a SYMREF, so the pass skipped the STORE even +though `mdct_win` was correctly marked `tu_no_readers`. + +## Fix + +Add an indirect-resolution helper that, when dest is a single-def lval +TEMP, traces back to the TEMP's defining `ADD`/`LEA`/`ASSIGN` and pulls +the SYMREF from `src1`: + +```c +static Sym *dss_resolve_store_dest_sym(TCCIRState *ir, IRQuadCompact *q, + int store_idx) { + IROperand dest = tcc_ir_op_get_dest(ir, q); + if (dest.is_sym) { ... handle direct form ... } + if (q->op != TCCIR_OP_STORE || !dest.is_lval) return NULL; + /* TEMP-DEREF: trace back to single-def ADD/LEA/ASSIGN of SYMREF */ + ... +} +``` + +Constraints kept tight to stay sound: single-def TEMP only, no other +defs anywhere in the function, src1 must be a non-lval SYMREF. + +## Why it matters (cascade) + +NOPing the STORE alone is small; the win is what DCE drops afterward. +For pr25483, NOPing the STORE_INDEXED to `mdct_win` lets DCE remove the +chain feeding it: + +- `T6 = CALL __aeabi_d2iz(T5)` — pure aeabi call, result now dead +- `T5 = CALL __aeabi_dmul(d, 3.0)` — pure aeabi call, result now dead +- `T3 = SHL V0, 2` and `T4 = ADD(mdct_win, T3)` — address dead + +Final result: 30 instructions → 16 instructions for `decode_init`. + +## Related + +- [[06]] — companion is_lval over-restriction in the summary collector. +- [[08]] — without late_reopt firing at all, this pass wouldn't run on pr25483 regardless. diff --git a/bugs/08-late-reopt-gated-on-inline-fns.md b/bugs/08-late-reopt-gated-on-inline-fns.md new file mode 100644 index 00000000..f21a10c9 --- /dev/null +++ b/bugs/08-late-reopt-gated-on-inline-fns.md @@ -0,0 +1,110 @@ +# 08 — `gen_late_reopt_functions` only iterated `inline_fns`, locking out non-auto-inline functions + +**Status:** FIXED in this branch ([tccgen.c:29381-29453](../tccgen.c#L29381-L29453)) +**Severity:** Medium — entire end-of-TU dead-static-store mechanism silently skipped most candidate functions. + +## Symptom + +`gen_late_reopt_functions` walks `tcc_state->inline_fns` and re-compiles +entries with `func_late_reopt` set: + +```c +for (i = 0; i < s->nb_inline_fns; ++i) { + fn = s->inline_fns[i]; + sym = fn->sym; + if (!sym->type.ref->f.func_late_reopt) continue; + ... begin_macro(compile_ts, 1); next(); gen_function(sym); ... +} +``` + +It requires `fn->func_str` (the saved token stream) to replay-compile. +Tokens are saved only when the function takes one of the inline-related +paths in `decl()` — specifically when `sym->type.t & VT_INLINE` is set +or `auto_inline_sig_ok(sym)` returns 1. + +`auto_inline_sig_ok` rejects: +- `double` / `long double` parameters or return type (via `auto_inline_type_ok` enum) +- struct *parameters* in non-static functions +- `_Complex` types +- unnamed parameters +- VLA parameters +- vector types +- structs > 16 bytes + +Any function matching one of these signatures fell through to the plain +`else { gen_function(sym); }` branch with **no token preservation**. +At end-of-TU, those functions could not be re-compiled even when +`tcc_ir_tu_analyze_dead_statics` marked their writes as dead. + +## Repro + +`tests/gcctestsuite/.../gcc.c-torture/compile/pr25483.c`: + +```c +static int mdct_win[8]; +int decode_init(double d) { /* double param → auto_inline_sig_ok = 0 */ + int j; + for (j = 4; j; j--) { d *= 0.5; mdct_win[j] = (d * 3); } +} +``` + +`mdct_win` has no readers in the TU — TU analysis correctly flagged it +`tu_no_readers` and `decode_init` as `func_late_reopt`. But +`decode_init` was never in `inline_fns`, so `gen_late_reopt_functions` +silently skipped it. Output: 30 instructions vs GCC's 1. + +## Fix + +In `decl()`'s "regular function definition" `else` branch, when +`opt_dead_store` is enabled, take the same save+replay path that the +auto-inline TOO-LARGE branch uses: + +```c +if (tcc_state->opt_dead_store) { + struct InlineFunc *fn = tcc_malloc(...); + fn->sym = sym; + skip_or_save_block(&fn->func_str); + int body_len = fn->func_str->len; + if (body_len <= 512) { + dynarray_add(&tcc_state->inline_fns, &tcc_state->nb_inline_fns, fn); + /* replay-compile */ + begin_macro(compile_ts, 1); next(); gen_function(sym); end_macro(); + if (!sym->type.ref->f.tu_static_writer) { + /* not a writer — drop tokens, detach so gen_inline_functions skips */ + fn->sym = NULL; tok_str_free(fn->func_str); + } + } else { + /* body too large to retain — still need to replay-compile from the + * saved stream because skip_or_save_block consumed the tokens */ + begin_macro(fn->func_str, 1); next(); gen_function(sym); end_macro(); + } +} +``` + +For `tu_static_writer` entries that weren't flagged for late_reopt +(their statics turned out to have readers), the *existing* +`gen_inline_functions` walk re-emits the body anyway — overwriting +only the symbol's `st_value` and leaving the first emission's bytes +as orphan in `.text`. That re-emission is desirable: it produces a +more optimized body once all auto-inline candidates have had their +flags finalized. Do *not* attempt to detach those entries from +`inline_fns` to suppress the re-emit — doing so leaves you with the +sub-optimized first emission (regression observed on +`tests/tests2/55_lshift_type.c`, main grew 532 → 1459 instructions). + +Also gate the "function might return no value" warning on +`!ir_late_reopt_phase` so the second compile doesn't double-emit it. + +## Why it matters (cascade) + +Pairs with [[06]] (summary collector now records the write) and [[07]] +(late_cleanup pass can now NOP the unfused TEMP-DEREF STORE). The three +together close pr25483's gap from 30 instructions to 16. Further wins +beyond that need a pure-loop elimination pass (the remaining +`__aeabi_dmul` calls into `d`, but `d`'s final value is never observed +— GCC reaches `bx lr` by recognizing the whole loop is dead). + +## Related + +- [[06]] — write summary collector fix. +- [[07]] — DSE pass fix to match the unfused store form. diff --git a/bugs/README.md b/bugs/README.md new file mode 100644 index 00000000..67b4f7e2 --- /dev/null +++ b/bugs/README.md @@ -0,0 +1,20 @@ +# Bug Reports + +Issues observed in the TCC IR optimizer during the bitfld-1 gap-closure work +(2026-05). Each report stands alone; cross-references use `[[NN]]` style. + +| # | Title | Status | +|----|---------------------------------------------------------------|------------| +| 01 | `const_prop_tmp` does not fold `IMOD`/`UMOD`/`DIV`/`UDIV`/`PDIV` with two-immediate operands | FIXED | +| 02 | `SHL N → SHR M` peephole only handles `N == M`; misses bitfield-extract (`N != M`) | WORKED AROUND | +| 03 | `dead_local_slot_elim` ignores STOREs through a LEA temp (`T = Addr[StackLoc[X]]; STORE T***DEREF***`) | FIXED | +| 04 | `memory_passes` group stalls when its trigger (`sl_forward`) returns 0 mid-cascade | WORKED AROUND | +| 05 | VAR/PARAM operands carry `tag=STACKOFF` for their potential spill slot; conflated with direct stack refs in new passes | DOCUMENTED | +| 06 | `collect_tu_func_summary` missed STORE_INDEXED / STORE_POSTINC writes when `is_lval` was cleared | FIXED | +| 07 | `dead_static_store_elim` only matched post-fusion SYMREF dest; missed the pre-fusion TEMP-DEREF form | FIXED | +| 08 | `gen_late_reopt_functions` only iterated `inline_fns`, locking out functions failing `auto_inline_sig_ok` | FIXED | + +Statuses: +- **FIXED**: a code change in this commit/branch resolves it. +- **WORKED AROUND**: the underlying limitation is still present; mitigated by an additional pass or extra pipeline pass. +- **DOCUMENTED**: footgun that bit a new pass author; recorded for next person. diff --git a/build.txt b/build.txt deleted file mode 100644 index 947a1dc5..00000000 --- a/build.txt +++ /dev/null @@ -1,586 +0,0 @@ -=== IR BEFORE OPTIMIZATIONS === -0000: T0 <-- P0 [ASSIGN] -0001: T1 <-- GlobalSym(935)***DEREF*** ADD T0***DEREF*** -0002: GlobalSym(935)***DEREF*** <-- T1 [STORE] -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: R1(T0) <-- R0(P0) [ASSIGN] -0001: R2(T1) <-- GlobalSym(935)***DEREF*** ADD R1(T0)***DEREF*** -0002: GlobalSym(935)***DEREF*** <-- R2(T1) [STORE] -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435457) [ASSIGN] -0001: V1 <-- #0 [ASSIGN] -0002: V2 <-- GlobalSym(268435458) [ASSIGN] -0003: PARAM0[call_0] GlobalSym(268435459) -0004: PARAM1[call_0] V1 -0005: CALL GlobalSym(934) --> GlobalSym(268435457)***DEREF*** -0006: CMP V1,#0 -0007: JMP to 10 if "!=" -0008: V1 <-- #1 [ASSIGN] -0009: JMP to 3 -0010: T1 <-- &V2 -0011: PARAM0[call_1] T1 -0012: CALL GlobalSym(947) CALL #65537 -0013: V3 <-- GlobalSym(268435460) [ASSIGN] -0014: JMP to 17 -0015: V3 <-- GlobalSym(268435461) [ASSIGN] -0016: JMP to 21 -0017: T2 <-- &V3 -0018: PARAM0[call_2] T2 -0019: CALL GlobalSym(947) CALL #131073 -0020: JMP to 54 -0021: T3 <-- &V3 -0022: PARAM0[call_3] T3 -0023: CALL GlobalSym(947) CALL #196609 -0024: V1 <-- #2 [ASSIGN] -0025: V4 <-- GlobalSym(268435462) [ASSIGN] -0026: V5 <-- GlobalSym(268435463) [ASSIGN] -0027: V6 <-- GlobalSym(268435464) [ASSIGN] -0028: JMP to 31 -0029: V6 <-- GlobalSym(268435465) [ASSIGN] -0030: JMP to 35 -0031: T4 <-- &V6 -0032: PARAM0[call_4] T4 -0033: CALL GlobalSym(947) CALL #262145 -0034: JMP to 39 -0035: T5 <-- &V6 -0036: PARAM0[call_5] T5 -0037: CALL GlobalSym(947) CALL #327681 -0038: JMP to 43 -0039: T6 <-- &V5 -0040: PARAM0[call_6] T6 -0041: CALL GlobalSym(947) CALL #393217 -0042: JMP to 47 -0043: T7 <-- &V5 -0044: PARAM0[call_7] T7 -0045: CALL GlobalSym(947) CALL #458753 -0046: JMP to 51 -0047: T8 <-- &V4 -0048: PARAM0[call_8] T8 -0049: CALL GlobalSym(947) CALL #524289 -0050: JMP to 54 -0051: T9 <-- &V4 -0052: PARAM0[call_9] T9 -0053: CALL GlobalSym(947) CALL #589825 -0054: CMP V1,#2 -0055: JMP to 57 if "==" -0056: JMP to 24 -0057: V7 <-- GlobalSym(268435466) [ASSIGN] -0058: V1 <-- V1 ADD #1 -0059: T11 <-- V1 [LOAD] -0060: CMP V1,#3 -0061: JMP to 66 if "==" -0062: T12 <-- &V7 -0063: PARAM0[call_10] T12 -0064: CALL GlobalSym(947) CALL #655361 -0065: JMP to 54 -0066: T13 <-- &V7 -0067: PARAM0[call_11] T13 -0068: CALL GlobalSym(947) CALL #720897 -0069: T14 <-- &V0 -0070: PARAM0[call_12] T14 -0071: CALL GlobalSym(947) CALL #786433 -0072: T15 <-- &V0 -0073: PARAM0[call_13] T15 -0074: CALL GlobalSym(947) CALL #851969 -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435457) [ASSIGN] -0001: R4(V1) <-- #0 [ASSIGN] -0002: V2 <-- GlobalSym(268435458) [ASSIGN] -0003: PARAM0[call_0] GlobalSym(268435459) -0004: PARAM1[call_0] R4(V1) -0005: CALL GlobalSym(934) --> GlobalSym(268435457)***DEREF*** -0006: CMP R4(V1),#0 -0007: JMP to 10 if "!=" -0008: R4(V1) <-- #1 [ASSIGN] -0009: JMP to 3 -0010: R5(T1) <-- &V2 -0011: PARAM0[call_1] R5(T1) -0012: CALL GlobalSym(947) CALL #65537 -0013: V3 <-- GlobalSym(268435460) [ASSIGN] -0014: JMP to 17 -0015: NOP -0016: NOP -0017: R5(T2) <-- &V3 -0018: PARAM0[call_2] R5(T2) -0019: CALL GlobalSym(947) CALL #131073 -0020: JMP to 54 -0021: NOP -0022: NOP -0023: NOP -0024: R4(V1) <-- #2 [ASSIGN] -0025: V4 <-- GlobalSym(268435462) [ASSIGN] -0026: V5 <-- GlobalSym(268435463) [ASSIGN] -0027: V6 <-- GlobalSym(268435464) [ASSIGN] -0028: JMP to 31 -0029: NOP -0030: NOP -0031: R5(T4) <-- &V6 -0032: PARAM0[call_4] R5(T4) -0033: CALL GlobalSym(947) CALL #262145 -0034: JMP to 39 -0035: NOP -0036: NOP -0037: NOP -0038: NOP -0039: R5(T6) <-- &V5 -0040: PARAM0[call_6] R5(T6) -0041: CALL GlobalSym(947) CALL #393217 -0042: JMP to 47 -0043: NOP -0044: NOP -0045: NOP -0046: NOP -0047: R5(T8) <-- &V4 -0048: PARAM0[call_8] R5(T8) -0049: CALL GlobalSym(947) CALL #524289 -0050: JMP to 54 -0051: NOP -0052: NOP -0053: NOP -0054: CMP R4(V1),#2 -0055: JMP to 57 if "==" -0056: JMP to 24 -0057: V7 <-- GlobalSym(268435466) [ASSIGN] -0058: R4(V1) <-- R4(V1) ADD #1 -0059: R0(T11) <-- R4(V1) [LOAD] -0060: CMP R4(V1),#3 -0061: JMP to 66 if "==" -0062: R5(T12) <-- &V7 -0063: PARAM0[call_10] R5(T12) -0064: CALL GlobalSym(947) CALL #655361 -0065: JMP to 54 -0066: R4(T13) <-- &V7 -0067: PARAM0[call_11] R4(T13) -0068: CALL GlobalSym(947) CALL #720897 -0069: R4(T14) <-- &V0 -0070: PARAM0[call_12] R4(T14) -0071: CALL GlobalSym(947) CALL #786433 -0072: R4(T15) <-- &V0 -0073: PARAM0[call_13] R4(T15) -0074: CALL GlobalSym(947) CALL #851969 -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: PARAM0[call_0] GlobalSym(268435467) -0001: T0 <-- P0 [ASSIGN] -0002: PARAM1[call_0] T0***DEREF*** -0003: CALL GlobalSym(934) --> GlobalSym(268435467)***DEREF*** -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: PARAM0[call_0] GlobalSym(268435467) -0001: R4(T0) <-- R0(P0) [ASSIGN] -0002: PARAM1[call_0] R4(T0)***DEREF*** -0003: CALL GlobalSym(934) --> GlobalSym(268435467)***DEREF*** -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: PARAM0[call_0] GlobalSym(268435468) -0001: T0 <-- P0 [ASSIGN] -0002: PARAM1[call_0] T0***DEREF*** -0003: CALL GlobalSym(934) --> GlobalSym(268435468)***DEREF*** -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: PARAM0[call_0] GlobalSym(268435468) -0001: R4(T0) <-- R0(P0) [ASSIGN] -0002: PARAM1[call_0] R4(T0)***DEREF*** -0003: CALL GlobalSym(934) --> GlobalSym(268435468)***DEREF*** -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435469) [ASSIGN] -0001: T0 <-- &V0 -0002: PARAM0[call_0] T0 -0003: CALL GlobalSym(947) CALL #1 -0004: T1 <-- V0 [LOAD] -0005: RETURNVALUE T1 -0006: T2 <-- &V0 -0007: PARAM0[call_1] T2 -0008: CALL GlobalSym(947) CALL #65537 -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435469) [ASSIGN] -0001: R4(T0) <-- &V0 -0002: PARAM0[call_0] R4(T0) -0003: CALL GlobalSym(947) CALL #1 -0004: R0(T1) <-- V0 [LOAD] -0005: RETURNVALUE R0(T1) -0006: NOP -0007: NOP -0008: NOP -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: PARAM0[call_0] GlobalSym(268435470) -0001: CALL GlobalSym(934) --> GlobalSym(268435470)***DEREF*** -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: PARAM0[call_0] GlobalSym(268435470) -0001: CALL GlobalSym(934) --> GlobalSym(268435470)***DEREF*** -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435471) [ASSIGN] -0001: PARAM0[call_0] V0 -0002: CALL GlobalSym(964) CALL #1 -0003: T0 <-- &V0 -0004: PARAM0[call_1] T0 -0005: CALL GlobalSym(947) CALL #65537 -0006: T1 <-- &V0 -0007: PARAM0[call_2] T1 -0008: CALL GlobalSym(947) CALL #131073 -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435471) [ASSIGN] -0001: PARAM0[call_0] V0 -0002: CALL GlobalSym(964) CALL #1 -0003: R4(T0) <-- &V0 -0004: PARAM0[call_1] R4(T0) -0005: CALL GlobalSym(947) CALL #65537 -0006: R4(T1) <-- &V0 -0007: PARAM0[call_2] R4(T1) -0008: CALL GlobalSym(947) CALL #131073 -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435472) [ASSIGN] -0001: V1 <-- GlobalSym(268435473) [ASSIGN] -0002: T0 <-- &V1 -0003: PARAM0[call_0] T0 -0004: CALL GlobalSym(947) CALL #1 -0005: V2 <-- GlobalSym(268435474) [ASSIGN] -0006: T1 <-- &V2 -0007: PARAM0[call_1] T1 -0008: CALL GlobalSym(947) CALL #65537 -0009: T2 <-- &V0 -0010: PARAM0[call_2] T2 -0011: CALL GlobalSym(947) CALL #131073 -0012: JMP to 19 -0013: T3 <-- &V2 -0014: PARAM0[call_3] T3 -0015: CALL GlobalSym(947) CALL #196609 -0016: T4 <-- &V0 -0017: PARAM0[call_4] T4 -0018: CALL GlobalSym(947) CALL #262145 -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435472) [ASSIGN] -0001: V1 <-- GlobalSym(268435473) [ASSIGN] -0002: R4(T0) <-- &V1 -0003: PARAM0[call_0] R4(T0) -0004: CALL GlobalSym(947) CALL #1 -0005: V2 <-- GlobalSym(268435474) [ASSIGN] -0006: R4(T1) <-- &V2 -0007: PARAM0[call_1] R4(T1) -0008: CALL GlobalSym(947) CALL #65537 -0009: R4(T2) <-- &V0 -0010: PARAM0[call_2] R4(T2) -0011: CALL GlobalSym(947) CALL #131073 -0012: JMP to 19 -0013: NOP -0014: NOP -0015: NOP -0016: NOP -0017: NOP -0018: NOP -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: V0 <-- #0 [ASSIGTEST_ZERO: is_lval=0 needs_load=0 is64=0 pr0=4 pr1=31 vr=268697601 btype=0 ind=0x238 -TEST_ZERO: is_lval=0 needs_load=0 is64=0 pr0=4 pr1=31 vr=268697601 btype=0 ind=0x244 -N] -0001: CMP V0,#0 -0002: JMP to 13 if "!=" -0003: V1 <-- GlobalSym(268435475) [ASSIGN] -0004: T0 <-- V0 [ASSIGN] -0005: V0 <-- T0 ADD #1 -0006: T2 <-- &V1 -0007: PARAM0[call_0] T2 -0008: CALL GlobalSym(947) CALL #1 -0009: JMP to 1 -0010: T3 <-- &V1 -0011: PARAM0[call_1] T3 -0012: CALL GlobalSym(947) CALL #65537 -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: R4(V0) <-- #0 [ASSIGN] -0001: CMP R4(V0),#0 -0002: JMP to 13 if "!=" -0003: V1 <-- GlobalSym(268435475) [ASSIGN] -0004: R0(T0) <-- R4(V0) [ASSIGN] -0005: R4(V0) <-- R0(T0) ADD #1 -0006: R5(T2) <-- &V1 -0007: PARAM0[call_0] R5(T2) -0008: CALL GlobalSym(947) CALL #1 -0009: JMP to 1 -0010: NOP -0011: NOP -0012: NOP -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435476) [ASSIGN] -0001: V1 <-- #0 [ASSIGN] -0002: TEST_ZERO V1 -0003: JMP to 11 if "==" -0004: V2 <-- GlobalSym(268435477) [ASSIGN] -0005: PARAM0[call_0] GlobalSym(268435478) -0006: PARAM1[call_0] V1 -0007: CALL GlobalSym(934) --> GlobalSym(268435476)***DEREF*** -0008: T1 <-- &V2 -0009: PARAM0[call_1] T1 -0010: CALL GlobalSym(947) CALL #65537 -0011: CMP V1,#0 -0012: JMP to 24 if "!=" -0013: V3 <-- GlobalSym(268435479) [ASSIGN] -0014: CMP V1,#0 -0015: JMP to 21 if "!=" -0016: V1 <-- #1 [ASSIGN] -0017: T2 <-- &V3 -0018: PARAM0[call_2] T2 -0019: CALL GlobalSym(947) CALL #131073 -0020: JMP to 4 -0021: T3 <-- &V3 -0022: PARAM0[call_3] T3 -0023: CALL GlobalSym(947) CALL #196609 -0024: T4 <-- &V0 -0025: PARAM0[call_4] T4 -0026: CALL GlobalSym(947) CALL #262145 -0027: RETURNVALUE #0 -0028: T5 <-- &V0 -0029: PARAM0[call_5] T5 -0030: CALL GlobalSym(947) CALL #327681 -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: V0 <-- GlobalSym(268435476) [ASSIGN] -0001: R4(V1) <-- #0 [ASSIGN] -0002: TEST_ZERO R4(V1) -0003: JMP to 11 if "==" -0004: V2 <-- GlobalSym(268435477) [ASSIGN] -0005: PARAM0[call_0] GlobalSym(268435478) -0006: PARAM1[call_0] R4(V1) -0007: CALL GlobalSym(934) --> GlobalSym(268435476)***DEREF*** -0008: R5(T1) <-- &V2 -0009: PARAM0[call_1] R5(T1) -0010: CALL GlobalSym(947) CALL #65537 -0011: CMP R4(V1),#0 -0012: JMP to 24 if "!=" -0013: V3 <-- GlobalSym(268435479) [ASSIGN] -0014: CMP R4(V1),#0 -0015: JMP to 21 if "!=" -0016: R4(V1) <-- #1 [ASSIGN] -0017: R5(T2) <-- &V3 -0018: PARAM0[call_2] R5(T2) -0019: CALL GlobalSym(947) CALL #131073 -0020: JMP to 4 -0021: R4(T3) <-- &V3 -0022: PARAM0[call_3] R4(T3) -0023: CALL GlobalSym(947) CALL #196609 -0024: R4(T4) <-- &V0 -0025: PARAM0[call_4] R4(T4) -0026: CALL GlobalSym(947) CALL #262145 -0027: RETURNVALUE #0 -0028: NOP -0029: NOP -0030: NOP -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: PARAM0[call_0] GlobalSym(268435480) -0001: T0 <-- P0 [ASSIGN] -0002: PARAM1[call_0] T0***DEREF*** -0003: CALL GlobalSym(934) --> GlobalSym(268435480)***DEREF*** -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: PARAM0[call_0] GlobalSym(268435480) -0001: R4(T0) <-- R0(P0) [ASSIGN] -0002: PARAM1[call_0] R4(T0)***DEREF*** -0003: CALL GlobalSym(934) --> GlobalSym(268435480)***DEREF*** -=== END IR AFTER OPTIMIZATIONS === -=== IR BEFORE OPTIMIZATIONS === -0000: V0 <-- #1000 [ASSIGN] -0001: PARAM0[call_0] GlobalSym(268435481) -0002: CALL GlobalSym(934) --> GlobalSym(268435481)***DEREF*** -0003: V1 <-- #0 [ASSIGN] -0004: CMP V1,#10 -0005: JMP to 15 if ">=S" -0006: JMP to 10 -0007: V1 <-- V1 ADD #1 -0008: T2 <-- V1 [LOAD] -0009: JMP to 4 -0010: V2 <-- #100 [ASSIGN] -0011: T3 <-- &V2 -0012: PARAM0[call_1] T3 -0013: CALL GlobalSym(971) CALL #65537 -0014: JMP to 7 -0015: T4 <-- &V1 -0016: PARAM0[call_2] T4 -0017: CALL GlobalSym(971) CALL #131073 -0018: PARAM0[call_3] GlobalSym(268435482) -0019: CALL GlobalSym(934) --> GlobalSym(268435481)***DEREF*** -0020: V3 <-- #0 [ASSIGN] -0021: CMP V3,#10 -0022: JMP to 36 if ">=S" -0023: JMP to 27 -0024: V3 <-- V3 ADD #1 -0025: T7 <-- V3 [LOAD] -0026: JMP to 21 -0027: V4 <-- #200 [ASSIGN] -0028: T8 <-- &V4 -0029: PARAM0[call_4] T8 -0030: CALL GlobalSym(971) CALL #262145 -0031: JMP to 24 -0032: T9 <-- &V4 -0033: PARAM0[call_5] T9 -0034: CALL GlobalSym(971) CALL #327681 -0035: JMP to 24 -0036: T10 <-- &V3 -0037: PARAM0[call_6] T10 -0038: CALL GlobalSym(971) CALL #393217 -0039: PARAM0[call_7] GlobalSym(268435483) -0040: CALL GlobalSym(934) --> GlobalSym(268435481)***DEREF*** -0041: V5 <-- #0 [ASSIGN] -0042: CMP V5,#10 -0043: JMP to 57 if ">=S" -0044: JMP to 48 -0045: V5 <-- V5 ADD #1 -0046: T13 <-- V5 [LOAD] -0047: JMP to 42 -0048: V6 <-- #300 [ASSIGN] -0049: T14 <-- &V6 -0050: PARAM0[call_8] T14 -0051: CALL GlobalSym(971) CALL #524289 -0052: JMP to 57 -0053: T15 <-- &V6 -0054: PARAM0[call_9] T15 -0055: CALL GlobalSym(971) CALL #589825 -0056: JMP to 45 -0057: T16 <-- &V5 -0058: PARAM0[call_10] T16 -0059: CALL GlobalSym(971) CALL #655361 -0060: PARAM0[call_11] GlobalSym(268435484) -0061: CALL GlobalSym(934) --> GlobalSym(268435481)***DEREF*** -0062: V7 <-- #0 [ASSIGN] -0063: CMP V7,#2 -0064: JMP to 92 if ">=S" -0065: JMP to 69 -0066: V7 <-- V7 ADD #1 -0067: T19 <-- V7 [LOAD] -0068: JMP to 63 -0069: V8 <-- #400 [ASSIGN] -0070: JMP to 84 -0071: T20 <-- &V8 -0072: PARAM0[call_12] T20 -0073: CALL GlobalSym(971) CALL #786433 -0074: JMP to 66 -0075: V9 <-- #500 [ASSIGN] -0076: T21 <-- &V9 -0077: PARAM0[call_13] T21 -0078: CALL GlobalSym(971) CALL #851969 -0079: JMP to 88 -0080: T22 <-- &V9 -0081: PARAM0[call_14] T22 -0082: CALL GlobalSym(971) CALL #917505 -0083: JMP to 88 -0084: T23 <-- V7 [ASSIGN] -0085: CMP T23,#0 -0086: JMP to 71 if "==" -0087: JMP to 75 -0088: T24 <-- &V8 -0089: PARAM0[call_15] T24 -0090: CALL GlobalSym(971) CALL #983041 -0091: JMP to 66 -0092: PARAM0[call_16] GlobalSym(268435485) -0093: CALL GlobalSym(934) --> GlobalSym(268435481)***DEREF*** -0094: T26 <-- &V0 -0095: PARAM0[call_17] T26 -0096: CALL GlobalSym(971) CALL #1114113 -=== END IR BEFORE OPTIMIZATIONS === -=== IR AFTER OPTIMIZATIONS === -0000: V0 <-- #1000 [ASSIGN] -0001: PARAM0[call_0] GlobalSym(268435481) -0002: CALL GlobalSym(934) --> GlobalSym(268435481)***DEREF*** -0003: V1 <-- #0 [ASSIGN] -0004: CMP V1,#10 -0005: JMP to 15 if ">=S" -0006: JMP to 10 -0007: V1 <-- V1 ADD #1 -0008: R0(T2) <-- V1 [LOAD] -0009: JMP to 4 -0010: V2 <-- #100 [ASSIGN] -0011: R4(T3) <-- &V2 -0012: PARAM0[call_1] R4(T3) -0013: CALL GlobalSym(971) CALL #65537 -0014: JMP to 7 -0015: R4(T4) <-- &V1 -0016: PARAM0[call_2] R4(T4) -0017: CALL GlobalSym(971) CALL #131073 -0018: PARAM0[call_3] GlobalSym(268435482) -0019: CALL GlobalSym(934) --> GlobalSym(268435481)***DEREF*** -0020: V3 <-- #0 [ASSIGN] -0021: CMP V3,#10 -0022: JMP to 36 if ">=S" -0023: JMP to 27 -0024: V3 <-- V3 ADD #1 -0025: R0(T7) <-- V3 [LOAD] -0026: JMP to 21 -0027: V4 <-- #200 [ASSIGN] -0028: R4(T8) <-- &V4 -0029: PARAM0[call_4] R4(T8) -0030: CALL GlobalSym(971) CALL #262145 -0031: JMP to 24 -0032: NOP -0033: NOP -0034: NOP -0035: NOP -0036: R4(T10) <-- &V3 -0037: PARAM0[call_6] R4(T10) -0038: CALL GlobalSym(971) CALL #393217 -0039: PARAM0[call_7] GlobalSym(268435483) -0040: CALL GlobalSym(934) --> GlobalSym(268435481)***DEREF*** -0041: V5 <-- #0 [ASSIGN] -0042: CMP V5,#10 -0043: JMP to 57 if ">=S" -0044: JMP to 48 -0045: NOP -0046: NOP -0047: NOP -0048: V6 <-- #300 [ASSIGN] -0049: R4(T14) <-- &V6 -0050: PARAM0[call_8] R4(T14) -0051: CALL GlobalSym(971) CALL #524289 -0052: JMP to 57 -0053: NOP -0054: NOP -0055: NOP -0056: NOP -0057: R4(T16) <-- &V5 -0058: PARAM0[call_10] R4(T16) -0059: CALL GlobalSym(971) CALL #655361 -0060: PARAM0[call_11] GlobalSym(268435484) -0061: CALL GlobalSym(934) --> GlobalSym(268435481)***DEREF*** -0062: R4(V7) <-- #0 [ASSIGN] -0063: CMP R4(V7),#2 -0064: JMP to 92 if ">=S" -0065: JMP to 69 -0066: R4(V7) <-- R4(V7) ADD #1 -0067: R0(T19) <-- R4(V7) [LOAD] -0068: JMP to 63 -0069: V8 <-- #400 [ASSIGN] -0070: JMP to 84 -0071: R5(T20) <-- &V8 -0072: PARAM0[call_12] R5(T20) -0073: CALL GlobalSym(971) CALL #786433 -0074: JMP to 66 -0075: V9 <-- #500 [ASSIGN] -0076: R6(T21) <-- &V9 -0077: PARAM0[call_13] R6(T21) -0078: CALL GlobalSym(971) CALL #851969 -0079: JMP to 88 -0080: NOP -0081: NOP -0082: NOP -0083: NOP -0084: R0(T23) <-- R4(V7) [ASSIGN] -0085: CMP R0(T23),#0 -0086: JMP to 71 if "==" -0087: JMP to 75 -0088: R5(T24) <-- &V8 -0089: PARAM0[call_15] R5(T24) -0090: CALL GlobalSym(971) CALL #983041 -0091: JMP to 66 -0092: PARAM0[call_16] GlobalSym(268435485) -0093: CALL GlobalSym(934) --> GlobalSym(268435481)*** \ No newline at end of file diff --git a/docs/codegen_dry_run_opt.md b/docs/codegen_dry_run_opt.md new file mode 100644 index 00000000..adf1fc85 --- /dev/null +++ b/docs/codegen_dry_run_opt.md @@ -0,0 +1,159 @@ +# Codegen dry-run optimisation plan + +Two complementary optimisations to reduce compilation time on memory-constrained +hardware (4–6 MB for TCC). + +--- + +## Option A — Skip dry-run for scratch-conflict-free functions + +### Rationale + +The dry-run serves three purposes: + +1. Scratch tracking — fills `dry_insn_scratch[]` / `dry_insn_saves[]`, feeds Phase-3 fixup. +2. LR-in-prologue detection — `tcc_gen_machine_dry_run_get_lr_push_count()`. +3. Branch offset analysis — `branch_opt_analyze()` selects 16-bit vs 32-bit encodings. + +If scratch pushes are provably impossible, purposes 1 and 2 are no-ops and the +dry-run can be skipped entirely. Purpose 3 falls back to conservative 32-bit +encodings (already the default fallback), costing 2 bytes per branch — acceptable. + +### Condition + +ARM has r0–r12 = 13 allocatable integer registers; scratch needs at most 2 +simultaneously. If there are always ≥2 free integer registers and ≥2 free VFP +registers at every program point, no push/pop can occur. + +```c +int can_skip_dry_run = + __builtin_popcountll(ir->ls.dirty_registers) <= 11 && + __builtin_popcountll(ir->ls.dirty_float_registers) <= 14; // 16 s-regs available +``` + +Evaluated once, just before the two-pass loop in `tcc_ir_codegen_generate`. + +### What changes when skipping + +| Concern | Effect | +|---|---| +| `dry_insn_scratch[]` / `dry_insn_saves[]` | Stay zero (`tcc_mallocz`) — correct | +| Phase-3 fixup loop | Sees all-zero saves — no-op, safe to run or skip | +| LR in prologue | No scratch push → no LR push; `leaffunc` already set correctly | +| Branch optimizer | `branch_opt_analyze` not called → 32-bit fallback for all branches | +| Prologue emission | Uses `ir->ls.dirty_registers` + `stack_size` directly — both available | + +### Loop structure change + +```c +// still call branch_opt_init so get_encoding returns the 32-bit fallback cleanly +tcc_gen_machine_branch_opt_init(); + +int pass_start = can_skip_dry_run ? 1 : 0; +for (int pass = pass_start; pass < 2; pass++) +{ + ... +} +``` + +When `pass_start == 1`, emit the prologue at the point where it was previously +emitted inside the dry-run finalisation block (just before the real-run starts). + +--- + +## Modified Option B — Cache decoded operands, reuse in real-run + +Only active when Option A did **not** fire. + +### Rationale + +Every instruction goes through `decode_mop_args` → `machine_op_from_ir` (interval +table lookups, register resolution) **twice** — once in the dry-run, once in the +real-run. Caching the dry-run results eliminates the second decode pass. + +Only `dest`, `src1`, `src2` are cached (3 slots × 24 bytes = 72 bytes/instruction). +`scale` and `accum` operands (indexed memory ops, MLA) are rare and re-decoded in +the real-run. + +### Memory cost + +`3 × sizeof(MachineOperand) × N` on a 32-bit host: + +| Instructions | Memory | +|---|---| +| 50 | 3.6 KB | +| 100 | 7.2 KB | +| 500 | 36 KB | + +### Allocation + +```c +// allocated before the two-pass loop, only when !can_skip_dry_run +MachineOperand *mop_cache = tcc_malloc(3 * ir->next_instruction_index * sizeof(MachineOperand)); +// layout: [3*i+0] = dest, [3*i+1] = src1, [3*i+2] = src2 +``` + +### Dry-run: fill cache + +After every `DECODE(...)` call in the dry-run instruction loop: + +```c +mop_cache[3*i+0] = a.dest; +mop_cache[3*i+1] = a.src1; +mop_cache[3*i+2] = a.src2; +``` + +### After dry-run: decide whether cache is valid + +Phase-3 fixup mutates the interval table when `any_fixup != 0`. + +```c +int use_mop_cache = !any_fixup; +if (!use_mop_cache) { + tcc_free(mop_cache); + mop_cache = NULL; +} +``` + +### Real-run: use cache via wrapper macro + +```c +#define DECODE(...) (use_mop_cache \ + ? cached_mop_args(mop_cache, i, (MopSpec){__VA_ARGS__}, \ + ir, cq, &src1_ir, &src2_ir, &dest_ir, has_incoming_jump) \ + : decode_mop_args(ir, cq, &src1_ir, &src2_ir, &dest_ir, i, \ + has_incoming_jump, (MopSpec){__VA_ARGS__})) +``` + +`cached_mop_args` reads dest/src1/src2 from the cache and re-calls +`machine_op_from_ir` only for `scale` and `accum` when the spec requests them. + +### Teardown + +```c +tcc_free(mop_cache); // after real-run ends; safe when NULL (tcc_free checks) +``` + +--- + +## Combined control flow + +``` +can_skip_dry_run == 1 + Option A fires: single pass (pass=1 only), no cache, 32-bit branches, + prologue emitted immediately before real-run. + +can_skip_dry_run == 0 + Option B active: two passes, mop_cache allocated. + any_fixup == 0 → cache reused in real-run + any_fixup != 0 → cache freed, normal decode in real-run +``` + +--- + +## Files to modify + +| File | Change | +|---|---| +| `ir/codegen.c` | Condition check, `pass_start`, prologue placement, cache alloc/fill/use/free | +| `arm-thumb-gen.c` | Ensure `branch_opt_init` is safe to call without a subsequent `branch_opt_analyze` | diff --git a/docs/design_loop_unrolling.md b/docs/design_loop_unrolling.md new file mode 100644 index 00000000..191bad21 --- /dev/null +++ b/docs/design_loop_unrolling.md @@ -0,0 +1,550 @@ +# Loop Unrolling Design + +## Goal + +Unroll small constant-trip-count loops to eliminate branch overhead and enable +further optimizations (constant folding, dead code elimination). + +## Motivating Example + +```c +const char *str = "hello"; +int sum = 0; +for (int i = 0; i < 5; i++) { + sum += strlen(str); +} +``` + +After strlen folding, the IR loop body becomes `V1 = V1 + #5` repeated 5 times. +The actual optimized IR before unrolling (from dump_ir.txt): + +``` +0000: V0 <-- GlobalSym(268435461) [ASSIGN] ; str = "hello" +0001: V1 <-- #0 [ASSIGN] ; sum = 0 +0002: V2 <-- #0 [ASSIGN] ; i = 0 +0003: CMP V2, #5 ; HEADER: i < 5? +0004: JMP to 14 if ">=S" ; EXIT: jump past loop +0005: JMP to 11 ; jump to body (skip latch on first iter) +0006: T0 <-- V2 [ASSIGN] ; LATCH: save old i +0007: V2 <-- T0 ADD #1 ; i++ +0008: JMP to 3 ; back to header +0009: NOP +0010: NOP ; (folded PARAM — was strlen arg) +0011: NOP ; (folded CALL — strlen folded to #5) +0012: V1 <-- V1 ADD #5 ; BODY: sum += 5 +0013: JMP to 6 ; jump to latch +0014: ... ; EXIT TARGET: printf etc. +``` + +Loop structure detected by `tcc_ir_detect_loops()`: +- Backward jump: instruction 8 (`JMP to 3`) — this is the latch +- `header_idx = 3`, `start_idx = 3`, `end_idx = 8` +- Body extends to 13 via forward jump analysis (instr 5 jumps to 11, instr 13 jumps to 6) +- `preheader_idx = 2` (the `V2 <-- #0` instruction before header) + +With full unrolling, this becomes: + +``` +0001: V1 <-- #0 +0012: V1 <-- V1 ADD #5 ; iteration 0 + V1 <-- V1 ADD #5 ; iteration 1 + V1 <-- V1 ADD #5 ; iteration 2 + V1 <-- V1 ADD #5 ; iteration 3 + V1 <-- V1 ADD #5 ; iteration 4 +``` + +And the existing iterative constant propagation (Phase 1) collapses it to `V1 <-- #25`. + +## Scope + +**Full unrolling only** for loops where: +- Trip count is a compile-time constant +- Trip count <= threshold (16) +- Loop body is small (<= 32 non-NOP instructions) +- No nested loops (single-level only) +- Simple exit condition: `CMP IV, #N` followed by conditional jump +- Total expanded size: `trip_count * body_insn_count <= 128` + +Partial unrolling (unroll-by-factor) is out of scope for the initial +implementation. + +## Where It Fits in the Pipeline + +In `tccgen.c` (around line 23991), between dead store elimination and LICM: + +``` +Phase 4: Store-load forwarding, redundant/dead store elimination (existing, ~line 23963-23990) +Phase 5a: Loop unrolling (NEW) +Phase 5a': Re-run Phase 1 iterative const prop + DCE (NEW — collapse unrolled code) +Phase 5: LICM (existing, disabled, ~line 23992) +Phase 6: IV strength reduction (existing, ~line 24008) +``` + +The key is that loop unrolling runs **after** strlen/constant folding has +simplified the body and **before** IV strength reduction (which would be +confused by an unrolled loop). After unrolling, we re-run the Phase 1 iterative +loop so constant propagation can collapse `0 + 5 + 5 + 5 + 5 + 5 → 25`. + +## Data Structures + +No new data structures. Reuse existing ones: + +| Structure | Defined in | Used for | +|-----------|-----------|----------| +| `IRLoop` | `ir/licm.h:28` | Loop bounds: header_idx, start_idx, end_idx, preheader_idx | +| `IRLoops` | `ir/licm.h:41` | Collection of detected loops | +| `InductionVar` | `ir/opt.c:7991` | IV: vreg, init_val, step, def_idx, init_idx | + +## Algorithm — Detailed + +### Phase 1: Detect loops and find candidates + +```c +int tcc_ir_opt_loop_unroll(TCCIRState *ir) +{ + IRLoops *loops = tcc_ir_detect_loops(ir); + // Process innermost loops first (highest start_idx) + // For each loop, call try_unroll_loop() +} +``` + +For each loop, `try_unroll_loop()` performs these checks: + +#### 1a. Find the induction variable + +Reuse `find_induction_vars()` (ir/opt.c:8021). This function: +- Scans `[loop->start_idx, loop->end_idx]` for `V = V + const` pattern +- Verifies V has exactly 1 definition inside the loop +- Looks for initialization `V = #const` in preheader (up to 5 instructions back) +- Returns `InductionVar { vreg, init_val, step, def_idx, init_idx }` + +**Requirement**: exactly 1 basic IV found (multi-IV loops are too complex). + +#### 1b. Find the exit condition + +Scan from `loop->header_idx` forward (at most 2 instructions) for: + +``` +CMP Viv, #limit +JMP to exit_target if COND +``` + +Where: +- `Viv` is the IV vreg from step 1a +- `#limit` is an immediate constant +- `COND` is one of: `>=S` (for `i < N`), `>S` (for `i <= N`), `==` (for `i != N`) +- `exit_target > loop->end_idx` (jumps past the loop) + +Extract: `cmp_idx`, `jmpif_idx`, `exit_target`, `limit`, `cond_token`. + +#### 1c. Compute trip count + +```c +switch (cond_token) { + case TOK_GE: // >=S means loop runs while < + trip_count = (limit - init_val + step - 1) / step; // ceiling division + break; + case TOK_GT: // >S means loop runs while <= + trip_count = (limit - init_val) / step + 1; + break; + case TOK_NE: // != means loop runs until equality + if ((limit - init_val) % step != 0) return 0; // infinite loop risk + trip_count = (limit - init_val) / step; + break; +} +``` + +**Bail if**: `trip_count <= 0`, `trip_count > 16`, or `step <= 0`. + +#### 1d. Identify the body instructions + +The "body" is everything between the exit conditional jump and the back-edge +jump that is NOT: +- The CMP instruction (`cmp_idx`) +- The conditional exit JMP (`jmpif_idx`) +- The IV increment (`iv.def_idx`) +- The back-edge JMP (latch jump to header) +- NOP instructions +- The `T0 <-- V2 [ASSIGN]` preceding the IV increment (save-old-IV pattern) + +In the example IR: +``` +Body instructions to clone = { 0012: V1 <-- V1 ADD #5 } +``` + +Count them: `body_insn_count`. **Bail if** `body_insn_count > 32` or +`trip_count * body_insn_count > 128`. + +#### 1e. Check no nested loops + +Scan body for backward JMP instructions (target < source). If any found, +bail — this is a nested loop. + +#### 1f. Check no side effects that prevent unrolling + +Scan body for instructions that are problematic: +- `FUNCCALLVAL` / `FUNCCALLVOID` — bail (calls can have side effects) + - Exception: if we later add pure-function tracking, pure calls are OK +- `INLINE_ASM` — bail +- `SETJMP` / `LONGJMP` — bail + +**Note**: `STORE` instructions are fine to unroll — they just happen N times to +different addresses (array writes). `LOAD` too. + +### Phase 2: Emit unrolled code + +Strategy: **in-place overwrite + `insert_instr_at()` for overflow**. + +Since `insert_instr_at()` (ir/opt.c:8284) already exists and correctly updates +all jump targets, we can use it when the unrolled body doesn't fit in the +original loop's instruction slots. + +However, to avoid the index-shifting complexity entirely for the common case, +use this two-tier approach: + +#### 2a. NOP out the entire loop region + +```c +for (int i = loop->start_idx; i <= loop_actual_end; i++) + ir->compact_instructions[i].op = TCCIR_OP_NOP; +``` + +Also NOP the IV initialization in the preheader (`iv.init_idx`). + +Also NOP the forward-jump into the body (`instr 5: JMP to 11` in our example) +if it's within the loop region. + +#### 2b. Compute write positions + +Available NOP slots: count NOPs in `[loop->start_idx, loop_actual_end]`. +Needed slots: `trip_count * body_insn_count`. + +- If `needed <= available`: write in-place starting at `loop->start_idx` +- If `needed > available`: write what fits in-place, then use `insert_instr_at()` + to insert remaining instructions at `loop_actual_end + 1` + +#### 2c. Clone body instructions for each iteration + +For each iteration `k = 0 .. trip_count - 1`: + For each body instruction `orig`: + + 1. Copy the instruction: `new.op = orig.op` + 2. Copy operands from the original (read src1, src2, dest from pool) + 3. **Remap operands**: + - If src1/src2 references the IV vreg → replace with constant + `#(init_val + k * step)` — but only if the IV is used as a value, + not being defined + - If dest is the IV vreg → this is the IV increment, already excluded + - VAR vregs defined inside the body: for each iteration k > 0, + allocate fresh TMPs via `tcc_ir_vreg_alloc_temp(ir)` and remap + all references to them within that iteration's copy + 4. Write to the next available slot using: + ```c + ir->compact_instructions[write_pos].op = new_op; + ir->compact_instructions[write_pos].operand_base = tcc_ir_pool_add(ir, dest); + tcc_ir_pool_add(ir, src1); + tcc_ir_pool_add(ir, src2); + ``` + 5. Clear `is_jump_target` on cloned instructions + +#### 2d. Patch the entry + +The original `JMP to exit if >=S` at `jmpif_idx` was NOPed. We need the +code to flow from the preheader into the first unrolled instruction. + +Since we write the unrolled body starting at `loop->start_idx` (which is the +header), the preheader naturally falls through into it. No patching needed — +the NOP'd header is replaced by the first unrolled body instruction. + +But we need to handle the `exit_target`: make sure the last unrolled +instruction falls through to `exit_target`. If the unrolled code ends before +`exit_target`, insert `JMP to exit_target` as the final instruction. + +#### 2e. Concrete example walkthrough + +For our test case (trip_count=5, body=[`V1 <-- V1 ADD #5`]): + +Original slots 3–13 (11 slots) get NOPed. We need 5 instructions. + +Write at positions 3–7: +``` +0003: V1 <-- V1 ADD #5 ; iteration 0 +0004: V1 <-- V1 ADD #5 ; iteration 1 +0005: V1 <-- V1 ADD #5 ; iteration 2 +0006: V1 <-- V1 ADD #5 ; iteration 3 +0007: V1 <-- V1 ADD #5 ; iteration 4 +0008: NOP ; (remaining slots stay NOP) +... +0013: NOP +0014: ... ; EXIT TARGET (unchanged) +``` + +Falls through to 0014 naturally. Phase 1 re-run folds: +``` +V1 = 0; V1 = V1+5; V1 = V1+5; ... → V1 = 25 +``` + +### Phase 3: Re-run constant propagation + +After unrolling, call the Phase 1 iterative loop again: + +```c +if (unrolled_count > 0) { + int iter2 = 0; + int ch2; + do { + ch2 = 0; + if (tcc_state->opt_dce) ch2 += tcc_ir_opt_dce(ir); + if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_const_prop(ir); + if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_const_prop_tmp(ir); + if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_branch_folding(ir); + } while (ch2 > 0 && ++iter2 < 10); +} +``` + +## File-by-file Implementation Plan + +### Step 1: Add flag — `tcc.h` and `libtcc.c` + +**tcc.h** (~line 1147, after `opt_iv_strength_red`): +```c +unsigned char opt_loop_unroll; /* -floop-unroll: full unroll small loops */ +``` + +**libtcc.c** (~line 1724, in flag table after `iv-strength-red`): +```c +{offsetof(TCCState, opt_loop_unroll), 0, "loop-unroll"}, +``` + +**libtcc.c** (~line 2279, in -O1 block): +```c +s->opt_loop_unroll = 1; /* Full-unroll small constant-trip-count loops */ +``` + +### Step 2: Declare API — `ir/opt.h` + +Add declarations (near the other loop optimization declarations): +```c +int tcc_ir_opt_loop_unroll(TCCIRState *ir); +int tcc_ir_opt_loop_unroll_with_loops(TCCIRState *ir, IRLoops *loops); +``` + +### Step 3: Implement — `ir/opt.c` + +Add a new section after the IV strength reduction code (~line 8570). + +**Helper: `find_loop_exit_condition()`** +```c +/* Scan from header_idx for: CMP Viv, #limit; JUMPIF exit_target COND + * Returns 1 if found, fills out_cmp_idx, out_jmpif_idx, out_limit, out_cond, + * out_exit_target. */ +static int find_loop_exit_condition(TCCIRState *ir, IRLoop *loop, + int iv_vreg, + int *out_cmp_idx, int *out_jmpif_idx, + int *out_limit, int *out_cond, int *out_exit_target); +``` + +Scan instructions `[header_idx, header_idx+3]`: +- Find `CMP` where one operand is `iv_vreg` and the other is immediate +- Find `JUMPIF` immediately after the CMP +- Extract condition token from the JUMPIF +- Extract exit target (must be > loop->end_idx to be an exit) + +**Helper: `compute_trip_count()`** +```c +static int compute_trip_count(int init_val, int limit, int step, int cond_token); +``` + +Handle: +- `>=S` (generated by `i < N`): `trip_count = ceil((limit - init_val) / step)` + with `ceil(a/b) = (a + b - 1) / b` for positive values +- `>S` (generated by `i <= N`): `trip_count = (limit - init_val) / step + 1` +- Validate: `trip_count >= 0`, `(limit - init_val)` is exact multiple of step + for `!=` conditions + +**Helper: `collect_body_instructions()`** +```c +/* Collect non-control-flow, non-IV body instructions to clone. + * Returns count, fills body_indices[] array. */ +static int collect_body_instructions(TCCIRState *ir, IRLoop *loop, + int iv_vreg, int cmp_idx, int jmpif_idx, int iv_def_idx, + int *body_indices, int max_body); +``` + +Walk `[loop->start_idx, loop_actual_end]`, skip: +- NOP instructions +- CMP at cmp_idx +- JUMPIF at jmpif_idx +- All JMP (unconditional) instructions +- IV increment at iv_def_idx +- ASSIGN that copies IV to a temp (pattern: `T = Viv` where T is only + used by the IV increment on the next line) + +**Main: `try_unroll_loop()`** +```c +static int try_unroll_loop(TCCIRState *ir, IRLoop *loop) +{ + InductionVar ivs[MAX_IV]; + int num_ivs = find_induction_vars(ir, loop, ivs, MAX_IV); + if (num_ivs != 1) return 0; + + InductionVar *iv = &ivs[0]; + int cmp_idx, jmpif_idx, limit, cond, exit_target; + if (!find_loop_exit_condition(ir, loop, iv->vreg, + &cmp_idx, &jmpif_idx, &limit, &cond, &exit_target)) + return 0; + + int trip_count = compute_trip_count(iv->init_val, limit, iv->step, cond); + if (trip_count <= 0 || trip_count > 16) return 0; + + int body_indices[128]; + int body_count = collect_body_instructions(ir, loop, iv->vreg, + cmp_idx, jmpif_idx, iv->def_idx, body_indices, 128); + if (body_count <= 0 || body_count > 32) return 0; + if (trip_count * body_count > 128) return 0; + + // Check no nested loops (backward jumps in body) + // Check no CALL/ASM instructions in body + + // === EMIT === + // NOP out entire loop region [start_idx .. actual_end] + IV init + // Write trip_count copies of body at start_idx + // Add JMP to exit_target at the end if needed + + return 1; +} +``` + +**Vreg remapping during clone:** + +For each body instruction being cloned for iteration k: +- Read original dest, src1, src2 +- If src1 or src2 has vreg == iv_vreg: replace with `irop_make_imm32(-1, init_val + k * step, VT_INT)` +- For VAR vregs defined in the body (not the IV): need per-iteration copies. + But since we use full unrolling and the accumulator pattern is `V = V + const`, + we do NOT remap — the same V is accumulated across iterations. This is correct: + ``` + V1 = V1 + 5 ; iter 0: V1 goes from 0 → 5 + V1 = V1 + 5 ; iter 1: V1 goes from 5 → 10 + ``` + +The only remapping needed is: uses of the IV as a value (e.g., `arr[i] = i` +where i appears as src). The IV definition itself is excluded from the body. + +**Writing an instruction in-place at a NOP slot:** +```c +static void write_instr_at(TCCIRState *ir, int pos, TccIrOp op, + IROperand dest, IROperand src1, IROperand src2) +{ + IRQuadCompact *q = &ir->compact_instructions[pos]; + q->op = op; + q->is_jump_target = 0; + q->operand_base = tcc_ir_pool_add(ir, dest); + tcc_ir_pool_add(ir, src1); + tcc_ir_pool_add(ir, src2); +} +``` + +This reuses the existing `tcc_ir_pool_add()` to allocate operand pool entries. +The old operand pool entries for the NOPed instructions become garbage but are +harmless (the pool only grows; it's freed when the IR block is freed). + +### Step 4: Wire into pipeline — `tccgen.c` + +At ~line 23991, after dead store elimination, before LICM: + +```c + /* Phase 5a: Loop Unrolling - fully unroll small constant-trip-count loops */ + int unrolled_count = 0; + if (tcc_state->opt_loop_unroll) + unrolled_count = tcc_ir_opt_loop_unroll(ir); + + /* Phase 5a': After unrolling, re-run iterative constant propagation + DCE + * to collapse the expanded constant arithmetic (e.g. 0+5+5+5+5+5 → 25) */ + if (unrolled_count > 0) + { + int iter2 = 0, ch2; + do { + ch2 = 0; + if (tcc_state->opt_dce) ch2 += tcc_ir_opt_dce(ir); + if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_const_prop(ir); + if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_const_prop_tmp(ir); + if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_branch_folding(ir); + if (tcc_state->opt_const_prop) ch2 += tcc_ir_opt_value_tracking(ir); + } while (ch2 > 0 && ++iter2 < 10); + } +``` + +### Step 5: Add tests + +**Test 1**: Existing `100_pure_func_strlen.c` — verify with `--dump-ir` that +the loop is eliminated and `V1 <-- #25` appears in the optimized IR. +Update the expect file if output changes (it shouldn't — same result, less work). + +**Test 2**: New `101_loop_unroll_basic.c`: +```c +#include +int main() { + int sum = 0; + for (int i = 0; i < 4; i++) sum += 10; + printf("%d\n", sum); // expect: 40 + return sum != 40; +} +``` + +**Test 3**: New `102_loop_unroll_no_unroll.c`: +```c +#include +int main() { + int sum = 0; + int n = 100; + for (int i = 0; i < n; i++) sum += 1; // n not const — don't unroll + printf("%d\n", sum); + return sum != 100; +} +``` + +**Test 4**: New `103_loop_unroll_with_array.c`: +```c +#include +int main() { + int arr[4]; + for (int i = 0; i < 4; i++) arr[i] = i * 10; + printf("%d %d %d %d\n", arr[0], arr[1], arr[2], arr[3]); + return 0; +} +``` + +Add all to `TEST_FILES` in `tests/ir_tests/test_qemu.py`. + +### Step 6: Validate + +```bash +make cross && make test -j16 # IR tests (must all pass) +make test-asm -j16 # ASM tests (no regressions) +# Optionally: +make test-gcc-torture-compile # GCC torture compile tests +``` + +## Edge Cases + +| Case | Expected behavior | +|------|-------------------| +| `for (i=0; i<0; i++)` | trip_count=0, NOP out loop, keep init values | +| `for (i=0; i<1; i++)` | trip_count=1, emit body once (no loop overhead) | +| `for (i=5; i<10; i+=2)` | trip_count=ceil(5/2)=3, emit 3 copies with IV=5,7,9 | +| `for (i=0; i<17; i++)` | trip_count=17 > threshold, skip | +| Body has `if/else` | Body contains JUMPIF → forward jumps within body. These need target remapping per iteration. Complex — bail for v1 | +| IV used after loop | Keep IV final value: `V2 = init + trip_count * step` assigned before exit | + +## Risks and Mitigations + +| Risk | Mitigation | +|------|-----------| +| Code size explosion | Conservative threshold: trip_count * body_size <= 128 | +| Instruction index corruption (like LICM bug) | Write into NOP slots — no shifting. Only use insert_instr_at() as fallback | +| Incorrect vreg remapping | Keep it simple: V accumulators aren't remapped (correct for `V=V+C`). IV uses get constant substitution. Fresh TMPs only for TMP vregs defined in body | +| Interactions with IV strength reduction | Unrolling eliminates the loop; IV SR detects no loops (safe) | +| Register pressure increase | Unrolled code reuses same VARs; linear scan handles spills | +| Body with internal branches | v1: bail on bodies containing JUMPIF (revisit later) | +| Operand pool growth | Pool only grows, old entries become dead — acceptable for small unrolls | diff --git a/docs/design_scalar_evolution.md b/docs/design_scalar_evolution.md new file mode 100644 index 00000000..ed5b5008 --- /dev/null +++ b/docs/design_scalar_evolution.md @@ -0,0 +1,216 @@ +# Scalar Evolution / Loop Accumulator Optimization Design + +## Goal + +Recognize simple accumulation patterns in loops and replace them with a +closed-form computation, eliminating the loop entirely without unrolling. + +## Motivating Example + +After strlen folding, the loop: + +```c +int sum = 0; +for (int i = 0; i < 5; i++) { + sum += 5; // strlen("hello") folded to 5 +} +``` + +produces IR: + +``` +V1 <-- #0 ; sum = 0 +V2 <-- #0 ; i = 0 +loop: + CMP V2, #5 + JMP exit if >=S + V1 <-- V1 ADD #5 ; sum += 5 + V2 <-- V2 ADD #1 ; i++ + JMP loop +exit: + ... use V1 ... +``` + +Scalar evolution recognizes that `V1` has the closed form: +`V1_final = init + trip_count * stride = 0 + 5 * 5 = 25` + +The entire loop is replaced with: + +``` +V1 <-- #25 +``` + +## Relationship to Loop Unrolling + +These are complementary optimizations: + +| | Loop Unrolling | Scalar Evolution | +|---|---|---| +| Approach | Replicate body N times | Compute final value directly | +| When better | Body has side effects, memory ops | Body is pure accumulation | +| Code size | Grows with trip count | Constant (1-2 instructions) | +| Generality | Works for any small loop | Only for reducible patterns | + +Scalar evolution is strictly better when applicable, but applies to fewer cases. +Loop unrolling is more general and also enables scalar evolution indirectly +(by exposing constant patterns to the existing constant propagation). + +**Recommended order**: Try scalar evolution first; if it fails, fall back to +loop unrolling. + +## Scope + +**Patterns recognized** (initial implementation): + +1. **Constant accumulation**: `acc += constant` over N iterations + - Result: `acc = init + N * constant` +2. **Linear induction final value**: `i = 0; i < N; i += step` + - Result: `i_final = N` (or `init + trip_count * step`) +3. **Constant assignment in loop**: `x = constant` repeated N times + - Result: `x = constant` (one assignment) + +**Not in scope** (future work): +- Polynomial induction (`sum += i` → triangular number) +- Reduction with non-constant stride (`sum += arr[i]`) +- Floating-point accumulation (precision semantics differ) +- Multiple exit loops + +## Where It Fits in the Pipeline + +``` +Phase 1: Constant propagation + strlen folding (existing) +Phase 5a: Scalar evolution / loop replacement (NEW) +Phase 5b: Loop unrolling (for remaining loops) (NEW) +Phase 1': Re-run constant prop + DCE (collapse results) +Phase 5: LICM (existing, disabled) +Phase 6: IV strength reduction (existing) +``` + +Runs in the same slot as loop unrolling, just before it. + +## Algorithm + +### Step 1: Loop analysis + +For each detected loop (reuse `tcc_ir_detect_loops()`): + +1. Identify all **basic induction variables** (reuse `find_induction_vars()`) +2. Determine **trip count** (same as loop unrolling: constant init, limit, step) +3. Verify **single exit** from loop header + +### Step 2: Classify loop body vregs + +Scan all non-NOP instructions in the loop body. For each VAR vreg `V` defined +in the loop, classify it: + +- **Basic IV**: `V = V + const_step` (already identified) +- **Constant accumulator**: `V = V + const` or `V = V - const` + (where const does not depend on any loop-variant value) +- **Constant overwrite**: `V = const` (same constant every iteration) +- **Non-reducible**: anything else (memory store, function call, etc.) + +A loop is **fully reducible** if: +- Every instruction is either a NOP, an IV increment, a reducible accumulator + update, or a branch instruction (CMP/JMP) for loop control +- There are no STORE, CALL, or other side-effecting instructions + +### Step 3: Compute closed-form values + +For each reducible accumulator: + +| Pattern | Closed Form | +|---------|------------| +| `V = V + C` (accumulator) | `V_final = V_init + trip_count * C` | +| `V = V - C` | `V_final = V_init - trip_count * C` | +| `V = C` (overwrite) | `V_final = C` | +| IV `V += step` | `V_final = V_init + trip_count * step` | + +Compute `trip_count * C` at compile time (both are constants). If the result +overflows 32 bits, bail out (preserve runtime semantics). + +### Step 4: Replace loop with assignments + +1. NOP out all instructions from loop preheader through loop end +2. At the loop start position, emit: + - For each reducible VAR: `V <-- #closed_form_value` + - Fall through to the original exit target +3. If any VAR is used after the loop, make sure its final value is set + +### Step 5: Dead IV cleanup + +The IV initialization and any IV-only uses become dead. Existing DCE handles +this automatically. + +## API + +```c +/* In ir/opt.h */ + +/* Attempt to replace loops with closed-form scalar computations. + * Returns number of loops eliminated. */ +int tcc_ir_opt_scalar_evolution(TCCIRState *ir); + +/* Variant using pre-detected loops */ +int tcc_ir_opt_scalar_evolution_with_loops(TCCIRState *ir, IRLoops *loops); +``` + +## Data Structures + +```c +/* Accumulator pattern found in a loop body */ +typedef struct LoopAccumulator { + int vreg; /* VAR vreg being accumulated */ + int init_val; /* Initial value (from preheader) */ + int stride; /* Constant added per iteration */ + int init_idx; /* Instruction index of initialization */ + int update_idx; /* Instruction index of accumulation in loop */ + enum { + ACCUM_ADD, /* V = V + C */ + ACCUM_SUB, /* V = V - C */ + ACCUM_ASSIGN, /* V = C (constant overwrite) */ + } kind; +} LoopAccumulator; + +#define MAX_ACCUMULATORS 8 +``` + +## Configuration + +Reuse `opt_loop_unroll` flag or add a separate `opt_scalar_evol` flag. +Enable at `-O1`. + +## Testing Strategy + +1. **Primary test**: `100_pure_func_strlen.c` - loop eliminated, sum = 25 +2. **New tests**: + - `sum += 3` over 10 iterations → sum = 30 + - `sum += i` (NOT reducible with initial impl - should fall through to + unrolling or remain as loop) + - Two accumulators in same loop: `sum1 += 2; sum2 += 3;` + - Loop with memory store in body (should NOT be eliminated) + - Trip count = 0 (loop never executes, preserve init values) + - Accumulator with negative stride: `sum -= 1` + - Overflow edge case: `sum += 0x40000000` over 8 iterations + +## Risks and Mitigations + +| Risk | Mitigation | +|------|-----------| +| Incorrect trip count for edge conditions | Handle `<`, `<=`, `!=` separately; test boundary values | +| Overflow semantics mismatch | Use 32-bit wrapping arithmetic (matches C unsigned); bail for signed overflow | +| Dead code after elimination | Existing DCE handles cleanup | +| Interaction with IV strength reduction | Eliminated loops have no IVs; SR skips them naturally | +| Missing a side effect in the loop | Conservative: any STORE/CALL/volatile makes loop non-reducible | + +## Implementation Steps + +1. Write `tcc_ir_opt_scalar_evolution()` in `ir/opt.c`: + a. Detect loops, find IVs, compute trip counts + b. Scan body for accumulator patterns + c. Check full reducibility (no side effects) + d. Compute closed-form values + e. Replace loop with constant assignments +2. Wire into pipeline before loop unrolling +3. Re-run Phase 1 constant prop after both passes +4. Add tests +5. Verify no regressions diff --git a/docs/fixes/omit_frame_pointer.md b/docs/fixes/omit_frame_pointer.md new file mode 100644 index 00000000..4d74f6ac --- /dev/null +++ b/docs/fixes/omit_frame_pointer.md @@ -0,0 +1,170 @@ +# Plan: Omit Frame Pointer When Safe + +**Goal**: Eliminate unnecessary frame pointer (R7) setup in functions where SP +is statically known, saving 2-3 instructions per function and freeing R7 for +register allocation. + +**Current state**: GCC `-O2` omits the frame pointer for `main` in +`hello_inline.txt` (16 instructions), while TCC always emits it (20 instructions). + +## Problem + +In `arm-thumb-gen.c:6828`, the frame pointer decision is: + +```c +const int need_fp = (tcc_state->force_frame_pointer + || tcc_state->need_frame_pointer + || (stack_size > 0)); // <-- too conservative +``` + +Any function with locals or spills gets a frame pointer. The `stack_size > 0` +condition exists because **SP moves dynamically** during function calls: + +- `func_call_mop` does `gadd_sp(-stack_size)` before each call to reserve + outgoing stack args, then `gadd_sp(stack_size)` after (lines 8574-8577, + 8644-8648). +- Nested call preservation pushes R0-R3 onto the stack (lines 8566-8569). + +When SP moves, SP-relative offsets to locals become invalid. The frame pointer +provides a stable base. Without it, removing `stack_size > 0` causes widespread +test failures. + +## Key Insight + +The IR already pre-computes the maximum outgoing call argument area: + +- `ir->call_outgoing_size` — max bytes needed across all calls (`tccir.h:454`) +- `ir->call_outgoing_base` — frame offset of the reserved area (`tccir.h:453`) +- `ir/codegen.c:1329-1336` reserves this space in the stack frame layout + +But the backend ignores this and still does per-call dynamic SP adjustments. + +## Implementation Plan + +### Phase 1: Use Pre-Reserved Outgoing Area for Stack Args + +**Files**: `arm-thumb-gen.c` + +1. **Replace `gadd_sp(-stack_size)` with offset-based stores in `func_call_mop`** + - Currently (line 8574): `gadd_sp(-stack_size)` lowers SP, then + `store_word_to_stack(reg, stack_offset)` stores relative to the new SP. + - Change: compute `outgoing_base = ir->call_outgoing_base` (FP-relative + offset). Store stack args at `[base_reg + outgoing_base + stack_offset]` + where `base_reg` is FP or SP depending on `need_frame_pointer`. + - Remove the `gadd_sp(-stack_size)` / `gadd_sp(stack_size)` pair. + +2. **Adapt `store_word_to_stack` and `place_stack_arg_*` functions** + - These currently store at `[SP + offset]` assuming SP was already lowered. + - Change them to accept a base register + base offset, or pass the outgoing + base through the `CallGenContext`. + +3. **Handle nested call R0-R3 preservation without PUSH/POP** + - Currently `th_push(arg_regs_push_mask)` / `th_pop(...)` dynamically moves SP. + - Option A: Reserve slots for R0-R3 preservation in the frame (alongside + outgoing area). Store/load explicitly instead of push/pop. + - Option B: Move the nested-call saves to callee-saved spill slots allocated + during register allocation. (More complex, may not be needed initially.) + +### Phase 2: Remove `stack_size > 0` from Frame Pointer Decision + +**Files**: `arm-thumb-gen.c` + +4. **Update the `need_fp` condition** (line 6828): + ```c + const int need_fp = (tcc_state->force_frame_pointer + || tcc_state->need_frame_pointer); + ``` + The remaining conditions (`force_frame_pointer`, variadic, `force_lr_save`) + already cover the cases that truly need FP. + +5. **Verify `fp_adjust_local_offset`** (line 192): + - This adjusts local offsets by `callee_push_size` for FP-relative access. + - When FP is omitted, locals are SP-relative. The offset calculation changes: + SP points at the bottom of the frame (below outgoing area), so local offset + from SP = `stack_size + local_offset` (where `local_offset` is negative + from frame top). + - Verify that all ~15 sites using `tcc_state->need_frame_pointer ? R_FP : R_SP` + compute the correct offset in the SP case. + +### Phase 3: Account for Outgoing Area in SP-Relative Offsets + +6. **When `need_fp == 0` and `call_outgoing_size > 0`**: + - SP is at `frame_bottom - call_outgoing_size` after prologue. + - All SP-relative local accesses need an additional + `+ call_outgoing_size` offset. + - This adjustment should happen in `fp_adjust_local_offset` or at each + `base_reg` selection site. + +### Phase 4: Prologue/Epilogue Updates + +7. **Prologue** (around line 6894): + - When `need_fp == 0`: skip `MOV R7, SP` and R7 push. + - Still emit `SUB SP, #stack_size` for locals + outgoing area. + +8. **Epilogue** (around line 7298): + - When `need_fp == 0`: skip `MOV SP, R7` restore. + - Use `ADD SP, #stack_size` instead. + +## Risks and Edge Cases + +- **VLA / `alloca`**: Already covered by `force_frame_pointer = 1` in `tccgen.c`. +- **Variadic functions**: Already force FP via `func_var` check (line 6821). +- **`__builtin_return_address`**: Already forces FP via `force_lr_save` (line 6825). +- **Debug info (DWARF)**: `tccdbg.c:2969` checks `need_frame_pointer` for CFA + tracking. Needs testing — CFA may need to switch to SP-based when FP is omitted. +- **Nested functions / static chain**: Use R10 for chain, may reference FP for + parent frame access. Check `tcc_gen_machine_set_chain`. +- **Scratch register saves**: `get_scratch_reg_with_save` does PUSH/POP of + scratch registers mid-function. These also move SP. If these happen while + accessing locals, SP offsets break. Need to verify these never overlap with + local accesses, or track their adjustment. +- **Software FP library calls**: Lines 6025-6332 do `sub sp` for softfloat call + frames. These are internal helpers and may need the same treatment. + +## Testing Strategy + +1. `make test -j16` — IR test suite (primary) +4. Manual inspection of `hello_inline.txt` output to verify FP is omitted +5. Compare instruction counts before/after across the full test suite + +## TODO + +### Phase 1: Use Pre-Reserved Outgoing Area +- [ ] Add `outgoing_base` field to `CallGenContext` sourced from `ir->call_outgoing_base` +- [ ] Change `place_stack_arg_32bit` / `place_stack_arg_64bit` / `place_stack_arg_struct` to store at `[base_reg + outgoing_base + stack_offset]` instead of `[SP + stack_offset]` +- [ ] Remove `gadd_sp(-stack_size)` / `gadd_sp(stack_size)` from `func_call_mop` +- [ ] Replace R0-R3 nested call `th_push`/`th_pop` with explicit STR/LDR to reserved frame slots +- [ ] Remove `used_stack_size` tracking (no longer needed) +- [ ] Adapt softfloat helper call frames (lines 6025-6332) to use reserved area + +### Phase 2: Remove `stack_size > 0` Condition +- [ ] Change `need_fp` condition at line 6828 to `(force_frame_pointer || need_frame_pointer)` +- [ ] Verify all `force_frame_pointer = 1` sites in `tccgen.c` cover VLA/alloca/varargs + +### Phase 3: Fix SP-Relative Offsets +- [ ] Update `fp_adjust_local_offset` to add `call_outgoing_size` when FP is omitted +- [ ] Audit all ~15 `need_frame_pointer ? R_FP : R_SP` sites for correct offset math +- [ ] Handle `MACH_OP_PARAM_STACK` offset calculation (incoming args above frame) + +### Phase 4: Prologue/Epilogue +- [ ] Skip R7 push/pop and `MOV R7, SP` / `MOV SP, R7` when `need_fp == 0` +- [ ] Use `ADD SP, #stack_size` in epilogue instead of `MOV SP, R7` +- [ ] Update DWARF CFA tracking in `tccdbg.c` for SP-based frames + +### Phase 5: Edge Cases +- [ ] Audit `get_scratch_reg_with_save` PUSH/POP — verify no local access overlap +- [ ] Test nested functions / static chain with FP omitted +- [ ] Verify R9 (GOT base) save/restore in yasos text-data-separation mode + +### Phase 6: Testing +- [ ] `make test -j16` — IR tests pass +- [ ] `make test-asm -j16` — assembly tests pass +- [ ] `make test-gcc-torture-compile` — GCC torture tests pass +- [ ] Verify `hello_inline.txt` shows FP omitted for `main` +- [ ] Compare instruction count regressions across test suite + +## Expected Impact + +- Saves 2-4 instructions per non-leaf function (push/pop R7 + MOV R7,SP + MOV SP,R7) +- Frees R7 for general register allocation (significant for register pressure) +- Closer parity with GCC `-O2` output diff --git a/docs/plan_closing_gcc_gap.md b/docs/plan_closing_gcc_gap.md new file mode 100644 index 00000000..ec1fb93e --- /dev/null +++ b/docs/plan_closing_gcc_gap.md @@ -0,0 +1,269 @@ +# Plan: Closing the TCC–GCC Code Size Gap + +## Current State + +Benchmark of TCC -O2 vs GCC -O2 across IR test suite (ARM Thumb-2, Cortex-M33): + +| Test / Function | TCC | GCC | Ratio | Root Cause | +|-------------------------------|-----|-----|--------|--------------------------| +| test_llong_load_unsigned/main | 102 | 8 | 12.75x | Inlining + const fold | +| test_u64_shift_add/main | 117 | 26 | 4.50x | Inlining + const fold | +| test_fp_offset_cache/mixed | 15 | 5 | 3.00x | Const fold + DCE | +| test_return64/main | 38 | 14 | 2.71x | Inlining + const fold | +| test_dcmp/main | 21 | 8 | 2.62x | Inlining + const fold | +| test_fp_offset_cache/loop | 61 | 27 | 2.26x | Loop opts + addr reuse | +| test_double_arith/main | 49 | 22 | 2.23x | Inlining + const fold | +| test_fp_offset_cache/swap | 52 | 27 | 1.93x | Loop opts + cond exec | +| bubble_sort | 44 | 27 | 1.63x | Addr modes + cond exec | +| test_f2d_bits/main | 48 | 30 | 1.60x | Inlining | + +TCC already matches or beats GCC on leaf functions: test_simple_return (1.00x), +test_llong_mul_unsigned (0.88x), test_semihosting (0.60x), test_aeabi_dneg (0.65x). + +### What GCC does for 12.75x case + +`test_llong_load_unsigned` defines `load_through_ptr`, `store_through_ptr`, `check_u64` +(all static, <20 lines) and calls them from `main` with known global/constant args. + +GCC: inlines everything → propagates `load_through_ptr(&g1) == g1` → folds +`check_u64("g1", g1, g1)` to return 0 → eliminates all dead branches → only +two `puts` calls and `return 0` remain (8 instructions). + +### What TCC does today + +Token-stream auto-inlining IS working: `load_through_ptr` (len=13) and `check_u64` +(len=54) are registered as inline candidates and replayed at call sites. + +Constant evaluation also works for calls with all-VT_CONST args: +- `load_through_ptr(&g1)` → evaluated, folded ✓ (first two calls) +- `load_through_ptr(&arr[0])` → FAILS: stack address not VT_CONST ✗ +- `check_u64("g1", , g1)` → FAILS: inlined result in register, not VT_CONST ✗ + +`store_through_ptr` is not appearing in inline candidate list (cause TBD — likely +the void return + VT_LLONG param combination). + +After token-replay inlining, the full check_u64 body (including the printf error +path) stays in the IR. The IR optimizer cannot prove the comparison always succeeds +because it lacks store-load forwarding through memory: `arr[0] = g1; *(&arr[0])` +does not resolve to `g1` at the IR level. + +--- + +## Step 1: Improve Post-Inline Constant Propagation + +**Goal:** After token-replay inlining of `check_u64`, fold `got != exp` to false +when both operands trace back to the same value. + +**What to do:** +1. In `ir/opt.c`, extend `tcc_ir_opt_const_prop` to handle the pattern: + `STORE val → addr` followed by `LOAD addr → tmp` → replace tmp with val. + This is store-load forwarding for the *same* basic block (intra-BB). +2. Extend the existing `tcc_ir_opt_sl_forward` to handle 64-bit (LLONG) values + stored/loaded via `strd`/`ldrd` patterns. +3. After forwarding, existing branch folding + DCE eliminates the dead printf path. + +**Test:** `test_llong_load_unsigned` — first two `check_u64` calls (with global +addresses) should be fully eliminated from the IR. + +**Expected improvement:** 12.75x → ~4x (eliminates 2 of 5 check blocks). + +**Files:** `ir/opt.c` (store-load forwarding), `tccir.h` (if new flags needed) + +--- + +## Step 2: Propagate Constants Through Local Arrays + +**Goal:** After `arr[0] = g1`, resolve `load_through_ptr(&arr[0])` to `g1`. + +**What to do:** +1. Track stores to local array elements with constant indices in a shadow map + during constant propagation: `stack_offset + idx*size → stored_value`. +2. When a LOAD from a known stack address matches a previous STORE to the same + address (no intervening aliasing store), forward the value. +3. Handle the specific pattern: `LEA(stack, offset)` passed as arg to inlined + `load_through_ptr` which does `LOAD(arg)` — after inlining, this becomes + `LOAD(LEA(stack, offset))` which can resolve via the shadow map. + +**Test:** `test_llong_load_unsigned` — all `check_u64` calls with arr elements +should be eliminated. + +**Expected improvement:** 12.75x → ~2x (eliminates arr-based checks, only +`store_through_ptr` + final check remain). + +**Files:** `ir/opt.c` + +--- + +## Step 3: Fix store_through_ptr Not Being Inlined + +**Goal:** Ensure void functions with VT_LLONG parameters are auto-inlined. + +**What to do:** +1. Add INLINE_STRUCT logging around `auto_inline_sig_ok` rejection path to + identify exactly why `store_through_ptr` is being skipped. +2. Fix the rejection (likely in `auto_inline_sig_ok` parameter loop or the + void+LLONG combination). +3. After inlining `store_through_ptr(&local, arr[2])`, Step 2's forwarding can + propagate `local == 0xffffffffffffffff` to the final `check_u64`. + +**Test:** `test_llong_load_unsigned` — final code should match GCC: two `puts` +calls + `return 0`. + +**Expected improvement:** 12.75x → ~1.0x for this specific test. + +**Files:** `tccgen.c` (auto_inline_sig_ok, call-site inline logic) + +--- + +## Step 4: Fix LICM Instruction Index Bug + +**Goal:** Re-enable loop-invariant code motion. + +**Current state:** LICM is disabled at `tccgen.c:25176`. The old pattern-based +`hoist_from_loop` returns 0 unconditionally (`licm.c:590`). A new dominance-based +`tcc_ir_opt_licm_ex` exists but the old pass is dead. The bug is documented: +> instruction indices are not adjusted by total_inserted when reading original +> instructions during the insertion loop, causing operand_base corruption + +**What to do:** +1. The dominance-based LICM (`tcc_ir_opt_licm_ex`) is already implemented with + CFG + dominator tree. Verify it handles instruction index adjustment correctly. +2. Remove the `return 0` guard in `hoist_from_loop` OR remove the old pass + entirely and rely on the dominance-based version. +3. Enable LICM by removing the comment/guard at `tccgen.c:25176` (set + `opt_licm=1` at `-O1`+). +4. Run full test suite to validate: `make test -j16 && make test-gcc-torture-compile`. + +**Test:** `test_fp_offset_cache/test_loop_access` (2.26x), bubble_sort (1.63x). + +**Expected improvement:** ~15-25% reduction in loop-heavy functions. + +**Files:** `ir/licm.c`, `tccgen.c` (optimization pipeline) + +--- + +## Step 5: Copy Coalescing in Register Allocator + +**Goal:** Eliminate redundant `mov` instructions from ASSIGN IR ops. + +**Current state:** The linear scan allocator in `tccls.c` assigns physical registers +independently. The optimized IR contains many identity assigns like: +``` +R0(T1) <-- R5(V0) [ASSIGN] → mov r0, r5 +R1(T9) <-- R4(V0) [ASSIGN] → mov r1, r4 +``` + +**What to do:** +1. After liveness analysis (`ir/live.c`), add a coalescing pre-pass that merges + virtual register live ranges connected by ASSIGN when they don't interfere. +2. Specifically: for `Tx <-- Vy [ASSIGN]`, if Tx and Vy have non-overlapping live + ranges (or Vy dies at this instruction), assign the same physical register. +3. After coalescing, the ASSIGN becomes a no-op and can be eliminated by DCE. + +Alternative lighter approach: add a post-regalloc peephole in `arm-thumb-gen.c` +that eliminates `mov Rx, Rx` (same register). + +**Test:** Every function — count `mov` instructions before/after. + +**Expected improvement:** ~15-20% across the board. In bubble_sort: 44 → ~35. + +**Files:** `tccls.c` (register allocator), `ir/live.c` (liveness) + +--- + +## Step 6: If-Conversion for Small Conditional Blocks (IT Blocks) + +**Goal:** Replace short branch-over patterns with ARM IT conditional execution. + +**Current state:** TCC generates full branch diamonds even for single-instruction +if-then bodies. GCC uses IT blocks: +``` +; GCC bubble sort swap: +cmp r2, r1 +it gt +strdgt r1, r2, [r3, #-4] ; 1 conditional instruction, no branch + +; TCC bubble sort swap: +cmp r1, r2 +ble .skip +; ... 10 instructions for swap ... +.skip: +``` + +**What to do:** +1. Add an IR-level if-conversion pass that detects diamond/triangle patterns where + the "then" block has 1-4 instructions and no side effects beyond stores. +2. Convert to `SELECT` IR ops (already defined in `tccir.h`) or emit IT blocks + directly in `arm-thumb-gen.c`. +3. ARM Thumb-2 IT blocks support up to 4 conditional instructions. Focus on the + common pattern: compare + conditional store (swap, min/max). + +**Test:** bubble_sort, test_swap_pattern, any conditional move patterns. + +**Expected improvement:** ~10-15% in branch-heavy inner loops. Bubble sort: 35 → ~28. + +**Files:** `ir/opt.c` (new pass), `arm-thumb-gen.c` (IT block emission) + +--- + +## Step 7: Improved Induction Variable Strength Reduction + +**Goal:** Convert `base + i*4` recomputed each iteration into pointer increment. + +**Current state:** IV strength reduction exists (`tcc_ir_opt_iv_strength_reduction`) +but doesn't catch all patterns, especially when the same array index is used +multiple times in a loop body (like swap: `arr[j]`, `arr[j+1]` used in load, store, +and recomputed independently). + +**What to do:** +1. Extend IV SR to identify groups of array accesses sharing the same base and + induction variable: `arr[j]`, `arr[j+1]` → single pointer `p` with `p[0]`, + `p[1]`, incremented once per iteration. +2. After the pointer is introduced, existing indexed load fusion + (`LOAD_INDEXED`) handles the rest. +3. Requires LICM (Step 4) to hoist the base address first. + +**Test:** bubble_sort, test_loop_access, test_swap_pattern. + +**Expected improvement:** ~10% additional on loop-heavy code. + +**Files:** `ir/opt.c` (IV strength reduction) + +--- + +## Execution Order & Dependencies + +``` +Step 1 ──→ Step 2 ──→ Step 3 (inlining + const prop chain) + │ + │ Step 4 ──→ Step 7 (LICM enables better IV SR) + │ + │ Step 5 (independent: regalloc) + │ + │ Step 6 (independent: if-conversion) + ↓ + Steps 4-7 can run in parallel with Steps 1-3 +``` + +Steps 1-3 are the highest leverage: they address the 12.75x/4.50x/2.71x outliers. +Steps 4-7 improve the 1.5x-2.3x cases (loops, branches, register pressure). + +## Validation + +After each step, run: +```bash +make test -j16 # IR tests pass +make test-gcc-torture-compile # no regressions +python3 scripts/compare_disasm.py tests/ir_tests/test_llong_load_unsigned.c # track ratio +python3 scripts/compare_disasm.py bubble # track ratio +``` + +## Target + +| Test | Current | After Steps 1-3 | After All | +|-------------------------------|---------|------------------|-----------| +| test_llong_load_unsigned/main | 12.75x | ~1.0x | ~1.0x | +| test_u64_shift_add/main | 4.50x | ~2.0x | ~1.5x | +| test_return64/main | 2.71x | ~1.2x | ~1.0x | +| test_fp_offset_cache/loop | 2.26x | ~2.26x | ~1.3x | +| bubble_sort | 1.63x | ~1.63x | ~1.1x | diff --git a/docs/plan_iv_sr_rotated_loop.md b/docs/plan_iv_sr_rotated_loop.md new file mode 100644 index 00000000..8d9c5170 --- /dev/null +++ b/docs/plan_iv_sr_rotated_loop.md @@ -0,0 +1,228 @@ +# Plan: IV Strength Reduction for Rotated Loops with `arr[i*const]` + +## Context + +`test_llong_relops::run_signed` and `run_unsigned` are ~1.39x and ~1.41x larger +than GCC's output (139 vs 100, 128 vs 91). The gap is dominated by: + +1. The loop counter `i` is spilled to `[sp, #36]` and the address + `&cases[i]` is recomputed each iteration via `mla r9, r0, r1, r2`. +2. GCC instead uses a pointer-IV: `r4 = &cases[0]` in the preheader, + `r4 += 40` in the latch, eliminating both the multiply and an `i` reload. + +TCC already has an IV strength reduction pass +([`tcc_ir_opt_iv_strength_reduction`](ir/opt.c:20889)) that's designed for +exactly this pattern — but it doesn't fire in `test_llong_relops`. This plan +covers what blocks it and how to fix it. + +## Root Cause + +The fix has two distinct blockers. Either one alone keeps the pointer-IV +transform from firing. + +### Blocker 1: pre-SSA MLA fusion rejects immediate multipliers + +[`tcc_ir_opt_fusion_pass`](ir/opt.c:14461) fuses `T = a * b; V = base + T` +into `V = a MLA b + base`. The gate at [ir/opt.c:14523-14524](ir/opt.c#L14523) +excludes the case where `a` or `b` is an immediate: + +```c +!irop_is_immediate(ms1) && !irop_is_immediate(ms2) && ir_opt_du_uses(...) == 1 +``` + +For `T = i * 40; V = base + T`, `ms2` is `#40` (immediate), so MLA fusion +skips it. The MUL+ADD form survives until the ARM-specific SSA-stage MLA +fusion in [`arch/arm/ssa_opt_arm.c:100`](arch/arm/ssa_opt_arm.c#L100) — but +**that runs after IV-SR**, so IV-SR never sees an MLA to operate on. + +The pre-SSA gate was presumably added because MUL-by-power-of-2 gets +strength-reduced to SHL later, which would render the MLA wasteful. But for +non-power-of-2 immediates (40, 12, etc.) the strength reducer at +[ir/opt.c:18846](ir/opt.c#L18846) bails out (multi-instruction patterns +aren't supported), so the MUL stays as MUL and MLA fusion was the right call +all along. + +### Blocker 2: `loop->body_instrs` is too narrow for TCC's rotated layout + +`find_derived_ivs` ([ir/opt.c:19115](ir/opt.c#L19115)) has two scan passes: + +| Pass | What it finds | Scan range | +|------|---------------|------------| +| 1 (line 19164) | `ADD` with MUL/SHL src — i.e. unfused MUL+ADD | `loop->body_instrs` | +| 2 (line 19400) | `MLA` directly | `mla_scan_start..mla_scan_end` (extended) | + +The extended range walks forward jumps iteratively past the back-edge — it's +specifically designed to catch rotated loops with the body proper *after* the +latch in instruction order. But it's only wired to pass 2 (MLA-detection). + +In `test_llong_relops`, loop rotation produces: + +``` +op 3: CMP i, 10 ← header +op 4: JMP if >=U exit +op 5: JMP to 10 ← into body +op 6: T = i + 1 ← latch (increment) +op 8: i = T ← latch (write-back) +op 9: JMP to 3 ← back to header +op 10: T3 = i * 40 ← body proper (MUL) +op 11: V1 = base + T3 ← body proper (ADD) — this is the DIV! +... +op 110: JMP to 6 ← back-edge to the latch +``` + +LICM's body detector ([ir/licm.c:228-264](ir/licm.c#L228-L264)) only follows +forward jumps one level deep when extending the body range, so +`loop->body_instrs` for this loop is `{2, 3, 4, 5, 6, 7, 8}` — it never +reaches op 11. Pass 1 misses the MUL+ADD. + +Even after fixing Blocker 1 (so the MUL+ADD becomes an MLA), Pass 2 catches +it because Pass 2 uses the extended scan range. + +## What I Tried — and Why It Failed + +Lifted the immediate-operand gate on pre-SSA MLA fusion. IV-SR then *did* +fire and produced the textbook pointer-IV in the IR dump: + +``` +0002: R4(T27) <-- Addr[StackLoc[-48]] [ASSIGN] ← preheader: p = base +... +0013: R4(T27) <-- R4(T27) ADD #12 ← latch: p += stride +``` + +But the **emitted assembly didn't match the IR**: +[`bug_struct_array_index_mul_clobber`](tests/ir_tests/bug_struct_array_index_mul_clobber.c) +crashed in QEMU because `main`'s emitted code loaded from `[r4, #0]` without +ever initializing r4. The preheader `ASSIGN R4 <- Addr[...]` was in the IR +but absent from the machine code. The latch `R4 += 12` was also missing. + +So there's a third blocker hiding behind the first two: when IV-SR inserts +new instructions *outside the original loop range* (specifically into the +preheader/latch), something in the codegen path doesn't pick them up. + +I reverted the MLA fusion change. The peephole improvement in commit +`e76cee04` (which is an unrelated, smaller win) stands. + +## The Real Fix + +Three changes, in order. Land each on its own commit and run the full IR +suite (1026 tests) plus a regression-disasm diff between each. + +### Step 1 — Verify and fix the codegen-doesn't-honor-inserted-instructions bug + +Without this, Steps 2-3 produce miscompiles. + +1. Reproduce with a minimal case. Apply the immediate-allowing MLA fusion + from this session (`git show e76cee04^..HEAD` is the wrong base — apply + the change as a separate scratch commit). Compile + `tests/ir_tests/bug_struct_array_index_mul_clobber.c` with `-O2 -dump-ir`. + The "AFTER OPTIMIZATIONS" IR dump for `main` will show + `R4(T27) <-- Addr[StackLoc[-48]]` near the top and `R4 += 12` in the + latch. +2. Confirm the disassembly is missing both: there's no `add r4, sp, #N` in + `main`'s preheader and no `adds r4, #12` in the loop's bottom block. +3. Hypothesis: IV-SR's `transform_derived_iv` + ([ir/opt.c:~19500](ir/opt.c) — search for it) inserts via + `insert_instr_at` at `loop->preheader_idx + 1` and at the latch position. + Those inserts shift indices. Either: + - the inserts land in an IR slot that codegen skips (NOP-classified, or + marked unreachable), or + - the inserts happen *after* the SSA-renaming snapshot codegen uses, and + codegen runs from the pre-IV-SR snapshot. +4. The way to find out is to instrument `tcc_ir_codegen_generate` to print + `(i, op, dest_vreg, dest_alloc.r0)` for every IR instruction it dispatches + on, and compare against the dumped IR. The first divergence is the bug. + +Most likely fix is in `transform_derived_iv` (it needs to mark new +instructions with the right flags), or in the SSA construction pass (it +needs to rebuild after IV-SR runs). Don't guess — the trace will say. + +### Step 2 — Relax pre-SSA MLA fusion to accept non-power-of-2 immediates + +Once Step 1 is done, re-land the immediate-allowing MLA fusion. The patch +in [ir/opt.c:14523](ir/opt.c#L14523): + +```diff ++ int ms1_imm = irop_is_immediate(ms1); ++ int ms2_imm = irop_is_immediate(ms2); ++ int allow_one_imm = (ms1_imm ^ ms2_imm); ++ if (allow_one_imm) { ++ int64_t mval = ms1_imm ? irop_get_imm64_ex(ir, ms1) ++ : irop_get_imm64_ex(ir, ms2); ++ if (is_power_of_2(mval) >= 0 || mval == 0 || mval == 1) ++ allow_one_imm = 0; /* leave for strength reduction */ ++ } + if (... && +- !irop_is_immediate(ms1) && !irop_is_immediate(ms2) && ...) { ++ (allow_one_imm || (!ms1_imm && !ms2_imm)) && ...) { +``` + +Forward-declare `is_power_of_2` near the top of `ir/opt.c`. + +Do **not** also drop the `STACKOFF && !is_lval` accumulator exclusion. That +exclusion is load-bearing (dropping it breaks `test_llong_relops` and +`bug_bitfield_packed10` in different ways — distinct from Step 1's bug). + +### Step 3 — Optional: extend Pass 1 of `find_derived_ivs` to the MLA scan range + +After Step 2, the test_llong_relops MUL+ADD becomes an MLA in pre-SSA, so +Pass 2 catches it. But other callers / code shapes may still have unfused +MUL+ADD outside `body_instrs`. The cleanest follow-up is to teach Pass 1 to +walk `mla_scan_start..mla_scan_end` as well, gated to only consider ADDs +whose matched MUL/SHL is *also* in the extended range. This preserves the +"don't extend body for SHR/AND chains" guarantee the comment at +[ir/opt.c:19126-19131](ir/opt.c#L19126-L19131) warns about. + +This is genuinely optional — Step 2 alone should close the test_llong_relops +gap once Step 1 is in place. + +## Expected Impact + +| Function | Before | After Steps 1-2 | GCC | +|---|---|---|---| +| `test_llong_relops::run_signed` | 138 | ~115 (-23) | 100 | +| `test_llong_relops::run_unsigned` | 127 | ~104 (-23) | 91 | +| (`bug_ull_mul10_loop`, others with `arr[i*c]`) | — | likely improves | — | + +The 23-instruction estimate per function comes from: +- Eliminate `mla r9, r0, r1, r2` plus its prep (`movs r1, #40; add r2, sp, + #40`) per iter → -3 insns in body, but body executes ×10/8 → counted as + static body shrink. +- Eliminate `i` spill (`str/ldr` to `[sp, #36]` ~6 times per iter once `i` + fits in a callee-saved reg, since one register is freed by the IV-SR + collapse) → ~6 insns gone from body. +- Net ~9 insns saved in the body, plus 14 in the prologue/preheader once the + computed-each-iter MLA collapses to a single preheader init + latch ADD. + +This won't close the gap entirely (GCC also uses cleaner long-long +relational comparisons — `sbcs`/`ite` patterns that TCC already produces but +spills around for the last comparison; see todo #3 from the original +analysis: `ne_s`/`ne_u` regalloc collision). + +## Out of Scope + +- The regalloc collision causing `ne_s`/`ne_u` to spill `got` and `exp` to + `[sp, #32]`/`[sp, #28]` (separate fix, ~6-8 insns). +- The dead intermediate `[sp, #24]` store from `i++` (would require DSE on + the post-codegen stack slot, or IR-level coalescing of T54 with T51). +- LICM body detection fix in `ir/licm.c` (a more thorough fix to Blocker 2 + but with broader regression surface — Step 3 above is the targeted + alternative). + +## Validation + +Per step: + +```bash +make cross +cd tests/ir_tests && source .venv/bin/activate +python -m pytest test_qemu.py -n auto # 1026 tests must pass +cd /home/mateusz/repos/tinycc +python scripts/regression_disasm.py --suite=ir -O2 # check function-level deltas +``` + +Specifically watch: +- `test_llong_relops::{run_signed,run_unsigned}` (target test) +- `bug_struct_array_index_mul_clobber::main` (Step 1 canary) +- `bug_bitfield_packed10::{check,main}` (was broken by dropping STACKOFF + exclusion — must stay passing) +- `110_iv_strength_reduction::*` (existing IV-SR test surface) diff --git a/docs/plan_opt_modularization.md b/docs/plan_opt_modularization.md new file mode 100644 index 00000000..f4a0b162 --- /dev/null +++ b/docs/plan_opt_modularization.md @@ -0,0 +1,494 @@ +# Pre-SSA Optimization: Engine + Modularization Plan + +## Progress checklist + +### Phase 0 — Delete dead code +- [x] Remove `tcc_ir_opt_run_by_name` stub (opt.c, opt.h) +- [x] Remove `tcc_ir_opt_run_all` stub (opt.c, opt.h) +- [x] Remove `tcc_ir_opt_return` stub + call site in tccgen.c +- [x] Remove `opt_return_value` flag (tcc.h, libtcc.c) — was the only consumer of the deleted stub + +### Phase 1 — Extract shared analysis & primitives +- [x] **1.1** `ir/opt_du.{h,c}` — `IROptDU` + `ir_opt_du_build/idx/def/uses` +- [x] **1.2** `ir/opt_xform.{h,c}` — `ir_xform_nop` (inline), `ir_xform_same_block` (5/6 call sites migrated; 1 site keeps non-canonical NOP-boundary semantics) +- [x] **1.3** `ir/opt_utils.{h,c}` — constant evaluators, BB/CFG helpers, purity tables, expression equality, call-param helpers +- [x] **1.4** `ir/opt_alias.{h,c}` — stack-slot aliasing helpers +- [x] **1.5** `ir/opt_loop_utils.{h,c}` — IV analysis, loop bounds, loop transforms + +### Phase 2 — Build the pre-SSA engine +- [x] **2.1** `ir/opt_engine.{h,c}` — `IROptCtx`, `IROptGen`, `tcc_ir_opt_run_gens`, lazy analysis cache +- [x] **2.2** Build-only verify (no rules wired yet) + +### Phase 3 — Convert pass groups to generator tables +- [x] **3.1** Fusion group → `ir/opt_gens_fusion.c` (7 converted: rotate, mla, indexed_mem, deref_indexed, disp, indexed_chain, indexed_pair_reorder; hand-written: postinc, lea_fold, assign_fuse) +- [x] **3.2** Branch-folding group → `ir/opt_gens_branch.c` (branch_folding + setif_branch_fuse converted to generators; or_bool_diamond, stack_addr_nonnull_fold, stack_bool_diamond stay hand-written — flow-sensitive/CFG patterns) +- [x] **3.3** Boolean simplification → `ir/opt_gens_bool.c` (bool_idempotent + bool_simplify + idempotent half of bool_pass) +- [x] **3.4** BB-scoped hash CSE — `cse_bool` converted to `IROptHashTable`; remaining passes (cse_global_load, globalsym_cse, cse_param_add, local_load_cse, local_alu_cse, stackoff_addr_cse) use ≤32-entry flat arrays where linear scan is faster than hash overhead — no conversion needed +- [x] **3.5** Call-result dead group → `ir/opt_gens_call_result.c` (dead_call_result_elim, dead_sret_call_elim, fold_call_result_store converted; dead_init_via_call stays in opt.c — FWS dependency) + +### Phase 4 — Generic hash table +- [x] **4.1** `ir/opt_hash.{h,c}` — `IROptHashTable`, bump-allocated entry pool, applied to `bool_cse` (replaces malloc-per-entry `BoolCSEEntry`); remaining CSE passes use flat arrays that don't benefit from hashing + +### Phase 5 — Collect-then-transform engine variant (optional) +- [x] **5.1** `IROptCollectGen` 2-phase dispatch — evaluated and skipped: candidate passes (const_var_prop, dead_var_store_elim, redundant_var_assign) each use unique per-pass state types that can't be shared through a generic interface; shared boilerplate is only ~5 lines of iteration loop per pass, not worth a new abstraction + +### Phase 6 — Theme-based file split (optional, zero flash savings) +- [x] **6.1** Theme-based split started: `opt_loop.c` (1,052 lines — strength reduction, IV, unroll, rotation, decrement-to-zero), `opt_memory.c` (3,259 lines — sl_forward, entry_store_prop, store_redundant, deref_fwd); `opt.c` reduced from 28,973 → 17,861 lines + +--- + +## Current State (2026-05) + +`ir/opt.c` is **28,973 lines** containing **81 pass functions**. It is the single largest source file in the project. The SSA optimization engine (`ir/opt/`, 8,500 lines across 13 files) has been built and runs on SSA-renamed IR before SSA destruction — but it did **not** displace the pre-SSA monolith. Both layers exist in production and the pre-SSA layer keeps growing as new post-destruction peepholes are needed for address materialization, indexed-mode fusion, and stack-aware patterns. + +### Why the monolith keeps growing + +The expectation in the original plan — "as SSA passes mature, pre-SSA equivalents are removed" — has not held. The pre-SSA layer operates on flat IR after SSA destruction, where vregs are no longer single-assignment and stack/local layout is materialized. Several optimization classes only make sense at this layer: + +- ARM addressing-mode fusion (`LOAD_INDEXED`, `LOAD_POSTINC`, `MLA`, displacement folding) +- Stack-slot aliasing and forwarding (`sl_forward`, `stack_addr_cse`) +- 64-bit register-pair tracking (`pack64`, `pack64_tautology`) +- Call-result lifetime analysis (`dead_call_result_elim`, `dead_init_via_call`, `dead_sret_call_elim`, `fold_call_result_store`) + +Since the original plan was written, 21 new pre-SSA passes have been added (full list in the census below). The pre-SSA optimizer is **permanent infrastructure**, not a migration bridge. + +### Two goals driving this rewrite + +1. **Save flash memory.** The compiler ships on flash-constrained embedded targets. Each pass has ~30–50 lines of duplicated iteration boilerplate (forward loop, NOP skip, BB-boundary check, local DU-table build). Across 81 passes that's roughly **3,000–4,000 lines** of redundant code, plus 4 hand-rolled hash tables and 6+ inlined "same-block check" loops. +2. **Combine passes into single forward loops.** Many passes only differ in their trigger opcode and pattern body. Today the pipeline runs 7+ separate fusion forward-scans back-to-back (each rebuilding the DU table); they could all run in one scan. + +The SSA engine has already proven the answer: a generator-based dispatch (`IRSSAOptGen` in [ir/opt/ssa_opt.h:62-66](ir/opt/ssa_opt.h#L62-L66), `ssa_opt_run_gens` in [ir/opt/ssa_opt.c:604-622](ir/opt/ssa_opt.c#L604-L622)) lets a single `O(n)` engine pass dispatch dozens of rules. The pre-SSA layer needs the same shape, with a context that survives the dispatch loop and caches analyses. + +--- + +## Pass Census (current) + +`opt.c` pass functions, grouped by pattern affinity: + +### Cleanup / DCE +`dce`, `compact_nops`, `dead_var_store_elim`, `dead_addrvar_elim`, `redundant_var_assign`, `redundant_init_elim`, `dse`, `dead_loop_elim`, `dead_call_result_elim`, `dead_init_via_call`, `dead_sret_call_elim` + +### Constant / value propagation +`const_var_prop`, `global_init_prop`, `const_prop`, `const_prop_tmp`, `value_tracking`, `complex_const_param_fold`, `param_addrof_const_fold`, `local_addrof_const_fold`, `add_reassoc`, `cmp_expr_fold` + +### Memory +`sl_forward`, `entry_store_prop`, `store_redundant`, `block_copy_init`, `deref_fwd`, `fold_call_result_store` + +### Fusion & addressing +`fusion_pass` (mla+indexed), `rotate_fusion`, `deref_indexed_fusion`, `disp_fusion`, `lea_fold`, `postinc_fusion`, `loop_postinc_fusion`, `indexed_chain`, `indexed_pair_reorder`, `add_deref_fold`, `stackoff_addr_cse`, `call_chain_rename`, `assign_fuse` + +### CSE / copy propagation +`copy_prop`, `cse_global_load`, `globalsym_cse`, `cse_param_add`, `local_load_cse`, `local_alu_cse`, `stack_addr_cse` + +### Branch / boolean +`branch_folding`, `setif_branch_fuse`, `stack_addr_nonnull_fold`, `stack_bool_diamond`, `or_bool_diamond`, `nonneg_branch_fold`, `float_branch_fold`, `bool_idempotent`, `bool_simplify`, `bool_pass` + +### Loop +`loop_unroll`, `loop_rotation`, `loop_bound_remat`, `iv_strength_reduction`, `iv_strength_reduction_with_loops`, `decrement_to_zero`, `redundant_loop_check`, `backedge_phi_hoist` + +### Other / peephole +`vrp`, `var_tmp_fwd`, `var_to_tmp`, `float_narrowing`, `strength_reduction`, `select`, `postinc_assign_fold`, `returnvalue_merge`, `const_string_calls`, `const_call_replace`, `pack64`, `pack64_tautology`, `fp_cache_*` + +### Stubs (delete in Phase 0) +`tcc_ir_opt_return`, `tcc_ir_opt_run_by_name` + +The original plan's `tcc_ir_opt_run_all` is already gone. `opt_jump_thread.c` already lives outside `opt.c` and provides `tcc_ir_opt_jump_threading` + `tcc_ir_opt_eliminate_fallthrough`. + +--- + +## Architecture: mirror the SSA engine for pre-SSA + +``` +┌──────────────────────────── Pipeline (tccgen.c) ─────────────────────────────┐ +│ │ +│ SSA layer: IRSSAOptCtx + IRSSAOptGen + ssa_opt_run_gens() │ +│ ✓ shipped: 13 passes, generator-based dispatch │ +│ │ +│ Pre-SSA layer (this plan): │ +│ IROptCtx + IROptGen + tcc_ir_opt_run_gens() │ +│ one engine, ~25 fusion/branch/bool peepholes registered as gens │ +│ ~55 remaining passes call into shared infra but stay bespoke │ +│ │ +├─────────────────────────── Shared analysis cache ────────────────────────────┤ +│ IROptCtx { du, bb_starts, pred_count, merge_bitmap } — lazy, generational │ +├─────────────────────────────── Libraries ────────────────────────────────────┤ +│ opt_du opt_utils opt_alias opt_loop_utils opt_hash opt_xform │ +├──────────────────────────────── IR core ─────────────────────────────────────┤ +│ core.c ir.h cfg.c ssa.c vreg.c pool.c machine_op.c │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` + +The pre-SSA engine deliberately mirrors the SSA engine's type and function naming: + +| SSA layer | Pre-SSA mirror | +|--------------------------|----------------------------| +| `IRSSAOptCtx` | `IROptCtx` | +| `IRSSAOptGen` | `IROptGen` | +| `ssa_opt_run_gens()` | `tcc_ir_opt_run_gens()` | +| `ssa_gen_*` functions | `ir_gen_*` functions | +| `ssa_opt_()` | `tcc_ir_opt_()` | + +Contributors who know one layer learn the other for free, and one implementation informs the other. + +--- + +## Flash savings estimate + +| Source of saving | Approx. lines removed | +|-------------------------------------------------------------------------|-----------------------| +| Iteration-loop boilerplate deduplicated across ~25 peephole passes | ~2,500 | +| DU-table builds: 20+ inline `ir_opt_du_build` call-sites → cache lookup | ~300 | +| Same-block check: 6+ inlined `for (j=...) if (JUMP/JUMPIF)` loops | ~200 | +| Pool-slot grow loops in fusion passes (`while (count <= n) pool_add`) | ~100 | +| `IROptHashTable` collapsing 4 hand-rolled CSE hash tables | ~400 | +| Constants in 2 idempotent/simplify boolean passes merged into one scan | ~150 | +| Branch-folding family (5 JUMPIF-triggered passes) merged into one scan | ~400 | +| **Total estimate** | **~4,000 lines (~14% of opt.c)** | + +Conservative because it counts only what duplication clearly costs; the engine creates new abstraction surface (~600 lines) that must be subtracted. **Net ~3,400 lines / ~12%.** + +The other win — not visible in line count — is **fewer O(n) scans** through the IR. The fusion group alone goes from 7+ separate forward scans (each rebuilding DU) to 1 scan with 1 DU build. For a function with 10,000 instructions that's 60,000–70,000 fewer dispatch-loop iterations per compile. + +--- + +## Migration phases + +The phase order has changed from the original plan. **Engine work goes first** because it produces all the flash savings; theme-based file splitting goes last because it produces zero flash savings (only readability). + +### Phase 0 — Delete dead code (15 min) + +1. Remove `tcc_ir_opt_run_by_name` ([opt.c:15131](ir/opt.c#L15131)) — empty stub. +2. Remove `tcc_ir_opt_return` ([opt.c:11202](ir/opt.c#L11202)) — 5-line stub never called from any pipeline path that needs it. +3. Delete `ir/opt_embedded_deref.c` if still present on disk (orphaned, not in `Makefile`). +4. Remove matching declarations from `ir/opt.h`. + +**Verify:** `make cross && make test -j16`. + +--- + +### Phase 1 — Extract shared analysis & primitives (4–6 h) + +This is the highest-leverage phase for flash savings. All subsequent phases depend on the libraries created here. + +#### 1.1 `ir/opt_du.h` + `ir/opt_du.c` (~200 lines) +- Move `IROptDU`, `ir_opt_du_build/def/uses/idx` from `opt.c`. +- Used by 20+ pass sites today; each currently writes its own `IROptDU du; ir_opt_du_build(ir, &du); …; tcc_free(du.def)` block (~10–15 lines per site). +- After extraction these collapse to `const IROptDU *du = ir_opt_ctx_require_du(&ctx);`. + +#### 1.2 `ir/opt_xform.h` + `ir/opt_xform.c` (~150 lines) +Six primitives, mirrors the most-duplicated patterns: +```c +static inline void ir_xform_nop(TCCIRState *ir, int idx); /* 81 sites */ +void ir_xform_replace_with_assign(TCCIRState *ir, int idx, IROperand src); /* ~40 sites */ +void ir_xform_replace_with_imm(TCCIRState *ir, int idx, int64_t v, int btype); +int ir_xform_same_block(TCCIRState *ir, int from, int to); /* 6+ sites */ +int ir_xform_alloc_pool(TCCIRState *ir, int n_slots); /* every fusion pass */ +void ir_xform_nop_with_du(TCCIRState *ir, int idx, IROptDU *du); +``` + +#### 1.3 `ir/opt_utils.h` + `ir/opt_utils.c` (~1,500 lines) +Extract from `opt.c`: +- Constant evaluators: `ir_opt_eval_const_u64`, `ir_opt_eval_const_string`, `evaluate_compare_condition`, `is_power_of_2`, condition-token helpers (`invert_cond_token`, `vrp_swap_cmp_tok`, `vrp_negate_cmp_tok`). +- BB / CFG helpers: `ir_opt_build_merge_bitmap`, `ir_opt_mark_block_starts`, `ir_opt_next_non_nop`, `ir_skip_nops_forward`, `ir_has_other_jump_to`, `ir_negate_condition`, `invert_condition`. +- Purity tables: `ir_opt_is_pure_helper_name`, `ir_opt_is_flag_cmp_helper_name`, `ir_opt_is_pure_fallthrough_instruction`, `tcc_ir_is_pure_aeabi`. +- Expression equality: `ir_opt_pure_expr_equal`, `ir_opt_pure_def_equal`, `ir_opt_nonvreg_expr_equal`. +- Call-param helpers: `ir_opt_get_call_param_operand` (27 sites), `ir_opt_nop_call_params` (15 sites), `ir_opt_nop_call_param`, `ir_opt_change_call_argc`. + +#### 1.4 `ir/opt_alias.h` + `ir/opt_alias.c` (~600 lines) +- `ir_opt_store_btype_size_bytes`, `ir_opt_stack_slot_range_for_offset`, `stackoff_same_slot`, `operand_references_slot`, `is_stack_address_operand`, `find_deref_use_operand`. + +#### 1.5 `ir/opt_loop_utils.h` + `ir/opt_loop_utils.c` (~1,800 lines) +- IV analysis (`find_induction_vars_ex`, `find_derived_ivs`, `transform_derived_iv`, `iv_strength_reduction_core`). +- Loop bounds (`find_loop_exit_condition`, `compute_trip_count`, `collect_body_instructions`). +- Loop transforms (`try_eliminate_loop`, `try_unroll_loop`, `try_rotate_loop`). +- Structs `InductionVar`, `DerivedIV`. + +**At end of Phase 1:** `opt.c` shrinks from 28,973 to ~24,000 lines. No pass logic moves yet; only their shared helpers. `static` → `extern` for everything pulled out. Build is verified after each step. + +--- + +### Phase 2 — Build the engine (3–4 h) + +#### 2.1 `ir/opt_engine.h` + `ir/opt_engine.c` + +Mirror the SSA engine's shape: + +```c +typedef struct IROptCtx { + TCCIRState *ir; + int n; /* cached ir->next_instruction_index */ + uint32_t generation; /* bumped on invalidation */ + + /* Lazy-built analyses — accessor builds on first use */ + IROptDU du; + uint32_t du_gen; + + int *pred_count; + uint32_t pred_gen; + + uint8_t *merge_bitmap; + uint32_t merge_gen; + + int changes; +} IROptCtx; + +typedef int (*ir_opt_gen_fn)(IROptCtx *ctx, int instr_idx); + +typedef struct IROptGen { + int op; /* trigger opcode; -1 = match any */ + ir_opt_gen_fn fn; + const char *name; + uint8_t needs_du; /* engine builds DU before dispatch if any gen requires */ + uint8_t same_block; /* engine wraps fn with same-BB check */ +} IROptGen; + +/* Lifecycle */ +void tcc_ir_opt_ctx_init(IROptCtx *ctx, TCCIRState *ir); +void tcc_ir_opt_ctx_free(IROptCtx *ctx); +void tcc_ir_opt_ctx_invalidate(IROptCtx *ctx); + +/* Lazy analysis accessors */ +const IROptDU *tcc_ir_opt_ctx_require_du(IROptCtx *ctx); +const int *tcc_ir_opt_ctx_require_pred(IROptCtx *ctx); +const uint8_t *tcc_ir_opt_ctx_require_merge(IROptCtx *ctx); + +/* Run a table of generators in a single forward pass */ +int tcc_ir_opt_run_gens(IROptCtx *ctx, const IROptGen *gens, int count); +``` + +Engine loop (mirrors `ssa_opt_run_gens` shape): +```c +int tcc_ir_opt_run_gens(IROptCtx *ctx, const IROptGen *gens, int count) +{ + TCCIRState *ir = ctx->ir; + int changes = 0; + + /* Ensure analyses are built once if any rule needs them */ + int any_du = 0; + for (int g = 0; g < count; g++) if (gens[g].needs_du) { any_du = 1; break; } + if (any_du) tcc_ir_opt_ctx_require_du(ctx); + + for (int i = 0; i < ir->next_instruction_index; i++) { + int op = ir->compact_instructions[i].op; + if (op == TCCIR_OP_NOP) continue; + for (int g = 0; g < count; g++) { + if (gens[g].op >= 0 && gens[g].op != op) continue; + int d = gens[g].fn(ctx, i); + if (d > 0) { changes += d; break; } /* first-match-wins */ + } + } + return changes; +} +``` + +**Same-block check:** When `gens[g].same_block` is set, the generator is wrapped by a helper that calls the user's `fn`, captures the matched instruction range, and calls `ir_xform_same_block` before allowing the transform. The cleanest place to put this check is inside the generator (it knows which range to test); a helper macro `IR_OPT_REQUIRE_SAME_BLOCK(ctx, from, to)` makes it one line. + +#### 2.2 Verify +Build only — no rules yet. Add `opt_engine.c`/`opt_du.c`/`opt_xform.c` to `Makefile` `IR_FILES`. Both engines coexist; pre-SSA passes still call the old way. + +--- + +### Phase 3 — Convert pass groups to generator tables + +Order is by **density of duplication** (highest payoff first), not by file location. + +#### 3.1 Fusion group → `ir/opt_gens_fusion.c` (4–6 h) + +Convert 7+ fusion passes into generators sharing one engine run. Current passes: + +| Pass | Trigger | Today's lines | After (match+transform) | +|---------------------------|----------------------|---------------|-------------------------| +| `fusion_pass` (mla+indexed) | `ADD`, `LOAD`, `STORE` | ~300 | ~120 | +| `rotate_fusion` | `ADD`/`OR` patterns | ~260 | ~100 | +| `deref_indexed_fusion` | ALU with deref | ~215 | ~100 | +| `disp_fusion` | `LOAD`/`STORE`/`ASSIGN` | ~260 | ~90 | +| `postinc_fusion` | `LOAD`/`STORE` | ~280 | ~90 | +| `lea_fold` | any deref source | ~420 | ~120 | +| `indexed_chain` | `LOAD_INDEXED`/`STORE_INDEXED` | ~150 | ~60 | +| `indexed_pair_reorder` | `LOAD_INDEXED` pairs | ~200 | ~70 | +| `assign_fuse` | `ASSIGN` chain | ~190 | ~70 | + +Hand-written exceptions: +- `add_deref_fold` (inserts new instructions, can't fit a same-index forward engine). +- `loop_postinc_fusion` (needs loop structure from `IRLoops`). +- `stackoff_addr_cse`, `call_chain_rename` (BB-scoped hash, see Phase 3.4). + +**Pipeline integration:** +```c +/* Before — 8 separate forward scans, 8 DU builds */ +tcc_ir_opt_rotate_fusion(ir); +tcc_ir_opt_fusion_pass(ir, opt_mla, opt_indexed); +tcc_ir_opt_deref_indexed_fusion(ir); +tcc_ir_opt_disp_fusion(ir); +tcc_ir_opt_indexed_chain(ir); +tcc_ir_opt_indexed_pair_reorder(ir); +tcc_ir_opt_assign_fuse(ir); +tcc_ir_opt_lea_fold(ir); +tcc_ir_opt_postinc_fusion(ir); + +/* After — 1 scan, 1 DU build */ +IROptCtx ctx; +tcc_ir_opt_ctx_init(&ctx, ir); +tcc_ir_opt_run_gens(&ctx, fusion_gens, FUSION_GENS_COUNT); +tcc_ir_opt_ctx_free(&ctx); + +tcc_ir_opt_add_deref_fold(ir); /* inserts → hand-written */ +tcc_ir_opt_loop_postinc_fusion(ir); /* needs IRLoops → hand-written */ +``` + +Convert one generator at a time, run `make test -j16` after each. Use existing IR tests (`tests/ir_tests/`) that exercise each pattern to catch ordering regressions. + +#### 3.2 Branch-folding group → `ir/opt_gens_branch.c` (3–4 h) + +All these trigger on `JUMPIF` and inspect the backward def chain. Currently 5 separate forward scans: + +| Pass | Trigger | Today | After | +|---------------------------|-------------|-------|-------| +| `branch_folding` | `JUMPIF` | ~160 | ~55 | +| `setif_branch_fuse` | `JUMPIF` | ~130 | ~65 | +| `stack_addr_nonnull_fold` | `JUMPIF` | ~470 | keep hand-written *or* split simple cases (~120) into generator and leave deep def-chain tracing (~350) in a helper | +| `or_bool_diamond` | `JUMPIF` | ~230 | ~80 | +| `stack_bool_diamond` | CFG diamond | ~270 | keep hand-written (4-instruction CFG pattern doesn't fit single-trigger dispatch) | + +Hand-written exceptions: `nonneg_branch_fold`, `float_branch_fold` (need merge-bitmap value tracking that doesn't fit per-instruction dispatch). + +#### 3.3 Boolean simplification → `ir/opt_gens_bool.c` (1–2 h) + +`bool_idempotent` + `bool_simplify` + the idempotent half of `bool_pass` collapse into 2–3 generators triggered on `BOOL_AND`/`BOOL_OR`. CSE half of `bool_pass` keeps its hash table and uses the new generic `IROptHashTable` from Phase 4. + +#### 3.4 BB-scoped hash CSE → use `opt_hash` (3–4 h) + +`cse_global_load`, `globalsym_cse`, `cse_param_add`, `local_load_cse`, `local_alu_cse`, `stackoff_addr_cse`, `cse_bool` all maintain a hash table that resets at BB boundaries. They are too varied for a single engine but they all reinvent the same hash-table lifecycle. + +**Phase 4 builds a shared `IROptHashTable`** (see below) — these passes are then rewritten to use it. Body logic stays per-pass; only the hash-table alloc/lookup/insert/clear/free becomes shared. ~400 lines saved across the 7 passes. + +#### 3.5 Call-result dead group → `ir/opt_gens_call_result.c` (2 h) + +`dead_call_result_elim`, `dead_init_via_call`, `dead_sret_call_elim`, `fold_call_result_store` all trigger on `FUNCCALLVAL` / `RETURNVALUE` and inspect the result's use chain. Collect-then-transform pattern fits the engine if a 2-phase variant is added (see Phase 5). + +--- + +### Phase 4 — Generic hash table (3–4 h) + +`ir/opt_hash.h` + `ir/opt_hash.c` (~200 lines) providing a bump-allocated CSE hash table. Drop-in replacement for 4 hand-rolled tables in `opt.c`: + +| Pass | Local struct | Buckets | +|---------------------|--------------------|---------| +| `cse_arith` (in `local_alu_cse`) | `ArithCSEEntry` | 256 | +| `cse_bool` (in `bool_pass`) | `BoolCSEEntry` | 64 | +| `sl_forward` | `StoreEntry` | 128 | +| `globalsym_cse` | `GSymCSEEntry` | linear-16 | + +API mirrors what `ssa_opt_load_cse` uses internally: + +```c +typedef struct IROptHashEntry { + uint32_t hash; + int instruction_idx; + int32_t result_vr; + int extra[4]; /* pass-specific payload */ + struct IROptHashEntry *next; +} IROptHashEntry; + +typedef struct IROptHashTable { + IROptHashEntry **buckets; + int n_buckets; + IROptHashEntry *pool; /* bump-allocated */ + int pool_count; +} IROptHashTable; + +void ir_opt_hash_init(IROptHashTable *, int n_buckets, int max_entries); +void ir_opt_hash_clear(IROptHashTable *); /* O(n_buckets), not O(entries) */ +void ir_opt_hash_free(IROptHashTable *); +IROptHashEntry *ir_opt_hash_lookup(IROptHashTable *, uint32_t hash, + int (*eq)(const IROptHashEntry *, const void *), + const void *key); +IROptHashEntry *ir_opt_hash_insert(IROptHashTable *, uint32_t hash); +``` + +`sl_forward`'s store-entry table has alias semantics that don't fit; **don't** touch it. The other 3 are straight rewrites. + +--- + +### Phase 5 — Collect-then-transform engine variant (optional, 2–3 h) + +Several passes (`const_var_prop`, `dead_call_result_elim`, `redundant_var_assign`, `dead_var_store_elim`) follow the pattern: forward pass to collect metadata, finalize, forward pass to transform. A 2-phase engine collapses their boilerplate: + +```c +typedef struct IROptCollectGen { + const char *name; + int op; + int (*collect)(IROptCtx *, int idx); /* phase 1 */ + int (*transform)(IROptCtx *, int idx); /* phase 2 */ +} IROptCollectGen; + +int tcc_ir_opt_run_collect_gens(IROptCtx *, const IROptCollectGen *, int n); +``` + +This is **optional** and should only be done after Phase 3 if the collect-transform passes still show significant boilerplate. If they don't, keep them hand-written and skip this phase. + +--- + +### Phase 6 — Theme-based file split (3–5 h, optional, zero flash savings) + +After Phases 0–5 the pre-SSA layer is: +- `opt.c` core (~16,000 lines of hand-written passes that don't fit any engine variant) +- `opt_engine.c`, `opt_du.c`, `opt_xform.c`, `opt_utils.c`, `opt_alias.c`, `opt_loop_utils.c`, `opt_hash.c` +- `opt_gens_fusion.c`, `opt_gens_branch.c`, `opt_gens_bool.c`, `opt_gens_call_result.c` + +Splitting the remaining `opt.c` by theme (cleanup / constprop / memory / loop / promote / peephole) is a pure-readability change and produces **zero flash savings**. It is worth doing once everything else is stable, mostly to make merge conflicts less painful. Don't block any of the earlier phases on this. + +--- + +## Pipeline driver changes + +The optimization driver lives in `tccgen.c` (~lines 25227–26230). Most changes are local one-block replacements where 7 sequential pass calls become 1 engine call: + +- Fusion section (~25446–25478): 9 calls → 1 engine call + 2 hand-written holdouts. +- Branch section (~25277–25291 and ~25535–25589 inside iterative loop): 3–4 calls → 1 engine call. +- Boolean section (~25480–25484): 2 calls → 1 engine call + 1 hand-written CSE. + +Inside the iterative `do { changes += … } while (changes)` loop, each engine invocation creates and destroys its own `IROptCtx` — the analysis cache must not span iterations because `compact_nops` and `dce` between iterations renumber instructions. + +--- + +## Risks + +- **Generator function-pointer dispatch overhead.** With ~10 fusion gens and 20K instructions, that's up to 200K indirect calls per engine run. Trigger-op filtering skips ~90% of gens per instruction. If profiling shows >5% overhead, switch to a `switch (op)` dispatch table generated at compile time. Mitigation already proven by `ssa_opt_run_gens` running in production with 14+ gens in `fold` alone. +- **Ordering changes when batching.** Today MLA fusion finishes the entire IR before disp fusion starts. After batching they run at the same instruction. First-match-wins + rule ordering (MLA before disp, indexed before plain disp, etc.) handles this, but every conversion needs a test verifying IR-dump equivalence on a representative input. +- **DU-table invalidation mid-pass.** When a generator changes `MUL→MLA` or `LOAD→LOAD_INDEXED`, the set of defined/used vregs around that index changes. NOP-only transforms preserve DU. Each generator must declare whether it changes opcodes; the engine refreshes DU between gens that need it. The SSA engine handles this via `tcc_ir_ssa_opt_rebuild` — borrow the same approach. +- **Pre-SSA passes that insert instructions.** `add_deref_fold` is the canonical example. Inserting shifts subsequent indices, invalidating the engine's loop counter. These stay hand-written and run **outside** the engine call. Document the rule: "generators must not change instruction count." + +--- + +## Estimated effort + +| Phase | What | Time | Net lines removed | +|------:|---------------------------------------------------|----------|-------------------| +| 0 | Delete dead stubs | 15 min | ~30 | +| 1 | Libraries: opt_du / opt_xform / opt_utils / opt_alias / opt_loop_utils | 4–6 h | ~500 (dedup) | +| 2 | Engine: opt_engine.c | 3–4 h | -600 (added) | +| 3.1 | Fusion gens | 4–6 h | ~1,400 | +| 3.2 | Branch gens | 3–4 h | ~500 | +| 3.3 | Bool gens | 1–2 h | ~200 | +| 3.4 | BB hash CSE rewrites | 3–4 h | ~400 | +| 3.5 | Call-result gens | 2 h | ~300 | +| 4 | Generic IROptHashTable | 3–4 h | (counted in 3.4) | +| 5 | Collect-transform engine variant (optional) | 2–3 h | ~250 | +| 6 | Theme-based split of remaining opt.c (optional) | 3–5 h | 0 | +| **Total (phases 0–4)** | **~20–28 h** | **~3,400 (~12%)** | + +Each phase produces a working build. Each can ship independently. If the project ships at any intermediate state, the result is strictly better than today. + +--- + +## Why this rewrite is different from the original plan + +| Original plan said… | This plan says… | +|---------------------------------------------------|--------------------------------------------------------------| +| opt.c is 22,712 lines, ~60 passes | opt.c is 28,973 lines, 81 passes (and growing) | +| Pre-SSA is a migration bridge — passes die as SSA matures | Pre-SSA is permanent infrastructure for post-destruction IR | +| Phase 4 (engine) is optional contingency | Phase 2 (engine) is the **primary** flash-saving mechanism | +| Phases 2 (theme split) first, then engine | Engine first; theme split last (or skip entirely) | +| Invent a fresh `IRPeepholeRule` API | **Mirror** the proven `IRSSAOptGen` / `ssa_opt_run_gens` API | +| Pass conversion is a 4–6 h side project | Pass conversion is **the whole point** — most of the work | \ No newline at end of file diff --git a/docs/plan_opt_split.md b/docs/plan_opt_split.md new file mode 100644 index 00000000..006a1968 --- /dev/null +++ b/docs/plan_opt_split.md @@ -0,0 +1,362 @@ +# Plan: Split `ir/opt.c` Into Themed Modules + +## Current State + +`ir/opt.c` is **17,861 lines** (down from 28,973 after Phase 6.1 extracted `opt_loop.c` and `opt_memory.c`). It still contains **67 functions** spanning 6+ distinct optimization themes. The already-extracted modules total ~13,200 lines across 14 files — so the remaining monolith is still the single largest source file. + +### Already extracted (for reference) + +| File | Lines | Contents | +|------|-------|----------| +| `opt_loop_utils.c` | 3,498 | IV analysis, loop bounds, loop transforms | +| `opt_memory.c` | 3,259 | sl_forward, entry_store_prop, store_redundant, deref_fwd | +| `opt_loop.c` | 1,052 | Strength reduction, unroll, rotation, decrement-to-zero | +| `opt_utils.c` | 978 | Constant evaluators, BB/CFG helpers, purity tables | +| `opt_gens_fusion.c` | 818 | Engine-based fusion generators | +| `opt_gens_call_result.c` | 301 | Dead call result generators | +| `opt_jump_thread.c` | 203 | Jump threading + fallthrough elimination | +| `opt_gens_branch.c` | 176 | Branch folding generators | +| `opt_alias.c` | 127 | Stack-slot aliasing helpers | +| `opt_engine.c` | 100 | IROptCtx, IROptGen, tcc_ir_opt_run_gens | +| `opt_du.c` | 98 | Def-use build/query | +| `opt_hash.c` | 63 | Generic hash table for CSE | +| `opt_gens_bool.c` | 57 | Boolean simplification generators | +| `opt_xform.c` | 24 | Transform primitives | + +--- + +## Proposed Split + +Split the remaining 17,861 lines into **7 new themed files** + a slim residual `opt.c` (~1,600 lines). + +--- + +### 1. `ir/opt_dce.c` — Dead Code & Cleanup (~2,200 lines) + +Functions to move: + +| Function | Lines | Range | +|----------|-------|-------| +| `tcc_ir_opt_dce` | 122 | 97–218 | +| `tcc_ir_opt_compact_nops` | 203 | 219–421 | +| `tcc_ir_opt_dead_var_store_elim` | 131 | 2985–3115 | +| `tcc_ir_opt_dead_addrvar_elim` | 330 | 3348–3677 | +| `tcc_ir_opt_redundant_var_assign` | 157 | 3678–3834 | +| `tcc_ir_opt_redundant_init_elim` | 156 | 14531–14686 | +| `tcc_ir_opt_dead_loop_elim` | 228 | 15500–15727 | +| `tcc_ir_opt_dse` | 1,269 | 1716–2984 | + +**Rationale:** All these passes remove dead/redundant IR — NOPs, unreachable code, dead stores, dead variables. `dse` is the largest single pass (1,269 lines) and is purely elimination logic. Grouping gives a single file for "what can I safely delete." + +**Internal dependencies:** +- `dse` uses `ir_opt_build_def_count` (shared static helper → move or expose via `opt_du.h`) +- All use `ir_xform_nop` (already in `opt_xform.h`) +- `dead_addrvar_elim` and `dse` use alias helpers (already in `opt_alias.h`) + +--- + +### 2. `ir/opt_constprop.c` — Constant & Value Propagation (~4,100 lines) + +Functions to move: + +| Function | Lines | Range | +|----------|-------|-------| +| `tcc_ir_opt_const_var_prop` | 253 | 422–674 | +| `tcc_ir_opt_global_init_prop` | 137 | 675–811 | +| `tcc_ir_opt_complex_const_param_fold` | 177 | 812–988 | +| `tcc_ir_opt_const_prop` | 1,235 | 3835–5069 | +| `tcc_ir_opt_value_tracking` | 1,647 | 5070–6716 | +| `tcc_ir_opt_const_prop_tmp` | 368 | 7928–8295 | +| `tcc_ir_opt_add_reassoc` | 125 | 8330–8454 | +| `tcc_ir_opt_cmp_expr_fold` | 166 | 8455–8620 | +| `ir_opt_build_def_count` (static) | 34 | 8296–8329 | + +**Rationale:** These are the "what values do I know at this point" passes. `const_prop` (1,235 lines) and `value_tracking` (1,647 lines) are the two biggest passes remaining in opt.c and they share constant-evaluation infrastructure. Together they form the core analysis engine. + +**Internal dependencies:** +- `const_prop` and `value_tracking` share evaluation helpers from `opt_utils.h` +- `ir_opt_build_def_count` is used by `add_reassoc` and `copy_prop` → make non-static, expose from header +- `value_tracking` uses VRP slot helpers (`vrp_get_slot`, `vrp_fold_cmp`) — move with it + +--- + +### 3. `ir/opt_copyprop.c` — Copy Propagation & CSE (~1,500 lines) + +Functions to move: + +| Function | Lines | Range | +|----------|-------|-------| +| `tcc_ir_opt_copy_prop` | 449 | 8621–9069 | +| `tcc_ir_opt_cse_global_load` | 214 | 9104–9317 | +| `tcc_ir_opt_globalsym_cse` | 133 | 9362–9494 | +| `gsym_cse_insert_before` (static) | 44 | 9318–9361 | +| `tcc_ir_opt_cse_param_add` | 194 | 9495–9688 | +| `tcc_ir_opt_local_load_cse` | 189 | 13737–13925 | +| `tcc_ir_opt_local_alu_cse` | 255 | 13926–14180 | +| `bool_cse_hash` / `bool_cse_eq` (statics) | 34 | 9070–9103 | + +**Rationale:** All these passes identify redundant computations (copy chains, repeated loads, repeated ALU ops) and eliminate them via forwarding or CSE. They share the same flat-array or hash-table BB-scoped pattern. + +**Internal dependencies:** +- Uses `IROptHashTable` from `opt_hash.h` +- `copy_prop` uses `ir_opt_build_def_count` (from opt_constprop.c or made public) +- `gsym_cse_insert_before` inserts instructions — unique to this group + +--- + +### 4. `ir/opt_branch.c` — Branch & Boolean Optimization (~2,200 lines) + +Functions to move: + +| Function | Lines | Range | +|----------|-------|-------| +| `tcc_ir_opt_float_branch_fold` | 252 | 7178–7429 | +| `ir_opt_match_zero_test` (static) | 35 | 7143–7177 | +| `tcc_ir_opt_vrp` | 330 | 7430–7759 | +| `vrp_get_slot` / `vrp_fold_cmp` (statics) | 29 | 6717–6745 | +| `tcc_ir_opt_nonneg_branch_fold` | 365 | 9720–10084 | +| `nonneg_func_names` / `flag_cmp_funcs` (tables) | 31 | 9689–9719 | +| `tcc_ir_opt_branch_folding` | 30 | 12447–12476 | +| `tcc_ir_opt_stack_addr_nonnull_fold` | 423 | 12477–12899 | +| `tcc_ir_opt_setif_branch_fuse` | 39 | 12900–12938 | +| `tcc_ir_opt_stack_bool_diamond` | 268 | 12939–13206 | +| `tcc_ir_opt_or_bool_diamond` | 232 | 13207–13438 | +| `tcc_ir_opt_bool_cse` | 75 | 12324–12398 | + +**Rationale:** All passes that reason about conditional branches, VRP (value-range propagation), boolean CSE, and control-flow diamonds. They share `JUMPIF`-triggered pattern matching and backward def-chain tracing. `vrp` and `nonneg_branch_fold` both use the VRP slot/fold helpers. + +**Internal dependencies:** +- `vrp` range tables are self-contained +- `nonneg_branch_fold` uses `change_callee_sym` (shared with float_narrowing → move to opt_utils or keep in residual) +- Branch passes use `ir_opt_match_zero_test` → move together + +--- + +### 5. `ir/opt_fusion.c` — Fusion & Addressing Mode (hand-written) (~2,050 lines) + +Functions to move: + +| Function | Lines | Range | +|----------|-------|-------| +| `tcc_ir_opt_add_deref_fold` | 232 | 3116–3347 | +| `tcc_ir_opt_postinc_fusion` | 278 | 10673–10950 | +| `tcc_ir_opt_loop_postinc_fusion` | 476 | 10951–11426 | +| `tcc_ir_barrel_shift_fusion` | 146 | 11427–11572 | +| `tcc_ir_opt_call_chain_rename` | 155 | 11573–11727 | +| `tcc_ir_opt_stackoff_addr_cse` | 176 | 11728–11903 | +| `tcc_ir_opt_lea_fold` | 420 | 11904–12323 | +| `tcc_ir_opt_assign_fuse` | 184 | 17486–17669 | + +**Rationale:** Hand-written fusion passes that couldn't be converted to engine generators (they insert instructions, need loop structure, or use BB-scoped hash tables). These are the ARM addressing-mode optimization passes — `LOAD_INDEXED`, `LOAD_POSTINC`, barrel-shift folding, LEA elimination, displacement fusion. Distinct from `opt_gens_fusion.c` which holds the engine-compatible generators. + +**Internal dependencies:** +- `loop_postinc_fusion` uses `IRLoops` from `opt_loop_utils.h` +- `lea_fold` uses def-use from `opt_du.h` +- `call_chain_rename` uses `change_callee_sym` helpers + +--- + +### 6. `ir/opt_promote.c` — Variable-to-Temp Promotion & Forwarding (~1,600 lines) + +Functions to move: + +| Function | Lines | Range | +|----------|-------|-------| +| `tcc_ir_opt_var_tmp_fwd` | 298 | 13439–13736 | +| `tcc_ir_opt_var_to_tmp` | 350 | 14181–14530 | +| `tcc_ir_opt_select` | 410 | 14687–15096 | +| `tcc_ir_opt_postinc_assign_fold` | 145 | 15303–15447 | +| `tcc_ir_opt_returnvalue_merge` | 52 | 15448–15499 | +| `tcc_ir_opt_backedge_phi_hoist` | 205 | 15920–16124 | +| `tcc_ir_opt_redundant_loop_check` | 168 | 7760–7927 | + +**Rationale:** These passes promote stack variables to temporaries, forward values through variable stores/loads, and select-ify simple if/else diamonds. They bridge the gap between flat variable-based IR (post-SSA destruction) and the register allocator which needs temporaries. `select` is the largest (410 lines) — it converts store-to-var-in-both-branches into a conditional move. + +--- + +### 7. `ir/opt_constfold.c` — Constant String/Call/Addrof Folding (~1,800 lines) + +Functions to move: + +| Function | Lines | Range | +|----------|-------|-------| +| `ir_opt_eval_const_string_operand` (static) | 70 | 6746–6815 | +| `ir_opt_fold_strcmp_result` (static) | 13 | 6816–6828 | +| `ir_opt_fold_strncmp_result` (static) | 16 | 6829–6844 | +| `ir_opt_fold_memcmp_result` (static) | 15 | 6845–6859 | +| `ir_opt_fold_memchr_offset` (static) | 20 | 6860–6879 | +| `tcc_ir_opt_const_string_calls` | 263 | 6880–7142 | +| `tcc_ir_opt_const_call_replace` | 90 | 15830–15919 | +| `tcc_ir_detect_const_result` | 73 | 15728–15800 | +| `tcc_ir_cache_const_result` | 15 | 15801–15815 | +| `tcc_ir_lookup_const_result` | 14 | 15816–15829 | +| `tcc_ir_opt_param_addrof_const_fold` | 435 | 16125–16559 | +| `tcc_ir_opt_local_addrof_const_fold` | 471 | 16560–17030 | +| `tcc_ir_opt_float_narrowing` | 307 | 10151–10457 | +| `float_narrow_table` / `change_callee_sym*` | 66 | 10085–10150 | + +**Rationale:** These passes evaluate calls and expressions at compile time when arguments are known constants — string library folding (`strcmp`, `strlen`, `memcmp`), memoized pure-function results, address-of-parameter constant propagation, and float type narrowing (e.g., `double→float` when precision allows). All share the "trace constant operands backward, fold result" pattern. + +**Internal dependencies:** +- `change_callee_sym` / `change_callee_sym_keep_type` → used by both `float_narrowing` and `nonneg_branch_fold`. Move to this file (it's defined here at line 10106) or to `opt_utils.c` if needed by `opt_branch.c` too. + +--- + +### 8. `ir/opt_pack64.c` — 64-bit Register Pair Optimization (~650 lines) + +Functions to move: + +| Function | Lines | Range | +|----------|-------|-------| +| `tcc_ir_opt_pack64` | 179 | 17031–17209 | +| `p64taut_trace_back` (static) | 51 | 17210–17260 | +| `tcc_ir_opt_pack64_tautology` | 225 | 17261–17485 | +| `tcc_ir_opt_cmp_narrow_64` | 192 | 17670–17861 | + +**Rationale:** ARM-specific 64-bit register-pair tracking. These passes combine/split `PACK64` pseudo-ops and eliminate redundant 64→32→64 conversions. Self-contained logic with no significant shared state. + +--- + +### 9. Residual `ir/opt.c` (~1,600 lines) + +What stays: + +| Function | Lines | Why stays | +|----------|-------|-----------| +| FP cache wrappers | 40 | Thin delegation layer, trivial | +| `tcc_ir_analyze_pure_via_sret` | 250 | Cross-cutting interprocedural analysis | +| FWS (func write summary) block | 400 | `fws_*` + `tcc_ir_compute_func_write_summary` — interprocedural, used by `dead_init_via_call` | +| `tcc_ir_opt_dead_init_via_call` | 116 | Depends on FWS, tight coupling | +| `tcc_ir_opt_stack_addr_cse` | 215 | Doesn't fit cleanly elsewhere (BB hash + stack aliasing hybrid) | +| `tcc_ir_opt_block_copy_init` | 206 | Memory/struct init hybrid | +| `tcc_ir_find_defining_instruction` | 18 | Small utility, widely used | +| `tcc_ir_vreg_has_single_use` | 30 | Small utility, widely used | +| Forward decls, includes, macros | ~50 | Boilerplate | + +The residual `opt.c` becomes a "miscellaneous + interprocedural" file. As these grow, they can be split further (e.g., `opt_interproc.c` for FWS + sret analysis). + +--- + +## Dependency Graph + +``` +opt.c (residual, 1.6K) + ├── opt_dce.c (2.2K) → opt_xform, opt_alias, opt_utils + ├── opt_constprop.c (4.1K) → opt_utils, opt_du + ├── opt_copyprop.c (1.5K) → opt_hash, opt_du, opt_utils + ├── opt_branch.c (2.2K) → opt_utils, opt_du + ├── opt_fusion.c (2.0K) → opt_du, opt_loop_utils, opt_alias + ├── opt_promote.c (1.6K) → opt_du, opt_utils + ├── opt_constfold.c (1.8K) → opt_utils + └── opt_pack64.c (0.6K) → (self-contained) +``` + +No circular dependencies. Each new file includes `ir.h` (which pulls in `tccir.h` + core types) plus the specific `opt_*.h` headers it needs. + +--- + +## Shared Helpers To Expose + +Before splitting, these currently-`static` helpers need to become non-static (add to appropriate header): + +| Helper | Current location | Move to | +|--------|-----------------|---------| +| `ir_opt_build_def_count` | opt.c:8296 | `opt_du.h` / `opt_du.c` | +| `change_callee_sym` | opt.c:10106 | `opt_utils.h` / `opt_utils.c` | +| `change_callee_sym_keep_type` | opt.c:10133 | `opt_utils.h` / `opt_utils.c` | +| `vrp_get_slot` / `vrp_fold_cmp` | opt.c:6717 | `opt_branch.c` (file-local) | +| `ir_opt_match_zero_test` | opt.c:7143 | `opt_branch.c` (file-local) | +| `ir_opt_eval_const_string_operand` | opt.c:6746 | `opt_constfold.c` (file-local) | +| `ir_opt_fold_str*` / `ir_opt_fold_mem*` | opt.c:6816–6879 | `opt_constfold.c` (file-local) | +| `p64taut_trace_back` | opt.c:17210 | `opt_pack64.c` (file-local) | +| `gsym_cse_insert_before` | opt.c:9318 | `opt_copyprop.c` (file-local) | +| `bool_cse_hash` / `bool_cse_eq` | opt.c:9070 | `opt_copyprop.c` (file-local) | + +--- + +## Execution Plan + +### Step 1: Expose shared helpers (30 min) +- [ ] Move `ir_opt_build_def_count` → `opt_du.c` / `opt_du.h` +- [ ] Move `change_callee_sym` + `change_callee_sym_keep_type` → `opt_utils.c` / `opt_utils.h` +- [ ] Verify: `make cross && make test -j16` + +### Step 2: Extract `opt_pack64.c` (30 min) +- [ ] Create `ir/opt_pack64.c` with `#define USING_GLOBALS` + `#include "ir.h"` +- [ ] Move `tcc_ir_opt_pack64`, `p64taut_trace_back`, `tcc_ir_opt_pack64_tautology`, `tcc_ir_opt_cmp_narrow_64` +- [ ] Add to `Makefile` `IR_FILES` +- [ ] Verify: `make cross && make test -j16` + +### Step 3: Extract `opt_dce.c` (45 min) +- [ ] Create `ir/opt_dce.c` +- [ ] Move 8 functions: `dce`, `compact_nops`, `dead_var_store_elim`, `dead_addrvar_elim`, `redundant_var_assign`, `redundant_init_elim`, `dead_loop_elim`, `dse` +- [ ] Create `ir/opt_dce.h` with public declarations +- [ ] Verify: `make cross && make test -j16` + +### Step 4: Extract `opt_constfold.c` (45 min) +- [ ] Create `ir/opt_constfold.c` +- [ ] Move 14 functions: string fold helpers, `const_string_calls`, `const_call_replace`, `detect_const_result`, `cache_const_result`, `lookup_const_result`, `param_addrof_const_fold`, `local_addrof_const_fold`, `float_narrowing`, `float_narrow_table` +- [ ] Verify: `make cross && make test -j16` + +### Step 5: Extract `opt_branch.c` (45 min) +- [ ] Create `ir/opt_branch.c` +- [ ] Move 12 functions: `float_branch_fold`, `match_zero_test`, `vrp`, VRP statics, `nonneg_branch_fold`, name tables, `branch_folding`, `stack_addr_nonnull_fold`, `setif_branch_fuse`, `stack_bool_diamond`, `or_bool_diamond`, `bool_cse` +- [ ] Verify: `make cross && make test -j16` + +### Step 6: Extract `opt_copyprop.c` (45 min) +- [ ] Create `ir/opt_copyprop.c` +- [ ] Move 8 functions: `copy_prop`, `cse_global_load`, `globalsym_cse`, `gsym_cse_insert_before`, `cse_param_add`, `local_load_cse`, `local_alu_cse`, `bool_cse_hash`/`bool_cse_eq` +- [ ] Verify: `make cross && make test -j16` + +### Step 7: Extract `opt_fusion.c` (45 min) +- [ ] Create `ir/opt_fusion.c` +- [ ] Move 8 functions: `add_deref_fold`, `postinc_fusion`, `loop_postinc_fusion`, `barrel_shift_fusion`, `call_chain_rename`, `stackoff_addr_cse`, `lea_fold`, `assign_fuse` +- [ ] Verify: `make cross && make test -j16` + +### Step 8: Extract `opt_promote.c` (30 min) +- [ ] Create `ir/opt_promote.c` +- [ ] Move 7 functions: `var_tmp_fwd`, `var_to_tmp`, `select`, `postinc_assign_fold`, `returnvalue_merge`, `backedge_phi_hoist`, `redundant_loop_check` +- [ ] Verify: `make cross && make test -j16` + +### Step 9: Extract `opt_constprop.c` (45 min) +- [ ] Create `ir/opt_constprop.c` +- [ ] Move 9 functions: `const_var_prop`, `global_init_prop`, `complex_const_param_fold`, `const_prop`, `value_tracking`, `const_prop_tmp`, `add_reassoc`, `cmp_expr_fold`, `ir_opt_build_def_count` +- [ ] Verify: `make cross && make test -j16` + +### Step 10: Final cleanup (30 min) +- [ ] Verify residual `opt.c` is ~1,600 lines +- [ ] Update `opt.h` — ensure all public function declarations reference correct headers +- [ ] Audit includes in each new file — remove unnecessary ones +- [ ] Final: `make cross && make test -j16 && make test-asm -j16` + +--- + +## Result Summary + +| File | Lines | Theme | +|------|-------|-------| +| `opt.c` (residual) | ~1,600 | Interprocedural (FWS, sret), misc | +| `opt_constprop.c` | ~4,100 | Constant/value propagation | +| `opt_dce.c` | ~2,200 | Dead code/store elimination | +| `opt_branch.c` | ~2,200 | Branch/VRP/boolean | +| `opt_fusion.c` | ~2,050 | Hand-written addressing-mode fusion | +| `opt_constfold.c` | ~1,800 | Compile-time call/string/addrof folding | +| `opt_promote.c` | ~1,600 | Variable→temp promotion | +| `opt_copyprop.c` | ~1,500 | Copy propagation & CSE | +| `opt_pack64.c` | ~650 | 64-bit register pair | + +**Total estimated effort: ~6 hours** (mechanical moves, no logic changes). + +**No flash savings** — this is purely a readability/maintainability refactor. The engine work (Phases 2–5 in the parent plan) is what saves flash. + +--- + +## Risks & Mitigations + +1. **Compilation unit boundaries change optimizer behavior.** Static functions that were previously inlinable across passes become extern calls. Mitigation: critical hot helpers stay `static inline` in headers (e.g., `ir_xform_nop` already is). + +2. **Include order sensitivity.** `opt.c` currently relies on `#define USING_GLOBALS` at the top. Each new file needs this + `#include "ir.h"`. Verify with `-Werror` that no implicit declarations creep in. + +3. **`change_callee_sym` used by 2 target files.** Moving it to `opt_utils.c` means both `opt_branch.c` and `opt_constfold.c` can call it. Alternative: duplicate in each file (worse) or keep in residual `opt.c` (limits extraction). + +4. **Build time.** More `.o` files = more linker inputs but better incremental build (touching one pass doesn't recompile 17K lines). Net positive for development velocity. diff --git a/docs/plan_ssa.md b/docs/plan_ssa.md new file mode 100644 index 00000000..292b9f08 --- /dev/null +++ b/docs/plan_ssa.md @@ -0,0 +1,315 @@ +# SSA Conversion Plan + +## Goal + +Insert a mandatory SSA (Static Single Assignment) construction pass between IR generation and optimization. The current `ir/opt.c` will be rewritten against SSA form. This document covers only the SSA infrastructure — no new optimizations yet. + +## Current IR Summary + +- Flat array of `IRQuadCompact` instructions +- Three vreg namespaces: VAR (locals), TEMP (compiler-generated), PARAM (function args) +- VARs can be assigned multiple times (not SSA) +- TEMPs are mostly single-def but not enforced +- Basic block boundaries are implicit: instructions following a JUMP/JUMPIF target (`is_jump_target` flag) start a new block +- No explicit CFG data structure — passes scan linearly and track jump targets +- Operands stored in a pool indexed by `operand_base` + +## Design + +### Phase 1: CFG Construction + +Build an explicit control flow graph from the flat instruction stream. + +**Data structures:** + +```c +typedef struct IRBasicBlock { + int start_idx; /* first instruction index (inclusive) */ + int end_idx; /* last instruction index (inclusive) */ + int id; /* block index */ + + int *preds; /* predecessor block IDs */ + int nb_preds; + int *succs; /* successor block IDs */ + int nb_succs; + + int idom; /* immediate dominator block ID */ + int *dom_frontier; /* dominance frontier set */ + int nb_dom_frontier; + int *dom_children; /* children in dominator tree */ + int nb_dom_children; +} IRBasicBlock; +``` + +**Algorithm:** +1. Scan instruction array; every `is_jump_target` or instruction following a JUMP/JUMPIF/RETURNVALUE/RETURNVOID starts a new block +2. Build successor edges: JUMP → target block, JUMPIF → target + fallthrough, RETURN → (none), IJUMP → all possible targets +3. Build predecessor edges (reverse of successors) + +**File:** `ir/cfg.c` + +### Phase 2: Dominator Tree + +Compute immediate dominators using the Cooper-Harvey-Kennedy algorithm (simple iterative, efficient for reducible CFGs which TCC always produces). + +**Algorithm:** "A Simple, Fast Dominance Algorithm" (Keith D. Cooper, Timothy J. Harvey, Ken Kennedy, 2001) + +1. Initialize idom[entry] = entry, all others undefined +2. Iterate in reverse postorder until fixed point: + - For each block b (except entry), idom[b] = intersect(idom of all preds) +3. Compute dominance frontier from idom tree + +**File:** `ir/cfg.c` (same file, closely coupled with CFG) + +### Phase 3: SSA Construction + +Convert VARs and TEMPs into SSA form using the standard algorithm: + +1. **Phi placement** (iterated dominance frontier): + - For each variable v, find all blocks that define v + - Place phi nodes at the dominance frontier of those blocks + - Iterate until no new phis are added + +2. **Renaming** (dominator tree walk): + - Walk dominator tree in preorder + - Maintain a rename stack per variable + - At each use: replace vreg with current SSA name from stack + - At each def: push new SSA name onto stack + - At each phi in successor: fill the phi operand for this edge + +**Phi node representation:** + +```c +typedef struct IRPhiNode { + int32_t dest_vreg; /* SSA vreg being defined */ + int nb_operands; + struct { + int32_t vreg; /* SSA vreg from this predecessor */ + int pred_block_id; /* which predecessor edge */ + } *operands; +} IRPhiNode; +``` + +Phi nodes are stored per-block (array at the top of each `IRBasicBlock`), not as regular instructions. This avoids disturbing the compact instruction array. + +**What gets SSA-renamed:** +- VAR vregs (locals) — these are the primary multi-def case +- TEMP vregs — already mostly single-def, but SSA enforces it +- PARAM vregs — treated as a single def at function entry + +**What does NOT get SSA-renamed:** +- StackLoc stores/loads (memory operations through pointers) +- Global symbol references +- Immediate constants + +**File:** `ir/ssa.c` + +### Phase 4: SSA Destruction (before regalloc) + +Convert out of SSA form for the register allocator (`tccls.c`) which expects the current flat IR format. + +**Algorithm:** naive phi elimination (sufficient for now, can optimize later with copy coalescing): + +1. For each phi node `v_i = phi(v_a, v_b, ...)`: + - Insert `ASSIGN v_i ← v_a` at end of predecessor block for edge a + - Insert `ASSIGN v_i ← v_b` at end of predecessor block for edge b +2. Remove all phi nodes +3. Flatten CFG back to linear instruction array + +Lost-copy and swap problems are rare in practice with linear scan; can add parallel-copy resolution later if needed. + +**File:** `ir/ssa.c` (destruction is the inverse of construction) + +## Integration Points + +### Pipeline position + +Current pipeline at -O1+ (SSA regalloc is default): +``` +tccgen.c (IR emission) + → ir/opt.c: pre-SSA optimizations (iterative loop) + → ir/regalloc.c: SSA-based register allocation + internally: build CFG → construct SSA → rename + → ir/opt/: SSA optimization engine (cprop → dce → target generators) + → build intervals → linear scan → phi resolution + → ir/codegen.c + arm-thumb-gen.c: code generation +``` + +Fallback pipeline at -O0 (or `-fno-ssa-regalloc`): +``` +tccgen.c (IR emission) + → ir/cfg.c + ir/ssa.c: construct SSA → rename + → ir/opt/: SSA optimization engine + → ir/ssa.c: destroy SSA + → ir/opt.c: pre-SSA optimizations + → tccls.c: legacy liveness + linear scan + → ir/codegen.c + arm-thumb-gen.c: code generation +``` + +Final pipeline (step 7 done — SSA is default, legacy removed): +``` +tccgen.c (IR emission) + → ir/opt.c: pre-SSA optimizations (iterative loop) + → ir/regalloc.c: SSA-based register allocation + internally: build CFG → construct SSA → rename + → ir/opt/: SSA optimization engine (SCCP, GVN, DCE, target generators) + → build intervals → linear scan → phi resolution + → ir/codegen.c + arm-thumb-gen.c: code generation +``` + +### Interface to existing code + +- `tccgen.c`: orchestrates SSA pipeline (build CFG → construct → rename → optimize → destroy) +- `ir/opt/`: SSA optimization engine — target-independent passes + registered target generators +- `arch/arm/ssa_opt_arm.c`: ARM target-specific generators, registered via `tcc_ir_ssa_opt_register_target()` +- `ir/opt.c`: pre-SSA optimization passes — run after SSA destruction on flat IR +- `tccls.c`: unchanged (receives flat IR after SSA destruction); replaced by `ir/regalloc.c` in step 5 +- `ir/codegen.c`: unchanged — operates post-regalloc + +### New API surface + +```c +/* ir/cfg.c */ +typedef struct IRCFG { ... } IRCFG; +IRCFG *tcc_ir_cfg_build(TCCIRState *ir); +void tcc_ir_cfg_free(IRCFG *cfg); + +/* ir/ssa.c */ +void tcc_ir_ssa_construct(TCCIRState *ir, IRCFG *cfg); +void tcc_ir_ssa_destroy(TCCIRState *ir, IRCFG *cfg); +``` + +### vreg numbering + +SSA creates new vregs (each def gets a unique name). Options: + +**Option A: Extend existing vreg encoding.** +Use TCCIR_VREG_TYPE_TEMP with new positions beyond the original max. Phi dests and renamed defs get fresh positions. Simple, no encoding changes. + +**Option B: New TCCIR_VREG_TYPE_SSA.** +Add a 4th vreg type. Cleaner separation, easier to assert "is this SSA?" but uses one of the few remaining type bits. + +Recommendation: **Option A** — reuse TEMP namespace. SSA vregs are just temps with the invariant that each position has exactly one def. No encoding changes needed. + +## Implementation Order + +### Done + +1. **`ir/cfg.c`** — CFG + dominator tree + dominance frontier ✓ + - CFG build, RPO, CHK dominators, dominance frontier all working + - Infinite-loop guard + bitset dedup optimization applied + - All tests pass with SSA phi placement enabled at -O1+ + +2. **`ir/ssa.c` phi placement** ✓ + - Only VARs with multi-block defs (skips TEMPs/PARAMs) + - Single-scan, bulk allocation, early-exit for trivial functions + - Wired into pipeline at -O1+ (`-fssa` / `-fno-ssa`) + +3. **SSA renaming** ✓ + - `tcc_ir_ssa_rename()` implemented and produces correct SSA form + - Enabled in pipeline with SSA construct → rename → optimize → destroy flow + - SSA destruction inserts phi-resolution copies at predecessor block ends + +4. **SSA optimization engine** ✓ (initial passes implemented) + - Modular engine in `ir/opt/` with generator-based dispatch (like `thop_*` instruction builders) + - Target-independent passes in `ir/opt/`, target-specific generators in `arch/arm/` + - Backend registers generators via `tcc_ir_ssa_opt_register_target()` — generic code knows nothing about the target + - **Infrastructure (`ir/opt/ssa_opt.h` + `ir/opt/ssa_opt.c`):** + - `IRSSAOptCtx` — shared context with use-def chains per TEMP vreg + - `IRSSAOptGen` — per-opcode generator descriptor (opcode → rewrite function) + - `IRSSAOptPass` — pass descriptor (custom function or generator table) + - Use-def chain builder: scans instructions + phi nodes in one pass + - Helpers: `ssa_opt_nop_instr()`, `ssa_opt_replace_all_uses()`, `ssa_opt_run_gens()` + - **DCE (`ir/opt/ssa_opt_dce.c`):** worklist-based, use-count == 0 → NOP defining instruction → cascade + - **Copy propagation (`ir/opt/ssa_opt_cprop.c`):** generators `ssa_gen_cprop_assign` (vreg→vreg) and `ssa_gen_cprop_imm` (vreg→immediate) + - **ARM generators (`arch/arm/ssa_opt_arm.c`):** `ssa_gen_arm_fuse_mul_add_to_mla`, `ssa_gen_arm_fuse_shl_add_to_load_indexed`, `ssa_gen_arm_fuse_shl_add_to_store_indexed`, `ssa_gen_arm_reduce_mul_to_shift` + +5. **SSA-based register allocator** ✓ + - `ir/regalloc.c` (1633 lines) — arch-independent SSA-aware linear scan + - `arch/arm/arm_regalloc.c` — ARM register tables (AAPCS, VFP) + - Consumes SSA-renamed IR + phi nodes directly (no SSA destruction step) + - Algorithm: linear scan on SSA with precoloring, call-crossing, 64-bit pairs + - Phi resolution: topological sort, cycle breaking, ASSIGN insertion + - Enabled at -O1+ via `-fssa-regalloc` (default on) + - SSA optimization engine now wired in: runs between SSA rename and interval building + +### Next + +6. **Port remaining opts to SSA** + - Constant propagation → sparse conditional constant propagation (SCCP) + - CSE → dominator-tree-based value numbering (GVN) + - Dead store elimination → SSA + alias analysis + - Dead pure call elimination → use-count on call result vreg + +7. **SSA default + legacy cleanup** + - Make SSA the mandatory path — remove `-fssa` / `-fno-ssa` toggle, SSA always runs + - Remove SSA destruction (`tcc_ir_ssa_destroy`) — regalloc consumes SSA directly + - Delete legacy allocator: `tccls.c`, `ir/live.c`, associated headers + - Delete pre-SSA passes replaced by SSA equivalents from `ir/opt.c`: + - `tcc_ir_opt_dce` (replaced by `ssa_opt_dce`) + - `tcc_ir_opt_copy_prop` (replaced by `ssa_opt_cprop`) + - `tcc_ir_opt_mla_fusion`, `tcc_ir_opt_indexed_memory_fusion` (replaced by ARM generators) + - `tcc_ir_opt_const_prop`, `tcc_ir_opt_const_prop_tmp`, `tcc_ir_opt_value_tracking` (replaced by SCCP) + - `tcc_ir_opt_cse_arith`, `tcc_ir_opt_cse_global_load` (replaced by GVN) + - Remove `IROptDU` infrastructure in `ir/opt.c` (superseded by `IRSSAVregInfo` use-def chains) + - Clean up `tccgen.c` pipeline: single path through SSA construct → optimize → regalloc → codegen + - Remove `opt_ssa` / `opt_ssa_regalloc` flags from `TCCState` + - Update Makefile: remove deleted files from `IR_FILES` / `CORE_FILES` + +## Complexity Estimates + +| Component | Lines (est.) | Algorithm complexity | Status | +|-----------|-------------|---------------------|--------| +| CFG build | ~150 | O(n) — single scan | ✓ | +| Dominator tree (CHK) | ~120 | O(n * d) — fast for structured code | ✓ | +| Dominance frontier | ~80 | O(n_blocks^2) worst case, O(n) typical | ✓ | +| Phi placement | ~100 | O(vars * blocks) | ✓ | +| SSA renaming | ~150 | O(instructions) | ✓ | +| SSA destruction | ~120 | O(phi_nodes) — interim until SSA regalloc | ✓ | +| SSA opt engine | ~400 | O(n * passes) — iterative convergence | ✓ | +| SSA opt DCE | ~80 | O(n) — worklist-based | ✓ | +| SSA opt copy prop | ~120 | O(n) — generator-based | ✓ | +| ARM generators | ~400 | O(n) — per-instruction pattern match | ✓ | +| SSA linear scan regalloc | ~400 | O(n) — single pass over live intervals | | +| SCCP | ~300 | O(n) — lattice-based worklist | | +| GVN | ~400 | O(n) — dominator-tree value numbering | | +| Legacy cleanup | negative | deletion of tccls.c, live.c, redundant opt.c passes | | +| **Total** | **~2820** | | | + +## Risks and Mitigations + +| Risk | Mitigation | +|------|-----------| +| IJUMP (computed goto) makes CFG imprecise | Already handled: functions with IJUMP skip advanced opts. For SSA, treat IJUMP as jumping to all known label targets (same as today). | +| Address-taken locals can't be SSA-renamed | Don't rename them. If a VAR has its address taken (LEA of that VAR), keep it as a memory operation. Only promote non-address-taken scalars to SSA vregs. | +| Critical edges (pred has multiple succs, succ has multiple preds) | Insert empty split blocks during phi elimination. Simple, adds at most O(edges) blocks. | +| Compile-time regression | All algorithms are near-linear. CHK dominators is O(n^2) worst case on irreducible CFGs, but TCC always generates reducible CFGs (no `goto` into loops from outside). | + +## Current Status (2026-05-04) + +All IR tests (`make test -j16`) and GCC torture tests pass. + +**What is live in the pipeline at -O1+:** +- CFG construction + dominator tree + dominance frontier (`ir/cfg.c`) +- SSA phi placement + renaming for multi-block VAR defs (`ir/ssa.c`) +- SSA optimization engine (`ir/opt/`): copy propagation, DCE, ARM target generators +- SSA destruction with phi-resolution copies (`ir/ssa.c`) +- Pre-SSA optimizations including `opt_cse` / `cse_arith` (`ir/opt.c`) +- Existing liveness + linear scan register allocator (`tccls.c` + `ir/live.c`) + +**SSA optimization engine architecture:** +- Target-independent infrastructure in `ir/opt/` — use-def chains, generator dispatch, pass table +- Target-specific generators in `arch/arm/` — registered via `tcc_ir_ssa_opt_register_target()` +- Generic code has no knowledge of the underlying hardware +- Each generator is an explicit named function (like `thop_*` instruction builders) + +**Next steps:** +- Port remaining optimizations to SSA: SCCP, GVN (step 6) +- Legacy cleanup: make SSA default, remove tccls.c + ir/live.c + redundant opt.c passes (step 7) + +## Non-Goals (explicitly out of scope for current phase) + +- Mem2Reg / SROA (needed eventually, not for current phase) +- Pruned SSA (full SSA is simpler to implement, prune later) +- Incremental SSA updates (rebuild from scratch each time is fine) +- Spill weight heuristics (use simple "most uses = least spill priority" initially) diff --git a/docs/plan_ssa_regalloc.md b/docs/plan_ssa_regalloc.md new file mode 100644 index 00000000..b815801d --- /dev/null +++ b/docs/plan_ssa_regalloc.md @@ -0,0 +1,201 @@ +# SSA-Based Register Allocator — Implementation Plan + +## Context + +Step 4 of `plan_ssa.md`: replace `tcc_ir_liveness_analysis()` + `tcc_ls_allocate_registers()` with a clean SSA-aware register allocator. The current allocator (`tccls.c`) works on flat IR after SSA destruction. The new allocator operates directly on SSA-renamed IR with phi nodes — simpler liveness, no lossy SSA destruction, and cleanly separated from the old code. + +## Pipeline + +Current: +``` +SSA construct → rename → destroy → optimize → liveness(ir/live.c) → allocate(tccls.c) → codegen +``` + +New (when `-fssa-regalloc` enabled): +``` +[SKIP first SSA pass] → optimize → [build SSA] → SSA regalloc → codegen +``` + +Skip the first SSA pass when SSA regalloc is enabled. Optimizations work without it (they did before SSA was added). After optimization, VARs still have multi-defs, and the existing `ir/ssa.c` handles VARs natively. + +When disabled: pipeline unchanged. + +## File Layout — Arch-Independent vs Arch-Dependent + +### Arch-independent: `ir/regalloc.c` + `ir/regalloc.h` + +Core SSA register allocator — no ARM-specific knowledge: + +- **SSA live interval building**: scan SSA instructions + phi nodes → `[start, end]` per vreg +- **Linear scan allocation**: sort intervals by start, sweep, assign from abstract register pools +- **Phi resolution**: sequentialize parallel copies, insert ASSIGN instructions +- **Instruction array rebuild**: fix jump targets, remap indices + +The allocator receives register constraints through an abstract interface: + +```c +/* Arch-independent register class descriptor */ +typedef struct RegAllocClass { + int num_regs; /* total registers in class */ + const int *caller_saved; /* caller-saved register list */ + int num_caller_saved; + const int *callee_saved; /* callee-saved register list */ + int num_callee_saved; + int pair_align; /* 1 = pairs must be even-aligned (AAPCS) */ +} RegAllocClass; + +/* Arch-independent allocation target */ +typedef struct RegAllocTarget { + RegAllocClass int_class; /* integer registers */ + RegAllocClass fp_class; /* float/VFP registers */ + int param_regs; /* number of parameter registers (e.g. 4) */ + int static_chain_reg; /* -1 if none */ +} RegAllocTarget; +``` + +Entry point: +```c +void tcc_ir_ssa_regalloc(TCCIRState *ir, const RegAllocTarget *target, int spill_base); +``` + +### Arch-dependent: `arch/arm/arm_regalloc.c` + `arch/arm/arm_regalloc.h` + +ARM-specific register set definitions: + +```c +/* Provides the RegAllocTarget for ARM Thumb-2 */ +const RegAllocTarget *arm_get_regalloc_target(void); +``` + +Contains: +- R0-R3 as caller-saved, R4-R11 as callee-saved (AAPCS) +- VFP register set (S0-S15 caller-saved) +- Even-aligned pair rule for 64-bit (R0:R1, R2:R3, etc.) +- Parameter register count (4) +- Static chain register (R10) + +Small file (~50 lines) — just data tables, no algorithms. + +## Algorithm Details + +### SSA Live Interval Building + +For each vreg in SSA-renamed IR, compute `[start, end]`: + +1. **Scan instructions**: For each instruction `i`: + - Each USE vreg: extend `end = max(end, i)` + - Each DEF vreg: set `start = i` (single-def in SSA) + +2. **Process phi nodes**: For each block `b`, for each phi: + - `phi.dest_vreg`: set `start = b.start_idx` + - For each operand `(vreg_k, pred_k)`: extend `vreg_k.end = pred_block.end_idx - 1` + +3. **FUNCPARAMVAL chains**: Extend parameter vreg intervals from FUNCPARAMVAL to corresponding FUNCCALL + +4. **Call crossings**: Build call-site prefix-sum array, check if interval spans any call + +5. **PARAMs**: Start at instruction 0, precolored to parameter registers + +6. **Address-taken VARs**: Not SSA-renamed; mark `addrtaken=1`, force stack + +### Linear Scan Allocation + +New implementation, independent of `tccls.c`: + +1. Sort intervals by start point (params first for precoloring) +2. Sweep in order, maintain active set (sorted by end point): + - Expire intervals ending before current start → free their registers + - If address-taken: force spill to stack + - If crosses call: prefer callee-saved register + - If 64-bit: allocate aligned pair (from `RegAllocTarget` pair rules) + - If float: allocate from float register class + - If no register available: spill (evict interval with fewest uses / longest range) +3. Track dirty_registers bitmap for prologue/epilogue + +Output: write directly to `IRLiveInterval.allocation` (r0, r1, offset) via `tcc_ir_stack_reg_assign()` — same output format consumed by `machine_op_from_ir()`. + +### Phi Resolution (after allocation) + +For each predecessor block, collect all phi copies `(dest_reg, src_reg)`: +1. Filter identity copies (dest == src) +2. Topological sort for dependency order +3. For cycles: break with scratch register or temp stack slot +4. Insert ASSIGN instructions before block terminator + +### Instruction Array Rebuild + +Same pattern as `tcc_ir_ssa_destroy()`: +1. Build `old_to_new[]` index mapping +2. Fix JUMP/JUMPIF targets, switch table targets, `is_jump_target` flags +3. Remap `IRLiveInterval.start/end` +4. Build `live_regs_by_instruction` table from final intervals + +## Pipeline Integration (`tccgen.c`) + +```c +/* SSA for optimizations — skip when SSA regalloc handles it later */ +if (tcc_state->opt_ssa && !tcc_state->opt_ssa_regalloc) { + /* existing: construct → rename → destroy */ +} + +/* ... optimizations as today ... */ + +/* Register allocation */ +if (tcc_state->opt_ssa_regalloc) { + const RegAllocTarget *target = arm_get_regalloc_target(); + tcc_ir_ssa_regalloc(ir, target, loc); +} else { + tcc_ir_liveness_analysis(ir); + tcc_ls_allocate_registers(&ir->ls, ...); +} + +/* ... rest unchanged: move coalescing, patch, params, stack, codegen ... */ +``` + +## Files to Create/Modify + +| File | Change | +|------|--------| +| `ir/regalloc.c` | **NEW** — arch-independent SSA regalloc (~400 lines) | +| `ir/regalloc.h` | **NEW** — `RegAllocTarget`, `tcc_ir_ssa_regalloc()` | +| `arch/arm/arm_regalloc.c` | **NEW** — ARM register set tables (~50 lines) | +| `arch/arm/arm_regalloc.h` | **NEW** — `arm_get_regalloc_target()` | +| `ir/ir.h` | Add `#include "regalloc.h"` | +| `tccgen.c` | Route to SSA regalloc when flag enabled (~20 lines) | +| `tcc.h` | Add `opt_ssa_regalloc` field to `TCCState` (near line 1144) | +| `libtcc.c` | Add `"ssa-regalloc"` to `-f` flag table (near line 1738) | +| `Makefile` | Add `ir/regalloc.c` + `arch/arm/arm_regalloc.c` to build | + +Files NOT modified: `tccls.c`, `ir/ssa.c`, `ir/cfg.c`, `ir/live.c`, `ir/codegen.c`, `arm-thumb-gen.c`, `ir/machine_op.c` + +## Functions to Reuse (read-only) + +- `tcc_ir_cfg_build()`, `tcc_ir_cfg_compute_dominators()`, `tcc_ir_cfg_compute_dom_frontiers()` — `ir/cfg.c` +- `tcc_ir_ssa_construct()`, `tcc_ir_ssa_rename()`, `tcc_ir_ssa_free()` — `ir/ssa.c` +- `tcc_ir_stack_reg_assign()` — `ir/stack.c` (writes `IRLiveInterval.allocation`) +- `tcc_ir_mark_return_value_incoming_regs()` — `ir/codegen.c` +- `tcc_ir_vreg_live_interval()` — `ir/vreg.c` +- `irop_config[]`, `tcc_ir_op_get_dest/src1/src2()`, `irop_get_vreg()` — `tccir_operand.h` + +## Implementation Order + +1. Create `arch/arm/arm_regalloc.h` + `arch/arm/arm_regalloc.c` — ARM register tables +2. Create `ir/regalloc.h` — `RegAllocTarget` structs + `tcc_ir_ssa_regalloc()` declaration +3. Create `ir/regalloc.c` — skeleton entry point, SSA build, live interval computation +4. Implement linear scan allocation (writes `IRLiveInterval.allocation` directly) +5. Implement phi resolution + instruction array rebuild +6. Wire into pipeline: `tccgen.c`, `tcc.h`, `libtcc.c`, `Makefile`, `ir/ir.h` +7. Test: `make test -j16`, `make test-gcc-torture-compile` + +## Verification + +```bash +make cross +# Test at -O0 with SSA regalloc +cd tests/ir_tests && python run.py -c 01_hello_world.c --cflags="-fssa-regalloc" +# Test at -O1 +cd tests/ir_tests && python run.py -c 01_hello_world.c --cflags="-O1 -fssa-regalloc" +# Full suites +make test -j16 +make test-gcc-torture-compile +``` diff --git a/docs/register_allocator_improvements.md b/docs/register_allocator_improvements.md new file mode 100644 index 00000000..3c1d93c4 --- /dev/null +++ b/docs/register_allocator_improvements.md @@ -0,0 +1,105 @@ +# Register Allocator Improvement Opportunities + +## Current State (25 vs 19 instructions for bench_array_sum) + +The remaining 6-instruction gap is entirely register allocation and stack layout quality: + +| Gap | TCC | GCC | Root Cause | +|---|---|---|---| +| 2 instr | `push/pop {r4}` | no callee-save | r4 used for inner loop temp; r12 not available | +| 2 instr | `add r3,sp,#8; add.w r3,#1024` | `add r1,sp,#1020` | End pointer computed in 2 instructions | +| 1 instr | `mov r0, r1` | sum already in r0 | Return value not in r0 | +| 1 instr | `subw sp,#1036` (wide) | `sub.w sp,#1024` | 12 extra bytes frame padding | + +--- + +## 1. R12 (IP) for Allocation + +### Goal +Add r12 to the allocator pool as a caller-saved register. This gives 5 caller-saved registers (r0-r3, r12) instead of 4, eliminating callee-save push/pop when register pressure is 5. + +### Current Blocker +~30 places in `arm-thumb-gen.c` hardcode `R_IP`/`R12`/`ARM_R12` without going through the scratch allocator. These would clobber any value the allocator placed in r12. + +### Hardcoded R12 uses that need conversion to scratch allocator: + +**Stack manipulation (prologue/epilogue):** +- `arm-thumb-gen.c:3116-3117` — `MOV R_IP, R_SP` for dynamic stack alloc +- `arm-thumb-gen.c:3131-3132` — Load via R_IP for stack restore +- `arm-thumb-gen.c:7881-7892` — Argument area setup uses R12 directly +- `arm-thumb-gen.c:7910-7912` — Vararg store uses R_IP + +**Struct handling:** +- `arm-thumb-gen.c:8577-8590` — `get_struct_base_addr_mop` defaults to ARM_R12 +- `arm-thumb-gen.c:9035` — Same pattern in store path +- `arm-thumb-gen.c:9106` — Returns R_IP as fallback + +**Direct scratch use:** +- `arm-thumb-gen.c:8100` — `int temp = R_IP` for parameter copy +- `arm-thumb-gen.c:9654-9655` — Stack load uses ARM_R12 for offset + +**PIC/GOT/text-data separation:** +- `arm-thumb-gen.c:6721,7298,7376` — POP uses R12 for GOT reload + +### Required changes: +1. Convert each hardcoded R12 use to call `get_scratch_reg_with_save()` instead +2. Ensure each converted site properly saves/restores if r12 is live +3. Add r12 to `caller_saved_registers` bitmap +4. Change `registers_for_allocator = 13` +5. Cap `tcc_ls_assign_callee_saved_register` to r4-r11 (exclude r12) +6. Update `tcc_ls_assign_any_register` allocation order: r0-r3, r12, r4-r11 + +### Risk +High — each hardcoded site needs careful analysis of what registers are excluded and whether the scratch save/restore interacts with the surrounding code correctly. + +--- + +## 2. Return Value Precolor Priority (Eviction) + +### Goal +When the allocator processes a precolored interval (e.g., return value hinted to r0) and the preferred register is already taken by an uncolored interval, evict the uncolored interval to a different register. + +### Current Blocker +Linear scan processes intervals in start-point order. The return value vreg (V0, start=10) is processed AFTER the loop counter (V3, start=9). V3 gets r0 first. When V0 tries r0, it's taken and falls back to r1. Result: `mov r0, r1` at return. + +### Failed Approach: Retroactive Eviction +Attempted: when precolored V0 can't get r0, find V3 in the active set, release r0, and reassign V3 to a different register. + +**Why it fails:** Retroactive reassignment changes the register for V3's ENTIRE interval. If another interval (V2) was assigned r1 during [7,12] while V3 was in r0 during [9,21], moving V3 to r1 creates an overlap [9,12] where both V3 and V2 are in r1. This produces incorrect codegen. + +### Correct Approaches (not yet implemented): + +**A. Interval Splitting:** +Split the conflicting interval at the eviction point. V3 stays in r0 for [9, eviction_point], then moves to r1 for [eviction_point, 21]. Requires inserting a MOV at the split point and managing two sub-intervals. + +**B. Priority-Based Sorting:** +Sort intervals so precolored ones are processed first among those with the same start point. Doesn't help when start points differ (V3=9 vs V0=10). + +**C. Second-Chance Allocation:** +After all intervals are processed, scan for precolored intervals that didn't get their preferred register. Try to swap with the conflicting interval if safe (no overlap with other intervals in the new register). + +**D. Graph Coloring:** +Replace linear scan with a graph-coloring allocator that handles preferences natively. Significant complexity increase. + +### Recommendation +Approach C (second-chance) is safest and simplest. After the main allocation loop, for each precolored interval that missed its hint: +1. Find the interval currently holding the desired register +2. Check if the desired register is free for the blocker's entire range (scan all intervals) +3. If safe, swap registers +4. If not safe, leave as-is + +--- + +## 3. Loop Bound Rematerialization Without Calls + +### Goal +The inner sum loop computes `end = SP+8+1024` in 2 instructions and keeps it in r3 for the entire loop. If rematerialized inside the loop (1 instruction per iteration), r3 is freed for the loaded value, avoiding r4 (callee-save). + +### Current State +`tcc_ir_opt_loop_bound_remat` only fires for loops containing function calls. The inner sum loop has no calls, so it's skipped. + +### Required Change +Relax the `has_calls` guard to also allow remat when register pressure exceeds caller-saved capacity (>4 simultaneous live values). Requires estimating live count at the IR level before register allocation. + +### Trade-off +Adds 1 instruction per inner loop iteration (the remat ADD) but saves 2 instructions total (push/pop r4). Net benefit depends on loop trip count — beneficial for loops with many iterations. diff --git a/docs/selfhost_miscompile_debugging.md b/docs/selfhost_miscompile_debugging.md new file mode 100644 index 00000000..a1a6b2dc --- /dev/null +++ b/docs/selfhost_miscompile_debugging.md @@ -0,0 +1,270 @@ +# Debugging self-host miscompiles (armv8m-tcc) + +A **self-host miscompile** is when the **cross** compiler (`bin/armv8m-tcc`, an x86 +binary built by gcc that *emits* ARM Thumb-2) compiles tinycc's own source into a +**native** compiler (the ARM `armv8m-tcc` that runs on the device) whose machine +code is subtly wrong. The source is correct — the same tinycc logic compiles a +test correctly when run as the cross, but wrong when run as the self-hosted +native binary. Symptom: a test program built **on the device** misbehaves +(infinite loop, wrong output, HardFault) even though the host cross builds it +fine. + +Most remaining `tests2` failures are this class. This guide is the repeatable +workflow to nail them. Worked example throughout: `09_do_while` (do-while loop +ran forever — fixed in `ir/regalloc.c ra_resolve_phis`). + +--- + +## 0. The mental model (read this first) + +``` +gcc ──compiles──> bin/armv8m-tcc (CROSS: x86 host binary, emits ARM) + │ + │ compiles tinycc's own *.c ← a bug HERE is the culprit + ▼ + native armv8m-tcc (rootfs/usr/bin/tcc: ARM, runs on device) + │ + │ compiles tests2/NN.c + ▼ + /tmp/NN (device binary that misbehaves) +``` + +Two independent facts pin it as a self-host bug: +1. **Host cross compiles the test correctly** — so the test source and tinycc + *logic* are fine. +2. **Device (native) compiles it wrong** — so the native binary's code for some + tinycc function `F` is wrong, i.e. **the cross miscompiled `F`**. + +There are two fix strategies (both valid, §6): +- **(A) Source workaround** in the tinycc function `F`: rewrite `F` so the cross + happens to compile it correctly. Fast, local, low-risk. (What `09` used.) +- **(B) Fix the cross codegen bug** itself: find the wrong ARM the cross emits and + fix the cross's optimizer/backend. Harder, but fixes *every* test that trips the + same bug at once. Prefer this when the same bug class recurs. + +--- + +## 1. Fast device round-trips: the FAT drive (use this, not RAM-scan) + +The slow/flaky way (`scripts/qemu_capture_yaff.py`) scans guest RAM for binaries. +The fast way is the host-readable FAT drive mounted at **`/mnt`** on the QEMU +guest — drop sources in, pull device-compiled binaries out, **no kernel rebuild**. +See [memory: yasos-qemu-fatdisk-host-drive] for the full design. One-liner: + +```bash +.qemu_smoke_venv/bin/python3 scripts/qemu_fatdisk_run.py \ + --put libs/tinycc/tests/tests2/09_do_while.c:IN.C \ + --cmd 'tcc -x c /mnt/IN.C -o /mnt/OUT; echo CC=$?; /mnt/OUT; echo RC=$?' \ + --get OUT:.cache/09_dev.elf \ + --backing .cache/bk.bin --img .cache/fd.img --boot-wait 7 --timeout 14 +``` + +- `--put HOST:FATNAME` puts a file on the drive; `--get FATNAME:HOST` pulls one out. +- `--cmd` runs on the guest shell; stdout/stderr stream live to the log (a runaway + guest is bounded by `--timeout`, not infinite). +- **8.3 UPPERCASE names only** (FatFs `FF_USE_LFN=0`): a source lands as `IN.C`; + tcc rejects `.C` → **always pass `tcc -x c`**. +- **Don't `ls /mnt`** — a kernel FatFs readdir bug panics ("invalid enum value"). + Compiling (open/read/write) is fine. +- It needs the QEMU kernel built with the `/mnt` drive support (already in tree: + `hal/.../ramflash.zig`, `linker_script.ld` fatdisk window, `main.zig` mount). + +Carve + disassemble the captured YAFF binary (`main` is after the crt0 stub — +look for `push {r4,...}` / `movs r4,#1`): + +```bash +python3 - <<'PY' +import struct; d=open('.cache/09_dev.elf','rb').read() +cl=struct.unpack_from('` +macro bug) and often won't even compile. Don't trust `-O0`-native as a bisector. + +--- + +## 3. Localize the miscompiled tinycc function + +This is the heart of the work. Narrow from "the test is wrong" to "tinycc +function `F`, this exact computation". + +### 3a. Narrow the *language feature* (cheap, FAT-drive) +Build one test program exercising several constructs and see which misbehaves. +`09` narrowed to **do-while only** (a `for`+`while`+`do-while` program: `for`/`while` +exited, `do-while` ran forever) → the bug is on the do-while codegen path. + +### 3b. See the IR and which *pass* transforms it (host, instant) +Build a **debug cross** (dumps IR; no device needed). Clean stale objects first — +a prior native build leaves ARM `.o`s that break the x86 cross link +("file in wrong format"): + +```bash +cd libs/tinycc +rm -rf armv8m-arch armv8m-ir armv8m-*.o *.o arm-eabi-*.o +SR=$PWD/../../rootfs +./configure --extra-cflags="-DTCC_DEBUG=1 -DCONFIG_TCC_DEBUG=1 -g -O1 -DTARGETOS_YasOS=1 -DCONFIG_TCC_BCHECK=0" \ + --enable-cross --config-asm=yes --config-pie=yes --config-pic=yes --debug --enable-O1 \ + --prefix=$PWD --sysroot=$SR --sysincludepaths="{B}/include:$SR/usr/include" \ + --crtprefix="$SR/usr/lib" --libpaths="$SR/usr/lib:$SR/lib" +make armv8m-tcc -j8 +./armv8m-tcc -dump-ir -c tests/tests2/09_do_while.c -o /tmp/x.o # 3 checkpoints +./armv8m-tcc -dump-ir-passes=all -c tests/tests2/09_do_while.c -o /tmp/x.o # after every pass +``` + +Diff the IR across passes to find the one that produces the wrong shape. For `09` +the inverted exit branch only appears in the **"AFTER OPTIMIZATIONS"** dump using +`R`-registers → it's introduced during **register allocation** (after the last +`-dump-ir-passes` checkpoint), specifically the phi-copy insertion in +`ra_resolve_phis`. (NB this debug cross is correct — it shows the *intended* IR, +e.g. exit target = 18. The device computes a different value; the gap localizes it.) + +### 3c. Get the *device's* actual values (one native rebuild) +When the IR transform is the suspect, add a one-off `fprintf(stderr, ...)` to the +relevant pass dumping the indices/targets it computes, rebuild the native tcc, +and run on the device via the FAT drive. For `09`, instrumenting +`tcc_ir_codegen_backpatch_jumps` printed `target_ir=15` (should be 18) for the +exit JUMPIF — proving the **target index in the IR was already wrong**, not the +address encoding. Remove the instrumentation afterwards. + +Rebuild native + kernel (the device tcc lives in the incbin'd romfs): +```bash +rm -f libs/tinycc/.yasos-build/native-stage1.stamp libs/tinycc/.yasos-build/native-stage2.stamp +./build_rootfs.sh -o rootfs.img # cross unchanged → only native rebuilds (~3-5 min) +rm -rf .zig-cache && zig build -Doptimize=ReleaseSafe # re-embed romfs (~1 min) +``` +(If you changed a file compiled into the *cross* too, also `rm .yasos-build/cross.stamp` +and the whole thing rebuilds, ~8-10 min.) + +--- + +## 4. Spot the cross's miscompile (disassembly) + +Once you know function `F` (e.g. `ra_resolve_phis` in `ir/regalloc.c`), look at the +ARM the **cross** emits for it. The cross compiles each tinycc TU; reproduce that +exact compile and disassemble `F`: + +```bash +cd libs/tinycc +# flags taken from the native build log line "armv8m-tcc -o armv8m-... -c ir/regalloc.c ..." +./bin/armv8m-tcc -o /tmp/F.o -c ir/regalloc.c \ + -DCONFIG_TCC_CROSSPREFIX='"armv8m-"' -I. -I./ir -I./ir/opt -DTCC_DEBUG=0 -g -O1 \ + -DTCC_ARM_VFP -DTCC_ARM_EABI=1 -DCONFIG_TCC_BCHECK=0 -DTCC_ARM_HARDFLOAT \ + -DTCC_TARGET_ARM_ARCHV8M -DTARGETOS_YasOS=1 -DTCC_TARGET_ARM_THUMB -DTCC_TARGET_ARM \ + -DTCC_IS_NATIVE -I$PWD/../../rootfs/usr/include -fpie -fPIE -mcpu=cortex-m33 \ + -fvisibility=hidden -std=c11 -Wno-declaration-after-statement +arm-none-eabi-objdump -dr /tmp/F.o | awk '/:/{f=1} f{print} f&&/^$/{exit}' +``` + +**How to know which instruction is wrong** (you need a notion of "correct"): +- **Golden ARM reference**: compile the same TU with `arm-none-eabi-gcc -O1 -mcpu=cortex-m33` + and diff the disassembly of `F`. Divergence that changes semantics = the cross bug. +- **Cross at -O0 vs -O1**: `./bin/armv8m-tcc -O0 -c …` vs `-O1`; the bug usually + rides an optimization, so `-O0` shows the intended behavior. +- **Reason from source**: e.g. for `09` the wrong value implied a stale register + read of an address-taken local across a call. + +Known good-vs-bad patterns already found this way (all in MEMORY.md): dropped +`< 0` branch + (~line 3168): a loop back-edge needing phi copies is rewritten from + `JUMPIF(cond)→top` into `JUMPIF(!cond)→exit; ; JUMP→top`. +- **Wrong computation**: the skip/exit target was stored as + `skip_dest.u.imm32 = -(wp + 2)` **before** `ra_emit_scheduled_phi_copies(…,&wp,…)` + advanced `wp`. `wp` is an **address-taken local** (`&wp` passed to the call). +- **Cross bug**: the cross cached `wp` in a register and did **not reload it after + the call** for that one expression (the adjacent JUMP-write *did* reload it) → + native used the stale pre-copies `wp` → exit target landed mid-body (IR 15) not + the epilogue (18) → `bge 0xee` → infinite loop. +- **Fix (strategy A, source)**: move the skip-target store to **after** the JUMP + write, using the now-fresh `wp`: `skip_dest.u.imm32 = -(wp + 1)`. Logically + identical on the host; sidesteps the stale-register read on the device. +- The deeper cross bug (call not invalidating a cached address-taken local) is + **latent** — strategy B would fix it for all callers. + +--- + +## 6. Fix, then verify + +**Strategy A (source workaround)** — edit `F`, rebuild (§3c), FAT-run the test: +the program must now behave (e.g. `09` prints `1..89` then `RC=0`; log ~400 B, not +~800 KB of runaway output). + +**Strategy B (fix the cross)** — fix the cross's codegen/optimizer, `rm +.yasos-build/cross.stamp`, full rebuild, retest. This is preferred when the same +bug class blocks several tests: fix once, many tests pass. + +**Always regression-test** — the official suite, reusing the current build: +```bash +./scripts/run_qemu_smoke.sh --no-build tcc_suite_test.py # full suite +./scripts/run_qemu_smoke.sh --no-build tcc_suite_test.py -k 09_do_while # one test +``` +A regalloc/codegen fix can affect unrelated loops — run the whole suite, not just +the target. + +--- + +## 7. Gotchas (each cost real time) + +- **`pkill -f qemu-system-arm` SELF-KILLS your shell** — the pattern string is in + the shell's own command line. Kill genuine QEMU by `comm`: + `ps -eo pid,comm | awk '$2=="qemu-system-arm"{print $1}' | xargs -r kill -9`. + Likewise never write `until ! pgrep -f qemu_fatdisk_run; do …` — the loop's own + cmdline matches the pattern, so it never exits. +- **Stale ARM objects break the x86 cross link** — after a native build, the cross + build fails with "file in wrong format". `rm -rf armv8m-arch armv8m-ir armv8m-*.o *.o`. +- **`config.mak` flips between cross and native** — `build_rootfs.sh` reconfigures + each as needed; if building manually, reconfigure for the mode you want + (`--enable-cross` for the cross). +- **Native rebuild is the slow loop** (~3-5 min) + kernel re-embed (~1 min). The + device tcc (~2 MB) does **not** fit the 1 MB `/mnt` window, so you can't swap + just the tcc binary — rebuild the romfs+kernel. Minimize native rebuilds: do all + the host-side localization (§2, §3b, §4) first. +- **`-O0` native shifts the bug** — don't use it as a clean bisector. +- **`NATIVE_TCC_OPT_OVERRIDE`** env var (added to `build_rootfs.sh`) overrides the + native opt level (default `-O1`) for experiments without editing the script. +- The bump commit is **not** automatically the cause — verify by reverting it; for + `09`, reverting `e65f29d0` did not fix it (long-standing bug). + +--- + +## 8. Checklist per test + +1. FAT-run the failing test; capture device binary + behavior (§1). +2. Confirm host cross is correct → self-host (§2). +3. Narrow the feature (§3a), then the pass via `-dump-ir-passes=all` on a debug + cross (§3b); if needed, instrument the pass for the device's actual values (§3c). +4. Disassemble `F` as the cross compiles it; find the wrong instruction vs a golden + reference (§4). +5. Fix (A source workaround, or B cross codegen) (§6). +6. FAT-verify the test, then run the **full** smoke suite (§6). +7. Update MEMORY.md / the per-bug memory with root cause + fix. diff --git a/docs/tcc_speedup_plan.md b/docs/tcc_speedup_plan.md new file mode 100644 index 00000000..3c31f526 --- /dev/null +++ b/docs/tcc_speedup_plan.md @@ -0,0 +1,91 @@ +# Plan — speed up device tcc by closing the inlining gap + +Companion to [tcc_vs_gcc_O2_codegen_report.md](./tcc_vs_gcc_O2_codegen_report.md). Goal: cut +device compile CPU by inlining the hot `static inline` helpers tcc currently emits out-of-line. + +## Facts the plan is built on + +- tcc has **no C-function inliner**; `static inline` → one out-of-line copy per TU, never inlined. +- `IROperand` is **9 bytes**, passed/returned **by value** → every accessor call does an sret + struct copy + table lookups + bounds checks, and none of it CSEs across calls. +- Call-site counts (the leverage): `irop_get_vreg` **1351**, `tcc_ir_op_get_src1` **924**, + `tcc_ir_op_get_dest` **871**, `tcc_ir_op_get_src2` **557**, `irop_make_imm32` **175**. +- The accessors are **branchy / multi-statement** (table lookup + bounds guard + sentinel + handling) — so they are *not* trivially macro-izable; a real inliner or careful + statement-expression macros are needed. +- `tccpp.c` (lexer/preprocess) is **~60% of compile CPU**; the IR accessors dominate the backend. + +## Build/validation harness (applies to every phase) + +- **★ Clean rebuild after header edits.** The tinycc Makefile has no header dependency tracking; + editing `tccir_operand.h` / `tccir.h` / `tcc.h` requires `rm *.o ir/*.o ir/opt/*.o` (or + `make distclean`) or you get stale-object SEGVs. (Known gotcha, see memory.) +- **CPU measurement:** `scripts/tcc_profile.py -n 30` (device-representative `Ir`), plus + `--save`/`--compare` for before/after deltas. Also profile `-O1`/`-O2` compiles, not just `-O0`. +- **Size:** `arm-none-eabi-nm -S bin/armv8m-tcc.elf` totals + per-helper copy counts. +- **Correctness:** QEMU smoke suite (must stay 412 pass / 0 undefined) + the tcc test suite; + confirm self-host rebuild is byte-stable (cross-built tcc and self-built tcc agree). + +## Phase 0 — Validate the lever (½ day, throwaway branch, no compiler change) + +Prove the predicted win before investing in an inliner. + +1. Force-inline the single hottest cluster only — `tcc_ir_op_get_src1/src2/get_dest` + + `irop_get_vreg` — by rewriting them as GNU statement-expression macros (`({ ... })`, which tcc + supports) **or** `__attribute__((always_inline))` if tcc honors it (check first; likely not). +2. `rm` objects, rebuild the **cross** `armv8m-tcc` (x86), re-run `scripts/tcc_profile.py + --compare base.json` on `129_scopes.c` at `-O0` and `-O1`. +3. **Decision gate:** if total `Ir` drops materially (expect several %), continue to Phase 1. + If not, the cost is elsewhere (struct-by-value ABI, table lookups) → pivot to Phase 1-B. + +Capture `base.json` from the *current* tree first so the comparison is honest. + +## Phase 1 — Pick the implementation path (decision gate after Phase 0) + +### Path A — minimal inliner in tcc (preferred if Phase 0 win is broad) +Highest leverage, compounds (an inlining tcc builds a faster tcc), fixes the 226 KB duplication +too. Higher risk given this fork's history of self-host miscompiles — so keep it **conservative +and gated**: +- Inline only functions that are: marked `inline`/`static inline`, single `return` or + straight-line + ≤1 branch, below an IR-instruction-count threshold, non-recursive, no varargs, + no address-taken. Everything else untouched. +- Implement at the IR/frontend boundary (where call lowering happens), behind a flag + (`-finline` / config define) defaulted off until validated, so it can be bisected like every + other opt pass in this tree. +- Validate with the full self-host + QEMU loop after **every** increment. + +### Path B — targeted, no new pass (fallback / lower risk) +- Macro-ize (statement-expression) the top ~8 hottest accessors from the report: + `irop_get_vreg`, `irop_set_vreg`, `tcc_ir_op_get_src1/2`, `tcc_ir_op_get_dest`, `irop_get_tag`, + `irop_make_imm32`, `irop_init_phys_regs`. +- **Plus** the orthogonal ABI win: change the worst by-value-9-byte-struct accessors to take + `const IROperand *` / write through an out-pointer, killing the sret copy even where inlining + doesn't reach. (Invasive across call sites — script the rewrite, do one accessor at a time.) +- Do the lexer helpers too (`cstr_ccat`, `tok_str_add2`, `token_lookup_cache_find`, + `default_reallocator`) — they sit in the 60%-CPU bucket. + +Recommendation: **start Path B** (safe, incremental, immediately shippable), and pursue Path A +only if Phase 0 shows the general inliner is worth the miscompile risk. + +## Phase 2 — Correctness & stability + +- QEMU smoke 412/0; tcc suite green; self-host byte-stability check. +- Watch for the known traps: stale-object SEGVs (clean rebuild), `build_rootfs.sh` not + fail-fast on cross `-Werror` (grep build.log for `error:`), statement-expression macros + double-evaluating arguments with side effects (audit each macro's args). + +## Phase 3 — Measure, report, decide next lever + +- Before/after: profiler `Ir` (total + per-fn), `.text` size, helper copy counts, and a real + device compile-time round-trip on a representative source. +- Update the report with measured deltas. Next lever after inlining is the §4 +19% codegen + quality (jump tables for dense enum switches, machine-level CSE of struct-field reloads). + +## Deliverables checklist + +- [ ] `base.json` profiler baseline committed/saved +- [ ] Phase 0 experiment branch + measured `Ir` delta +- [ ] Path decision recorded (A vs B) with the numbers behind it +- [ ] Implementation behind a flag, validated incrementally +- [ ] QEMU smoke + self-host stability green +- [ ] Report updated with before/after diff --git a/docs/tcc_vs_gcc_O2_codegen_report.md b/docs/tcc_vs_gcc_O2_codegen_report.md new file mode 100644 index 00000000..a48af09d --- /dev/null +++ b/docs/tcc_vs_gcc_O2_codegen_report.md @@ -0,0 +1,156 @@ +# tcc -O2 (self-host) vs arm-none-eabi-gcc -O2 — codegen comparison + +**Date:** 2026-06-23 · **Target:** Cortex-M33 / armv8m thumb · **Question:** where is the +device tcc leaving compile-time performance on the table, measured against a "good codegen" +reference? + +## Method + +The device compiler `bin/armv8m-tcc.elf` is built **by tcc compiling its own sources** with +`-O2 -mcpu=cortex-m33` (the self-host stage in `build_rootfs.sh`). To get a reference for how +good that codegen *could* be, I compiled the **same 81 translation units** (CORE + IR + arm +backend, from the Makefile's `armv8m_FILES`) with `arm-none-eabi-gcc -O2 -mcpu=cortex-m33 +-mthumb -fpie`, same TCC defines. The gcc build is **not linked or run** — it only exists to +diff codegen quality per function. All 81 TUs compiled (2 needed `-fpermissive` / a `dlfcn.h` +stub; neither is a hotspot). + +I then matched functions **by name across both builds** (the `.elf` carries ~3900 symbols incl. +libc/native code the gcc objects don't; comparing only the 1547 functions present in **both** +keeps it apples-to-apples) and weighted everything by `scripts/tcc_profile.py` — the +device-representative CPU profile (callgrind `Ir` on the x86 cross, which runs the identical +codegen path) for the default `-O0` compile of `129_scopes.c`. + +**Caveats (read before acting):** +- Code size is a *proxy* for cycles. On the M33 (no data cache) instruction-fetch ∝ size is a + fair proxy, but data traffic also costs — so the profiler `Ir` weighting, not raw size, is the + authority on "what's hot." +- gcc and tcc inline differently, which **confounds per-function size** (see §3). I call this out + where it matters rather than letting it mislead. +- The gcc build drops `TCC_IS_NATIVE` and forces `CONFIG_TCC_STATIC` / `CONFIG_TCC_SEMLOCK=0` to + build under newlib. These only touch `tcc_run`/threading glue — none of the hot codegen. + +## 1. Headline numbers + +| metric | value | +|---|---| +| `.text` of device `armv8m-tcc.elf` | **2.26 MB** | +| matched-function total, **gcc -O2** | 1,152,516 B | +| matched-function total, **tcc -O2** | 1,368,164 B | +| **tcc / gcc ratio** | **1.19×** (tcc emits +19% more code on equal functions) | +| `.text` that is **duplicated inline-helper copies** | **~226 KB (10% of .text)** | + +Two distinct, independently-actionable problems fall out: a **systemic inlining gap** (§2, +the big one) and a **per-function codegen-quality gap** (§4, the steady +19%). + +## 2. Root cause #1 — tcc has *no* function inliner (biggest lever) + +There is **no C-function inlining pass anywhere in tcc** (the IR optimizer's only "inline" +references are inline-*asm*). `static inline` in a header is compiled as an ordinary function: +**emitted once per TU that references it, and never inlined into a call site.** + +The IR operand layer (`tccir_operand.h`) is *designed* around tiny by-value struct accessors +that assume the compiler inlines them. It doesn't. Measured copies in the two binaries: + +| helper (`static inline`, hot IR loops) | tcc copies | gcc copies | +|---|---|---| +| `irop_set_vreg` | **42** | 0 (fully inlined) | +| `irop_init_phys_regs` | **37** | 0 (fully inlined) | +| `irop_get_vreg` | **53** | 14 | +| `tcc_ir_op_get_src1` | **55** | 20 | +| `irop_make_imm32` | **31** | 1 | + +Same function, per-function size blowups (tcc ÷ gcc): `irop_make_imm32` **49×**, +`tcc_ir_op_get_dest` **9.4×**, `tcc_ir_op_get_src2` **9.1×**, `irop_get_imm64_ex` **5.3×**, +`irop_get_vreg` **5.1×**. + +This costs **twice**: +1. **CPU (the point of this exercise):** every IR operand touched during codegen pays a real + `bl`/return + struct-by-value copy instead of a few inlined instructions. These accessors run + per-operand, per-instruction, across the whole backend — and the backend is run by the device + tcc on every compile. +2. **Flash:** ~226 KB of `.text` (10%) is redundant duplicated copies of 30 such helpers. + `thop_emit` alone is **128 KB across 27 copies**; the `irop_*`/`tcc_ir_op_*` accessors add + another ~70 KB. + +The same root cause explains why several **hot lexer functions look "smaller" in tcc** in §3 +(`next` 0.22×, `macro_subst_tok` 0.40×): gcc inlined their helpers *into* them (work shows up in +the caller), tcc left the helpers as out-of-line calls. It's the same missing optimization seen +from the other side — and the lexer/preprocessor is **>50% of device compile CPU** (§3), so it's +exactly where the call overhead hurts most. + +## 3. Hot functions: CPU weight vs codegen size + +Top of the device-representative profile (`-O0` compile, the default). `ratio` = tcc ÷ gcc size; +**<1 means gcc inlined helpers into the caller**, not that tcc is better. + +``` +fn CPU% gccB tccB ratio note +next_nomacro 24.6% 4752 4396 0.93x +macro_subst_tok 11.5% 4092 1644 0.40x gcc inlined helpers in +tok_str_add2 8.0% 282 666 2.36x tcc bloat +next 6.5% 3428 764 0.22x gcc inlined helpers in +tccpp_new 6.5% 692 644 0.93x +macro_subst 4.5% 364 524 1.44x +parse_btype 3.2% 2348 3444 1.47x tcc bloat +cstr_ccat 2.5% 68 98 1.44x +token_lookup_cache_find 2.2% 76 108 1.42x +default_reallocator 2.2% 64 124 1.94x +post_type 1.8% 1660 2644 1.59x +svalue_to_iroperand 1.8% 1924 2548 1.32x +sym_push 1.4% 588 1180 2.01x +unary_funcall 1.4% 15392 20860 1.36x +``` + +Takeaway: **`tccpp.c` (lex + preprocess) is the CPU, by a wide margin** — `next_nomacro`, +`next`, `macro_subst_tok`, `macro_subst`, `tccpp_new`, `tok_str_add2` together are ~60% of the +profile. Whatever we do, it has to make the lexer hot path cheaper. + +## 4. Root cause #2 — steady +19% per-function codegen quality + +Beyond inlining, on functions where both builds emit one real copy, tcc is ~1.2–2× larger. The +gaps cluster around: +- **Dense switches over op/tag enums** compiled as linear compare chains instead of jump tables + (`tcc_ir_op_get_*`, `thumb_generate_opcode_for_data_processing` 3.2×). +- **Repeated struct-field reloads** — weak CSE/value-numbering at the machine level means a field + like `op->vr` is re-loaded instead of kept in a register across uses. +- **Spill-happy register allocation** in the big functions (`tcc_ir_codegen_generate` +10 KB, + `gen_function` +5.8 KB, `unary_funcall` +5.5 KB). + +This is the broad, always-on tax. Each fix is smaller per-unit than inlining but applies to the +whole binary (and to every program the device compiles). + +## 5. Recommendations, ranked by expected speedup ÷ effort + +1. **Inline the hot IR-operand accessors — do this first.** No new compiler pass required: + convert the handful of hottest `static inline` helpers in `tccir_operand.h` + (`irop_get_vreg`/`irop_set_vreg`, `irop_init_phys_regs`, `tcc_ir_op_get_src1/2/dest`, + `irop_get_tag`, `irop_make_imm32`) into **macros** (or hand-inline at the few hottest call + sites). tcc *will* emit macro bodies inline. Expected: removes the per-operand call+struct-copy + overhead from the entire backend **and** reclaims a chunk of the 226 KB. Low risk, mechanical. +2. **Inline the hot lexer helpers** the same way: `cstr_ccat`, `tok_str_add2`, + `token_lookup_cache_find`, `default_reallocator` are tiny, hot, and called in the >50%-CPU + lexer loop. gcc inlines them; tcc can via macro-ization. Targets the single biggest CPU bucket. +3. **A minimal real inliner** (medium effort, highest ceiling): inline single-return leaf + functions marked `inline`/`static inline` below an instruction-count threshold. This solves + #1 and #2 generally, eliminates the 226 KB duplication, and compounds — *a tcc that inlines + compiles a faster tcc*. Worth it if macro-ization proves too piecemeal. +4. **De-duplicate out-of-line copies** (link-time / single-definition fold). Reclaims ~226 KB + flash but **not** the call overhead — strictly worse than inlining for speed; do it only if + flash is the binding constraint and an inliner isn't. +5. **Jump tables for dense enum switches** in `tcc_ir_op_get_*` and the thumb opcode emitters — + attacks the §4 +19% at its largest contributors. + +The leverage multiplier worth remembering: the device tcc runs **its own compiled code**. Every +codegen improvement here makes the next self-host build of tcc itself faster, on top of speeding +up every user program it compiles. + +## Reproduce + +```sh +# gcc -O2 reference objects (81 TUs) -> /tmp/gcc_tcc/*.o (see flags in this report's git history) +# per-function sizes: +arm-none-eabi-nm -S --defined-only /tmp/gcc_tcc/*.o | awk '$3~/[tT]/{print $2,$4}' > /tmp/gcc_sizes.txt +arm-none-eabi-nm -S --defined-only bin/armv8m-tcc.elf | awk '$3~/[tT]/{print $2,$4}' > /tmp/elf_sizes.txt +# device-representative hot list: +scripts/tcc_profile.py -n 30 +``` diff --git a/elf.h b/elf.h index c8b6906c..9c2da5cc 100644 --- a/elf.h +++ b/elf.h @@ -2538,6 +2538,12 @@ typedef Elf32_Addr Elf32_Conflict; 108 /* 32 bit offset relative to static \ TLS block */ #define R_ARM_THM_TLS_DESCSEQ 129 +/* YASOS RELRO: 32-bit offset of a symbol within .rodata (S - rodata base). + * Emitted for references to shared (pure-const) .rodata symbols; the runtime + * address is anchor(rodata base from a fixed GOT slot) + this offset. Resolved + * at link time and baked into the .text literal, so it never reaches the YAFF + * writer (like R_ARM_GOTOFF). Uses a free value in the 130-159 ABI gap. */ +#define R_ARM_RODATA_OFF 137 #define R_ARM_IRELATIVE 160 #define R_ARM_RXPC25 249 #define R_ARM_RSBREL32 250 diff --git a/include/complex.h b/include/complex.h index 88de3db7..7827ad4f 100644 --- a/include/complex.h +++ b/include/complex.h @@ -67,41 +67,13 @@ extern "C" { * when they are fully supported. */ -/* For now, these are inline implementations that access the components */ -static inline double creal(double _Complex z) -{ - return (double)z; /* Casting complex to real extracts real part */ -} - -static inline float crealf(float _Complex z) -{ - return (float)z; -} - -static inline long double creall(long double _Complex z) -{ - return (long double)z; -} - -/* - * Imaginary part access - these will be fully implemented - * when __imag__ operator support is complete. - */ -static inline double cimag(double _Complex z) -{ - /* Placeholder - full implementation needs __imag__ support */ - return 0.0; -} +extern double creal(double _Complex z); +extern float crealf(float _Complex z); +extern long double creall(long double _Complex z); -static inline float cimagf(float _Complex z) -{ - return 0.0f; -} - -static inline long double cimagl(long double _Complex z) -{ - return 0.0L; -} +extern double cimag(double _Complex z); +extern float cimagf(float _Complex z); +extern long double cimagl(long double _Complex z); /* * Conjugate functions - return the complex conjugate. diff --git a/include/libtcc.h b/include/libtcc.h index 5949c807..20f7d7e5 100644 --- a/include/libtcc.h +++ b/include/libtcc.h @@ -70,6 +70,7 @@ LIBTCCAPI int tcc_set_output_type(TCCState *s, int output_type); #define TCC_OUTPUT_DLL 4 /* dynamic library */ #define TCC_OUTPUT_OBJ 3 /* object file */ #define TCC_OUTPUT_PREPROCESS 5 /* only preprocess */ +#define TCC_OUTPUT_PCH 6 /* generate a precompiled header */ /* equivalent to -Lpath option */ LIBTCCAPI int tcc_add_library_path(TCCState *s, const char *pathname); diff --git a/include/stddef.h b/include/stddef.h index da9b9e0d..880fb062 100644 --- a/include/stddef.h +++ b/include/stddef.h @@ -23,19 +23,12 @@ typedef union { long long __ll; long double __ld; } max_align_t; void *alloca(size_t size); #endif -#endif - -/* Older glibc require a wint_t from (when requested - by __need_wint_t, as otherwise stddef.h isn't allowed to - define this type). Note that this must be outside the normal - _STDDEF_H guard, so that it works even when we've included the file - already (without requiring wint_t). Some other libs define _WINT_T - if they've already provided that type, so we can use that as guard. - TCC defines __WINT_TYPE__ for us. */ -#if defined (__need_wint_t) -#ifndef _WINT_T -#define _WINT_T -typedef __WINT_TYPE__ wint_t; -#endif -#undef __need_wint_t +/* NOTE: nothing must follow the guard's #endif below -- it has to be the last + token before EOF so tcc's multiple-include optimization records _STDDEF_H + and skips re-reading this header. Upstream tcc kept a wint_t typedef + OUTSIDE the guard (gated behind __need_wint_t) for legacy glibc + partial-includes; that trailing content defeated the optimization and forced + a full re-read + re-tokenize on every #include (3x for one stdio.h compile). + YASOS never defines __need_wint_t, and wint_t is not a stddef.h type per C + anyway -- libc's owns it -- so the block is dropped. */ #endif diff --git a/include/tccdefs.h b/include/tccdefs.h index bfc06175..c3c23df8 100644 --- a/include/tccdefs.h +++ b/include/tccdefs.h @@ -93,15 +93,15 @@ #define __WINT_TYPE__ int #endif -#if __STDC_VERSION__ >= 201112L -#define __STDC_NO_ATOMICS__ 1 -#define __STDC_NO_COMPLEX__ 1 -#define __STDC_NO_THREADS__ 1 + #if __STDC_VERSION__ >= 201112L + #define __STDC_NO_ATOMICS__ 1 + #define __STDC_NO_COMPLEX__ 1 + #define __STDC_NO_THREADS__ 1 #if !defined _WIN32 -#define __STDC_UTF_16__ 1 -#define __STDC_UTF_32__ 1 -#endif + #define __STDC_UTF_16__ 1 + #define __STDC_UTF_32__ 1 #endif + #endif #if defined _WIN32 #define __declspec(x) __attribute__((x)) @@ -201,30 +201,30 @@ These are indented with 4 spaces so that c2str stringifies the guards instead of emitting them as real host-preprocessor directives (which would cause the host GCC to strip the blocks). */ -#ifndef __INT8_MAX__ -#define __INT8_MAX__ 0x7f -#endif -#ifndef __INT16_MAX__ -#define __INT16_MAX__ 0x7fff -#endif -#ifndef __INT32_MAX__ -#define __INT32_MAX__ 0x7fffffff -#endif -#ifndef __INT64_MAX__ -#define __INT64_MAX__ 0x7fffffffffffffffLL -#endif -#ifndef __UINT8_MAX__ -#define __UINT8_MAX__ 0xff -#endif -#ifndef __UINT16_MAX__ -#define __UINT16_MAX__ 0xffff -#endif -#ifndef __UINT32_MAX__ -#define __UINT32_MAX__ 0xffffffffU -#endif -#ifndef __UINT64_MAX__ -#define __UINT64_MAX__ 0xffffffffffffffffULL -#endif + #ifndef __INT8_MAX__ + #define __INT8_MAX__ 0x7f + #endif + #ifndef __INT16_MAX__ + #define __INT16_MAX__ 0x7fff + #endif + #ifndef __INT32_MAX__ + #define __INT32_MAX__ 0x7fffffff + #endif + #ifndef __INT64_MAX__ + #define __INT64_MAX__ 0x7fffffffffffffffLL + #endif + #ifndef __UINT8_MAX__ + #define __UINT8_MAX__ 0xff + #endif + #ifndef __UINT16_MAX__ + #define __UINT16_MAX__ 0xffff + #endif + #ifndef __UINT32_MAX__ + #define __UINT32_MAX__ 0xffffffffU + #endif + #ifndef __UINT64_MAX__ + #define __UINT64_MAX__ 0xffffffffffffffffULL + #endif /* Floating point limits (IEEE 754). These match include/float.h values. */ #define __FLT_MAX__ 3.40282347e+38F @@ -249,18 +249,11 @@ #define __LDBL_MAX_EXP__ 1024 #define __LDBL_MIN_EXP__ (-1021) -#ifdef __leading_underscore -#define __USER_LABEL_PREFIX__ _ -#else -#define __USER_LABEL_PREFIX__ -#endif -#if !defined _WIN32 -/* glibc defines */ -#define __REDIRECT(name, proto, alias) name proto __asm__(#alias) -#define __REDIRECT_NTH(name, proto, alias) name proto __asm__(#alias) __THROW -#define __REDIRECT_NTHNL(name, proto, alias) name proto __asm__(#alias) __THROWNL -#endif - + #ifdef __leading_underscore + #define __USER_LABEL_PREFIX__ _ + #else + #define __USER_LABEL_PREFIX__ + #endif /* not implemented */ #define __PRETTY_FUNCTION__ __FUNCTION__ #define __has_builtin(x) 0 diff --git a/ir/IMPLEMENTATION_SUMMARY.md b/ir/IMPLEMENTATION_SUMMARY.md deleted file mode 100644 index 1dc3a7b8..00000000 --- a/ir/IMPLEMENTATION_SUMMARY.md +++ /dev/null @@ -1,139 +0,0 @@ -# TCCIR Subdirectory Refactoring - Implementation Summary - -## Completed Work - -### 1. Created ir/ Subdirectory Structure - -``` -ir/ -├── README.md # Documentation -├── ir.h # Internal IR header (includes all modules) -├── type.h # Type helpers (is_float, is_64bit, etc.) -├── pool.h # Operand pool management -├── vreg.h # Virtual register management -├── live.h # Liveness analysis -├── stack.h # Stack layout, spill slots -├── mat.h # Value materialization -├── opt.h # Optimizations -├── codegen.h # Codegen helpers -├── dump.h # Debug dumping -└── operand.h # IROperand definitions (moved from root) -``` - -### 2. Consistent Naming Convention Established - -#### Public API Pattern: `tcc_ir__` - -| Module | Old Name | New Name | -|--------|----------|----------| -| Core | `tcc_ir_allocate_block()` | `tcc_ir_alloc()` | -| Core | `tcc_ir_release_block()` | `tcc_ir_free()` | -| Core | `tcc_ir_gen_opi()` | `tcc_ir_gen_i()` | -| Core | `tcc_ir_gen_opf()` | `tcc_ir_gen_f()` | -| VReg | `tcc_ir_get_vreg_temp()` | `tcc_ir_vreg_alloc_temp()` | -| VReg | `tcc_ir_set_float_type()` | `tcc_ir_vreg_type_set_fp()` | -| Live | `tcc_ir_liveness_analysis()` | `tcc_ir_live_analysis()` | -| Live | `tcc_ir_compute_live_intervals()` | `tcc_ir_live_intervals_compute()` | -| Stack | `tcc_ir_build_stack_layout()` | `tcc_ir_stack_layout_build()` | -| Mat | `tcc_ir_materialize_value()` | `tcc_ir_mat_value()` | -| Opt | `tcc_ir_dead_code_elimination()` | `tcc_ir_opt_dce()` | -| Opt | `tcc_ir_constant_propagation()` | `tcc_ir_opt_const_prop()` | -| Codegen | `tcc_ir_codegen_get_operand()` | `tcc_ir_codegen_operand_get()` | -| Dump | `tcc_ir_show()` | `tcc_ir_dump()` | - -### 3. Supporting Infrastructure Created - -#### tccmachine.h / tccmachine.c -- Abstract machine interface (vtable pattern) -- Opaque scratch register handles -- Architecture-independent materialization requests - -#### tccopt.h / tccopt.c -- FP offset materialization cache (moved from tccir.c) -- Pluggable optimization pass structure -- Optimization driver functions - -#### tccir.h Updates -- Added `TCCFPMatCache` forward declaration -- Added `opt_fp_mat_cache` field to `TCCIRState` - -### 4. Build System Updates - -#### Makefile -- Added `tccmachine.c` and `tccopt.c` to CORE_FILES -- Added corresponding headers - -### 5. Backward Compatibility - -- tccir.h remains the public API at the project root -- All existing code compiles without modification -- All 480 tests pass - -## Module Dependencies - -``` -type (no deps) - ↓ -pool (uses type) - ↓ -vreg (uses pool, type) - ↓ -stack (uses vreg) -live (uses vreg) - ↓ -core (uses pool, vreg, type) -mat (uses stack, vreg) - ↓ -codegen (uses mat, live) -opt (uses core) -dump (uses all) -``` - -## Next Steps (Future Work) - -### Phase 2: Split tccir.c Implementation - -1. Create `ir/type.c` with type helper implementations -2. Create `ir/pool.c` with pool management -3. Create `ir/vreg.c` with vreg operations -4. Continue with other modules... - -### Phase 3: Update Build System - -1. Add `ir/*.c` to Makefile compilation -2. Remove original `tccir.c` when complete - -### Phase 4: Implement New Machine Interface - -1. Create `arm-thumb-machine.c` implementing `TCCMachineInterface` -2. Migrate materialization code to use interface -3. Remove architecture-dependent code from IR layer - -## API Reference - -See individual header files in `ir/` for complete API documentation: -- `core.h` - IR block lifecycle, instruction insertion -- `vreg.h` - Virtual register allocation, type setting -- `live.h` - Liveness analysis, live intervals -- `stack.h` - Stack layout, spill slots -- `mat.h` - Value materialization -- `opt.h` - Optimization passes -- `codegen.h` - Code generation helpers -- `dump.h` - Debug output - -## Testing - -All tests pass: -- IR tests: 606/606 ✓ (+ GCC torture: 3310 passed, 79 skipped, 582 xfailed) -- Assembler tests: 156/156 ✓ -- Internal tests: 63/63 ✓ -- AEABI tests: 13/13 ✓ - -## Codegen Architecture - -`ir/codegen.c` uses a single unified two-pass loop (`for (pass = 0; pass < 2; pass++)`): -- **Pass 0 (dry-run)**: discovers scratch register needs, collects branch offsets — `ot()` is a no-op. -- **Inter-pass**: analyzes branch encodings, checks LR usage, runs scratch conflict fixup, emits prologue. -- **Pass 1 (real-run)**: emits actual Thumb-2 machine code using dry-run data for consistency checks. - -Both passes share a single `switch (cq->op)` dispatch. Pass-specific behavior uses `if (is_dry_run)` / `if (!is_dry_run)` guards. Adding a new IR op requires adding only one `case`. diff --git a/ir/cfg.c b/ir/cfg.c new file mode 100644 index 00000000..c00c858c --- /dev/null +++ b/ir/cfg.c @@ -0,0 +1,334 @@ +/* + * TCC IR - Control Flow Graph and Dominator Tree + * + * Copyright (c) 2025 Mateusz Stadnik + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation. + */ + +#define USING_GLOBALS +#include "ir.h" +#include "cfg.h" + +static void cfg_add_edge(IRCFG *cfg, int from, int to) +{ + IRBasicBlock *fb = &cfg->blocks[from]; + IRBasicBlock *tb = &cfg->blocks[to]; + /* Avoid duplicate successor edges */ + for (int i = 0; i < fb->num_succs; i++) + if (fb->succs[i] == to) + goto add_pred; + if (fb->num_succs >= fb->succs_cap) { + int nc = fb->succs_cap ? fb->succs_cap * 2 : 4; + fb->succs = tcc_realloc(fb->succs, nc * sizeof(int)); + fb->succs_cap = nc; + } + fb->succs[fb->num_succs++] = to; +add_pred: + if (tb->num_preds >= tb->preds_cap) { + int nc = tb->preds_cap ? tb->preds_cap * 2 : 4; + tb->preds = tcc_realloc(tb->preds, nc * sizeof(int)); + tb->preds_cap = nc; + } + tb->preds[tb->num_preds++] = from; +} + +IRCFG *tcc_ir_cfg_build(TCCIRState *ir) +{ + int n = ir->next_instruction_index; + if (n == 0) + return NULL; + + IRCFG *cfg = tcc_mallocz(sizeof(IRCFG)); + cfg->num_instrs = n; + + /* Mark leaders — recompute jump targets from scratch (don't trust + * stale is_jump_target flags from previous optimization passes). */ + uint8_t *is_leader = tcc_mallocz(n); + is_leader[0] = 1; + for (int i = 0; i < n; i++) { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_JUMP || q->op == TCCIR_OP_JUMPIF) { + IROperand dest = tcc_ir_op_get_dest(ir, q); + int target = (int)irop_get_imm64_ex(ir, dest); + if (target >= 0 && target < n) { + is_leader[target] = 1; + } + } + if (q->op == TCCIR_OP_JUMP || q->op == TCCIR_OP_JUMPIF || + q->op == TCCIR_OP_RETURNVALUE || q->op == TCCIR_OP_RETURNVOID || + q->op == TCCIR_OP_IJUMP || q->op == TCCIR_OP_SWITCH_TABLE) { + if (i + 1 < n) { + is_leader[i + 1] = 1; + } + } + } + + /* Count blocks */ + int nb = 0; + for (int i = 0; i < n; i++) + if (is_leader[i]) + nb++; + + cfg->capacity = nb; + cfg->blocks = tcc_mallocz(nb * sizeof(IRBasicBlock)); + cfg->instr_to_block = tcc_mallocz(n * sizeof(int)); + + /* Create blocks */ + int bi = -1; + for (int i = 0; i < n; i++) { + if (is_leader[i]) { + if (bi >= 0) + cfg->blocks[bi].end_idx = i; + bi++; + cfg->blocks[bi].start_idx = i; + cfg->blocks[bi].idom = -1; + cfg->blocks[bi].rpo_number = -1; + } + cfg->instr_to_block[i] = bi; + } + if (bi >= 0) + cfg->blocks[bi].end_idx = n; + cfg->num_blocks = bi + 1; + + tcc_free(is_leader); + + /* Build edges */ + for (int b = 0; b < cfg->num_blocks; b++) { + int last = cfg->blocks[b].end_idx - 1; + if (last < cfg->blocks[b].start_idx) + continue; + IRQuadCompact *q = &ir->compact_instructions[last]; + + if (q->op == TCCIR_OP_JUMP) { + IROperand dest = tcc_ir_op_get_dest(ir, q); + int target = (int)irop_get_imm64_ex(ir, dest); + if (target >= 0 && target < n) + cfg_add_edge(cfg, b, cfg->instr_to_block[target]); + } + else if (q->op == TCCIR_OP_JUMPIF) { + IROperand dest = tcc_ir_op_get_dest(ir, q); + int target = (int)irop_get_imm64_ex(ir, dest); + if (target >= 0 && target < n) + cfg_add_edge(cfg, b, cfg->instr_to_block[target]); + if (b + 1 < cfg->num_blocks) + cfg_add_edge(cfg, b, b + 1); + } + else if (q->op == TCCIR_OP_SWITCH_TABLE) { + IROperand src2 = tcc_ir_op_get_src2(ir, q); + int table_id = (int)irop_get_imm64_ex(ir, src2); + if (table_id >= 0 && table_id < ir->num_switch_tables) { + TCCIRSwitchTable *table = &ir->switch_tables[table_id]; + for (int ti = 0; ti < table->num_entries; ti++) { + int target = table->targets[ti]; + if (target >= 0 && target < n) + cfg_add_edge(cfg, b, cfg->instr_to_block[target]); + } + } + } + else if (q->op == TCCIR_OP_RETURNVALUE || q->op == TCCIR_OP_RETURNVOID || + q->op == TCCIR_OP_IJUMP) { + /* no successors (IJUMP: conservative — skip loops containing it) */ + } + else { + if (b + 1 < cfg->num_blocks) + cfg_add_edge(cfg, b, b + 1); + } + } + + return cfg; +} + +void tcc_ir_cfg_free(IRCFG *cfg) +{ + if (!cfg) + return; + for (int i = 0; i < cfg->num_blocks; i++) { + tcc_free(cfg->blocks[i].succs); + tcc_free(cfg->blocks[i].preds); + tcc_free(cfg->blocks[i].dom_frontier); + tcc_free(cfg->blocks[i].dom_children); + } + tcc_free(cfg->blocks); + tcc_free(cfg->rpo_order); + tcc_free(cfg->instr_to_block); + tcc_free(cfg); +} + +/* Iterative DFS for reverse postorder */ +static void cfg_compute_rpo(IRCFG *cfg) +{ + int nb = cfg->num_blocks; + if (nb == 0) + return; + + uint8_t *visited = tcc_mallocz(nb); + int *postorder = tcc_mallocz(nb * sizeof(int)); + int po_count = 0; + + /* Iterative DFS using explicit stack: (block, child_index) */ + typedef struct { int block; int ci; } DFSFrame; + DFSFrame *stack = tcc_mallocz(nb * sizeof(DFSFrame)); + int sp = 0; + + visited[0] = 1; + stack[sp++] = (DFSFrame){0, 0}; + + while (sp > 0) { + DFSFrame *top = &stack[sp - 1]; + IRBasicBlock *bb = &cfg->blocks[top->block]; + if (top->ci < bb->num_succs) { + int s = bb->succs[top->ci]; + top->ci++; + if (s >= 0 && s < nb && !visited[s]) { + visited[s] = 1; + stack[sp++] = (DFSFrame){s, 0}; + } + } + else { + postorder[po_count++] = top->block; + sp--; + } + } + + /* Reverse postorder */ + cfg->rpo_order = tcc_mallocz(po_count * sizeof(int)); + cfg->rpo_count = po_count; + for (int i = 0; i < po_count; i++) { + int b = postorder[po_count - 1 - i]; + cfg->rpo_order[i] = b; + cfg->blocks[b].rpo_number = i; + } + + tcc_free(visited); + tcc_free(postorder); + tcc_free(stack); +} + +/* Cooper-Harvey-Kennedy dominator tree */ +static int cfg_intersect(IRCFG *cfg, int b1, int b2) +{ + while (b1 != b2) { + while (cfg->blocks[b1].rpo_number > cfg->blocks[b2].rpo_number) + b1 = cfg->blocks[b1].idom; + while (cfg->blocks[b2].rpo_number > cfg->blocks[b1].rpo_number) + b2 = cfg->blocks[b2].idom; + } + return b1; +} + +void tcc_ir_cfg_compute_dominators(IRCFG *cfg) +{ + if (!cfg || cfg->num_blocks == 0) + return; + + cfg_compute_rpo(cfg); + + /* Entry dominates itself */ + cfg->blocks[0].idom = 0; + + int changed = 1; + while (changed) { + changed = 0; + for (int ri = 0; ri < cfg->rpo_count; ri++) { + int b = cfg->rpo_order[ri]; + if (b == 0) + continue; + IRBasicBlock *bb = &cfg->blocks[b]; + int new_idom = -1; + for (int pi = 0; pi < bb->num_preds; pi++) { + int p = bb->preds[pi]; + if (cfg->blocks[p].idom == -1) + continue; + if (new_idom == -1) + new_idom = p; + else + new_idom = cfg_intersect(cfg, new_idom, p); + } + if (new_idom >= 0 && new_idom != bb->idom) { + bb->idom = new_idom; + changed = 1; + } + } + } +} + +int tcc_ir_cfg_dominates(IRCFG *cfg, int a, int b) +{ + if (!cfg || a < 0 || b < 0 || a >= cfg->num_blocks || b >= cfg->num_blocks) + return 0; + while (b >= 0) { + if (b == a) + return 1; + if (b == cfg->blocks[b].idom) + return b == a; + b = cfg->blocks[b].idom; + } + return 0; +} + +static void cfg_add_df(IRBasicBlock *bb, int df_block, uint8_t *df_seen) +{ + if (df_seen[df_block / 8] & (1 << (df_block % 8))) + return; + df_seen[df_block / 8] |= (1 << (df_block % 8)); + if (bb->num_df >= bb->df_cap) { + int nc = bb->df_cap ? bb->df_cap * 2 : 4; + bb->dom_frontier = tcc_realloc(bb->dom_frontier, nc * sizeof(int)); + bb->df_cap = nc; + } + bb->dom_frontier[bb->num_df++] = df_block; +} + +static void cfg_add_dom_child(IRBasicBlock *bb, int child) +{ + if (bb->num_dom_children >= bb->dom_children_cap) { + int nc = bb->dom_children_cap ? bb->dom_children_cap * 2 : 4; + bb->dom_children = tcc_realloc(bb->dom_children, nc * sizeof(int)); + bb->dom_children_cap = nc; + } + bb->dom_children[bb->num_dom_children++] = child; +} + +void tcc_ir_cfg_compute_dom_frontiers(IRCFG *cfg) +{ + if (!cfg || cfg->num_blocks == 0) + return; + + /* Build dominator tree children lists */ + for (int b = 1; b < cfg->num_blocks; b++) { + int idom = cfg->blocks[b].idom; + if (idom >= 0 && idom != b) + cfg_add_dom_child(&cfg->blocks[idom], b); + } + + /* Compute dominance frontier using the standard algorithm. + * Per-block bitset avoids O(n^2) duplicate checks in cfg_add_df. */ + int nb = cfg->num_blocks; + int df_seen_bytes = (nb + 7) / 8; + uint8_t *df_seen = tcc_mallocz(nb * df_seen_bytes); + + for (int b = 0; b < nb; b++) { + IRBasicBlock *bb = &cfg->blocks[b]; + if (bb->num_preds < 2) + continue; + if (bb->idom < 0) + continue; + for (int pi = 0; pi < bb->num_preds; pi++) { + int runner = bb->preds[pi]; + if (runner < 0 || cfg->blocks[runner].idom < 0) + continue; + int steps = 0; + while (runner != bb->idom && steps < nb) { + cfg_add_df(&cfg->blocks[runner], b, &df_seen[runner * df_seen_bytes]); + if (runner == cfg->blocks[runner].idom) + break; + runner = cfg->blocks[runner].idom; + steps++; + } + } + } + tcc_free(df_seen); +} diff --git a/ir/cfg.h b/ir/cfg.h new file mode 100644 index 00000000..75540b47 --- /dev/null +++ b/ir/cfg.h @@ -0,0 +1,43 @@ +#ifndef TCC_IR_CFG_H +#define TCC_IR_CFG_H + +struct TCCIRState; + +typedef struct IRBasicBlock +{ + int start_idx; + int end_idx; + int *succs; + int num_succs; + int succs_cap; + int *preds; + int num_preds; + int preds_cap; + int idom; + int rpo_number; + int *dom_frontier; + int num_df; + int df_cap; + int *dom_children; + int num_dom_children; + int dom_children_cap; +} IRBasicBlock; + +typedef struct IRCFG +{ + IRBasicBlock *blocks; + int num_blocks; + int capacity; + int *rpo_order; + int rpo_count; + int *instr_to_block; + int num_instrs; +} IRCFG; + +IRCFG *tcc_ir_cfg_build(struct TCCIRState *ir); +void tcc_ir_cfg_free(IRCFG *cfg); +void tcc_ir_cfg_compute_dominators(IRCFG *cfg); +void tcc_ir_cfg_compute_dom_frontiers(IRCFG *cfg); +int tcc_ir_cfg_dominates(IRCFG *cfg, int a, int b); + +#endif diff --git a/ir/codegen.c b/ir/codegen.c index 95dc1b5c..21ef60c4 100644 --- a/ir/codegen.c +++ b/ir/codegen.c @@ -323,6 +323,66 @@ void tcc_ir_mark_return_value_incoming_regs(TCCIRState *ir) else interval->incoming_reg1 = -1; } + + /* Mark the root source vreg of RETURNVALUE with incoming_reg0=0 + * as a hint for the post-allocation swap pass. Only at -O1+ to + * avoid interfering with -O0 codegen paths that read incoming_reg0. */ + if (tcc_state->optimize < 1) + return; + + for (int i = 0; i < ir->next_instruction_index; ++i) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op != TCCIR_OP_RETURNVALUE) + continue; + + const IROperand src = tcc_ir_op_get_src1(ir, q); + if (!irop_has_vreg(src) || irop_is_immediate(src)) + continue; + + int32_t vr = irop_get_vreg(src); + + for (int depth = 0; depth < 5; depth++) + { + if (vr < 0 || !tcc_ir_vreg_is_valid(ir, vr)) + break; + if (TCCIR_DECODE_VREG_TYPE(vr) == TCCIR_VREG_TYPE_PARAM) + break; + int found = 0; + for (int j = i - 1; j >= 0; j--) + { + IRQuadCompact *dq = &ir->compact_instructions[j]; + if (dq->op == TCCIR_OP_NOP) + continue; + if (!irop_config[dq->op].has_dest) + continue; + IROperand dd = tcc_ir_op_get_dest(ir, dq); + if (irop_get_vreg(dd) != vr) + continue; + if (dq->op == TCCIR_OP_LOAD || dq->op == TCCIR_OP_ASSIGN) + { + IROperand ds = tcc_ir_op_get_src1(ir, dq); + if (irop_has_vreg(ds) && + TCCIR_DECODE_VREG_TYPE(irop_get_vreg(ds)) != TCCIR_VREG_TYPE_PARAM) + { + vr = irop_get_vreg(ds); + found = 1; + } + } + break; + } + if (!found) + break; + } + + if (vr >= 0 && tcc_ir_vreg_is_valid(ir, vr) && + TCCIR_DECODE_VREG_TYPE(vr) != TCCIR_VREG_TYPE_PARAM) + { + IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, vr); + if (interval && interval->incoming_reg0 < 0) + interval->incoming_reg0 = 0; /* hint: prefer r0 */ + } + } } void tcc_ir_avoid_spilling_stack_passed_params(TCCIRState *ir) @@ -867,6 +927,16 @@ static void tcc_ir_codegen_inline_asm_by_id(TCCIRState *ir, int id) * allocatable "r" registers in IR mode. */ for (int i = 0; i < nb_operands; ++i) { + /* For an lvalue operand such as "+r"(*p), pr0_reg holds the ADDRESS of + * the value (the pointer), not the value itself. That pointer is read + * both by the prolog load (ldr op->reg,[ptr]) and the epilog store + * (str op->reg,[ptr]), so it must survive across the asm body. If we + * un-reserved it, the constraint solver could pick the same register + * for op->reg, and the prolog load would clobber the pointer before the + * store ran. Keep lvalue-operand registers reserved so the value gets a + * distinct register. */ + if (vals[i].r & VT_LVAL) + continue; if (!vals[i].pr0_spilled && vals[i].pr0_reg != PREG_REG_NONE && vals[i].pr0_reg < NB_ASM_REGS) reserved_regs[vals[i].pr0_reg] = 0; if (!vals[i].pr1_spilled && vals[i].pr1_reg != PREG_REG_NONE && vals[i].pr1_reg < NB_ASM_REGS) @@ -971,7 +1041,7 @@ static int try_reassign_scratch_conflict(TCCIRState *ir, int r, int insn_i) * R10 = static_chain_reg (= 10): reserved when function uses a static chain. */ const uint32_t ALL_CALLEE_SAVED = 0x0FF0u; - const uint32_t ARM_FP_REG = 7u; /* R_FP = R7, defined in arm-thumb-opcodes.h */ + const uint32_t ARM_FP_REG = 7u; /* R_FP = R7, defined in thumb.h */ const uint32_t ARM_R9 = 9u; /* R9 = GOT base pointer when text_and_data_separation */ uint32_t reserved = (1u << ARM_FP_REG); /* always exclude frame pointer */ if (tcc_state->text_and_data_separation) @@ -1016,6 +1086,10 @@ static int try_reassign_scratch_conflict(TCCIRState *ir, int r, int insn_i) * cause the codegen to look in the wrong register after a call/entry. */ if (ir_iv->incoming_reg0 >= 0) return -1; + /* Skip phi-pinned intervals: their register is relied upon by identity phi + * resolution (no copy was emitted because src and dest share the same reg). */ + if (ir_iv->phi_pinned) + return -1; /* Compute the union of live register masks across [ls_iv->start .. ls_iv->end]. * Any register set in this union is occupied by some other live vreg and @@ -1035,6 +1109,9 @@ static int try_reassign_scratch_conflict(TCCIRState *ir, int r, int insn_i) int new_r = (int)__builtin_ctz(avail); /* lowest-numbered free callee-saved */ /* --- Apply the reassignment --- */ + if (tcc_state && tcc_state->verbose) + fprintf(stderr, "[phase3-fixup] insn=%d vreg=%d R%d->R%d (blocked=0x%x avail=0x%x)\n", insn_i, (int)ls_iv->vreg, r, + new_r, blocked, avail); /* 1. Update the IRLiveInterval (read by machine_op_from_ir). */ ir_iv->allocation.r0 = (uint16_t)new_r; @@ -1089,6 +1166,180 @@ static void mop_fixup_subcomponent(MachineOperand *mop, const IROperand *op, TCC } } +/* Check whether any live interval (other than skip_vreg) is allocated to + * physical register `reg` and overlaps the range [start, end]. Returns + * true if a conflict exists, meaning we cannot reassign skip_vreg to `reg`. */ +static bool ir_reg_conflict(const TCCIRState *ir, int reg, uint32_t start, uint32_t end, int skip_vreg) +{ + const struct + { + const IRLiveInterval *arr; + int count; + } pools[] = { + {ir->variables_live_intervals, ir->next_local_variable}, + {ir->temporary_variables_live_intervals, ir->next_temporary_variable}, + {ir->parameters_live_intervals, ir->next_parameter}, + }; + for (int p = 0; p < 3; p++) + { + for (int k = 0; k < pools[p].count; k++) + { + const IRLiveInterval *other = &pools[p].arr[k]; + if (other->allocation.r0 != (uint16_t)reg) + continue; + /* Skip the vreg we're about to reassign */ + if (p == 0 && k == TCCIR_DECODE_VREG_POSITION(skip_vreg) && + TCCIR_DECODE_VREG_TYPE(skip_vreg) == TCCIR_VREG_TYPE_VAR) + continue; + if (p == 1 && k == TCCIR_DECODE_VREG_POSITION(skip_vreg) && + TCCIR_DECODE_VREG_TYPE(skip_vreg) == TCCIR_VREG_TYPE_TEMP) + continue; + if (p == 2 && k == TCCIR_DECODE_VREG_POSITION(skip_vreg) && + TCCIR_DECODE_VREG_TYPE(skip_vreg) == TCCIR_VREG_TYPE_PARAM) + continue; + /* Check overlap: intervals [other->start, other->end] ∩ [start, end] */ + if (other->start <= end && other->end >= start) + return true; + } + } + return false; +} + +/* ============================================================================ + * Pre-prologue FUNCPARAMVAL allocation patching + * + * When a LOAD/LOAD_INDEXED/LOAD_POSTINC/ASSIGN produces a value that is + * immediately consumed by FUNCPARAMVAL (param 0..3), the codegen peephole + * would load directly into the ABI register (R0-R3) instead of the + * allocator-assigned register. If the allocator assigned a callee-saved + * register, it becomes a ghost save. + * + * This pre-pass patches those allocations BEFORE prologue emission so that + * dirty_registers can be recomputed accurately. It mirrors the logic in + * ir_codegen_before_ret_peephole's FUNCPARAMVAL branch, but only patches + * allocations (no MachineOperand output needed). + * ============================================================================ */ +static void ir_codegen_pre_patch_funcparam_allocations(TCCIRState *ir) +{ + for (int i = 0; i < ir->next_instruction_index; i++) + { + TccIrOp op = ir->compact_instructions[i].op; + /* Only instructions that produce a value and use peephole (.dest = 2) */ + if (op != TCCIR_OP_LOAD && op != TCCIR_OP_LOAD_INDEXED && op != TCCIR_OP_LOAD_POSTINC && op != TCCIR_OP_ASSIGN && + op != TCCIR_OP_SELECT && op != TCCIR_OP_FUNCCALLVAL) + continue; + + IROperand dest_ir = tcc_ir_op_get_dest(ir, &ir->compact_instructions[i]); + int dest_vr = irop_get_vreg(dest_ir); + if (dest_vr < 0) + continue; + + /* 64-bit values need register pairs — skip (peephole skips them too) */ + if (irop_needs_pair(dest_ir)) + continue; + + /* Find next non-NOP instruction */ + int j = i + 1; + while (j < ir->next_instruction_index && ir->compact_instructions[j].op == TCCIR_OP_NOP) + j++; + if (j >= ir->next_instruction_index) + continue; + + /* Must also not be a jump target (another path could reach it without + * executing instruction i, making the peephole unsafe). */ + if (j < ir->next_instruction_index && ir->compact_instructions[j].is_jump_target) + continue; + + if (ir->compact_instructions[j].op != TCCIR_OP_FUNCPARAMVAL) + continue; + + IROperand nq_src1 = tcc_ir_op_get_src1(ir, &ir->compact_instructions[j]); + int next_vr = irop_get_vreg(nq_src1); + if (next_vr != dest_vr) + continue; + + if (irop_op_is_lval(nq_src1)) + continue; + + IRLiveInterval *li = tcc_ir_get_live_interval(ir, dest_vr); + if (!li || li->start != (uint32_t)i) + continue; + + /* Find the CALL that consumes this parameter */ + int call_idx = -1; + for (int k = j + 1; k < ir->next_instruction_index; k++) + { + TccIrOp kop = ir->compact_instructions[k].op; + if (kop == TCCIR_OP_NOP || kop == TCCIR_OP_FUNCPARAMVAL) + continue; + if (kop == TCCIR_OP_FUNCCALLVAL || kop == TCCIR_OP_FUNCCALLVOID) + { + call_idx = k; + break; + } + break; + } + if (call_idx < 0 || li->end != (uint32_t)call_idx) + continue; + + /* Decode parameter index */ + IROperand nq_src2 = tcc_ir_op_get_src2(ir, &ir->compact_instructions[j]); + uint32_t encoded = (uint32_t)irop_get_imm64_ex(ir, nq_src2); + int param_idx = TCCIR_DECODE_PARAM_IDX(encoded); + if (param_idx > 3) + continue; + + int target_reg = param_idx; + + /* Check no register conflict */ + if (ir_reg_conflict(ir, target_reg, li->start, li->end > 0 ? li->end - 1 : 0, dest_vr)) + continue; + + /* Patch allocation to ABI register */ + li->allocation.r0 = (uint16_t)target_reg; + li->allocation.offset = 0; + } +} + +/* ============================================================================ + * Recompute dirty_registers from actual IRLiveInterval allocations + * + * After peephole/pre-patch optimizations change IRLiveInterval.allocation, + * the allocator's dirty_registers bitmap may contain callee-saved registers + * that are no longer referenced by any interval. Rebuild from ground truth. + * ============================================================================ */ +static void ir_codegen_recompute_dirty_from_allocations(TCCIRState *ir) +{ + uint64_t callee_mask = 0; + for (int r = 4; r <= 11; ++r) + callee_mask |= (1ULL << r); + + /* Collect registers actually referenced by any interval allocation. */ + uint64_t used = 0; + +#define SCAN_INTERVALS(arr, count) \ + for (int _i = 0; _i < (count); ++_i) \ + { \ + const IRLiveInterval *_li = &(arr)[_i]; \ + uint16_t _r0 = _li->allocation.r0; \ + uint16_t _r1 = _li->allocation.r1; \ + if (!(_r0 & PREG_SPILLED) && _r0 != PREG_NONE && _r0 < 16) \ + used |= (1ULL << _r0); \ + if (!(_r1 & PREG_SPILLED) && _r1 != PREG_NONE && _r1 < 16) \ + used |= (1ULL << _r1); \ + } + + SCAN_INTERVALS(ir->parameters_live_intervals, ir->next_parameter); + SCAN_INTERVALS(ir->variables_live_intervals, ir->next_local_variable); + SCAN_INTERVALS(ir->temporary_variables_live_intervals, ir->next_temporary_variable); +#undef SCAN_INTERVALS + + uint64_t old_dirty = ir->ls.dirty_registers; + uint64_t non_callee = old_dirty & ~callee_mask; + uint64_t callee_dirty = old_dirty & callee_mask; + ir->ls.dirty_registers = non_callee | (callee_dirty & used); +} + /* ============================================================================ * Before-Return Peephole * @@ -1102,39 +1353,211 @@ static void mop_fixup_subcomponent(MachineOperand *mop, const IROperand *op, TCC * accounting stays consistent. * ============================================================================ */ static bool ir_codegen_before_ret_peephole(TCCIRState *ir, int i, const IROperand *dest_ir, - const uint8_t *has_incoming_jump, MachineOperand *out_mop_dest) + MachineOperand *out_mop_dest) { - if (i + 1 >= ir->next_instruction_index) + int dest_vr = irop_get_vreg(*dest_ir); + if (dest_vr < 0) return false; - const IRQuadCompact *nq = &ir->compact_instructions[i + 1]; - if (nq->op != TCCIR_OP_RETURNVALUE || has_incoming_jump[i + 1]) + /* Find the next non-NOP instruction, skipping over dead code. + * NOPs are skipped regardless of is_jump_target — a branch landing on + * a NOP falls through without affecting register state, so the peephole + * assumption ("instruction i's result flows to j") remains valid. + * Only check is_jump_target on the actual consumer (non-NOP) — if a + * branch can reach it without executing instruction i, the peephole + * would produce wrong code. */ + int j = i + 1; + while (j < ir->next_instruction_index) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + break; + j++; + } + if (j >= ir->next_instruction_index) return false; - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - int next_vr = irop_get_vreg(nq_src1); - int dest_vr = irop_get_vreg(*dest_ir); - if (next_vr != dest_vr || dest_vr < 0) + if (ir->compact_instructions[j].is_jump_target) + { return false; + } - IRLiveInterval *li = tcc_ir_get_live_interval(ir, dest_vr); + const IRQuadCompact *nq = &ir->compact_instructions[j]; const int needs_pair = irop_needs_pair(*dest_ir); - if (li) + + if (nq->op == TCCIR_OP_RETURNVALUE) { + IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); + int next_vr = irop_get_vreg(nq_src1); + if (next_vr != dest_vr) + return false; + + IRLiveInterval *li = tcc_ir_get_live_interval(ir, dest_vr); + if (!li || li->start != (uint32_t)i) + return false; + li->allocation.r0 = REG_IRET; li->allocation.offset = 0; if (needs_pair) li->allocation.r1 = REG_IRE2; + + *out_mop_dest = (MachineOperand){.kind = MACH_OP_REG, + .btype = irop_get_btype(*dest_ir), + .vreg = dest_vr, + .is_64bit = needs_pair, + .is_unsigned = dest_ir->is_unsigned, + .needs_deref = false, + .u.reg = {.r0 = REG_IRET, .r1 = needs_pair ? (int)REG_IRE2 : -1}}; + return true; + } + + /* Peephole: when the next instruction is an ASSIGN that just copies our dest + * vreg into another register, load directly into the ASSIGN's destination. + * This eliminates "ldr rT, [pc,#imm]; mov rD, rT" sequences. + * + * Safety: the dest vreg must die at the ASSIGN (end == i+1), ensuring no + * other instruction reads the old allocation. */ + if (nq->op == TCCIR_OP_ASSIGN) + { + IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); + int next_vr = irop_get_vreg(nq_src1); + if (next_vr != dest_vr) + return false; + + /* dest vreg must be defined here and die at the ASSIGN — no earlier + * definitions and no later uses. If li->start < i, another instruction + * (e.g. conditional set on an alternate path) also defines the vreg; + * patching the global allocation would break that earlier instruction. */ + IRLiveInterval *li = tcc_ir_get_live_interval(ir, dest_vr); + if (!li || li->start != (uint32_t)i || li->end != (uint32_t)j) + return false; + + /* Get the ASSIGN's destination vreg and its register allocation */ + IROperand nq_dest = tcc_ir_op_get_dest(ir, nq); + int assign_dest_vr = irop_get_vreg(nq_dest); + if (assign_dest_vr < 0) + return false; + + /* Both source and destination must have matching pair requirements */ + int dest_needs_pair = irop_needs_pair(nq_dest); + if (dest_needs_pair != needs_pair) + return false; + + IRLiveInterval *dest_li = tcc_ir_get_live_interval(ir, assign_dest_vr); + if (!dest_li) + return false; + + int target_r0 = (int)dest_li->allocation.r0; + int target_r1 = needs_pair ? (int)dest_li->allocation.r1 : -1; + + /* Target must be a valid physical register, not spilled (r0 >= NB_REGS + * means spilled to stack — e.g. r0=63 is a spill sentinel). */ + if (target_r0 >= NB_REGS) + return false; + + /* For 64-bit pairs, both halves must have valid registers */ + if (needs_pair && (target_r1 < 0 || target_r1 >= NB_REGS)) + return false; + + /* Patch the source vreg's allocation to the ASSIGN's destination register */ + li->allocation.r0 = (uint16_t)target_r0; + li->allocation.offset = 0; + if (needs_pair) + li->allocation.r1 = (uint16_t)target_r1; + + *out_mop_dest = (MachineOperand){.kind = MACH_OP_REG, + .btype = irop_get_btype(*dest_ir), + .vreg = dest_vr, + .is_64bit = needs_pair, + .is_unsigned = dest_ir->is_unsigned, + .needs_deref = false, + .u.reg = {.r0 = target_r0, .r1 = target_r1}}; + return true; + } + + /* Peephole: when the next non-NOP instruction is FUNCPARAMVAL using our dest + * vreg as a 32-bit scalar argument, load directly into the parameter register + * (R0+param_index). This eliminates "ldr rT, ...; mov r0, rT" sequences + * generated when a SELECT/LOAD result feeds a function call parameter. + * + * Only applies to simple 32-bit scalar arguments (param_index 0..3). */ + if (nq->op == TCCIR_OP_FUNCPARAMVAL && !needs_pair) + { + IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); + int next_vr = irop_get_vreg(nq_src1); + if (next_vr != dest_vr) + { + return false; + } + + if (irop_op_is_lval(nq_src1)) + { + return false; + } + + IRLiveInterval *li = tcc_ir_get_live_interval(ir, dest_vr); + if (!li) + { + return false; + } + + if (li->start != (uint32_t)i) + { + return false; + } + + /* Scan forward from j+1 to find the FUNCCALLVAL that consumes + * this parameter. The vreg must end at that CALL instruction. */ + int call_idx = -1; + for (int k = j + 1; k < ir->next_instruction_index; k++) + { + TccIrOp kop = ir->compact_instructions[k].op; + if (kop == TCCIR_OP_NOP || kop == TCCIR_OP_FUNCPARAMVAL) + continue; + if (kop == TCCIR_OP_FUNCCALLVAL || kop == TCCIR_OP_FUNCCALLVOID) + { + call_idx = k; + break; + } + break; /* unexpected instruction — bail */ + } + if (call_idx < 0 || li->end != (uint32_t)call_idx) + { + return false; + } + + /* Decode parameter index from src2 (packed call_id | param_idx) */ + IROperand nq_src2 = tcc_ir_op_get_src2(ir, nq); + uint32_t encoded = (uint32_t)irop_get_imm64_ex(ir, nq_src2); + int param_idx = TCCIR_DECODE_PARAM_IDX(encoded); + + /* Only handle scalar register parameters (param 0..3 → R0..R3) */ + if (param_idx > 3) + return false; + + int target_reg = param_idx; /* R0=0, R1=1, R2=2, R3=3 */ + + /* Bail if another vreg already occupies target_reg during our live range. + * Use end-1: the value must be in the target reg up to (but not including) + * the call instruction where it is consumed; the call's return value (a + * different vreg) may start at exactly li->end in the same register. */ + if (ir_reg_conflict(ir, target_reg, li->start, li->end > 0 ? li->end - 1 : 0, dest_vr)) + return false; + + /* Patch allocation */ + li->allocation.r0 = (uint16_t)target_reg; + li->allocation.offset = 0; + + *out_mop_dest = (MachineOperand){.kind = MACH_OP_REG, + .btype = irop_get_btype(*dest_ir), + .vreg = dest_vr, + .is_64bit = false, + .is_unsigned = dest_ir->is_unsigned, + .needs_deref = false, + .u.reg = {.r0 = target_reg, .r1 = -1}}; + return true; } - *out_mop_dest = (MachineOperand){.kind = MACH_OP_REG, - .btype = irop_get_btype(*dest_ir), - .vreg = dest_vr, - .is_64bit = needs_pair, - .is_unsigned = dest_ir->is_unsigned, - .needs_deref = false, - .u.reg = {.r0 = REG_IRET, .r1 = needs_pair ? (int)REG_IRE2 : -1}}; - return true; + return false; } /* ============================================================================ @@ -1178,42 +1601,186 @@ static inline void ir_codegen_track_scratch(int is_dry_run, int i, TccIrOp op, i ir_codegen_check_scratch(i, op, dry_insn_scratch, dry_insn_saves); } +static int ir_codegen_count_vreg_uses(TCCIRState *ir, int32_t vreg) +{ + if (vreg < 0) + return 0; + + int uses = 0; + for (int i = 0; i < ir->next_instruction_index; i++) + { + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP) + continue; + + if (irop_config[q->op].has_src1 && irop_get_vreg(tcc_ir_op_get_src1(ir, q)) == vreg) + uses++; + if (irop_config[q->op].has_src2 && irop_get_vreg(tcc_ir_op_get_src2(ir, q)) == vreg) + uses++; + if (q->op == TCCIR_OP_MLA && q->operand_base + 3 < ir->iroperand_pool_count && + irop_get_vreg(ir->iroperand_pool[q->operand_base + 3]) == vreg) + uses++; + } + return uses; +} + +/* True if `vreg` is referenced by any instruction OTHER than its def at + * `def_idx` and a single consumer at `use_idx`. Gates the MUL-const+ADD + * fusion, which leaves the MUL result holding only the PARTIAL product (the + * trailing <blocks smash). */ +static int ir_codegen_vreg_used_elsewhere(TCCIRState *ir, int32_t vreg, int def_idx, int use_idx) +{ + if (vreg < 0) + return 0; + + for (int i = 0; i < ir->next_instruction_index; i++) + { + if (i == def_idx || i == use_idx) + continue; + IRQuadCompact *q = &ir->compact_instructions[i]; + if (q->op == TCCIR_OP_NOP) + continue; + + if (irop_config[q->op].has_dest && irop_get_vreg(tcc_ir_op_get_dest(ir, q)) == vreg) + return 1; + if (irop_config[q->op].has_src1 && irop_get_vreg(tcc_ir_op_get_src1(ir, q)) == vreg) + return 1; + if (irop_config[q->op].has_src2 && irop_get_vreg(tcc_ir_op_get_src2(ir, q)) == vreg) + return 1; + if (q->op == TCCIR_OP_MLA && q->operand_base + 3 < ir->iroperand_pool_count && + irop_get_vreg(ir->iroperand_pool[q->operand_base + 3]) == vreg) + return 1; + } + return 0; +} + +#ifdef TCC_REGALLOC_DEBUG +static void tcc_ir_debug_codegen_generate_entry(TCCIRState *ir) +{ + int local_count = ir->next_local_variable; + int temp_count = ir->next_temporary_variable; + int param_count = ir->next_parameter; + int total_vregs = local_count + temp_count + param_count; + if (total_vregs > 1000) /* Only print for large functions */ + fprintf(stderr, "[VREG STATS] locals=%d temps=%d params=%d total=%d (max_encoded=%d)\n", local_count, temp_count, + param_count, total_vregs, + (local_count > temp_count ? local_count : temp_count) > param_count + ? (local_count > temp_count ? local_count : temp_count) + : param_count); +} +#else +#define tcc_ir_debug_codegen_generate_entry(ir) ((void)0) +#endif + /* ============================================================================ - * Main Code Generation Loop + * Operand decode helper + * + * MopSpec encodes which MachineOperands to extract for a given IR instruction. + * decode_mop_args() performs all machine_op_from_ir / peephole / fixup calls + * once, returning a MopArgs struct. Switch cases then just forward to the + * appropriate backend function with the pre-decoded args. + * + * dest modes: 0 = none, 1 = normal, 2 = with before-return peephole + * src1 modes: 0 = none, 1 = normal, 2 = normal + subcomponent fixup + * src2/scale/accum: 0 = none, 1 = extract * ============================================================================ */ -void tcc_ir_codegen_generate(TCCIRState *ir) +typedef struct { - IRQuadCompact *cq; - int drop_return_value = 0; + uint8_t dest; /* 0/1/2: none / normal / peephole */ + uint8_t src1; /* 0/1/2: none / normal / +subcomp fixup */ + uint8_t src2; /* 0/1: none / normal */ + uint8_t scale; /* 0/1: none / from scale slot */ + uint8_t accum; /* 0/1: none / from operand slot 3 (MLA) */ +} MopSpec; + +typedef struct +{ + MachineOperand dest, src1, src2, scale, accum; +} MopArgs; -#ifdef TCC_REGALLOC_DEBUG - /* Print vreg statistics for size optimization analysis */ +static MopArgs decode_mop_args(TCCIRState *ir, IRQuadCompact *cq, const IROperand *src1_ir, const IROperand *src2_ir, + const IROperand *dest_ir, int i, MopSpec spec) +{ + MopArgs a; + if (spec.dest) { - int local_count = ir->next_local_variable; - int temp_count = ir->next_temporary_variable; - int param_count = ir->next_parameter; - int total_vregs = local_count + temp_count + param_count; - if (total_vregs > 1000) /* Only print for large functions */ - fprintf(stderr, "[VREG STATS] locals=%d temps=%d params=%d total=%d (max_encoded=%d)\n", local_count, temp_count, - param_count, total_vregs, - (local_count > temp_count ? local_count : temp_count) > param_count - ? (local_count > temp_count ? local_count : temp_count) - : param_count); + if (spec.dest == 2 && ir_codegen_before_ret_peephole(ir, i, dest_ir, &a.dest)) + ; /* peephole patched the allocation — use synthesised MachineOperand */ + else + a.dest = machine_op_from_ir(ir, dest_ir); } -#endif - /* `&&label` stores label positions as IR indices BEFORE DCE/compaction. - * Build a mapping for original indices, not just the compacted array indices. - */ - int max_orig_index = -1; - for (int i = 0; i < ir->next_instruction_index; i++) + if (spec.src1 >= 1) + { + a.src1 = machine_op_from_ir(ir, src1_ir); + if (spec.src1 == 2) + mop_fixup_subcomponent(&a.src1, src1_ir, ir); + } + + if (spec.src2) + a.src2 = machine_op_from_ir(ir, src2_ir); + if (spec.scale) { - if (ir->compact_instructions[i].orig_index > max_orig_index) - max_orig_index = ir->compact_instructions[i].orig_index; + IROperand scale_ir = tcc_ir_op_get_scale(ir, cq); + a.scale = machine_op_from_ir(ir, &scale_ir); } - if (max_orig_index < 0) - max_orig_index = 0; + if (spec.accum) + { + IROperand accum_ir = ir->iroperand_pool[cq->operand_base + 3]; + a.accum = machine_op_from_ir(ir, &accum_ir); + } + return a; +} + +/* ============================================================================ + * OPTION B: MopArgs cache helper + * ============================================================================ + * During the dry-run, decoded dest/src1/src2 operands are stored in mop_cache. + * During the real-run (when the cache is valid), they are read back directly, + * skipping the interval-table lookups in decode_mop_args. + * + * Instructions that use scale or accum (indexed loads/stores, MLA) are rare; + * those slots are not cached — they fall through to a full decode in both passes. + * ============================================================================ */ +static inline MopArgs ir_decode_cached(int is_dry_run, int use_mop_cache, MopArgs *mop_cache, int i, TCCIRState *ir, + IRQuadCompact *cq, const IROperand *src1_ir, const IROperand *src2_ir, + const IROperand *dest_ir, MopSpec spec) +{ + /* Real-run cache hit: scale/accum not needed, cache is valid. */ + if (!is_dry_run && use_mop_cache && !spec.scale && !spec.accum) + return mop_cache[i]; + + MopArgs a = decode_mop_args(ir, cq, src1_ir, src2_ir, dest_ir, i, spec); + + /* Dry-run: store decoded dest/src1/src2 for reuse, unless scale/accum are + * involved (those instructions re-decode cheaply in the real-run). */ + if (is_dry_run && mop_cache && !spec.scale && !spec.accum) + mop_cache[i] = a; + + return a; +} + +void tcc_ir_codegen_generate(TCCIRState *ir) +{ + IRQuadCompact *cq; + + tcc_ir_debug_codegen_generate_entry(ir); + + if (getenv("DUMP_IR_CG")) { printf("==== POST-OPT IR AT CODEGEN ====\n"); tcc_ir_show(ir); fflush(stdout); } + + /* `&&label` stores label positions as IR indices BEFORE DCE/compaction. + * max_orig_index and is_jump_target flags are maintained incrementally + * during IR construction (tcc_ir_put / tcc_ir_backpatch), so no pre-pass + * is needed here. */ + int max_orig_index = ir->max_orig_index; /* +1 to include epilogue when needed. * Keep this mapping available after codegen (e.g. for &&label). */ @@ -1243,83 +1810,565 @@ void tcc_ir_codegen_generate(TCCIRState *ir) int *return_jump_addrs = tcc_malloc(sizeof(int) * ir->next_instruction_index); int num_return_jumps = 0; + /* --- DEBUG: catch codegen-time corruption of a spilled temp's allocation.r0. + * The HW-only 90_struct c[1].y=5 bug: a temp that regalloc spilled + * (allocation.r0 == 0x3f) is overwritten to a register number during codegen, + * so machine_op_from_ir later reads it as "lives in R8". Snapshot now + * (post-regalloc) and report the first instruction at which any spilled temp + * flips to a register. --- */ + static uint8_t *dbg_alloc_snap = NULL; + static int dbg_alloc_snap_n = 0; + static int dbg_alloc_active = 0; + static int dbg_alloc_reported = 0; + dbg_alloc_active = 0; + if (funcname && !strcmp((const char *)funcname, "test_init_struct_from_struct")) + { + dbg_alloc_snap_n = ir->temporary_variables_live_intervals_size; + dbg_alloc_snap = tcc_realloc(dbg_alloc_snap, (size_t)dbg_alloc_snap_n + 1); + for (int p = 0; p < dbg_alloc_snap_n; p++) + dbg_alloc_snap[p] = (uint8_t)ir->temporary_variables_live_intervals[p].allocation.r0; + dbg_alloc_active = 1; + dbg_alloc_reported = 0; + fprintf(stderr, "ALLOCSNAP n=%d\n", dbg_alloc_snap_n); + /* Snapshot the liveness bitmap at the printf-arg LEA indices at codegen + * START. Compare with the FSR trace (printed at the find_free call): if + * these are correct here but wrong at find_free, the bitmap is corrupted + * during codegen; if already wrong here, ra_build_live_regs_bitmap + * miscomputed it. */ + uint32_t *lrb = ir->ls.live_regs_by_instruction; + int lrbn = ir->ls.live_regs_by_instruction_size; + fprintf(stderr, "LRBSNAP arr=%p sz=%d [70]=0x%x [72]=0x%x [75]=0x%x [80]=0x%x\n", (void *)lrb, lrbn, + (lrb && 70 < lrbn) ? lrb[70] : 0xDEADu, (lrb && 72 < lrbn) ? lrb[72] : 0xDEADu, + (lrb && 75 < lrbn) ? lrb[75] : 0xDEADu, (lrb && 80 < lrbn) ? lrb[80] : 0xDEADu); + } + /* Clear spill cache at function start */ tcc_ir_spill_cache_clear(&ir->spill_cache); - /* Some peephole optimizations (LOAD/ASSIGN -> RETURNVALUE in R0, and skipping - * RETURNVALUE moves) are only valid when RETURNVALUE is reached by straight-line - * fallthrough from the immediately preceding instruction. - * - * If RETURNVALUE is a jump target (a control-flow merge), those peepholes can - * become incorrect: the preceding instruction might not execute on all paths, - * leaving the return value in a non-return register. - * - * Track which IR instruction indices are jump targets to guard these peepholes. + /* ============================================================================ + * PRE-SCAN: Compute maximum outgoing call stack argument size + * ============================================================================ + * Scan all FUNCCALLVAL/FUNCCALLVOID instructions to find the maximum stack + * argument area needed across all calls. This allows us to pre-reserve the + * area in the frame and avoid dynamic SP adjustments at each call site. */ - uint8_t *has_incoming_jump = tcc_mallocz(ir->next_instruction_index ? ir->next_instruction_index : 1); - for (int i = 0; i < ir->next_instruction_index; ++i) { - IRQuadCompact *p = &ir->compact_instructions[i]; - if (p->op == TCCIR_OP_JUMP || p->op == TCCIR_OP_JUMPIF) + int max_outgoing = 0; + int call_count = 0; + int has_softfloat_ops = 0; + int max_nested_save_regs = 0; + for (int i = 0; i < ir->next_instruction_index; i++) { - /* Read jump target from IROperand pool */ - IROperand dest_irop = tcc_ir_op_get_dest(ir, p); - int target = (int)dest_irop.u.imm32; - if (target >= 0 && target < ir->next_instruction_index) - has_incoming_jump[target] = 1; - } - } + const IRQuadCompact *q = &ir->compact_instructions[i]; - /* Reserve outgoing call stack args area at the very bottom of the frame. - * This ensures prepared-call stack args are at call-time SP. - */ - if (ir->call_outgoing_size > 0) - { - loc -= ir->call_outgoing_size; - ir->call_outgoing_base = loc; - } + /* Detect soft-float operations that temporarily adjust SP. + * These require FP to keep frame-relative offsets stable. */ + if (q->op >= TCCIR_OP_FADD && q->op <= TCCIR_OP_CVT_FTOI) + has_softfloat_ops = 1; - int stack_size = (-loc + 7) & ~7; // align to 8 bytes + /* VLA functions dynamically move SP (sub sp, vla_size). Without FP, + * saved-SP references and local variable offsets break. Force FP. */ + if (q->op == TCCIR_OP_VLA_ALLOC) + { + tcc_state->need_frame_pointer = 1; + tcc_state->func_dynamic_sp = 1; + } - /* ============================================================================ - * DRY RUN PASS: Analyze scratch register needs before emitting prologue - * ============================================================================ - * This discovers what scratch registers will be needed during code generation, - * allowing us to include them in the prologue (avoiding push/pop in loops). - */ - int original_leaffunc = ir->leaffunc; - uint32_t extra_prologue_regs = 0; + if (q->op != TCCIR_OP_FUNCCALLVAL && q->op != TCCIR_OP_FUNCCALLVOID) + continue; - /* If this function has a static chain (nested function), reserve R10 - * as callee-saved so the parent's static chain is preserved. - * R10 is the static chain register per architecture_config.static_chain_reg. */ - if (ir->has_static_chain) - { - extra_prologue_regs |= (1 << architecture_config.static_chain_reg); - } + call_count++; + const IROperand call_id_op = tcc_ir_get_src2(ir, i); + if (irop_is_none(call_id_op)) + continue; - /* Phase-3 per-instruction scratch constraint recording. - * Allocated once per function; indexed by instruction index. - * dry_insn_scratch[i] = number of mach_alloc_scratch() calls at instruction i. - * dry_insn_saves[i] = bitmask of registers that would be PUSH'd at instruction i. - * Both arrays are declared before #if so they are visible in both passes. */ - int *dry_insn_scratch = tcc_mallocz(ir->next_instruction_index * sizeof(int)); - uint16_t *dry_insn_saves = tcc_mallocz(ir->next_instruction_index * sizeof(uint16_t)); + const int call_id = TCCIR_DECODE_CALL_ID((uint32_t)call_id_op.u.imm32); + const int argc_hint = TCCIR_DECODE_CALL_ARGC((uint32_t)call_id_op.u.imm32); - /* ============================================================================ - * TWO-PASS CODE GENERATION - * ============================================================================ - * Pass 0 (dry-run): Discover scratch register needs without emitting code. - * - ot() is a no-op; ind advances but no bytes are written. + /* Compute ABI layout to determine stack arg size (no MOP allocation). */ + TCCAbiCallLayout layout; + memset(&layout, 0, sizeof(layout)); + TCCAbiArgLoc inline_locs[16]; + layout.locs = inline_locs; + layout.capacity = 16; + + TCCAbiArgLoc *heap_locs = NULL; + if (argc_hint > 16) + { + heap_locs = tcc_mallocz(sizeof(TCCAbiArgLoc) * argc_hint); + layout.locs = heap_locs; + layout.capacity = argc_hint; + } + + int argc = thumb_build_call_layout_from_ir(ir, i, call_id, argc_hint, &layout, NULL, NULL); + int stack = (argc > 0) ? (int)layout.stack_size : 0; + stack = (stack + 7) & ~7; /* 8-byte align (AAPCS) */ + if (stack > max_outgoing) + max_outgoing = stack; + + /* Compute actual nested-call save needs using liveness data. + * Only R0-R3 that hold values live BEFORE argument setup AND those + * values survive past the call actually need saving. + * + * Check liveness at the first FUNCPARAM for this call (before arg + * setup clobbers R0-R3). If R0-R3 aren't live there, the call + * doesn't need nested register saves. + * + * Checking at i+1 (after call) is wrong: a new definition at i+1 + * (e.g., R0 <-- #34) makes R0 appear "live" even though it's a fresh + * value, not one that needs preserving across the call. */ + { + uint32_t reg_arg_mask = 0; + for (int a = 0; a < argc; a++) + { + const TCCAbiArgLoc *al = &layout.locs[a]; + if (al->kind == TCC_ABI_LOC_REG || al->kind == TCC_ABI_LOC_REG_STACK) + { + for (int w = 0; w < al->reg_count; w++) + reg_arg_mask |= (1u << (al->reg_base + w)); + } + } + /* Find the first FUNCPARAM for this call by scanning backward. + * Liveness at that point reflects the pre-arg-setup state. */ + uint32_t need_save = reg_arg_mask & 0x0F; + if (!ir->ls.live_regs_by_instruction) + { + /* No liveness table means the register allocator assigned no physical + * registers — all values are materialized on-the-fly. Nothing can be + * live across calls, so no saves are needed. */ + need_save = 0; + } + else + { + int first_param = i; /* fallback to call instruction */ + /* Scan backward for the earliest FUNCPARAM of this call. + * Param-value computation may sit between FUNCPARAMs (e.g. PARAM0, + * then some ASSIGNs setting up R1, then PARAM1), so do NOT stop at + * non-PARAM instructions — only stop at the previous CALL or at + * function entry. */ + for (int k = i - 1; k >= 0; k--) + { + const IRQuadCompact *pk = &ir->compact_instructions[k]; + if (pk->op == TCCIR_OP_NOP) + continue; + if (pk->op == TCCIR_OP_FUNCCALLVAL || pk->op == TCCIR_OP_FUNCCALLVOID) + break; /* hit the previous call — done */ + if (pk->op == TCCIR_OP_FUNCPARAMVAL) + { + const IROperand param_src2 = tcc_ir_get_src2(ir, k); + if (!irop_is_none(param_src2)) + { + int param_call_id = TCCIR_DECODE_CALL_ID((uint32_t)param_src2.u.imm32); + if (param_call_id == call_id) + first_param = k; + } + } + /* Non-PARAM, non-CALL: keep scanning past arg-value computation. */ + } + /* A register needs saving across this call iff some interval holds + * a value in it BEFORE the call setup begins AND that value is still + * needed AFTER the call returns. Checking live_before alone falsely + * counts arg-passing intervals of the *previous* call (their ranges + * end exactly at that CALL, which is often first_param - 1). + * Intersecting with live_after_call removes them: their intervals + * do not extend past the previous call. Cross-call live values, by + * definition, are live at both points. */ + if (first_param == 0) + { + need_save = 0; + } + else if (first_param - 1 < ir->ls.live_regs_by_instruction_size) + { + uint32_t live_before = ir->ls.live_regs_by_instruction[first_param - 1]; + uint32_t live_after_call = (i + 1 < ir->ls.live_regs_by_instruction_size) + ? ir->ls.live_regs_by_instruction[i + 1] + : 0; + need_save &= live_before & live_after_call; + } + else + { + need_save = 0; + } + } + const int save_count = __builtin_popcount(need_save); + if (save_count > max_nested_save_regs) + max_nested_save_regs = save_count; + } + + if (heap_locs) + tcc_free(heap_locs); + if (layout.locs != inline_locs && layout.locs) + tcc_free(layout.locs); + } + ir->call_outgoing_size = max_outgoing; + + /* Disable tail-call optimization if the call needs stack arguments or if + * text_and_data_separation requires R9 save/restore around the call. + * With stack args, the pre-reserved outgoing area would need to be set up + * before the branch, complicating frame teardown. */ + if (ir->tail_call_only && (max_outgoing > 0 || tcc_state->text_and_data_separation)) + { + ir->tail_call_only = 0; + ir->leaffunc = 0; + } + + /* Reserve nested-call register save area for functions with multiple calls. + * Size based on actual max R0-R3 usage across calls (+ R9 if needed). */ + if (call_count > 1 && max_nested_save_regs > 0) + { + int save_regs = max_nested_save_regs; + if (tcc_state->text_and_data_separation) + save_regs++; /* R9 */ + ir->call_nested_save_size = save_regs * 4; + } + else if (call_count >= 1 && tcc_state->text_and_data_separation) + { + ir->call_nested_save_size = 4; /* R9 only */ + } + else + { + ir->call_nested_save_size = 0; + } + + /* Soft-float helpers temporarily lower SP (sub sp, #N ... add sp, #N) + * to save intermediate values. This breaks SP-relative local offsets + * when FP is omitted. Force FP for functions with soft-float ops. */ + if (has_softfloat_ops) + tcc_state->need_frame_pointer = 1; + + if (ir->has_static_chain) + tcc_state->need_frame_pointer = 1; + + if (call_count > 0) + { + for (int p = 0; p < ir->next_parameter; p++) + { + if (ir->parameters_live_intervals[p].incoming_reg0 < 0) + { + tcc_state->need_frame_pointer = 1; + break; + } + } + } + } + + /* Reserve nested-call save area above the outgoing area. */ + if (ir->call_nested_save_size > 0) + { + loc -= ir->call_nested_save_size; + ir->call_nested_save_base = loc; + } + + /* Reserve outgoing call stack args area at the very bottom of the frame. + * This ensures prepared-call stack args are at call-time SP. + */ + if (ir->call_outgoing_size > 0) + { + loc -= ir->call_outgoing_size; + ir->call_outgoing_base = loc; + } + + ir->scratch_save_size = 0; + ir->scratch_save_base = 0; + + int stack_size = (-loc + 7) & ~7; // align to 8 bytes + + /* Disable tail-call if the function needs any stack frame or frame pointer. + * Tail-call tears down the frame before branching, but arguments to the tail + * call may reference stack-relative addresses (struct copies, spilled values) + * that would become invalid after the teardown. */ + if (ir->tail_call_only && + (stack_size > 0 || tcc_state->need_frame_pointer || tcc_state->force_frame_pointer)) + { + ir->tail_call_only = 0; + ir->leaffunc = 0; + } + + /* ============================================================================ + * DRY RUN PASS: Analyze scratch register needs before emitting prologue + * ============================================================================ + * This discovers what scratch registers will be needed during code generation, + * allowing us to include them in the prologue (avoiding push/pop in loops). + */ + int original_leaffunc = ir->leaffunc; + uint32_t extra_prologue_regs = 0; + + /* If this function has a static chain (nested function), reserve R10 + * as callee-saved so the parent's static chain is preserved. + * R10 is the static chain register per architecture_config.static_chain_reg. */ + if (ir->has_static_chain) + { + extra_prologue_regs |= (1 << architecture_config.static_chain_reg); + } + + /* Phase-3 per-instruction scratch constraint recording. + * Allocated once per function; indexed by instruction index. + * dry_insn_scratch[i] = number of mach_alloc_scratch() calls at instruction i. + * dry_insn_saves[i] = bitmask of registers that would be PUSH'd at instruction i. + * Both arrays are declared before #if so they are visible in both passes. */ + int *dry_insn_scratch = tcc_mallocz(ir->next_instruction_index * sizeof(int)); + uint16_t *dry_insn_saves = tcc_mallocz(ir->next_instruction_index * sizeof(uint16_t)); + + /* ============================================================================ + * OPTION A: Skip dry-run for scratch-conflict-free functions + * ============================================================================ + * ARM has 13 allocatable integer registers (r0-r12) and 16 single-precision + * VFP registers (s0-s15). Scratch needs at most 2 of each simultaneously. + * If enough registers are provably free at every program point, no scratch + * push/pop can occur, so the dry-run produces no useful information. + * + * When skipping: + * - dry_insn_scratch[] / dry_insn_saves[] stay zero (tcc_mallocz) — correct. + * - Phase-3 fixup is a no-op (all-zero dry_insn_saves). + * - LR: no scratch push means no surprise LR push; leaffunc already correct. + * - Branch optimizer falls back to 32-bit encodings for all branches + * (2 bytes wasted per branch; acceptable tradeoff). + * ============================================================================ */ + const int can_skip_dry_run = + __builtin_popcountll(ir->ls.dirty_registers) <= (unsigned)(tcc_state->registers_for_allocator - 2) && + __builtin_popcountll(ir->ls.dirty_float_registers) <= (unsigned)(tcc_state->float_registers_for_allocator - 2); + + if (can_skip_dry_run) + { + /* When FP is omitted and the dry run is skipped, allocate a safety-net + * scratch save area. Even with few dirty registers, exclude_regs can + * make all free registers unavailable for scratch, forcing a PUSH that + * would break SP-relative addressing. The area costs only 8 bytes of + * stack and allows get_scratch_reg_with_save() to use STR/LDR. + * + * Skip when the function has no SP-relative accesses at all: no locals, + * no spills, no outgoing args (stack_size == 0), AND no stack-passed + * parameters (whose offsets are also SP-relative via offset_to_args). */ + int has_stack_params = 0; + for (int p = 0; p < ir->next_parameter; p++) + { + if (ir->parameters_live_intervals[p].incoming_reg0 < 0) + { + has_stack_params = 1; + break; + } + } + if (!tcc_state->need_frame_pointer && !tcc_state->force_frame_pointer && (stack_size > 0 || has_stack_params)) + { + /* Scratch save area is a safety net for get_scratch_reg_with_save() + * paths that PUSH/STR into the area when no free register is found. + * The skip-dry-run path doesn't know max_scratch_depth, so it reserves + * conservatively. But a pre-scan over the IR can rule out the + * scratch-requiring ops entirely for simple functions (e.g. integer + * code with no FP / 64-bit / div / inline-asm), avoiding the dead + * reservation. */ + int might_need_scratch = 0; + int has_any_op = 0; + for (int i = 0; i < ir->next_instruction_index; i++) + { + int op = ir->compact_instructions[i].op; + if (op == TCCIR_OP_NOP) + continue; + has_any_op = 1; + /* FP/double ops invoke soft-float helpers (or VFP) with multi-reg + * scratch needs; 64-bit ints are emulated as pairs and may need + * scratch for the high half; div/mod call helpers; block-copy and + * VLA touch SP/memcpy; inline asm and indexed memory ops are + * unconstrained. Any of these forces the safety net. */ + if (op >= TCCIR_OP_FADD && op <= TCCIR_OP_CVT_FTOI) { might_need_scratch = 1; break; } + switch (op) + { + case TCCIR_OP_DIV: + case TCCIR_OP_UDIV: + case TCCIR_OP_PDIV: + case TCCIR_OP_UMOD: + case TCCIR_OP_IMOD: + case TCCIR_OP_UMULL: + case TCCIR_OP_SMULL: + case TCCIR_OP_BLOCK_COPY: + case TCCIR_OP_VLA_ALLOC: + case TCCIR_OP_VLA_SP_SAVE: + case TCCIR_OP_VLA_SP_RESTORE: + case TCCIR_OP_INLINE_ASM: + case TCCIR_OP_ASM_INPUT: + case TCCIR_OP_ASM_OUTPUT: + case TCCIR_OP_SET_CHAIN: + case TCCIR_OP_LOAD_INDEXED: + case TCCIR_OP_STORE_INDEXED: + case TCCIR_OP_IJUMP: + case TCCIR_OP_SWITCH_TABLE: + case TCCIR_OP_SWITCH_LOAD: + might_need_scratch = 1; + break; + default: + break; + } + if (might_need_scratch) + break; + /* 64-bit operand on any op: emulated as a pair, may need scratch. */ + IROperand d = tcc_ir_op_get_dest(ir, &ir->compact_instructions[i]); + IROperand s1 = tcc_ir_op_get_src1(ir, &ir->compact_instructions[i]); + IROperand s2 = tcc_ir_op_get_src2(ir, &ir->compact_instructions[i]); + if (irop_is_64bit(d) || irop_is_64bit(s1) || irop_is_64bit(s2)) + { + might_need_scratch = 1; + break; + } + } + /* Calls with stack-passed args: the call-site setup may need scratch + * to materialise the argument values into SP-relative slots. */ + if (!might_need_scratch && ir->call_outgoing_size > 0) + might_need_scratch = 1; + /* Incoming stack params: reads from [sp + offset_to_args] may collide + * with live argument registers, forcing get_scratch_reg_with_save to + * STR the register into the reserved area before the load. + * Only relevant if some non-NOP op actually runs — a fully-NOP'd body + * (useless_function_body) never loads those params. */ + if (!might_need_scratch && has_stack_params && has_any_op) + might_need_scratch = 1; + /* Large frames need scratch to materialise SP-relative offsets that + * exceed the immediate-encoding range of Thumb-2 LDR/STR. A simple + * 124-byte threshold matches the LDR rt,[sp,#imm5*4] limit; above + * that, individual access sites may need an extra register. */ + if (!might_need_scratch && stack_size > 124) + might_need_scratch = 1; + + if (might_need_scratch) + { + ir->scratch_save_size = 16; /* 4 slots — 64-bit ops on 32-bit ARM can need 3+ simultaneous scratch saves */ + loc -= ir->scratch_save_size; + /* The outgoing call-arg area must stay at the very bottom of the + * frame (stack args are stored at literal [SP, #stack_off]), and the + * nested-call save area (R0-R3/R9 saves around calls) is addressed + * literally at [SP + call_outgoing_size + n*4] directly above it. + * The scratch area must therefore sit ABOVE BOTH: putting it lower + * maps scratch saves onto already-written argument slots or onto the + * saved R9/GOT base (restoring r9 = scratch garbage after the call). */ + if (ir->call_outgoing_size > 0 || ir->call_nested_save_size > 0) + { + ir->call_outgoing_base = loc; + ir->call_nested_save_base = loc + ir->call_outgoing_size; + ir->scratch_save_base = loc + ir->call_outgoing_size + ir->call_nested_save_size; + } + else + { + ir->scratch_save_base = loc; + } + stack_size = (-loc + 7) & ~7; + } + } + + /* Mirror the dry-run finalisation: init branch opt (sets 32-bit fallback), + * reset scratch/spill/fp state, then emit prologue immediately. */ + tcc_gen_machine_branch_opt_init(); + tcc_gen_machine_reset_scratch_state(); + tcc_ir_spill_cache_clear(&ir->spill_cache); + tcc_ir_opt_fp_cache_clear(ir); + /* Pre-patch allocations for FUNCPARAMVAL fusion, then trim ghost + * callee-saved registers from dirty_registers before prologue. */ + ir_codegen_pre_patch_funcparam_allocations(ir); + ir_codegen_recompute_dirty_from_allocations(ir); + if (!ir->naked) + tcc_gen_machine_prolog(ir->leaffunc, ir->ls.dirty_registers, stack_size, extra_prologue_regs); + if (!ir->naked) + tcc_debug_prolog_epilog(tcc_state, 0); + } + + /* ============================================================================ + * TWO-PASS CODE GENERATION + * ============================================================================ + * Pass 0 (dry-run): Discover scratch register needs without emitting code. + * - ot() is a no-op; ind advances but no bytes are written. * - Records per-instruction scratch counts in dry_insn_scratch[]. * - Branch optimizer collects offset data. * Pass 1 (real-run): Emit actual Thumb-2 machine code. * - Uses dry-run data for scratch consistency checks. * - Emits debug info, epilogue jumps, inline asm. + * When can_skip_dry_run: pass 0 is skipped entirely, prologue already emitted. * ============================================================================ */ - for (int pass = 0; pass < 2; pass++) + /* Option B: allocate per-instruction MopArgs cache for the dry-run. + * Not used when the dry-run is skipped (can_skip_dry_run). */ + MopArgs *mop_cache = (!can_skip_dry_run && ir->next_instruction_index > 0) + ? tcc_malloc(ir->next_instruction_index * sizeof(MopArgs)) + : NULL; + int use_mop_cache = 0; + + const int pass_start = can_skip_dry_run ? 1 : 0; + uint32_t *cbz_dry_mapping = NULL; + + /* Branch-target reset map for the materialisation cache (imm_cache). + * + * imm_cache persists a register's cached constant / symbol address across + * straight-line IR boundaries (dead registers keep their value). This is + * only sound when control reaches the instruction linearly: at a control-flow + * merge an alternate predecessor may have clobbered the register. The shared + * `is_jump_target` flag covers most merges, but at -O0 backward (loop) branch + * targets are not always flagged, so cache a complete target set here and + * reset at those points too. Kept local to codegen so the wider + * `is_jump_target` semantics (and the peephole fusions keyed on it) are + * untouched. Mirrors the target enumeration in tcc_ir_codegen_backpatch_jumps. */ + uint8_t *branch_target_reset = NULL; + if (ir->next_instruction_index > 0) + { + branch_target_reset = tcc_mallocz((size_t)ir->next_instruction_index); + int has_indirect_jump = 0; + for (int bi = 0; bi < ir->next_instruction_index; bi++) + { + IRQuadCompact *bq = &ir->compact_instructions[bi]; + if (bq->op == TCCIR_OP_JUMP || bq->op == TCCIR_OP_JUMPIF) + { + IROperand bdest = tcc_ir_op_get_dest(ir, bq); + int btgt = irop_is_none(bdest) ? -1 : (int)bdest.u.imm32; + if (btgt >= 0 && btgt < ir->next_instruction_index) + branch_target_reset[btgt] = 1; + } + else if (bq->op == TCCIR_OP_IJUMP) + { + /* Computed goto: lands on an address-taken label that is not a static + * JUMP target and cannot be cheaply enumerated from the register- + * indirect jump. Conservatively disable cross-boundary cache + * persistence for the whole function (computed goto is rare). */ + has_indirect_jump = 1; + } + } + /* Switch-table targets (data-driven jumps). */ + for (int st = 0; st < ir->num_switch_tables; st++) + { + TCCIRSwitchTable *tbl = &ir->switch_tables[st]; + for (int je = 0; je < tbl->num_entries; je++) + { + int btgt = tbl->targets[je]; + if (btgt >= 0 && btgt < ir->next_instruction_index) + branch_target_reset[btgt] = 1; + } + } + if (has_indirect_jump) + memset(branch_target_reset, 1, (size_t)ir->next_instruction_index); + } + + for (int pass = pass_start; pass < 2; pass++) { const int is_dry_run = (pass == 0); + int codegen_skip_cmp = -1; + int codegen_skip_select = -1; /* SUBS+IT peephole: skip this SELECT (CMP already emitted SUBS+IT+MOVNE in its slot). */ + int codegen_cbz_reg = -1; /* pending CBZ: physical register for compare */ + int codegen_cbz_nonzero = 0; /* pending CBZ: 0=CBZ (EQ), 1=CBNZ (NE) */ + /* CBZ/CBNZ peephole: fuse `CMP rN,#0; JUMPIF EQ/NE` into a single 16-bit + * CBZ/CBNZ. DISABLED — it is unsound and crashes the backend. + * + * CBZ/CBNZ are forward-only with a 0..126-byte range, and the peephole + * commits the 2-byte encoding irrevocably while only ESTIMATING the forward + * distance (the target is not yet emitted in the single forward real pass). + * Both estimators are unsound: + * - can_skip_dry_run path: `ir_gap*10 + pending_pool_size <= 126` assumes + * ~10 bytes/IR-op, but a single op can emit far more (64-bit arithmetic, + * literal-pool loads, block copies), so the real distance overflows 126 + * (e.g. offset=166). + * - dry-mapping path: distances from a NO-CBZ dry run diverge from the + * real layout once literal-pool flush points shift between the passes, + * producing wildly wrong (even negative) final offsets (e.g. -1192). + * When the real offset does not fit, th_patch_call() has no way to widen a + * committed 2-byte CBZ in place and aborts with + * "CBZ/CBNZ target out of range". Falling back to the always-correct + * CMP rN,#0 + B.W (full +/-1MB range) costs only 4 bytes/branch and + * never crashes. Re-enable only behind a proper iterative branch- + * relaxation pass that re-emits out-of-range CBZ candidates as wide. */ + const int cbz_enabled = 0; /* ---- Pass-specific initialisation ---- */ if (is_dry_run) @@ -1336,24 +2385,87 @@ void tcc_ir_codegen_generate(TCCIRState *ir) int saved_codegen_idx = ir->codegen_instruction_idx; int saved_loc = loc; int saved_call_outgoing_base = ir->call_outgoing_base; + int saved_call_nested_save_base = ir->call_nested_save_base; /* ---- Instruction loop ---- */ for (int i = 0; i < ir->next_instruction_index; i++) { - drop_return_value = 0; cq = &ir->compact_instructions[i]; /* Default: no extra scratch constraints for this instruction. */ ir->codegen_materialize_scratch_flags = 0; + /* At jump targets, flags from a prior CMP are not guaranteed live. + * The STR→LDR peephole tracker is also invalidated, since a branch can + * reach this IR op from a path where the prior STR did not execute. */ + if (cq->is_jump_target) + { + ir->codegen_flags_live = 0; + ir->spill_cache.last_emit_kind = 0; + } + /* Track current instruction for scratch register allocation */ ir->codegen_instruction_idx = i; + /* DEBUG: report the first spilled temp whose allocation.r0 was overwritten + * to a register since codegen start (corruption happened at instr <= i-1, + * or in the dry-run pass if i is 0). */ + if (dbg_alloc_active && !dbg_alloc_reported) + { + int lim = ir->temporary_variables_live_intervals_size; + if (lim > dbg_alloc_snap_n) + lim = dbg_alloc_snap_n; + for (int p = 0; p < lim; p++) + { + uint8_t now = (uint8_t)ir->temporary_variables_live_intervals[p].allocation.r0; + if (dbg_alloc_snap[p] == 0x3f && now != 0x3f) + { + fprintf(stderr, "ALLOCCORRUPT T%d r0 0x3f->0x%x by codegen idx<=%d (this op=%d)\n", + p, now, i, (int)cq->op); + dbg_alloc_reported = 1; + break; + } + } + } + /* Debug tracking: update current op for ot_check failure reporting */ g_debug_current_op = (int)cq->op; ir_to_code_mapping[i] = ind; + /* Reset the STR→LDR memory-reload cache at every IR instruction + * boundary (it tracks memory state, which an aliasing store on a + * jumped-from path could invalidate without an emit the tracker sees). + * + * The MOV-equivalence (GPR value) cache, by contrast, stays sound + * across straight-line IR-op boundaries: every emitted instruction + * updates it (invalidating its dest reg, with calls/unknown opcodes + * forcing a full reset), so register equivalences only become invalid + * at a real control-flow merge. Reset it only at jump targets; this + * lets cross-IR `mov` chains — e.g. a soft-float double result copied + * to its callee-saved home pair and then back to the next call's + * argument pair — coalesce away. */ + tcc_gen_machine_strldr_cache_reset(); + /* Like imm_cache below, the GPR-equivalence cache must also drop at + * backward (loop) branch targets that is_jump_target misses at -O0: + * an equivalence recorded before the loop (e.g. the prologue's + * `mov r4, r0` param save) is not re-established on the back edge, + * and eliding a call-argument `mov r0, r4` on that basis passes + * garbage from the previous iteration (gcc_execute/990128-1 stored + * through such a garbage pointer into the kernel vector table). */ + if (cq->is_jump_target || (branch_target_reset && branch_target_reset[i])) + tcc_gen_machine_mov_equiv_reset(); + + /* Invalidate imm_cache for registers assigned to live vregs. + * Free (dead) registers retain cached constants across IR boundaries. + * Full reset at jump targets / calls where control flow is non-linear. */ + if (cq->is_jump_target || (branch_target_reset && branch_target_reset[i]) || + cq->op == TCCIR_OP_FUNCCALLVAL || cq->op == TCCIR_OP_FUNCCALLVOID) + tcc_gen_machine_imm_cache_reset(); + else if (ir->ls.live_regs_by_instruction && + i < ir->ls.live_regs_by_instruction_size) + tcc_gen_machine_imm_cache_invalidate_live(ir->ls.live_regs_by_instruction[i]); + /* Real-run only: record original-index mapping and emit debug line info */ if (!is_dry_run) { @@ -1371,6 +2483,17 @@ void tcc_ir_codegen_generate(TCCIRState *ir) * table directly from the raw operand. All dispatch sites now use * MachineOperand-based (_mop) handlers unconditionally. */ +#define DECODE(...) \ + ir_decode_cached(is_dry_run, use_mop_cache, mop_cache, i, ir, cq, &src1_ir, &src2_ir, &dest_ir, \ + (MopSpec){__VA_ARGS__}) +#define SCRATCH_WRAP(call) \ + do \ + { \ + tcc_gen_machine_insn_scratch_reset(); \ + call; \ + ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); \ + } while (0) + switch (cq->op) { case TCCIR_OP_MUL: @@ -1378,56 +2501,476 @@ void tcc_ir_codegen_generate(TCCIRState *ir) case TCCIR_OP_UDIV: case TCCIR_OP_IMOD: case TCCIR_OP_UMOD: + { + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + + /* Peephole: MUL-by-const + ADD → fused shifted-add. + * When MUL result feeds directly into ADD, fuse the trailing + * shift into the ADD using ARM's flexible second operand. */ + if (cq->op == TCCIR_OP_MUL && !a.src1.is_64bit && !a.dest.is_64bit) + { + const MachineOperand *imm_op = NULL, *var_op = NULL; + if (a.src2.kind == MACH_OP_IMM) + { + imm_op = &a.src2; + var_op = &a.src1; + } + else if (a.src1.kind == MACH_OP_IMM) + { + imm_op = &a.src1; + var_op = &a.src2; + } + if (imm_op) + { + int next_j = i + 1; + while (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_NOP) + next_j++; + if (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_ADD && + !ir->compact_instructions[next_j].is_jump_target) + { + IRQuadCompact *nq = &ir->compact_instructions[next_j]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_j, ir, nq, &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 1, .src1 = 1, .src2 = 1}); + + /* Identify which ADD operand is the MUL result and which is the base */ + MachineOperand *add_base = NULL; + int mul_dest_vreg = a.dest.vreg; + if (!b.src1.is_64bit && !b.src2.is_64bit && mul_dest_vreg >= 0) + { + if (b.src2.vreg == mul_dest_vreg && b.src2.kind == MACH_OP_REG && !b.src2.needs_deref) + add_base = &b.src1; + else if (b.src1.vreg == mul_dest_vreg && b.src1.kind == MACH_OP_REG && !b.src1.needs_deref) + add_base = &b.src2; + } + + /* Only safe when the ADD at next_j is the SOLE consumer of the MUL + * result: the fused helper leaves mul_dest holding the PARTIAL + * product (var*odd for a (2^a+1)*2^b or (2^a-1)*2^b constant), not + * the full var*C — the trailing < deferred free() HardFault; the same shape + * smashed cfg->blocks in the 02-08 self-host crashes.) Scan the IR + * directly rather than trust the live-interval `end`, which can + * under-approximate cross-block / loop-back-edge uses and let the + * fusion fire when mul_dest is in fact still live. */ + if (add_base && mul_dest_vreg >= 0) + { + if (ir_codegen_vreg_used_elsewhere(ir, mul_dest_vreg, i, next_j)) + add_base = NULL; + } + + if (add_base) + { + tcc_gen_machine_insn_scratch_reset(); + int fused = tcc_gen_machine_mul_const_add_fused_mop(*var_op, imm_op->u.imm.val, a.dest, *add_base, + b.dest); + if (fused) + { + ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + i = next_j; + break; + } + } + } + } + } + + SCRATCH_WRAP(tcc_gen_machine_muldiv_mop(a.src1, a.src2, a.dest, cq->op)); + break; + } case TCCIR_OP_TEST_ZERO: { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_muldiv_mop(mop_src1, mop_src2, mop_dest, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + /* CBZ/CBNZ peephole for TEST_ZERO: same as CMP #0 pattern */ + if (cbz_enabled) + { + MopArgs cbz_a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + if (!cbz_a.src1.is_64bit && cbz_a.src1.kind == MACH_OP_REG && !cbz_a.src1.needs_deref && + cbz_a.src1.u.reg.r0 >= 0 && cbz_a.src1.u.reg.r0 <= 7) + { + int next_j = i + 1; + while (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_NOP) + next_j++; + if (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_JUMPIF) + { + IROperand jc = tcc_ir_op_get_src1(ir, &ir->compact_instructions[next_j]); + int ct = (int)irop_get_imm64_ex(ir, jc); + if (ct == 0x94 || ct == 0x95) + { + IROperand jdest = tcc_ir_op_get_dest(ir, &ir->compact_instructions[next_j]); + int target_ir = irop_is_none(jdest) ? -1 : (int)jdest.u.imm32; + if (target_ir > i && target_ir < (int)ir->ir_to_code_mapping_size) + { + int cbz_in_range = 0; + if (cbz_dry_mapping) + { + int estimated_dist = (int)(ir_to_code_mapping[target_ir] - ind); + int dry_dist = (int)(cbz_dry_mapping[target_ir] - cbz_dry_mapping[i]); + cbz_in_range = (dry_dist >= 4 && dry_dist <= 126 && estimated_dist >= 0); + } + else + { + int ir_gap = target_ir - i; + int est = ir_gap * 10 + tcc_gen_machine_pending_pool_size(); + cbz_in_range = (ir_gap >= 1 && est <= 126); + } + if (cbz_in_range) + { + codegen_cbz_reg = cbz_a.src1.u.reg.r0; + codegen_cbz_nonzero = (ct == 0x95); + break; + } + } + } + } + } + } + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + SCRATCH_WRAP(tcc_gen_machine_muldiv_mop(a.src1, a.src2, a.dest, cq->op)); break; } case TCCIR_OP_MLA: { - IROperand accum_ir = ir->iroperand_pool[cq->operand_base + 3]; - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_accum = machine_op_from_ir(ir, &accum_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_mla_mop(mop_src1, mop_src2, mop_dest, mop_accum); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1, .accum = 1); + if (TCC_LOG_LS) { + IROperand accum_ir_dbg = ir->iroperand_pool[cq->operand_base + 3]; + int vr_dbg = irop_get_vreg(accum_ir_dbg); + IRLiveInterval *li_dbg = (vr_dbg > 0 && tcc_ir_vreg_is_valid(ir, vr_dbg)) ? tcc_ir_vreg_live_interval(ir, vr_dbg) : (IRLiveInterval*)0; + LOG_LS("MLA accum: vreg=0x%x type=%d pos=%d tag=%d alloc.r0=%d alloc.off=%d mop.kind=%d mop.off=%d", + vr_dbg, TCCIR_DECODE_VREG_TYPE(vr_dbg), TCCIR_DECODE_VREG_POSITION(vr_dbg), + irop_get_tag(accum_ir_dbg), + li_dbg ? li_dbg->allocation.r0 : -99, + li_dbg ? li_dbg->allocation.offset : -99, + a.accum.kind, a.accum.kind == MACH_OP_SPILL ? a.accum.u.spill.offset : -99); + } + if (a.dest.is_64bit) + { + SCRATCH_WRAP({ + int fused = tcc_gen_machine_mlal_accum_mop(a.src1, a.src2, a.accum, a.dest, !a.dest.is_unsigned); + if (!fused) + tcc_error("compiler_error: unable to lower 64-bit MLA"); + }); + } + else + { + SCRATCH_WRAP(tcc_gen_machine_mla_mop(a.src1, a.src2, a.dest, a.accum)); + } break; } case TCCIR_OP_UMULL: + case TCCIR_OP_SMULL: { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_umull_mop(mop_src1, mop_src2, mop_dest); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + + /* Peephole: (S/U)MULL feeding a single 64-bit ADD into the same + * accumulator pair maps directly to (S/U)MLAL. */ + if (a.dest.vreg >= 0 && ir_codegen_count_vreg_uses(ir, a.dest.vreg) == 1) + { + int next_j = i + 1; + while (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_NOP) + next_j++; + if (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_ADD && + !ir->compact_instructions[next_j].is_jump_target) + { + IRQuadCompact *nq = &ir->compact_instructions[next_j]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_j, ir, nq, &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 1, .src1 = 1, .src2 = 1}); + + MachineOperand *accum = NULL; + if (b.src1.vreg == a.dest.vreg) + accum = &b.src2; + else if (b.src2.vreg == a.dest.vreg) + accum = &b.src1; + + if (accum && b.dest.is_64bit && accum->is_64bit) + { + tcc_gen_machine_insn_scratch_reset(); + int fused = tcc_gen_machine_mlal_accum_mop(a.src1, a.src2, *accum, b.dest, cq->op == TCCIR_OP_SMULL); + if (fused) + { + ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + i = next_j; + break; + } + } + + if (accum && accum->is_64bit && irop_get_vreg(n_dest_ir) >= 0) + { + int store_j = next_j + 1; + while (store_j < ir->next_instruction_index && ir->compact_instructions[store_j].op == TCCIR_OP_NOP) + store_j++; + if (store_j < ir->next_instruction_index && ir->compact_instructions[store_j].op == TCCIR_OP_STORE && + !ir->compact_instructions[store_j].is_jump_target) + { + IRQuadCompact *sq = &ir->compact_instructions[store_j]; + IROperand st_src_ir = tcc_ir_op_get_src1(ir, sq); + IROperand st_dest_ir = tcc_ir_op_get_dest(ir, sq); + if (irop_get_vreg(st_src_ir) == irop_get_vreg(n_dest_ir) && + irop_get_vreg(st_dest_ir) == accum->vreg && + ir_codegen_count_vreg_uses(ir, irop_get_vreg(n_dest_ir)) == 1) + { + tcc_gen_machine_insn_scratch_reset(); + int fused = + tcc_gen_machine_mlal_accum_mop(a.src1, a.src2, *accum, *accum, cq->op == TCCIR_OP_SMULL); + if (fused) + { + ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + i = store_j; + break; + } + } + } + } + } + } + + if (cq->op == TCCIR_OP_UMULL) + SCRATCH_WRAP(tcc_gen_machine_umull_mop(a.src1, a.src2, a.dest)); + else + SCRATCH_WRAP(tcc_gen_machine_smull_mop(a.src1, a.src2, a.dest)); break; } case TCCIR_OP_ADD: case TCCIR_OP_SUB: + { + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + /* Peephole: if next instruction is CMP #0 of the same dest vreg, + * force flag-setting encoding for this ADD/SUB and skip the CMP. + * ARM Thumb SUBS/ADDS sets Z flag which replaces CMP Rd, #0. + * Don't NOP the CMP — use skip index so both dry/real runs agree. */ + if (i + 1 < ir->next_instruction_index) + { + IRQuadCompact *nq = &ir->compact_instructions[i + 1]; + if (nq->op == TCCIR_OP_CMP) + { + IROperand cmp_s1 = tcc_ir_op_get_src1(ir, nq); + IROperand cmp_s2 = tcc_ir_op_get_src2(ir, nq); + /* Only safe for EQ/NE conditions (Z flag only). */ + int next_jmpif_idx = i + 2; + while (next_jmpif_idx < ir->next_instruction_index && + ir->compact_instructions[next_jmpif_idx].op == TCCIR_OP_NOP) + next_jmpif_idx++; + int cond_safe = 0; + if (next_jmpif_idx < ir->next_instruction_index && + ir->compact_instructions[next_jmpif_idx].op == TCCIR_OP_JUMPIF) + { + IROperand jc = tcc_ir_op_get_src1(ir, &ir->compact_instructions[next_jmpif_idx]); + int ct = (int)irop_get_imm64_ex(ir, jc); + cond_safe = (ct == 0x94 || ct == 0x95); /* TOK_EQ or TOK_NE */ + } + /* Block when CMP src1 is a TEMP used as a pointer dereference + * (is_lval + TEMP type = *ptr, tests memory not the pointer). + * Allow VAR operands with is_lval (load from stack = same value). */ + int cmp_is_ptr_deref = irop_op_is_lval(cmp_s1) && + TCCIR_DECODE_VREG_TYPE(irop_get_vreg(cmp_s1)) == TCCIR_VREG_TYPE_TEMP; + /* 64-bit only: a flag-setting 64-bit SUB/ADD lowers to + * `subs lo; sbc hi` (or adds/adc) where only the low-word op sets + * flags — `sbc`/`adc` do not. So Z reflects only the low word and + * cannot replace a full-width `CMP Rd,#0` for an EQ/NE branch + * (miscompile: 920501-6's `for(b=0,s=t; b++,(s>>=1)!=0;)` exited + * after one iteration). Keep the CMP, which the 64-bit EQ/NE + * peephole below lowers correctly via cmp_eq64. */ + if (cond_safe && !cmp_is_ptr_deref && + !a.src1.is_64bit && !a.dest.is_64bit && + irop_is_immediate(cmp_s2) && irop_get_imm64_ex(ir, cmp_s2) == 0 && + irop_has_vreg(cmp_s1) && + irop_get_vreg(cmp_s1) == irop_get_vreg(dest_ir)) + { + SCRATCH_WRAP(tcc_gen_machine_data_processing_mop_flags(a.src1, a.src2, a.dest, cq->op)); + codegen_skip_cmp = i + 1; + ir->codegen_flags_live = 1; + break; + } + } + } + { + uint32_t bs = ir->barrel_shifts ? ir->barrel_shifts[cq->orig_index] : 0; + SCRATCH_WRAP(tcc_gen_machine_data_processing_mop(a.src1, a.src2, a.dest, cq->op, bs)); + } + break; + } case TCCIR_OP_CMP: + if (i == codegen_skip_cmp) + { + codegen_skip_cmp = -1; + break; + } + /* CBZ/CBNZ peephole: CMP rN, #0 followed by JUMPIF EQ/NE. + * Only in real pass with valid dry-run distance estimates. */ + if (cbz_enabled) + { + MopArgs cbz_a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + if (cbz_a.src2.kind == MACH_OP_IMM && cbz_a.src2.u.imm.val == 0 && !cbz_a.src1.is_64bit && + cbz_a.src1.kind == MACH_OP_REG && !cbz_a.src1.needs_deref && cbz_a.src1.u.reg.r0 >= 0 && + cbz_a.src1.u.reg.r0 <= 7) + { + int next_j = i + 1; + while (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_NOP) + next_j++; + if (next_j < ir->next_instruction_index && ir->compact_instructions[next_j].op == TCCIR_OP_JUMPIF) + { + IROperand jc = tcc_ir_op_get_src1(ir, &ir->compact_instructions[next_j]); + int ct = (int)irop_get_imm64_ex(ir, jc); + if (ct == 0x94 || ct == 0x95) /* TOK_EQ or TOK_NE */ + { + IROperand jdest = tcc_ir_op_get_dest(ir, &ir->compact_instructions[next_j]); + int target_ir = irop_is_none(jdest) ? -1 : (int)jdest.u.imm32; + if (target_ir > i && target_ir < (int)ir->ir_to_code_mapping_size) + { + int cbz_in_range = 0; + if (cbz_dry_mapping) + { + int estimated_dist = (int)(ir_to_code_mapping[target_ir] - ind); + int dry_dist = (int)(cbz_dry_mapping[target_ir] - cbz_dry_mapping[i]); + cbz_in_range = (dry_dist >= 4 && dry_dist <= 126 && estimated_dist >= 0); + } + else + { + int ir_gap = target_ir - i; + int est = ir_gap * 10 + tcc_gen_machine_pending_pool_size(); + cbz_in_range = (ir_gap >= 1 && est <= 126); + } + if (cbz_in_range) + { + codegen_cbz_reg = cbz_a.src1.u.reg.r0; + codegen_cbz_nonzero = (ct == 0x95); /* NE → CBNZ */ + break; /* skip emitting CMP */ + } + } + } + } + } + } + /* 64-bit EQ/NE peephole: CMP pair followed by SETIF/JUMPIF/SELECT EQ/NE. + * Use CMP+IT+CMPEQ instead of CMP+SBCS for correct Z flag. */ + { + MopArgs eq_a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + if (eq_a.src1.is_64bit) + { + /* Skip NOPs and flag-neutral register copies (ASSIGN lowers to + * `mov`, which preserves flags) when searching for the condition + * consumer. After const-prop folds `CMP; SETIF; TEST_ZERO; JUMPIF` + * into `CMP; JUMPIF`, phi-resolution ASSIGNs for loop-carried + * variables get scheduled between the CMP and the JUMPIF; without + * skipping them this peephole would miss the EQ/NE consumer and + * fall back to the relational SBCS lowering, whose Z flag reflects + * only the high word — wrong for a 64-bit equality test + * (920501-6: `for(b=0,s=t; b++,(s>>=1)!=0;)` exited after one + * iteration). The relational path already relies on these ASSIGNs + * preserving the CMP's flags up to the branch, so skipping them + * here is consistent. */ + int next_j = i + 1; + while (next_j < ir->next_instruction_index && + (ir->compact_instructions[next_j].op == TCCIR_OP_NOP || + ir->compact_instructions[next_j].op == TCCIR_OP_ASSIGN)) + next_j++; + if (next_j < ir->next_instruction_index) + { + TccIrOp next_op = ir->compact_instructions[next_j].op; + IROperand nc; + int has_cond = 0; + if (next_op == TCCIR_OP_SETIF || next_op == TCCIR_OP_JUMPIF) + { + nc = tcc_ir_op_get_src1(ir, &ir->compact_instructions[next_j]); + has_cond = 1; + } + else if (next_op == TCCIR_OP_SELECT) + { + nc = tcc_ir_op_get_cond(ir, &ir->compact_instructions[next_j]); + has_cond = 1; + } + if (has_cond) + { + int next_cond = (int)irop_get_imm64_ex(ir, nc); + if (next_cond == TOK_EQ || next_cond == TOK_NE) + { + SCRATCH_WRAP(tcc_gen_machine_cmp_eq64_mop(eq_a.src1, eq_a.src2)); + break; + } + } + } + } + } + /* SUBS+IT peephole: CMP x, #K immediately followed by + * `T <-- #1 SELECT #0` (cond=NE) or `T <-- #0 SELECT #1` (cond=EQ) + * collapses cmp+ite+movne+moveq (4 instr) into subs+it+movne (3 instr). + * The SUBS sets flags AND result in one shot: on EQ the result is 0 + * (matches the "else" arm), on NE the IT-MOVNE overwrites with 1. */ + { + MopArgs subs_a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + if (!subs_a.src1.is_64bit && subs_a.src2.kind == MACH_OP_IMM && + subs_a.src1.kind == MACH_OP_REG && !subs_a.src1.needs_deref) { + int next_j = i + 1; + while (next_j < ir->next_instruction_index && + ir->compact_instructions[next_j].op == TCCIR_OP_NOP) + next_j++; + if (next_j < ir->next_instruction_index && + ir->compact_instructions[next_j].op == TCCIR_OP_SELECT) { + IRQuadCompact *sq = &ir->compact_instructions[next_j]; + IROperand sel_s1 = tcc_ir_op_get_src1(ir, sq); + IROperand sel_s2 = tcc_ir_op_get_src2(ir, sq); + IROperand sel_cond = tcc_ir_op_get_cond(ir, sq); + int sel_cc = (int)irop_get_imm64_ex(ir, sel_cond); + int v1 = irop_is_immediate(sel_s1) ? (int)irop_get_imm64_ex(ir, sel_s1) : -1; + int v2 = irop_is_immediate(sel_s2) ? (int)irop_get_imm64_ex(ir, sel_s2) : -1; + int matches = (v1 == 1 && v2 == 0 && sel_cc == TOK_NE) || + (v1 == 0 && v2 == 1 && sel_cc == TOK_EQ); + if (matches) { + IROperand sel_dest = tcc_ir_op_get_dest(ir, sq); + MachineOperand sd = machine_op_from_ir(ir, &sel_dest); + if (sd.kind == MACH_OP_REG && !sd.needs_deref && sd.u.reg.r0 >= 0) { + if (tcc_gen_machine_subs_eq_select_01(subs_a.src1, subs_a.src2, sd)) { + ir->codegen_flags_live = 0; + codegen_skip_select = next_j; + break; /* skip normal CMP emission */ + } + } + } + } + } + } + /* fall through */ case TCCIR_OP_SHL: case TCCIR_OP_SHR: case TCCIR_OP_SAR: + case TCCIR_OP_ROR: case TCCIR_OP_OR: case TCCIR_OP_AND: case TCCIR_OP_XOR: case TCCIR_OP_ADC_GEN: case TCCIR_OP_ADC_USE: { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_data_processing_mop(mop_src1, mop_src2, mop_dest, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + { + uint32_t bs = ir->barrel_shifts ? ir->barrel_shifts[cq->orig_index] : 0; + /* For 64-bit shifts, pass dead-half annotations in bits 16-17 so the + * emitter can skip the dead low/high word write. */ + if ((cq->op == TCCIR_OP_SHL || cq->op == TCCIR_OP_SHR || cq->op == TCCIR_OP_SAR) && + ir->shift64_dead_half) + bs |= (uint32_t)ir->shift64_dead_half[cq->orig_index] << 16; + SCRATCH_WRAP(tcc_gen_machine_data_processing_mop(a.src1, a.src2, a.dest, cq->op, bs)); + } + break; + } + case TCCIR_OP_UBFX: + { + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + SCRATCH_WRAP(tcc_gen_machine_ubfx_mop(a.src1, a.src2, a.dest)); + break; + } + case TCCIR_OP_BFI: + { + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + uint32_t params = ir->bfi_params ? ir->bfi_params[cq->orig_index] : 0; + SCRATCH_WRAP(tcc_gen_machine_bfi_mop(a.src1, a.src2, a.dest, params)); break; } case TCCIR_OP_FADD: @@ -1440,151 +2983,1017 @@ void tcc_ir_codegen_generate(TCCIRState *ir) case TCCIR_OP_CVT_ITOF: case TCCIR_OP_CVT_FTOI: { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_fp_mop(mop_src1, mop_src2, mop_dest, cq->op, src1_ir.is_complex || dest_ir.is_complex); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + tcc_gen_machine_fp_mop(a.src1, a.src2, a.dest, cq->op, src1_ir.is_complex || dest_ir.is_complex); break; } case TCCIR_OP_LOAD: { - MachineOperand mop_dest; - if (!ir_codegen_before_ret_peephole(ir, i, &dest_ir, has_incoming_jump, &mop_dest)) - mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - mop_fixup_subcomponent(&mop_src, &src1_ir, ir); - if (mop_dest.kind == MACH_OP_NONE || mop_src.kind == MACH_OP_NONE) + MopArgs a = DECODE(.dest = 2, .src1 = 2); + if (a.dest.kind == MACH_OP_NONE || a.src1.kind == MACH_OP_NONE) tcc_error("compiler_error: LOAD operand produced MACH_OP_NONE (i=%d dest_kind=%d src_kind=%d)", i, - mop_dest.kind, mop_src.kind); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_mop(mop_src, mop_dest, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + a.dest.kind, a.src1.kind); + + /* Block copy peephole: consecutive LOAD-from-spill + STORE-to-spill pairs + * with sequential offsets → single LDM/STM block copy. + * Safety: all loads must use the same destination register, proving each + * loaded value is dead after the store (just a temporary for the copy). + * If different registers are used, the values are live past the copy. */ + if (a.dest.kind == MACH_OP_REG && !a.dest.needs_deref && + a.src1.kind == MACH_OP_SPILL && !a.src1.needs_deref && !a.src1.is_64bit && + (a.src1.btype == IROP_BTYPE_INT32 || a.src1.btype == IROP_BTYPE_FLOAT32) && + (a.src1.u.spill.offset & 3) == 0) + { + int first_load_reg = a.dest.u.reg.r0; + int store_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + store_i = j; + break; + } + } + if (store_i >= 0 && ir->compact_instructions[store_i].op == TCCIR_OP_STORE && + !ir->compact_instructions[store_i].is_jump_target) + { + IRQuadCompact *sq = &ir->compact_instructions[store_i]; + IROperand s_src1 = tcc_ir_op_get_src1(ir, sq); + IROperand s_src2 = tcc_ir_op_get_src2(ir, sq); + IROperand s_dest = tcc_ir_op_get_dest(ir, sq); + MopArgs sa = ir_decode_cached(is_dry_run, 0, NULL, store_i, ir, sq, + &s_src1, &s_src2, &s_dest, + (MopSpec){.dest = 1, .src1 = 2}); + + if (sa.dest.kind == MACH_OP_SPILL && !sa.dest.needs_deref && !sa.src1.is_64bit && + sa.src1.kind == MACH_OP_REG && sa.src1.u.reg.r0 == first_load_reg && + (sa.dest.btype == IROP_BTYPE_INT32 || sa.dest.btype == IROP_BTYPE_FLOAT32) && + (sa.dest.u.spill.offset & 3) == 0) + { + int32_t src_base = a.src1.u.spill.offset; + int32_t dst_base = sa.dest.u.spill.offset; + int count = 1; + int last_i = store_i; + + while (count < 32) + { + int next_load_i = -1; + for (int j = last_i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_load_i = j; + break; + } + } + if (next_load_i < 0 || ir->compact_instructions[next_load_i].op != TCCIR_OP_LOAD || + ir->compact_instructions[next_load_i].is_jump_target) + break; + + IRQuadCompact *lq = &ir->compact_instructions[next_load_i]; + IROperand l_src1 = tcc_ir_op_get_src1(ir, lq); + IROperand l_src2 = tcc_ir_op_get_src2(ir, lq); + IROperand l_dest = tcc_ir_op_get_dest(ir, lq); + MopArgs la = ir_decode_cached(is_dry_run, 0, NULL, next_load_i, ir, lq, + &l_src1, &l_src2, &l_dest, + (MopSpec){.dest = 1, .src1 = 2}); + + if (la.src1.kind != MACH_OP_SPILL || la.src1.needs_deref || la.src1.is_64bit || + la.src1.u.spill.offset != src_base + count * 4 || + (la.src1.btype != IROP_BTYPE_INT32 && la.src1.btype != IROP_BTYPE_FLOAT32)) + break; + + if (la.dest.kind != MACH_OP_REG || la.dest.u.reg.r0 != first_load_reg) + break; + + int next_store_i = -1; + for (int j = next_load_i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_store_i = j; + break; + } + } + if (next_store_i < 0 || ir->compact_instructions[next_store_i].op != TCCIR_OP_STORE || + ir->compact_instructions[next_store_i].is_jump_target) + break; + + IRQuadCompact *sq2 = &ir->compact_instructions[next_store_i]; + IROperand s2_src1 = tcc_ir_op_get_src1(ir, sq2); + IROperand s2_src2 = tcc_ir_op_get_src2(ir, sq2); + IROperand s2_dest = tcc_ir_op_get_dest(ir, sq2); + MopArgs sa2 = ir_decode_cached(is_dry_run, 0, NULL, next_store_i, ir, sq2, + &s2_src1, &s2_src2, &s2_dest, + (MopSpec){.dest = 1, .src1 = 2}); + + if (sa2.dest.kind != MACH_OP_SPILL || sa2.dest.needs_deref || sa2.src1.is_64bit || + sa2.dest.u.spill.offset != dst_base + count * 4 || + sa2.src1.kind != MACH_OP_REG || sa2.src1.u.reg.r0 != first_load_reg || + (sa2.dest.btype != IROP_BTYPE_INT32 && sa2.dest.btype != IROP_BTYPE_FLOAT32)) + break; + + count++; + last_i = next_store_i; + } + + if (count >= 8) + { + SCRATCH_WRAP(tcc_gen_machine_spill_block_copy(src_base, dst_base, count)); + i = last_i; + break; + } + } + } + } + + SCRATCH_WRAP(tcc_gen_machine_load_mop(a.src1, a.dest, cq->op)); break; } case TCCIR_OP_STORE: { - MachineOperand mop_dest_s = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src_s = machine_op_from_ir(ir, &src1_ir); - mop_fixup_subcomponent(&mop_src_s, &src1_ir, ir); - if (mop_dest_s.kind == MACH_OP_NONE || mop_src_s.kind == MACH_OP_NONE) + MopArgs a = DECODE(.dest = 1, .src1 = 2); + if (a.dest.kind == MACH_OP_NONE || a.src1.kind == MACH_OP_NONE) tcc_error("compiler_error: STORE operand produced MACH_OP_NONE (i=%d dest_kind=%d src_kind=%d)", i, - mop_dest_s.kind, mop_src_s.kind); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_mop(mop_dest_s, mop_src_s, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + a.dest.kind, a.src1.kind); + + /* STRD peephole: if this is a 32-bit store to a spill slot and the + * very next non-NOP instruction is also a 32-bit store to an adjacent + * (+4) spill slot, emit STRD for both and skip the second. */ + if (a.dest.kind == MACH_OP_SPILL && !a.dest.needs_deref && + a.src1.kind == MACH_OP_REG && !a.src1.is_64bit && + (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32) && + (a.dest.u.spill.offset & 3) == 0) + { + /* Find next non-NOP instruction */ + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_STORE && + !ir->compact_instructions[next_i].is_jump_target) + { + /* Decode the next store's operands */ + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, + &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 1, .src1 = 2}); + + if (b.dest.kind == MACH_OP_SPILL && !b.dest.needs_deref && + b.src1.kind == MACH_OP_REG && !b.src1.is_64bit && + (b.dest.btype == IROP_BTYPE_INT32 || b.dest.btype == IROP_BTYPE_FLOAT32) && + (b.dest.u.spill.offset & 3) == 0) + { + int32_t off1 = a.dest.u.spill.offset; + int32_t off2 = b.dest.u.spill.offset; + int reg1 = a.src1.u.reg.r0; + int reg2 = b.src1.u.reg.r0; + + if (off1 + 4 == off2) + { + if (tcc_gen_machine_try_strd_spill(reg1, reg2, off1, off2)) + { + /* Skip the next store — advance i past NOPs and the paired store */ + i = next_i; + break; + } + } + else if (off2 + 4 == off1) + { + if (tcc_gen_machine_try_strd_spill(reg2, reg1, off2, off1)) + { + i = next_i; + break; + } + } + } + } + } + + /* STRD peephole (immediate-to-spill form): two consecutive stores of + * immediate constants to adjacent spill slots → single STRD. + * The helper materializes the constants into scratch registers. */ + if (a.dest.kind == MACH_OP_SPILL && !a.dest.needs_deref && + a.src1.kind == MACH_OP_IMM && !a.src1.is_64bit && + (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32) && + (a.dest.u.spill.offset & 3) == 0) + { + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_STORE && + !ir->compact_instructions[next_i].is_jump_target) + { + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, + &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 1, .src1 = 2}); + + if (b.dest.kind == MACH_OP_SPILL && !b.dest.needs_deref && + b.src1.kind == MACH_OP_IMM && !b.src1.is_64bit && + (b.dest.btype == IROP_BTYPE_INT32 || b.dest.btype == IROP_BTYPE_FLOAT32) && + (b.dest.u.spill.offset & 3) == 0) + { + int32_t off1 = a.dest.u.spill.offset; + int32_t off2 = b.dest.u.spill.offset; + int64_t val1 = a.src1.u.imm.val; + int64_t val2 = b.src1.u.imm.val; + int strd_ok = 0; + + if (off1 + 4 == off2) + { + SCRATCH_WRAP(strd_ok = tcc_gen_machine_try_strd_imm_spill(val1, val2, off1, off2)); + } + else if (off2 + 4 == off1) + { + SCRATCH_WRAP(strd_ok = tcc_gen_machine_try_strd_imm_spill(val2, val1, off2, off1)); + } + if (strd_ok) + { + i = next_i; + break; + } + } + } + } + + /* STRD peephole (deref-through-vreg form): pair a plain STORE + * through a register-deref destination (offset 0 implicit) with an + * immediately-following STORE_INDEXED through the same base vreg + * at offset +4. Mirrors the spill-slot STRD peephole above; the + * disp-fusion turns "ADD base+N; STORE *(...) <- v" into + * STORE_INDEXED, but the off=0 store stays plain STORE — so the + * existing STORE_INDEXED-only peephole misses the pair. */ + if (a.dest.kind == MACH_OP_REG && a.dest.needs_deref && + a.src1.kind == MACH_OP_REG && !a.src1.is_64bit && + (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32)) + { + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + /* is_jump_target misses some branch targets (see branch_target_reset); + * consuming a branch-target store removes the label's only emission + * point, so branches to it backpatch against code address 0. */ + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_STORE_INDEXED && + !ir->compact_instructions[next_i].is_jump_target && + !(branch_target_reset && branch_target_reset[next_i])) + { + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 1, .src1 = 1, .src2 = 1, .scale = 1}); + + if (!b.src1.is_64bit && b.src1.kind == MACH_OP_REG && + b.scale.kind == MACH_OP_IMM && b.scale.u.imm.val == 0 && + b.src2.kind == MACH_OP_IMM && + b.dest.kind == MACH_OP_REG && !b.dest.needs_deref && + (b.src1.btype == IROP_BTYPE_INT32 || b.src1.btype == IROP_BTYPE_FLOAT32) && + a.dest.u.reg.r0 == b.dest.u.reg.r0) + { + int reg1 = a.src1.u.reg.r0; + int reg2 = b.src1.u.reg.r0; + int base_reg = a.dest.u.reg.r0; + int32_t off2 = (int32_t)b.src2.u.imm.val; + + if (off2 == 4) + { + if (tcc_gen_machine_try_strd_base(reg1, reg2, base_reg, 0)) + { + i = next_i; + break; + } + } + } + } + } + + /* STRD peephole (deref-through-vreg form, IMM sources): pair a plain + * STORE of an immediate through a register-deref destination (offset 0) + * with an immediately-following STORE_INDEXED of an immediate through + * the same base vreg at offset +4. Mirrors the REG-source variant + * above; here both values are constants materialised into scratch regs + * before the paired store. */ + if (a.dest.kind == MACH_OP_REG && a.dest.needs_deref && + a.src1.kind == MACH_OP_IMM && !a.src1.is_64bit && + (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32)) + { + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + /* is_jump_target misses some branch targets (see branch_target_reset); + * consuming a branch-target store removes the label's only emission + * point, so branches to it backpatch against code address 0. */ + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_STORE_INDEXED && + !ir->compact_instructions[next_i].is_jump_target && + !(branch_target_reset && branch_target_reset[next_i])) + { + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 1, .src1 = 1, .src2 = 1, .scale = 1}); + + if (b.src1.kind == MACH_OP_IMM && !b.src1.is_64bit && + b.scale.kind == MACH_OP_IMM && b.scale.u.imm.val == 0 && + b.src2.kind == MACH_OP_IMM && + b.dest.kind == MACH_OP_REG && !b.dest.needs_deref && + (b.src1.btype == IROP_BTYPE_INT32 || b.src1.btype == IROP_BTYPE_FLOAT32) && + a.dest.u.reg.r0 == b.dest.u.reg.r0) + { + int32_t off2 = (int32_t)b.src2.u.imm.val; + if (off2 == 4) + { + if (tcc_gen_machine_try_strd_imm_base(a.src1.u.imm.val, b.src1.u.imm.val, + a.dest.u.reg.r0, 0)) + { + i = next_i; + break; + } + } + } + } + } + + /* Store-load forwarding: STORE reg → spill followed immediately by + * LOAD from the same spill → same reg. The value is still in the + * register, so emit the store but skip the redundant load. */ + if (a.dest.kind == MACH_OP_SPILL && !a.dest.needs_deref && + a.src1.kind == MACH_OP_REG && !a.src1.is_64bit && + (a.dest.u.spill.offset & 3) == 0) + { + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_LOAD && + !ir->compact_instructions[next_i].is_jump_target) + { + IRQuadCompact *lq = &ir->compact_instructions[next_i]; + IROperand l_src1 = tcc_ir_op_get_src1(ir, lq); + IROperand l_src2 = tcc_ir_op_get_src2(ir, lq); + IROperand l_dest = tcc_ir_op_get_dest(ir, lq); + MopArgs la = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, lq, + &l_src1, &l_src2, &l_dest, + (MopSpec){.dest = 1, .src1 = 1}); + + if (la.src1.kind == MACH_OP_SPILL && !la.src1.needs_deref && + la.src1.u.spill.offset == a.dest.u.spill.offset && + la.dest.kind == MACH_OP_REG && !la.dest.is_64bit && + la.dest.u.reg.r0 == a.src1.u.reg.r0 && + la.src1.btype == a.dest.btype) + { + SCRATCH_WRAP(tcc_gen_machine_store_mop(a.dest, a.src1, cq->op)); + i = next_i; + break; + } + } + } + + SCRATCH_WRAP(tcc_gen_machine_store_mop(a.dest, a.src1, cq->op)); break; } case TCCIR_OP_LOAD_INDEXED: { - MachineOperand mop_dest; - if (!ir_codegen_before_ret_peephole(ir, i, &dest_ir, has_incoming_jump, &mop_dest)) - mop_dest = machine_op_from_ir(ir, &dest_ir); - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_base = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_indexed_mop(mop_dest, mop_base, mop_index, mop_scale, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 2, .src1 = 1, .src2 = 1, .scale = 1); + + /* LDRD pairing: two adjacent 32-bit LOAD_INDEXED ops with the same + * base register, scale=0, and constant offsets differing by 4 can + * fold to a single LDRD. Mirrors the SPILL-slot LDRD peephole + * above; the offset is a generic [base, #imm] so we use the + * non-spill `try_ldrd_base` wrapper. */ + if (!a.dest.is_64bit && a.dest.kind == MACH_OP_REG && + a.scale.kind == MACH_OP_IMM && a.scale.u.imm.val == 0 && + a.src2.kind == MACH_OP_IMM && + a.src1.kind == MACH_OP_REG && !a.src1.needs_deref && + (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32)) + { + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_LOAD_INDEXED && + !ir->compact_instructions[next_i].is_jump_target) + { + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 2, .src1 = 1, .src2 = 1, .scale = 1}); + + if (!b.dest.is_64bit && b.dest.kind == MACH_OP_REG && + b.scale.kind == MACH_OP_IMM && b.scale.u.imm.val == 0 && + b.src2.kind == MACH_OP_IMM && + b.src1.kind == MACH_OP_REG && !b.src1.needs_deref && + (b.dest.btype == IROP_BTYPE_INT32 || b.dest.btype == IROP_BTYPE_FLOAT32) && + a.src1.u.reg.r0 == b.src1.u.reg.r0) + { + int32_t off1 = (int32_t)a.src2.u.imm.val; + int32_t off2 = (int32_t)b.src2.u.imm.val; + int reg1 = a.dest.u.reg.r0; + int reg2 = b.dest.u.reg.r0; + int base_reg = a.src1.u.reg.r0; + + /* LDRD writes Rt before Rt2; if Rt overlaps the base reg the + * second load reads from a clobbered base. Punt those cases. */ + if (reg1 != reg2 && reg1 != base_reg && reg2 != base_reg) + { + if ((off1 & 3) == 0 && off1 + 4 == off2) + { + if (tcc_gen_machine_try_ldrd_base(reg1, reg2, base_reg, off1)) + { + i = next_i; + break; + } + } + else if ((off2 & 3) == 0 && off2 + 4 == off1) + { + if (tcc_gen_machine_try_ldrd_base(reg2, reg1, base_reg, off2)) + { + i = next_i; + break; + } + } + } + } + } + } + + SCRATCH_WRAP(tcc_gen_machine_load_indexed_mop(a.dest, a.src1, a.src2, a.scale, cq->op)); break; } case TCCIR_OP_STORE_INDEXED: { - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_base = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_indexed_mop(mop_base, mop_index, mop_scale, mop_value, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1, .scale = 1); + + /* STRD pairing peephole: two adjacent 32-bit STORE_INDEXED ops with + * same base, scale=0, offsets differing by 4 → single STRD. + * Only for REG sources — IMM STRD through generic base registers is + * unsafe because STRD requires 4-byte aligned addresses while + * individual STR tolerates unaligned access on ARMv8-M. */ + if (!a.src1.is_64bit && a.src1.kind == MACH_OP_REG && + a.scale.kind == MACH_OP_IMM && a.scale.u.imm.val == 0 && + a.src2.kind == MACH_OP_IMM && + a.dest.kind == MACH_OP_REG && !a.dest.needs_deref && + (a.src1.btype == IROP_BTYPE_INT32 || a.src1.btype == IROP_BTYPE_FLOAT32)) + { + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + int jop = ir->compact_instructions[j].op; + if (jop == TCCIR_OP_NOP) + continue; + /* An ASSIGN or pure-vreg LOAD whose src and dst materialise to + * the same physical register emits no code (mov elision in + * load/assign codegen). Skip these so adjacent STORE_INDEXEDs + * can still pair as STRD even with a no-op copy between them + * (move-coalesced inlined swap_adjacent / similar patterns). */ + if (jop == TCCIR_OP_ASSIGN || jop == TCCIR_OP_LOAD) { + IRQuadCompact *jq = &ir->compact_instructions[j]; + IROperand jds = tcc_ir_op_get_src1(ir, jq); + IROperand jdd = tcc_ir_op_get_dest(ir, jq); + /* Identity check: src is a vreg / stack-local-as-mov, dst is + * a vreg, both end up in the same hw register. */ + if (jdd.tag == IROP_TAG_VREG && !jdd.is_lval && + (jds.tag == IROP_TAG_VREG || + (jds.tag == IROP_TAG_STACKOFF && jds.is_local)) && + !jds.is_llocal && !jds.is_sym) { + MachineOperand sm = machine_op_from_ir(ir, &jds); + MachineOperand dm = machine_op_from_ir(ir, &jdd); + if (sm.kind == MACH_OP_REG && dm.kind == MACH_OP_REG && + !sm.needs_deref && !dm.needs_deref && + sm.u.reg.r0 == dm.u.reg.r0 && sm.u.reg.r0 >= 0) + continue; /* identity move — skip */ + } + } + next_i = j; + break; + } + /* is_jump_target misses some branch targets (see branch_target_reset); + * consuming a branch-target store removes the label's only emission + * point, so branches to it backpatch against code address 0. */ + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_STORE_INDEXED && + !ir->compact_instructions[next_i].is_jump_target && + !(branch_target_reset && branch_target_reset[next_i])) + { + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 1, .src1 = 1, .src2 = 1, .scale = 1}); + + if (!b.src1.is_64bit && b.src1.kind == MACH_OP_REG && + b.scale.kind == MACH_OP_IMM && b.scale.u.imm.val == 0 && + b.src2.kind == MACH_OP_IMM && + b.dest.kind == MACH_OP_REG && !b.dest.needs_deref && + (b.src1.btype == IROP_BTYPE_INT32 || b.src1.btype == IROP_BTYPE_FLOAT32) && + a.dest.u.reg.r0 == b.dest.u.reg.r0) + { + int32_t off1 = (int32_t)a.src2.u.imm.val; + int32_t off2 = (int32_t)b.src2.u.imm.val; + int reg1 = a.src1.u.reg.r0; + int reg2 = b.src1.u.reg.r0; + int base_reg = a.dest.u.reg.r0; + + if ((off1 & 3) == 0 && off1 + 4 == off2) + { + if (tcc_gen_machine_try_strd_base(reg1, reg2, base_reg, off1)) + { + i = next_i; + break; + } + } + else if ((off2 & 3) == 0 && off2 + 4 == off1) + { + if (tcc_gen_machine_try_strd_base(reg2, reg1, base_reg, off2)) + { + i = next_i; + break; + } + } + } + } + } + + /* STRD pairing peephole for IMM-source STORE_INDEXED ops: two adjacent + * stores of immediate values to consecutive word-aligned offsets from + * the same base register → materialise constants into scratch regs, + * emit a single STRD. Mirrors the REG-source peephole above. */ + if (a.src1.kind == MACH_OP_IMM && !a.src1.is_64bit && + a.scale.kind == MACH_OP_IMM && a.scale.u.imm.val == 0 && + a.src2.kind == MACH_OP_IMM && + a.dest.kind == MACH_OP_REG && !a.dest.needs_deref && + (a.src1.btype == IROP_BTYPE_INT32 || a.src1.btype == IROP_BTYPE_FLOAT32)) + { + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + /* is_jump_target misses some branch targets (see branch_target_reset); + * consuming a branch-target store removes the label's only emission + * point, so branches to it backpatch against code address 0. */ + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_STORE_INDEXED && + !ir->compact_instructions[next_i].is_jump_target && + !(branch_target_reset && branch_target_reset[next_i])) + { + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 1, .src1 = 1, .src2 = 1, .scale = 1}); + + if (b.src1.kind == MACH_OP_IMM && !b.src1.is_64bit && + b.scale.kind == MACH_OP_IMM && b.scale.u.imm.val == 0 && + b.src2.kind == MACH_OP_IMM && + b.dest.kind == MACH_OP_REG && !b.dest.needs_deref && + (b.src1.btype == IROP_BTYPE_INT32 || b.src1.btype == IROP_BTYPE_FLOAT32) && + a.dest.u.reg.r0 == b.dest.u.reg.r0) + { + int32_t off1 = (int32_t)a.src2.u.imm.val; + int32_t off2 = (int32_t)b.src2.u.imm.val; + int base_reg = a.dest.u.reg.r0; + + if ((off1 & 3) == 0 && off1 + 4 == off2) + { + if (tcc_gen_machine_try_strd_imm_base(a.src1.u.imm.val, b.src1.u.imm.val, + base_reg, off1)) + { + i = next_i; + break; + } + } + else if ((off2 & 3) == 0 && off2 + 4 == off1) + { + if (tcc_gen_machine_try_strd_imm_base(b.src1.u.imm.val, a.src1.u.imm.val, + base_reg, off2)) + { + i = next_i; + break; + } + } + } + } + } + + /* Byte-to-word coalescing peephole: four consecutive byte + * STORE_INDEXEDs with immediate sources to word-aligned consecutive + * offsets on the same base → single word store of the packed constant. + * Saves 3 constant loads + 3 strb → 1 movs + 1 str. */ + if (a.src1.kind == MACH_OP_IMM && !a.src1.is_64bit && + a.src1.btype == IROP_BTYPE_INT8 && + a.scale.kind == MACH_OP_IMM && a.scale.u.imm.val == 0 && + a.src2.kind == MACH_OP_IMM && + ((int32_t)a.src2.u.imm.val & 3) == 0) + { + int32_t base_off = (int32_t)a.src2.u.imm.val; + uint32_t combined = (uint32_t)(a.src1.u.imm.val & 0xFF); + int last_i = i; + int found = 0; + + for (int k = 1; k <= 3; k++) + { + int next_i = -1; + for (int j = last_i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + if (next_i < 0 || + ir->compact_instructions[next_i].op != TCCIR_OP_STORE_INDEXED || + ir->compact_instructions[next_i].is_jump_target) + break; + + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, + &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 1, .src1 = 1, .src2 = 1, .scale = 1}); + + if (b.src1.kind != MACH_OP_IMM || b.src1.is_64bit || + b.src1.btype != IROP_BTYPE_INT8 || + b.scale.kind != MACH_OP_IMM || b.scale.u.imm.val != 0 || + b.src2.kind != MACH_OP_IMM || + (int32_t)b.src2.u.imm.val != base_off + k) + break; + + if (b.dest.kind != a.dest.kind) + break; + if (a.dest.kind == MACH_OP_REG && + (b.dest.u.reg.r0 != a.dest.u.reg.r0 || b.dest.needs_deref != a.dest.needs_deref)) + break; + if (a.dest.kind == MACH_OP_FRAME_ADDR && + b.dest.u.frame.offset != a.dest.u.frame.offset) + break; + + combined |= (uint32_t)(b.src1.u.imm.val & 0xFF) << (k * 8); + last_i = next_i; + found++; + } + + if (found == 3) + { + uint32_t combined2 = 0; + int last_i2 = last_i; + int found2 = 0; + for (int k = 0; k <= 3; k++) + { + int next_i2 = -1; + for (int j = last_i2 + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i2 = j; + break; + } + } + if (next_i2 < 0 || + ir->compact_instructions[next_i2].op != TCCIR_OP_STORE_INDEXED || + ir->compact_instructions[next_i2].is_jump_target) + break; + + IRQuadCompact *nq2 = &ir->compact_instructions[next_i2]; + IROperand ns1 = tcc_ir_op_get_src1(ir, nq2); + IROperand ns2 = tcc_ir_op_get_src2(ir, nq2); + IROperand nd = tcc_ir_op_get_dest(ir, nq2); + MopArgs c = ir_decode_cached(is_dry_run, 0, NULL, next_i2, ir, nq2, + &ns1, &ns2, &nd, + (MopSpec){.dest = 1, .src1 = 1, .src2 = 1, .scale = 1}); + + if (c.src1.kind != MACH_OP_IMM || c.src1.is_64bit || + c.src1.btype != IROP_BTYPE_INT8 || + c.scale.kind != MACH_OP_IMM || c.scale.u.imm.val != 0 || + c.src2.kind != MACH_OP_IMM || + (int32_t)c.src2.u.imm.val != base_off + 4 + k) + break; + if (c.dest.kind != a.dest.kind) + break; + if (a.dest.kind == MACH_OP_REG && + (c.dest.u.reg.r0 != a.dest.u.reg.r0 || c.dest.needs_deref != a.dest.needs_deref)) + break; + if (a.dest.kind == MACH_OP_FRAME_ADDR && + c.dest.u.frame.offset != a.dest.u.frame.offset) + break; + + combined2 |= (uint32_t)(c.src1.u.imm.val & 0xFF) << (k * 8); + last_i2 = next_i2; + found2++; + } + + if (found2 == 4) + { + /* All 8 bytes coalesced. Emit TWO 32-bit STRs, NOT an STRD: + * these stores originate from INT8 writes, so the destination + * has byte (1) alignment — e.g. zero-initialising an element of + * an array of 9-byte structs, where the base register holds + * `arr + i*9` and is unaligned for odd i. On ARMv7-M/v8-M a + * single STR tolerates an unaligned address (CCR.UNALIGN_TRP=0 + * by default) but STRD/LDRD ALWAYS fault when unaligned, so + * pairing into STRD off a register base (try_unroll_loop_ex's + * struct-array zero-init miscompiled this way) is unsafe. */ + MachineOperand wv1 = a.src1; + wv1.btype = IROP_BTYPE_INT32; + wv1.u.imm.val = (int64_t)(int32_t)combined; + SCRATCH_WRAP(tcc_gen_machine_store_indexed_mop(a.dest, a.src2, a.scale, wv1, cq->op)); + + MachineOperand off2 = a.src2; + off2.u.imm.val = base_off + 4; + MachineOperand wv2 = a.src1; + wv2.btype = IROP_BTYPE_INT32; + wv2.u.imm.val = (int64_t)(int32_t)combined2; + SCRATCH_WRAP(tcc_gen_machine_store_indexed_mop(a.dest, off2, a.scale, wv2, cq->op)); + i = last_i2; + break; + } + + MachineOperand word_val = a.src1; + word_val.btype = IROP_BTYPE_INT32; + word_val.u.imm.val = (int64_t)(int32_t)combined; + SCRATCH_WRAP(tcc_gen_machine_store_indexed_mop(a.dest, a.src2, a.scale, word_val, cq->op)); + i = last_i; + break; + } + } + + SCRATCH_WRAP(tcc_gen_machine_store_indexed_mop(a.dest, a.src2, a.scale, a.src1, cq->op)); break; } case TCCIR_OP_LOAD_POSTINC: { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_ptr = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_postinc_mop(mop_dest, mop_ptr, mop_offset, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 2, .src1 = 1, .scale = 1); + SCRATCH_WRAP(tcc_gen_machine_load_postinc_mop(a.dest, a.src1, a.scale, cq->op)); break; } case TCCIR_OP_STORE_POSTINC: { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_ptr = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_postinc_mop(mop_ptr, mop_value, mop_offset, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .scale = 1); + SCRATCH_WRAP(tcc_gen_machine_store_postinc_mop(a.dest, a.src1, a.scale, cq->op)); break; } case TCCIR_OP_RETURNVALUE: { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_return_value_mop(mop_src, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.src1 = 1); + SCRATCH_WRAP(tcc_gen_machine_return_value_mop(a.src1, cq->op)); } /* fall through to RETURNVOID */ case TCCIR_OP_RETURNVOID: /* Real-run: emit jump to epilogue (backpatched later). - * Dry-run: no-op (we don't track return_jump_addrs). */ - if (!is_dry_run && i != ir->next_instruction_index - 1) + * Dry-run: no-op (we don't track return_jump_addrs). + * Skip the jump if all remaining instructions are NOPs — + * the epilogue immediately follows, so the branch is a no-op. */ { - return_jump_addrs[num_return_jumps++] = ind; - tcc_gen_machine_jump_mop(cq->op, irop_get_imm32(dest_ir), i); + int has_trailing_code = 0; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + has_trailing_code = 1; + break; + } + } + if (!is_dry_run && has_trailing_code) + { + /* Pass -1 as target: return jumps go forward to the epilogue + * (backpatched later), so they must not be narrowed. */ + int ret_branch_size = tcc_gen_machine_jump_mop(cq->op, -1, i); + return_jump_addrs[num_return_jumps++] = ind - ret_branch_size; + } } break; case TCCIR_OP_ASSIGN: { - MachineOperand mop_dest; - if (!ir_codegen_before_ret_peephole(ir, i, &dest_ir, has_incoming_jump, &mop_dest)) - mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_assign_mop(mop_src, mop_dest, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 2, .src1 = 1); + + /* LDRD peephole: two adjacent 32-bit assigns loading from adjacent + * spill slots into registers → single LDRD instruction. */ + if (a.src1.kind == MACH_OP_SPILL && !a.src1.needs_deref && + a.dest.kind == MACH_OP_REG && !a.dest.is_64bit && + (a.src1.btype == IROP_BTYPE_INT32 || a.src1.btype == IROP_BTYPE_FLOAT32) && + (a.src1.u.spill.offset & 3) == 0) + { + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_ASSIGN && + !ir->compact_instructions[next_i].is_jump_target) + { + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, + &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 2, .src1 = 1}); + + if (b.src1.kind == MACH_OP_SPILL && !b.src1.needs_deref && + b.dest.kind == MACH_OP_REG && !b.dest.is_64bit && + (b.src1.btype == IROP_BTYPE_INT32 || b.src1.btype == IROP_BTYPE_FLOAT32) && + (b.src1.u.spill.offset & 3) == 0) + { + int32_t off1 = a.src1.u.spill.offset; + int32_t off2 = b.src1.u.spill.offset; + int reg1 = a.dest.u.reg.r0; + int reg2 = b.dest.u.reg.r0; + + if (reg1 != reg2 && off1 + 4 == off2) + { + if (tcc_gen_machine_try_ldrd_spill(reg1, off1, reg2, off2)) + { + i = next_i; + break; + } + } + else if (reg1 != reg2 && off2 + 4 == off1) + { + if (tcc_gen_machine_try_ldrd_spill(reg2, off2, reg1, off1)) + { + i = next_i; + break; + } + } + } + } + } + + /* STRD peephole: two adjacent 32-bit assigns storing registers to + * adjacent spill slots → single STRD instruction. */ + if (a.dest.kind == MACH_OP_SPILL && !a.dest.needs_deref && + a.src1.kind == MACH_OP_REG && !a.src1.is_64bit && + (a.dest.btype == IROP_BTYPE_INT32 || a.dest.btype == IROP_BTYPE_FLOAT32) && + (a.dest.u.spill.offset & 3) == 0) + { + int next_i = -1; + for (int j = i + 1; j < ir->next_instruction_index; j++) + { + if (ir->compact_instructions[j].op != TCCIR_OP_NOP) + { + next_i = j; + break; + } + } + if (next_i >= 0 && ir->compact_instructions[next_i].op == TCCIR_OP_ASSIGN && + !ir->compact_instructions[next_i].is_jump_target) + { + IRQuadCompact *nq = &ir->compact_instructions[next_i]; + IROperand n_src1_ir = tcc_ir_op_get_src1(ir, nq); + IROperand n_src2_ir = tcc_ir_op_get_src2(ir, nq); + IROperand n_dest_ir = tcc_ir_op_get_dest(ir, nq); + MopArgs b = ir_decode_cached(is_dry_run, 0, NULL, next_i, ir, nq, + &n_src1_ir, &n_src2_ir, &n_dest_ir, + (MopSpec){.dest = 2, .src1 = 1}); + + if (b.dest.kind == MACH_OP_SPILL && !b.dest.needs_deref && + b.src1.kind == MACH_OP_REG && !b.src1.is_64bit && + (b.dest.btype == IROP_BTYPE_INT32 || b.dest.btype == IROP_BTYPE_FLOAT32) && + (b.dest.u.spill.offset & 3) == 0) + { + int32_t off1 = a.dest.u.spill.offset; + int32_t off2 = b.dest.u.spill.offset; + int reg1 = a.src1.u.reg.r0; + int reg2 = b.src1.u.reg.r0; + + if (reg1 != reg2 && off1 + 4 == off2) + { + if (tcc_gen_machine_try_strd_spill(reg1, off1, reg2, off2)) + { + i = next_i; + break; + } + } + else if (reg1 != reg2 && off2 + 4 == off1) + { + if (tcc_gen_machine_try_strd_spill(reg2, off2, reg1, off1)) + { + i = next_i; + break; + } + } + } + } + } + + SCRATCH_WRAP(tcc_gen_machine_assign_mop(a.src1, a.dest, cq->op)); + break; + } + case TCCIR_OP_ZEXT: + { + /* Zero-extension: lower like ASSIGN — assign_mop already emits the + * u32-src→u64-dest widening (low = src, high = 0). The point of + * ZEXT as a distinct opcode is to be opaque to the IR optimizer's + * value-tracking, which would otherwise sign-extend the source. */ + MopArgs a = DECODE(.dest = 2, .src1 = 1); + SCRATCH_WRAP(tcc_gen_machine_assign_mop(a.src1, a.dest, TCCIR_OP_ASSIGN)); + break; + } + case TCCIR_OP_PACK64: + { + /* Pack two u32s into a u64: dest_lo = src1, dest_hi = src2. Lower + * to two 32-bit assigns to the dest's halves. */ + MopArgs a = DECODE(.dest = 2, .src1 = 1, .src2 = 1); + SCRATCH_WRAP(tcc_gen_machine_pack64_mop(a.src1, a.src2, a.dest)); break; } case TCCIR_OP_LEA: { - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_lea_mop(mop_dest, mop_src); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1); + SCRATCH_WRAP(tcc_gen_machine_lea_mop(a.dest, a.src1)); break; } case TCCIR_OP_FUNCPARAMVAL: case TCCIR_OP_FUNCPARAMVOID: { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - tcc_gen_machine_func_parameter_mop(mop_src1, mop_src2, cq->op); + MopArgs a = DECODE(.src1 = 1, .src2 = 1); + tcc_gen_machine_func_parameter_mop(a.src1, a.src2, cq->op); break; } case TCCIR_OP_JUMP: - tcc_gen_machine_jump_mop(cq->op, irop_get_imm32(dest_ir), i); + { + int branch_size = tcc_gen_machine_jump_mop(cq->op, irop_get_imm32(dest_ir), i); if (!is_dry_run) - ir_to_code_mapping[i] = ind - (tcc_gen_machine_branch_opt_get_encoding(i) == 16 ? 2 : 4); + ir_to_code_mapping[i] = ind - branch_size; tcc_ir_spill_cache_clear(&ir->spill_cache); break; + } case TCCIR_OP_JUMPIF: - tcc_gen_machine_conditional_jump_mop(src1_ir.u.imm32, cq->op, irop_get_imm32(dest_ir), i); + { + int branch_size; + if (codegen_cbz_reg >= 0) + { + branch_size = tcc_gen_machine_cbz_jump_mop(codegen_cbz_reg, codegen_cbz_nonzero, irop_get_imm32(dest_ir), i); + codegen_cbz_reg = -1; + } + else + { + branch_size = tcc_gen_machine_conditional_jump_mop(src1_ir.u.imm32, cq->op, irop_get_imm32(dest_ir), i); + } if (!is_dry_run) - ir_to_code_mapping[i] = ind - (tcc_gen_machine_branch_opt_get_encoding(i) == 16 ? 2 : 4); + ir_to_code_mapping[i] = ind - branch_size; tcc_ir_spill_cache_clear(&ir->spill_cache); break; + } case TCCIR_OP_IJUMP: { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_indirect_jump_mop(mop_src, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.src1 = 1); + SCRATCH_WRAP(tcc_gen_machine_indirect_jump_mop(a.src1, cq->op)); tcc_ir_spill_cache_clear(&ir->spill_cache); break; } @@ -1592,62 +4001,71 @@ void tcc_ir_codegen_generate(TCCIRState *ir) { int table_id = (int)irop_get_imm64_ex(ir, src2_ir); TCCIRSwitchTable *table = &ir->switch_tables[table_id]; + MopArgs a = DECODE(.src1 = 1); + /* Flush any pending literal pool before the dispatch+table block so it + * cannot be flushed in the middle of the preamble (which would relocate + * the terminal `ADD Rt,PC; BX Rt` past the pool and break the switch- + * table offset backpatch). Done in both passes — with the same byte + * count — so dry-run size estimates and real-run addresses agree. */ + tcc_gen_machine_reserve_pool_bytes(tcc_gen_machine_switch_table_dry_run_size(table->num_entries)); if (is_dry_run) { - /* Compute exact table size so branch offsets are accurate. - * Layout: ADD.W(4) + LDR.W(4) + ADD.W(4) + BX(2) = 14 bytes preamble - * + 4 bytes per table entry (32-bit signed PC-relative offsets). */ - int table_data_size = table->num_entries * 4; - ind += 14; - ind += table_data_size; + ind += tcc_gen_machine_switch_table_dry_run_size(table->num_entries); } else { - MachineOperand mop_idx = machine_op_from_ir(ir, &src1_ir); tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_switch_table_mop(mop_idx, table, ir, i); + tcc_gen_machine_switch_table_mop(a.src1, table, ir, i); + } + tcc_ir_spill_cache_clear(&ir->spill_cache); + break; + } + case TCCIR_OP_SWITCH_LOAD: + { + int vt_id = (int)irop_get_imm64_ex(ir, src2_ir); + TCCIRSwitchValueTable *vtab = &ir->switch_value_tables[vt_id]; + MopArgs a = DECODE(.dest = 1, .src1 = 1); + if (is_dry_run) + { + ind += tcc_gen_machine_switch_load_dry_run_size(vtab->num_entries); + } + else + { + tcc_gen_machine_insn_scratch_reset(); + tcc_gen_machine_switch_load_mop(a.src1, a.dest, vtab, ir, i); } tcc_ir_spill_cache_clear(&ir->spill_cache); break; } case TCCIR_OP_SETIF: { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_setif_mop(mop_src, mop_dest, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1); + SCRATCH_WRAP(tcc_gen_machine_setif_mop(a.src1, a.dest, cq->op)); break; } case TCCIR_OP_BOOL_OR: case TCCIR_OP_BOOL_AND: { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_bool_mop(mop_src1, mop_src2, mop_dest, cq->op); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + SCRATCH_WRAP(tcc_gen_machine_bool_mop(a.src1, a.src2, a.dest, cq->op)); break; } case TCCIR_OP_VLA_ALLOC: case TCCIR_OP_VLA_SP_SAVE: case TCCIR_OP_VLA_SP_RESTORE: { - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - tcc_gen_machine_vla_mop(mop_dest, mop_src1, mop_src2, cq->op); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + tcc_gen_machine_vla_mop(a.dest, a.src1, a.src2, cq->op); break; } case TCCIR_OP_FUNCCALLVOID: - drop_return_value = 1; - /* fall through */ case TCCIR_OP_FUNCCALLVAL: { - MachineOperand func_mop = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_func_call_mop(func_mop, src2_ir, mop_dest, drop_return_value, ir, i); + int drop_return_value = (cq->op == TCCIR_OP_FUNCCALLVOID); + MopArgs a = DECODE(.dest = 2, .src1 = 1); + tcc_gen_machine_insn_scratch_reset(); + tcc_gen_machine_func_call_mop(a.src1, src2_ir, a.dest, drop_return_value, ir, i); + ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); tcc_ir_spill_cache_clear(&ir->spill_cache); if (ir->has_static_chain) tcc_gen_machine_restore_chain(); @@ -1657,12 +4075,9 @@ void tcc_ir_codegen_generate(TCCIRState *ir) break; case TCCIR_OP_PREFETCH: { - MachineOperand mop_addr = machine_op_from_ir(ir, &src1_ir); + MopArgs a = DECODE(.src1 = 1); /* src2 holds the rw hint: 0 = read (PLD), 1 = write (PLDW) */ - int rw = (int)irop_get_imm64_ex(ir, src2_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_prefetch_mop(mop_addr, rw); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + SCRATCH_WRAP(tcc_gen_machine_prefetch_mop(a.src1, (int)irop_get_imm64_ex(ir, src2_ir))); break; } case TCCIR_OP_TRAP: @@ -1670,54 +4085,38 @@ void tcc_ir_codegen_generate(TCCIRState *ir) break; case TCCIR_OP_SETJMP: { - MachineOperand mop_buf = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_setjmp_mop(mop_buf, mop_dest); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + SCRATCH_WRAP(tcc_gen_machine_setjmp_mop(a.src1, a.src2, a.dest)); break; } case TCCIR_OP_LONGJMP: { - MachineOperand mop_buf = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_longjmp_mop(mop_buf); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.src1 = 1); + SCRATCH_WRAP(tcc_gen_machine_longjmp_mop(a.src1)); break; } case TCCIR_OP_NL_SETJMP: { - MachineOperand mop_buf = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_nl_setjmp_mop(mop_buf, mop_dest); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1); + SCRATCH_WRAP(tcc_gen_machine_nl_setjmp_mop(a.src1, a.dest)); break; } case TCCIR_OP_NL_LONGJMP: { - MachineOperand mop_buf = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_nl_longjmp_mop(mop_buf); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.src1 = 1); + SCRATCH_WRAP(tcc_gen_machine_nl_longjmp_mop(a.src1)); break; } case TCCIR_OP_BUILTIN_APPLY_ARGS: { - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_builtin_apply_args_mop(mop_dest); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1); + SCRATCH_WRAP(tcc_gen_machine_builtin_apply_args_mop(a.dest)); break; } case TCCIR_OP_BUILTIN_APPLY: { - MachineOperand mop_fn = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_args = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_builtin_apply_mop(mop_fn, mop_args, mop_dest); - ir_codegen_track_scratch(is_dry_run, i, cq->op, dry_insn_scratch, dry_insn_saves); + MopArgs a = DECODE(.dest = 1, .src1 = 1, .src2 = 1); + SCRATCH_WRAP(tcc_gen_machine_builtin_apply_mop(a.src1, a.src2, a.dest)); tcc_ir_spill_cache_clear(&ir->spill_cache); break; } @@ -1744,6 +4143,32 @@ void tcc_ir_codegen_generate(TCCIRState *ir) #endif } break; + case TCCIR_OP_BLOCK_COPY: + { + /* dest=stack offset, src1=symbol ref, src2=size immediate. + * No vregs involved - pass raw IROperands to the backend. */ + IROperand bc_dest = tcc_ir_op_get_dest(ir, cq); + IROperand bc_src = tcc_ir_op_get_src1(ir, cq); + int bc_size = (int)irop_get_imm64_ex(ir, src2_ir); + tcc_gen_machine_block_copy_mop(ir, bc_dest, bc_src, bc_size); + break; + } + case TCCIR_OP_SELECT: + { + /* Skip if the preceding CMP's SUBS+IT peephole already emitted the + * full sequence (subs + it ne + movne #1) in this slot. */ + if (i == codegen_skip_select) { + codegen_skip_select = -1; + break; + } + /* Conditional select: dest = (cond) ? src1 : src2 + * Condition code stored in 4th pool entry as IMM32. */ + MopArgs a = DECODE(.dest = 2, .src1 = 1, .src2 = 1); + IROperand cond_op = tcc_ir_op_get_cond(ir, cq); + int cond_code = (int)irop_get_imm64_ex(ir, cond_op); + SCRATCH_WRAP(tcc_gen_machine_select_mop(a.src1, a.src2, a.dest, cond_code)); + break; + } default: if (!is_dry_run) { @@ -1760,6 +4185,18 @@ void tcc_ir_codegen_generate(TCCIRState *ir) break; }; +#undef DECODE +#undef SCRATCH_WRAP + + /* Track condition-flag liveness for the backend: after a CMP the flags + * are live until consumed by a JUMPIF. Any control-flow instruction + * (JUMP, IJUMP, SWITCH_TABLE) also kills the pending flags. */ + if (cq->op == TCCIR_OP_CMP || cq->op == TCCIR_OP_TEST_ZERO) + ir->codegen_flags_live = 1; + else if (cq->op == TCCIR_OP_JUMPIF || cq->op == TCCIR_OP_JUMP || + cq->op == TCCIR_OP_IJUMP || cq->op == TCCIR_OP_SWITCH_TABLE) + ir->codegen_flags_live = 0; + /* Clean up scratch register state at end of each IR instruction. * This restores any pushed scratch registers and resets the global exclude mask. */ tcc_gen_machine_end_instruction(); @@ -1774,6 +4211,15 @@ void tcc_ir_codegen_generate(TCCIRState *ir) /* Analyze branch offsets and select optimal encodings */ tcc_gen_machine_branch_opt_analyze(ir_to_code_mapping, ir->next_instruction_index); + /* Save dry-run mapping for CBZ distance estimation in the real pass. + * The real pass overwrites ir_to_code_mapping as it goes, but CBZ needs + * the dry-run distance (source→target within the same run) to avoid + * literal-pool-timing divergence between runs. */ + if (cbz_dry_mapping) + tcc_free(cbz_dry_mapping); + cbz_dry_mapping = tcc_malloc(ir->ir_to_code_mapping_size * sizeof(uint32_t)); + memcpy(cbz_dry_mapping, ir_to_code_mapping, ir->ir_to_code_mapping_size * sizeof(uint32_t)); + /* Check if LR was pushed during dry run in a leaf function */ if (original_leaffunc && tcc_gen_machine_dry_run_get_lr_push_count() > 0) { @@ -1784,11 +4230,18 @@ void tcc_ir_codegen_generate(TCCIRState *ir) ind = saved_ind; loc = saved_loc; ir->call_outgoing_base = saved_call_outgoing_base; + ir->call_nested_save_base = saved_call_nested_save_base; ir->codegen_instruction_idx = saved_codegen_idx; /* Phase-3 scratch conflict fixup. * For each instruction where the dry run needed to PUSH a register, - * try to move the blocking vreg to a free callee-saved register. */ + * try to move the blocking vreg to a free callee-saved register. + * + * If the specific register from dry_insn_saves can't be freed (e.g. it + * holds a function parameter pinned by the ABI), try freeing any other + * R0-R3 register that is occupied at this instruction. Low registers + * use 16-bit Thumb encoding for PUSH/POP and most ALU ops, so freeing + * one avoids the push/pop entirely AND keeps instructions compact. */ { int any_fixup = 0; for (int i = 0; i < ir->next_instruction_index; i++) @@ -1796,6 +4249,7 @@ void tcc_ir_codegen_generate(TCCIRState *ir) uint16_t saves = dry_insn_saves[i]; if (!saves) continue; + int all_fixed = 1; while (saves) { int r = (int)__builtin_ctz(saves); @@ -1803,13 +4257,108 @@ void tcc_ir_codegen_generate(TCCIRState *ir) int new_r = try_reassign_scratch_conflict(ir, r, i); if (new_r >= 0) { - dry_insn_scratch[i] = 0; any_fixup = 1; } + else + { + /* The recorded register couldn't be freed. Try to free any + * other R0-R3 at this instruction — if one is already free or + * can be freed, tcc_ls_find_free_scratch_reg will find it during + * the real run and no push/pop will be needed. */ + int alt_fixed = 0; + if (ir->ls.live_regs_by_instruction && i < ir->ls.live_regs_by_instruction_size) + { + uint32_t live = ir->ls.live_regs_by_instruction[i]; + /* If any R0-R3 is already free, the real run will use it. */ + if ((~live & 0xFu) & ~(1u << r)) + { + alt_fixed = 1; + } + else + { + /* All R0-R3 occupied — try to reassign one to callee-saved. */ + for (int ar = 0; ar <= 3; ar++) + { + if (ar == r) + continue; + if (try_reassign_scratch_conflict(ir, ar, i) >= 0) + { + any_fixup = 1; + alt_fixed = 1; + break; + } + } + } + } + if (!alt_fixed) + all_fixed = 0; + } } + if (all_fixed) + dry_insn_scratch[i] = 0; } if (any_fixup) + { tcc_ls_reset_scratch_cache(&ir->ls); + /* Interval table was mutated: cached MopArgs are stale, discard. */ + tcc_free(mop_cache); + mop_cache = NULL; + } + use_mop_cache = (mop_cache != NULL); + } + + /* Allocate scratch save area if the dry run detected scratch pushes. + * When FP is omitted, scratch PUSH/POP would move SP and break + * SP-relative addressing. Instead, reserve stack slots so that + * get_scratch_reg_with_save() can use STR/LDR to fixed offsets. + * Only allocate when actually needed (detected by dry run). */ + if (!tcc_state->need_frame_pointer) + { + int max_scratch_depth = 0; + /* Check per-instruction saves from dry run */ + for (int i = 0; i < ir->next_instruction_index; i++) + { + if (dry_insn_saves[i]) + { + int depth = __builtin_popcount(dry_insn_saves[i]); + if (depth > max_scratch_depth) + max_scratch_depth = depth; + } + } + /* Also check the global bitmap as safety net for dry/real divergence */ + { + uint32_t global_bitmap = tcc_gen_machine_dry_run_get_scratch_regs_pushed(); + if (global_bitmap) + { + int depth = __builtin_popcount(global_bitmap); + if (depth > max_scratch_depth) + max_scratch_depth = depth; + } + } + if (max_scratch_depth > 0) + { + /* Round up to 8 so the frame's alignment padding (and with it the + * SP-literal addressing of the outgoing/nested areas) is unchanged + * relative to the no-scratch layout. */ + ir->scratch_save_size = (max_scratch_depth * 4 + 7) & ~7; + loc -= ir->scratch_save_size; + /* Keep the outgoing call-arg area at the very bottom of the frame + * and the nested-call save area directly above it (both are + * addressed with literal SP offsets); see the matching re-slot in + * the might_need_scratch reservation above. */ + if (ir->call_outgoing_size > 0 || ir->call_nested_save_size > 0) + { + ir->call_outgoing_base = loc; + ir->call_nested_save_base = loc + ir->call_outgoing_size; + ir->scratch_save_base = loc + ir->call_outgoing_size + ir->call_nested_save_size; + } + else + { + ir->scratch_save_base = loc; + } + /* Recompute stack_size with scratch area included */ + stack_size = (-loc + 7) & ~7; + } } /* Reset scratch state for real pass */ @@ -1817,8 +4366,13 @@ void tcc_ir_codegen_generate(TCCIRState *ir) tcc_ir_spill_cache_clear(&ir->spill_cache); tcc_ir_opt_fp_cache_clear(ir); - /* Emit prologue before real pass */ + /* Emit prologue before real pass. + * The dry-run peephole already patched some allocations, but re-run + * the pre-patch for any cases the peephole missed (e.g. if the + * dispatch didn't trigger for certain ops). */ (void)original_leaffunc; + ir_codegen_pre_patch_funcparam_allocations(ir); + ir_codegen_recompute_dirty_from_allocations(ir); if (!ir->naked) tcc_gen_machine_prolog(ir->leaffunc, ir->ls.dirty_registers, stack_size, extra_prologue_regs); if (!ir->naked) @@ -1826,6 +4380,12 @@ void tcc_ir_codegen_generate(TCCIRState *ir) } } + tcc_free(mop_cache); + if (cbz_dry_mapping) + tcc_free(cbz_dry_mapping); + if (branch_target_reset) + tcc_free(branch_target_reset); + ir_to_code_mapping[ir->next_instruction_index] = ind; orig_ir_to_code_mapping[ir->orig_ir_to_code_mapping_size - 1] = ind; @@ -1844,7 +4404,12 @@ void tcc_ir_codegen_generate(TCCIRState *ir) } if (!ir->naked) - tcc_gen_machine_epilog(ir->leaffunc); + { + if (!ir->noreturn && !ir->tail_call_only) + tcc_gen_machine_epilog(ir->leaffunc); + else + tcc_gen_machine_finish_noreturn(); + } tcc_ir_codegen_backpatch_jumps(ir, ir_to_code_mapping); /* Backpatch return jumps to point to epilogue */ @@ -1857,7 +4422,6 @@ void tcc_ir_codegen_generate(TCCIRState *ir) tcc_free(return_jump_addrs); tcc_free(dry_insn_saves); tcc_free(dry_insn_scratch); - tcc_free(has_incoming_jump); } /* ============================================================================ diff --git a/ir/codegen.c.assign_only b/ir/codegen.c.assign_only deleted file mode 100644 index e64751cb..00000000 --- a/ir/codegen.c.assign_only +++ /dev/null @@ -1,3068 +0,0 @@ -/* - * TCC IR - Code Generation Helpers Implementation - * - * Copyright (c) 2025 Mateusz Stadnik - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation. - */ - -#define USING_GLOBALS -#include "ir.h" - -/* Debug tracking variable (defined in arm-thumb-gen.c) */ -extern int g_debug_current_op; - -/* ============================================================================ - * Register Fill (Apply Allocation to Operands) - * ============================================================================ */ - -void tcc_ir_fill_registers(TCCIRState *ir, SValue *sv) -{ - int old_r = sv->r; - int old_v = old_r & VT_VALMASK; - - /* VT_LOCAL/VT_LLOCAL operands can mean either: - * - a concrete stack slot (vr == -1), e.g. VLA save slots, or - * - a logical local tracked as a vreg by the IR (vr != -1). - * - * For concrete stack slots, do not rewrite them into registers here; doing - * so can create uninitialized register reads at runtime. - * - * For locals that do carry a vreg, they must participate in register - * allocation so that defs/uses stay consistent. - */ - if ((old_v == VT_LOCAL || old_v == VT_LLOCAL) && sv->vr == -1) - { - sv->pr0_reg = PREG_REG_NONE; - sv->pr0_spilled = 0; - sv->pr1_reg = PREG_REG_NONE; - sv->pr1_spilled = 0; - return; - } - if (tcc_ir_vreg_is_valid(ir, sv->vr)) - { - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, sv->vr); - - /* Stack-passed parameters: if not allocated to a register, treat them as - * residing in the incoming argument area (VT_PARAM) rather than forcing a - * separate local spill slot. - * - * This is safe under AAPCS: the caller's argument stack area remains valid - * for the duration of the call, and it also provides a correct addressable - * home for '¶m' semantics. - */ - if (TCCIR_DECODE_VREG_TYPE(sv->vr) == TCCIR_VREG_TYPE_PARAM && interval && interval->incoming_reg0 < 0 && - interval->allocation.r0 == PREG_NONE && interval->allocation.offset == 0) - { - sv->pr0_reg = PREG_REG_NONE; - sv->pr0_spilled = 0; - sv->pr1_reg = PREG_REG_NONE; - sv->pr1_spilled = 0; - sv->c.i = interval->original_offset; - - int need_lval = (old_r & VT_LVAL); - if (old_v < VT_CONST && old_v != VT_LOCAL && old_v != VT_LLOCAL && interval->is_lvalue) - need_lval = VT_LVAL; - - sv->r = VT_LOCAL | need_lval | VT_PARAM; - return; - } - - /* Register-passed parameters: if allocated to a register (not spilled), - * clear VT_LVAL. The value is already in the register, no dereference needed. - * VT_LVAL is only used on parameters for address-of operations (¶m) or - * when they're on the stack (VT_LOCAL). - */ - int is_register_param = - (TCCIR_DECODE_VREG_TYPE(sv->vr) == TCCIR_VREG_TYPE_PARAM && interval && interval->incoming_reg0 >= 0); - - sv->pr0_reg = interval->allocation.r0 & PREG_REG_NONE; - sv->pr0_spilled = (interval->allocation.r0 & PREG_SPILLED) != 0; - sv->pr1_reg = interval->allocation.r1 & PREG_REG_NONE; - sv->pr1_spilled = (interval->allocation.r1 & PREG_SPILLED) != 0; - sv->c.i = interval->allocation.offset; - - /* Determine if we should preserve VT_LVAL: - * - If old_r was VT_LOCAL|VT_LVAL (local variable on stack), and now - * it's allocated to a register, we should NOT preserve VT_LVAL because - * the value is already in the register, no load needed. - * - If old_r has VT_LVAL but (old_r & VT_VALMASK) < VT_CONST, it means - * the vreg holds a pointer that needs dereferencing - preserve VT_LVAL. - * - Register parameters: do NOT preserve VT_LVAL when allocated to a register. - * VT_LVAL on parameters is only needed for stack params (VT_LOCAL) or for - * address-of operations. - * - If old_r does NOT have VT_LVAL, this is an address-of operation - * (we want the address, not the value). Do NOT add VT_LVAL. */ - int preserve_flags = old_r & VT_PARAM; /* Always preserve VT_PARAM */ - if ((old_r & VT_LVAL) && old_v < VT_CONST && old_v != VT_LOCAL && old_v != VT_LLOCAL && !is_register_param) - { - /* The vreg holds a pointer that needs dereferencing. - * Note: VT_LOCAL/VT_LLOCAL use VT_LVAL to mean "load from stack slot". - * When such a local/param is promoted to a register, we must NOT - * preserve VT_LVAL, otherwise we turn a plain value into a pointer - * dereference (double-indirection bugs). - */ - preserve_flags |= VT_LVAL; - } - - if ((interval->allocation.r0 & PREG_SPILLED) || interval->allocation.offset != 0) - { - /* Spilled to stack - treat as local. - * For computed values (old_r was 0 or a register), add VT_LVAL to load the value. - * For address-of expressions (old_r == VT_LOCAL without VT_LVAL), don't add VT_LVAL. - * If original had VT_LVAL (pointer dereference), preserve it. - * - * DOUBLE INDIRECTION CASE: If old_r has VT_LVAL AND the original was NOT - * already a local variable (VT_LOCAL), then the code wants to DEREFERENCE - * the value held in this vreg. If that value is spilled: - * - Spill slot contains a POINTER value (e.g., result of ADD on address) - * - Need to: (1) load pointer from spill, (2) dereference it - * Use VT_LLOCAL to encode this double-indirection requirement. - * - * But if old_v == VT_LOCAL, the VT_LVAL means "load/store from/to this stack slot" - * which is standard local variable access - do NOT use VT_LLOCAL. - * - * ADDRESS-OF CASE: If old_v == VT_LOCAL and old_r does NOT have VT_LVAL, - * this is an address-of operation (&var). We want the ADDRESS of the spill - * slot, not its contents. Do NOT add VT_LVAL in this case. - * - * COMPUTED VALUE CASE: If old_v was a register (computed value that got - * spilled), we ALWAYS need VT_LVAL to load the value from the spill slot. */ - int need_lval; - if (old_v == VT_LOCAL || old_v == VT_LLOCAL) - { - /* Local variable: preserve VT_LVAL to distinguish load vs address-of */ - need_lval = (old_r & VT_LVAL); - } - else - { - /* Computed value (was in register): always need VT_LVAL to load from spill */ - need_lval = VT_LVAL; - } - int base_kind = VT_LOCAL; - if ((old_r & VT_LVAL) && old_v != VT_LOCAL && old_v != VT_LLOCAL) - { - /* The original use wants to dereference the value in this vreg. - * Since the value is spilled, we need double indirection: - * load pointer from spill slot, then dereference it. - * Note: We exclude VT_LOCAL/VT_LLOCAL because their VT_LVAL means - * "access this stack slot" not "dereference pointer in vreg". */ - base_kind = VT_LLOCAL; - } - /* Only preserve VT_PARAM for stack-passed parameters (incoming_reg0 < 0). - * Register-passed parameters that are spilled to local stack should NOT - * have VT_PARAM set, because VT_PARAM causes load_to_dest to add - * offset_to_args (for accessing caller's argument area), but spilled - * register params live in the callee's local stack area (negative FP offset). */ - int spilled_param_flag = 0; - if ((old_r & VT_PARAM) && interval->incoming_reg0 < 0) - { - spilled_param_flag = VT_PARAM; - } - sv->r = base_kind | need_lval | spilled_param_flag; - } - else if (interval->allocation.r0 != PREG_NONE) - { - /* In a register - set r to the register number, preserving VT_LVAL only for pointer derefs */ - sv->r = interval->allocation.r0 | preserve_flags; - } - } - else if ((sv->vr == -1 || sv->vr == 0 || TCCIR_DECODE_VREG_TYPE(sv->vr) == 0) && - (sv->r == -1 || sv->r == PREG_REG_NONE || (old_v >= VT_CONST))) - { - /* No valid vreg and either invalid .r or a constant - preserve important flags. - * This handles global symbol references (VT_CONST | VT_SYM) and plain constants. */ - int flags = sv->r & (VT_LVAL | VT_SYM); - sv->r = VT_CONST | flags; - } - else if (sv->vr == -1 && old_r == 0 && sv->sym) - { - /* Special case: old_r=0 but has a symbol - this is a function symbol reference - * that wasn't marked as VT_CONST. Preserve the symbol. */ - sv->r = VT_CONST | VT_SYM; - } -} - -void tcc_ir_fill_registers_ir(TCCIRState *ir, IROperand *op) -{ - const int old_is_local = op->is_local; - const int old_is_llocal = op->is_llocal; - const int old_is_const = op->is_const; - const int old_is_lval = op->is_lval; - const int old_is_param = op->is_param; - - const int vreg = irop_get_vreg(*op); - - /* VT_LOCAL/VT_LLOCAL operands can mean either: - * - a concrete stack slot (vr == -1), e.g. VLA save slots, or - * - a temp local for type-punning casts (vr <= -2, VR_TEMP_LOCAL), or - * - a logical local tracked as a vreg by the IR (vr > 0). - * - * For concrete stack slots and temp locals, do not rewrite them into - * registers here; doing so can create uninitialized register reads - * at runtime. */ - if ((old_is_local || old_is_llocal) && vreg < 0) - { - op->pr0_reg = PREG_REG_NONE; - op->pr0_spilled = 0; - op->pr1_reg = PREG_REG_NONE; - op->pr1_spilled = 0; - return; - } - - if (tcc_ir_vreg_is_valid(ir, vreg)) - { - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, vreg); - int32_t old_stackoff = 0; - if (op->btype != IROP_BTYPE_STRUCT && irop_get_tag(*op) == IROP_TAG_STACKOFF) - old_stackoff = op->u.imm32; - - /* Stack-passed parameters: if not allocated to a register, treat them as - * residing in the incoming argument area (VT_PARAM) rather than forcing a - * separate local spill slot. */ - if (TCCIR_DECODE_VREG_TYPE(vreg) == TCCIR_VREG_TYPE_PARAM && interval && interval->incoming_reg0 < 0 && - interval->allocation.r0 == PREG_NONE && interval->allocation.offset == 0) - { - op->pr0_reg = PREG_REG_NONE; - op->pr0_spilled = 0; - op->pr1_reg = PREG_REG_NONE; - op->pr1_spilled = 0; - /* For STRUCT types, preserve ctype_idx in the split encoding */ - if (op->btype == IROP_BTYPE_STRUCT) - { - op->u.s.aux_data = interval->original_offset; - } - else - { - op->u.imm32 = interval->original_offset; - } - op->tag = IROP_TAG_STACKOFF; - - int need_lval = old_is_lval; - /* old_v < VT_CONST && old_v != VT_LOCAL && old_v != VT_LLOCAL → reg kind operand */ - if (!old_is_const && !old_is_local && !old_is_llocal && interval->is_lvalue) - need_lval = 1; - - op->is_local = 1; - op->is_llocal = 0; - op->is_const = 0; - op->is_lval = need_lval; - op->is_param = 1; - return; - } - - /* Register-passed parameters: if allocated to a register (not spilled), - * clear VT_LVAL. The value is already in the register, no dereference needed. */ - int is_register_param = - (TCCIR_DECODE_VREG_TYPE(vreg) == TCCIR_VREG_TYPE_PARAM && interval && interval->incoming_reg0 >= 0); - - op->pr0_reg = interval->allocation.r0 & PREG_REG_NONE; - op->pr0_spilled = (interval->allocation.r0 & PREG_SPILLED) != 0; - op->pr1_reg = interval->allocation.r1 & PREG_REG_NONE; - op->pr1_spilled = (interval->allocation.r1 & PREG_SPILLED) != 0; - /* For STRUCT types, preserve ctype_idx in the split encoding */ - if (op->btype == IROP_BTYPE_STRUCT) - { - op->u.s.aux_data = interval->allocation.offset; - } - else - { - if ((old_is_local || old_is_llocal) && !old_is_param && irop_get_tag(*op) == IROP_TAG_STACKOFF) - { - int32_t delta = old_stackoff - interval->original_offset; - op->u.imm32 = interval->allocation.offset + delta; - } - else - { - op->u.imm32 = interval->allocation.offset; - } - } - - /* Determine if we should preserve is_lval: - * - If was local|lval and now in register, do NOT preserve is_lval - * - If was lval with reg-kind operand (pointer deref), preserve is_lval - * - Register parameters: do NOT preserve is_lval when in register */ - int preserve_param = old_is_param; - int preserve_lval = 0; - if (old_is_lval && !old_is_const && !old_is_local && !old_is_llocal && !is_register_param) - { - preserve_lval = 1; - } - - if ((interval->allocation.r0 & PREG_SPILLED) || interval->allocation.offset != 0) - { - /* Spilled to stack */ - int need_lval; - if (old_is_local || old_is_llocal) - { - need_lval = old_is_lval; - } - else - { - /* Computed value (was in register): always need lval to load from spill */ - need_lval = 1; - } - - int use_llocal = 0; - if (old_is_lval && !old_is_local && !old_is_llocal) - { - /* Double indirection: spilled pointer that needs dereferencing */ - use_llocal = 1; - } - - /* Only preserve is_param for stack-passed parameters (incoming_reg0 < 0). - * Register-passed parameters spilled to local stack should NOT have is_param. */ - int spilled_param = 0; - if (old_is_param && interval->incoming_reg0 < 0) - { - spilled_param = 1; - } - - op->is_local = 1; - op->is_llocal = use_llocal; - op->is_const = 0; - op->is_lval = need_lval; - op->is_param = spilled_param; - op->tag = IROP_TAG_STACKOFF; - } - else if (interval->allocation.r0 != PREG_NONE) - { - /* In a register */ - op->is_local = 0; - op->is_llocal = 0; - op->is_const = 0; - op->is_lval = preserve_lval; - op->is_param = preserve_param; - op->tag = IROP_TAG_VREG; - } - } - /* No valid vreg: constants, symbols, etc. - IROperand already has the right encoding - * from the pool. Nothing to do for register allocation. */ -} - -/* ============================================================================ - * Parameter Register Allocation - * ============================================================================ */ - -void tcc_ir_register_allocation_params(TCCIRState *ir) -{ - /* For leaf functions: parameters can stay in registers r0-r3, UNLESS - * the linear scan allocator already spilled them due to register pressure. - * For non-leaf functions: parameters arrive in registers but must be - * stored to stack since r0-r3 are caller-saved. - * In both cases, we need to track which register each parameter arrives in. - */ - int argno = 0; // current register number (r0-r3) - for (int vreg = 0; vreg < ir->next_parameter; ++vreg) - { - const int encoded_vreg = (TCCIR_VREG_TYPE_PARAM << 28) | vreg; - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, encoded_vreg); - /* is_double for soft-float (LS_REG_TYPE_DOUBLE_SOFT) or is_llong for 64-bit - */ - int is_64bit = interval && (interval->is_double || interval->is_llong || interval->is_complex); - - /* If the ABI incoming registers were already set (e.g., by the - * parameter handling in tcc_ir_add_function_parameters), respect them - * and only advance argno for subsequent parameters. - */ - if (interval && (interval->incoming_reg0 >= 0 || interval->incoming_reg1 >= 0)) - { - argno += is_64bit ? 2 : 1; - continue; - } - - /* AAPCS: 64-bit values must be aligned to even register pairs */ - if (is_64bit && (argno & 1)) - { - argno++; /* skip odd register to align to even */ - } - - if (is_64bit) - { - /* 64-bit value (double or long long) takes r0+r1 or r2+r3 */ - if (argno <= 2) - { - /* Parameter arrives in registers */ - interval->incoming_reg0 = argno; - interval->incoming_reg1 = argno + 1; - /* NOTE: For leaf functions, the linear scanner has already assigned registers. - * Don't overwrite interval->allocation here - it would clobber the correct allocation - * with argno (parameter index), which is NOT the same as the physical register number. - * The prolog will use incoming_reg0/1 to know which registers the parameter arrives in. */ - } - else - { - /* Spilled to caller's stack frame - parameter passed on stack */ - interval->incoming_reg0 = -1; - interval->incoming_reg1 = -1; - /* Record where the parameter arrives on the caller's stack frame. - * Use original_offset if already set by tcc_ir_set_original_offset - * (from the ABI layout), otherwise compute from argno. - * The ABI-derived offset is more accurate for complex cases like - * split structs (REG_STACK) where argno doesn't account for - * stack words that don't have PARAM vregs. - */ - if (interval->original_offset == 0) - interval->original_offset = (argno - 4) * 4; - /* See 64-bit case above: do not overwrite allocator spill slots with - * caller-stack offsets. - */ - interval->allocation.r0 = PREG_NONE; - interval->allocation.r1 = PREG_NONE; - interval->allocation.offset = 0; - } - argno += 2; - } - else - { - if (argno <= 3) - { - interval->incoming_reg0 = argno; - interval->incoming_reg1 = -1; - } - else - { - /* Spilled to caller's stack frame - parameter passed on stack */ - interval->incoming_reg0 = -1; - interval->incoming_reg1 = -1; - /* Record where the parameter arrives on the caller's stack frame. - * Use original_offset if already set by tcc_ir_set_original_offset - * (from the ABI layout), otherwise compute from argno. - */ - if (interval->original_offset == 0) - interval->original_offset = (argno - 4) * 4; - /* See 64-bit case above: do not overwrite allocator spill slots with - * caller-stack offsets. - */ - interval->allocation.r0 = PREG_NONE; - interval->allocation.r1 = PREG_NONE; - interval->allocation.offset = 0; - } - argno++; - } - } -} - -void tcc_ir_mark_return_value_incoming_regs(TCCIRState *ir) -{ - if (!ir) - return; - - /* Scan all instructions to find FUNCCALLVAL that produce return values */ - for (int i = 0; i < ir->next_instruction_index; ++i) - { - IRQuadCompact *q = &ir->compact_instructions[i]; - if (q->op != TCCIR_OP_FUNCCALLVAL) - continue; - - /* dest is the vreg that receives the return value */ - const IROperand dest = tcc_ir_op_get_dest(ir, q); - if (dest.vr < 0 || !tcc_ir_vreg_is_valid(ir, dest.vr)) - continue; - - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, dest.vr); - if (!interval) - continue; - - /* Mark that this vreg arrives in r0 (or r0+r1 for 64-bit returns) */ - interval->incoming_reg0 = 0; /* r0 */ - if (interval->is_llong || interval->is_double || interval->is_complex) - interval->incoming_reg1 = 1; /* r1 */ - else - interval->incoming_reg1 = -1; - } -} - -void tcc_ir_avoid_spilling_stack_passed_params(TCCIRState *ir) -{ - if (!ir) - return; - - /* Compute which PARAM vregs are stack-passed under AAPCS. - * We intentionally do this before patching IRLiveInterval allocations, - * operating on the linear-scan table so we can also shrink `loc`/frame size. - */ - const int param_count = ir->next_parameter; - if (param_count <= 0) - return; - - uint8_t *is_stack_passed = tcc_mallocz((size_t)param_count); - int argno = 0; - for (int vreg = 0; vreg < param_count; ++vreg) - { - const int encoded_vreg = (TCCIR_VREG_TYPE_PARAM << 28) | vreg; - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, encoded_vreg); - if (!interval) - continue; - - const int is_64bit = interval->is_double || interval->is_llong; - if (is_64bit && (argno & 1)) - argno++; /* align 64-bit to even reg pair */ - - const int in_regs = is_64bit ? (argno <= 2) : (argno <= 3); - if (!in_regs) - is_stack_passed[vreg] = 1; - - argno += is_64bit ? 2 : 1; - } - - /* Rewrite linear-scan results: stack-passed params already have an incoming - * memory home (caller arg area), so if the allocator spilled them, drop the - * local spill slot. Also force address-taken stack params to remain in - * memory (we can use the incoming slot as their addressable home). - */ - for (int i = 0; i < ir->ls.next_interval_index; ++i) - { - LSLiveInterval *ls = &ir->ls.intervals[i]; - if (TCCIR_DECODE_VREG_TYPE((int)ls->vreg) != TCCIR_VREG_TYPE_PARAM) - continue; - const int pidx = TCCIR_DECODE_VREG_POSITION((int)ls->vreg); - if (pidx < 0 || pidx >= param_count) - continue; - if (!is_stack_passed[pidx]) - continue; - - /* Stack-passed params live in the caller's argument area. If linear-scan - * assigned them a register (without spilling), the prolog won't load them - * into that register, causing incorrect code. Always reset r0/r1 to force - * them to use the incoming stack location via VT_PARAM path. */ - ls->r0 = PREG_NONE; - ls->r1 = PREG_NONE; - ls->stack_location = 0; - } - - tcc_free(is_stack_passed); -} - -/* ============================================================================ - * Code Generation Helpers - * ============================================================================ */ - -IROperand tcc_ir_codegen_dest_get(TCCIRState *ir, const IRQuadCompact *q) -{ - if (!irop_config[q->op].has_dest) - { - IROperand empty = {0}; - return empty; - } - return ir->iroperand_pool[q->operand_base + 0]; -} - -IROperand tcc_ir_codegen_src1_get(TCCIRState *ir, const IRQuadCompact *q) -{ - int off = irop_config[q->op].has_dest; - if (!irop_config[q->op].has_src1) - { - IROperand empty = {0}; - return empty; - } - return ir->iroperand_pool[q->operand_base + off]; -} - -IROperand tcc_ir_codegen_src2_get(TCCIRState *ir, const IRQuadCompact *q) -{ - int off = irop_config[q->op].has_dest + irop_config[q->op].has_src1; - if (!irop_config[q->op].has_src2) - { - IROperand empty = {0}; - return empty; - } - return ir->iroperand_pool[q->operand_base + off]; -} - -void tcc_ir_codegen_dest_set(TCCIRState *ir, const IRQuadCompact *q, IROperand irop) -{ - if (!irop_config[q->op].has_dest) - return; - ir->iroperand_pool[q->operand_base + 0] = irop; -} - -int tcc_ir_codegen_reg_get(TCCIRState *ir, int vreg) -{ - if (!ir || !tcc_ir_vreg_is_valid(ir, vreg)) - return PREG_NONE; - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, vreg); - if (!interval) - return PREG_NONE; - return interval->allocation.r0; -} - -void tcc_ir_codegen_reg_set(TCCIRState *ir, int vreg, int preg) -{ - if (!ir || !tcc_ir_vreg_is_valid(ir, vreg)) - return; - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, vreg); - if (interval) - interval->allocation.r0 = preg; -} - -void tcc_ir_codegen_params_setup(TCCIRState *ir) -{ - tcc_ir_register_allocation_params(ir); -} - -void tcc_ir_codegen_cmp_jmp_set(TCCIRState *ir) -{ - if (ir == NULL) - return; - /* Guard against invalid vtop - can happen with empty structs */ - extern SValue _vstack[]; - if (vtop < _vstack + 1) /* vstack is defined as (_vstack + 1) */ - return; - int v = vtop->r & VT_VALMASK; - if (v == VT_CMP) - { - SValue src, dest; - int jtrue = vtop->jtrue; - int jfalse = vtop->jfalse; - svalue_init(&src); - svalue_init(&dest); - dest.vr = tcc_ir_get_vreg_temp(ir); - dest.type.t = VT_INT; - dest.pr0_reg = PREG_REG_NONE; - dest.pr0_spilled = 0; - dest.pr1_reg = PREG_REG_NONE; - dest.pr1_spilled = 0; - - if (jtrue >= 0 || jfalse >= 0) - { - /* We have pending jump chains - need to merge them with the comparison */ - SValue jump_dest; - svalue_init(&jump_dest); - jump_dest.vr = -1; - jump_dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - - /* Generate SETIF for the comparison part */ - src.vr = -1; - src.r = VT_CONST; - src.c.i = vtop->cmp_op; - tcc_ir_put(ir, TCCIR_OP_SETIF, &src, NULL, &dest); - - /* Jump to end */ - jump_dest.c.i = -1; /* will be patched */ - int end_jump = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &jump_dest); - - /* Patch jtrue chain to here - set dest = 1 */ - if (jtrue >= 0) - { - tcc_ir_backpatch_to_here(ir, jtrue); - src.r = VT_CONST; - src.c.i = 1; - src.pr0_reg = PREG_REG_NONE; - src.pr0_spilled = 0; - src.pr1_reg = PREG_REG_NONE; - src.pr1_spilled = 0; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src, NULL, &dest); - if (jfalse >= 0) - { - /* Jump over the jfalse handler */ - jump_dest.c.i = -1; /* will be patched */ - int skip_jump = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &jump_dest); - /* Patch jfalse chain to here - set dest = 0 */ - tcc_ir_backpatch_to_here(ir, jfalse); - src.r = VT_CONST; - src.c.i = 0; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src, NULL, &dest); - /* Patch skip_jump to end */ - tcc_ir_set_dest_jump_target(ir, skip_jump, ir->next_instruction_index); - } - } - else if (jfalse >= 0) - { - tcc_ir_backpatch_to_here(ir, jfalse); - src.r = VT_CONST; - src.c.i = 0; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src, NULL, &dest); - } - - /* Patch end_jump to here */ - tcc_ir_set_dest_jump_target(ir, end_jump, ir->next_instruction_index); - tcc_ir_codegen_bb_start(ir); - } - else - { - /* Simple case - just SETIF */ - src.vr = -1; - src.r = VT_CONST; - src.c.i = vtop->cmp_op; - tcc_ir_put(ir, TCCIR_OP_SETIF, &src, NULL, &dest); - } - - vtop->vr = dest.vr; - vtop->r = 0; - } - else if ((v & ~1) == VT_JMP) - { - SValue dest, src1; - SValue jump_dest; - int t; - svalue_init(&src1); - svalue_init(&dest); - svalue_init(&jump_dest); - dest.vr = tcc_ir_get_vreg_temp(ir); - dest.type.t = VT_INT; - src1.vr = -1; - src1.r = VT_CONST; - t = v & 1; - src1.c.i = t; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src1, NULL, &dest); - - /* Default path: result already set to `t`. Skip the alternate assignment. - If the jump chain is taken, execution lands at the alternate assignment - which flips the result to `t ^ 1`. */ - jump_dest.vr = -1; - jump_dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - jump_dest.c.i = -1; /* patched to end */ - int end_jump = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &jump_dest); - - tcc_ir_backpatch_to_here(ir, vtop->c.i); - src1.c.i = t ^ 1; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src1, NULL, &dest); - IROperand end_dest = tcc_ir_op_get_dest(ir, &ir->compact_instructions[end_jump]); - end_dest.u.imm32 = ir->next_instruction_index; - tcc_ir_op_set_dest(ir, &ir->compact_instructions[end_jump], end_dest); - vtop->vr = dest.vr; - vtop->r = 0; - } -} - -void tcc_ir_codegen_backpatch(TCCIRState *ir, int jump_idx, int target_address) -{ - tcc_ir_backpatch(ir, jump_idx, target_address); -} - -void tcc_ir_codegen_backpatch_here(TCCIRState *ir, int jump_idx) -{ - tcc_ir_backpatch_to_here(ir, jump_idx); -} - -void tcc_ir_codegen_backpatch_first(TCCIRState *ir, int jump_idx, int target_address) -{ - tcc_ir_backpatch_first(ir, jump_idx, target_address); -} - -int tcc_ir_codegen_jump_append(TCCIRState *ir, int chain, int jump) -{ - return tcc_ir_gjmp_append(ir, chain, jump); -} - -int tcc_ir_codegen_test_gen(TCCIRState *ir, int invert, int test) -{ - int v; - v = vtop->r & VT_VALMASK; - if (v == VT_CMP) - { - SValue src, dest; - int jtrue = vtop->jtrue; - int jfalse = vtop->jfalse; - - svalue_init(&src); - svalue_init(&dest); - src.vr = -1; - src.r = VT_CONST; - /* Use cmp_op and invert if needed. In TCC, comparison tokens are designed - * so that XORing with 1 inverts them (e.g., TOK_EQ ^ 1 = TOK_NE) */ - int cond = vtop->cmp_op ^ invert; - /* Validate condition is a valid comparison token */ - src.c.i = cond; - dest.vr = -1; - dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - dest.c.i = test; - test = tcc_ir_put(ir, TCCIR_OP_JUMPIF, &src, NULL, &dest); - - /* Handle pending jump chains - merge with the appropriate chain */ - if (invert) - { - /* inv=1: we want to jump when condition is false */ - /* Merge any existing "jump-on-false" chain with the new jump. - * Patch the opposite chain (jump-on-true) to fall through here. */ - if (jfalse >= 0) - { - tcc_ir_backpatch_first(ir, jfalse, test); - test = jfalse; - } - if (jtrue >= 0) - { - tcc_ir_backpatch_to_here(ir, jtrue); - } - } - else - { - /* inv=0: we want to jump when condition is true */ - /* Merge any existing "jump-on-true" chain with the new jump. - * Patch the opposite chain (jump-on-false) to fall through here. */ - if (jtrue >= 0) - { - tcc_ir_backpatch_first(ir, jtrue, test); - test = jtrue; - } - if (jfalse >= 0) - { - tcc_ir_backpatch_to_here(ir, jfalse); - } - } - } - else if (v == VT_JMP || v == VT_JMPI) - { - if ((v & 1) == invert) - { - if (vtop->c.i == -1) - { - vtop->c.i = test; - } - else - { - if (test != -1) - { - tcc_ir_backpatch_first(ir, vtop->c.i, test); - } - test = vtop->c.i; - } - } - else - { - SValue dest; - svalue_init(&dest); - dest.vr = -1; - dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - dest.c.i = test; - test = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &dest); - tcc_ir_backpatch_to_here(ir, vtop->c.i); - } - } - else - { - if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) - { - if ((vtop->c.i != 0) != invert) - { - SValue dest; - svalue_init(&dest); - dest.vr = -1; - dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - dest.c.i = test; - test = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &dest); - /* Unconditional jump for a compile-time constant condition: - * code after this point is unreachable. Must mirror gjmp_acs() - * which calls CODE_OFF() so that data/code suppression works - * correctly for dead branches (e.g. if(0) { ... }). - * CODE_OFF_BIT = 0x20000000 (defined in tccgen.c). */ - if (!nocode_wanted) - nocode_wanted |= 0x20000000; - } - } - else - { - /* If we're testing a memory lvalue (e.g. tabl[i]), load the value first. - * Otherwise we end up testing the address, which is almost always non-zero - * and can lead to invalid indirect calls. - */ - tcc_ir_put(ir, TCCIR_OP_TEST_ZERO, &vtop[0], NULL, NULL); - vtop->r = VT_CMP; - vtop->cmp_op = TOK_NE; - vtop->jtrue = -1; /* -1 = no chain */ - vtop->jfalse = -1; /* -1 = no chain */ - return tcc_ir_codegen_test_gen(ir, invert, test); - } - } - --vtop; - return test; -} - -void tcc_ir_codegen_bb_start(TCCIRState *ir) -{ - if (ir) - ir->basic_block_start = 1; -} - -/* ============================================================================ - * Return Value Handling - * ============================================================================ */ - -void tcc_ir_codegen_drop_return(TCCIRState *ir) -{ - if (ir->next_instruction_index == 0) - { - return; - } - IRQuadCompact *last_instr = &ir->compact_instructions[ir->next_instruction_index - 1]; - - if (last_instr->op == TCCIR_OP_FUNCCALLVAL) - { - /* Only drop return values that are assigned to temporaries. - * If coalescing redirected the dest to a VAR, the value IS used - * and should not be dropped. */ - IROperand dest = tcc_ir_op_get_dest(ir, last_instr); - if (TCCIR_DECODE_VREG_TYPE(dest.vr) == TCCIR_VREG_TYPE_TEMP) - { - if (tcc_ir_vreg_is_valid(ir, dest.vr)) - { - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, dest.vr); - interval->start = INTERVAL_NOT_STARTED; - interval->end = 0; - } - irop_set_vreg(&dest, -1); - dest.vr = -1; - tcc_ir_op_set_dest(ir, last_instr, dest); - } - } -} - -/* ============================================================================ - * Inline Assembly Code Generation - * ============================================================================ */ - -#ifdef CONFIG_TCC_ASM - -static void tcc_ir_codegen_inline_asm_by_id(TCCIRState *ir, int id) -{ - if (!ir) - return; - if (id < 0 || id >= ir->inline_asm_count) - tcc_error("IR: invalid inline asm id"); - - TCCIRInlineAsm *ia = &ir->inline_asms[id]; - if (!ia->asm_str) - tcc_error("IR: inline asm payload missing"); - - const int nb_operands = ia->nb_operands; - const int nb_labels = ia->nb_labels; - if (nb_operands < 0 || nb_operands > MAX_ASM_OPERANDS || nb_operands + nb_labels > MAX_ASM_OPERANDS) - tcc_error("IR: invalid asm operand count"); - - ASMOperand ops[MAX_ASM_OPERANDS]; - SValue vals[MAX_ASM_OPERANDS]; - memset(ops, 0, sizeof(ops)); - memset(vals, 0, sizeof(vals)); - - memcpy(ops, ia->operands, sizeof(ASMOperand) * (nb_operands + nb_labels)); - for (int i = 0; i < nb_operands; ++i) - { - vals[i] = ia->values[i]; - tcc_ir_fill_registers(ir, &vals[i]); - ops[i].vt = &vals[i]; - } - for (int i = nb_operands; i < nb_operands + nb_labels; ++i) - ops[i].vt = NULL; - - uint8_t clobber_regs[NB_ASM_REGS]; - memcpy(clobber_regs, ia->clobber_regs, sizeof(clobber_regs)); - - /* Compute reserved_regs: physical registers of vregs that are live at this - * INLINE_ASM instruction but are NOT asm operands. The constraint solver - * must avoid these registers when picking registers for "r" constraints, - * otherwise the operand load will clobber the live value. - * - * Unlike clobber_regs, reserved_regs only affect constraint allocation — - * they do NOT trigger save/restore in asm_gen_code prolog/epilog. */ - uint8_t reserved_regs[NB_ASM_REGS]; - memset(reserved_regs, 0, sizeof(reserved_regs)); - { - int asm_instr_idx = ir->codegen_instruction_idx; - struct - { - IRLiveInterval *intervals; - int count; - } groups[3] = { - {ir->variables_live_intervals, ir->variables_live_intervals_size}, - {ir->temporary_variables_live_intervals, ir->temporary_variables_live_intervals_size}, - {ir->parameters_live_intervals, ir->parameters_live_intervals_size}, - }; - - for (int g = 0; g < 3; g++) - { - for (int j = 0; j < groups[g].count; j++) - { - IRLiveInterval *interval = &groups[g].intervals[j]; - if (interval->start == INTERVAL_NOT_STARTED) - continue; - if ((int)interval->start > asm_instr_idx || (int)interval->end < asm_instr_idx) - continue; - - int r0 = interval->allocation.r0; - if (r0 & PREG_SPILLED) - continue; - int phys_reg = r0 & PREG_REG_NONE; - if (phys_reg == PREG_REG_NONE) - continue; - if (phys_reg < NB_ASM_REGS) - reserved_regs[phys_reg] = 1; - - int r1 = interval->allocation.r1; - if (!(r1 & PREG_SPILLED)) - { - int phys_reg1 = r1 & PREG_REG_NONE; - if (phys_reg1 != PREG_REG_NONE && phys_reg1 < NB_ASM_REGS) - reserved_regs[phys_reg1] = 1; - } - } - } - } - - tcc_asm_emit_inline(ops, nb_operands, ia->nb_outputs, nb_labels, clobber_regs, reserved_regs, ia->asm_str, - ia->asm_len, ia->must_subst); -} - -static void tcc_ir_codegen_inline_asm_ir(TCCIRState *ir, IROperand dest_irop) -{ - if (!ir) - return; - const int id = (int)irop_get_imm64_ex(ir, dest_irop); - tcc_ir_codegen_inline_asm_by_id(ir, id); -} -#endif - -/* ============================================================================ - * Jump Backpatching - * ============================================================================ */ - -static void tcc_ir_codegen_backpatch_jumps(TCCIRState *ir, uint32_t *ir_to_code_mapping) -{ - IRQuadCompact *q; - for (int i = 0; i < ir->next_instruction_index; i++) - { - q = &ir->compact_instructions[i]; - if (q->op == TCCIR_OP_JUMP || q->op == TCCIR_OP_JUMPIF) - { - IROperand dest = tcc_ir_op_get_dest(ir, q); - int target_ir = irop_is_none(dest) ? -1 : (int)dest.u.imm32; - /* Skip unpatched jumps (target is -1 or truly out of range) - * Note: target_ir == ir->next_instruction_index is valid (epilogue) */ - if (target_ir < 0 || target_ir > ir->next_instruction_index) - continue; - const int instruction_address = ir_to_code_mapping[i]; - const int target_address = ir_to_code_mapping[target_ir]; - tcc_gen_machine_backpatch_jump(instruction_address, target_address); - } - } - - /* Backpatch switch table entries. - * Table entries are 32-bit signed PC-relative offsets with Thumb bit. - * The reference point is table_start, which is the PC value when - * the 16-bit ADD Rt, PC instruction at ind+10 reads PC (= ind+10+4 = ind+14 = table_start). - * Formula: table[i] = (target_addr | 1) - table_start - * This must happen after all code is generated so forward targets are mapped. */ - for (int t = 0; t < ir->num_switch_tables; t++) - { - TCCIRSwitchTable *table = &ir->switch_tables[t]; - int table_start = table->table_code_addr; - if (table_start <= 0) - continue; /* Table not emitted (e.g. dead code) */ - int ref_point = table_start; /* PC value at the 16-bit ADD Rt, PC instruction (at ind+10, PC=ind+14=table_start) */ - for (int j = 0; j < table->num_entries; j++) - { - int target_ir = table->targets[j]; - int entry_addr = table_start + j * 4; /* 4 bytes per entry */ - int target_addr; - if (target_ir >= 0 && target_ir < (int)ir->ir_to_code_mapping_size) - target_addr = ir_to_code_mapping[target_ir]; - else - target_addr = ir_to_code_mapping[ir->next_instruction_index]; /* epilogue */ - int32_t offset = (int32_t)((target_addr | 1) - ref_point); - write32le(cur_text_section->data + entry_addr, (uint32_t)offset); - } - } -} - -/* ============================================================================ - * Phase-3 scratch conflict fixup - * ============================================================================ - * - * After the dry run has identified which instructions would push a register - * to the stack (no free scratch register available), this function tries to - * move the vreg currently occupying that register to a free callee-saved - * register. This eliminates the push/pop overhead for those instructions. - * - * Parameters: - * ir - current function IR state - * r - physical register that would be pushed at instruction insn_i - * insn_i - the instruction index where the push was noted - * - * Returns the new physical register on success, -1 if no reassignment could - * be made (e.g. all callee-saved registers are already occupied over the - * vreg's live range, or the interval is complex / 64-bit / float). - */ -static int try_reassign_scratch_conflict(TCCIRState *ir, int r, int insn_i) -{ - LSLiveIntervalState *ls = &ir->ls; - - /* Callee-saved registers R4-R11 (bits 4..11 = 0x0FF0), minus reserved - * special-purpose registers: - * R7 = R_FP (= 7): always reserved as frame pointer by the ARM backend. - * arm-thumb-gen.c: "Always reserve R7 (FP) and never allocate it as a - * general register." The linear-scan allocator never assigns vregs to R7, - * so it never appears in live_regs_by_instruction. We must exclude it - * here as well, otherwise we would clobber the frame pointer. - * R10 = static_chain_reg (= 10): reserved when function uses a static chain. - */ - const uint32_t ALL_CALLEE_SAVED = 0x0FF0u; - const uint32_t ARM_FP_REG = 7u; /* R_FP = R7, defined in arm-thumb-opcodes.h */ - uint32_t reserved = (1u << ARM_FP_REG); /* always exclude frame pointer */ - if (ir->has_static_chain) - reserved |= (1u << (uint32_t)architecture_config.static_chain_reg); - const uint32_t CALLEE_SAVED = ALL_CALLEE_SAVED & ~reserved; - - /* Find the LSLiveInterval holding r at instruction insn_i. */ - LSLiveInterval *ls_iv = NULL; - for (int k = 0; k < ls->next_interval_index; k++) - { - LSLiveInterval *iv = &ls->intervals[k]; - /* Only handle plain integer register allocations. */ - if (iv->reg_type != LS_REG_TYPE_INT) - continue; - if (iv->addrtaken || iv->stack_location != 0) - continue; - /* Skip 64-bit pairs — they need two adjacent registers. */ - if (iv->r1 >= 0 && iv->r1 < 16) - continue; - if (iv->r0 != r) - continue; - if ((int)iv->start > insn_i || (int)iv->end < insn_i) - continue; - ls_iv = iv; - break; - } - if (!ls_iv) - return -1; - - /* Get the IRLiveInterval for the same vreg to check for float/double/llong. */ - IRLiveInterval *ir_iv = tcc_ir_get_live_interval(ir, (int)ls_iv->vreg); - if (!ir_iv) - return -1; - /* Skip floating-point and 64-bit intervals. */ - if (ir_iv->is_float || ir_iv->is_double || ir_iv->is_llong || ir_iv->is_complex || ir_iv->use_vfp) - return -1; - /* Skip ABI-pinned intervals: function parameters and call return values have - * incoming_reg0 >= 0, meaning the hardware places the value in a specific - * register dictated by the calling convention. Changing the allocation would - * cause the codegen to look in the wrong register after a call/entry. */ - if (ir_iv->incoming_reg0 >= 0) - return -1; - - /* Compute the union of live register masks across [ls_iv->start .. ls_iv->end]. - * Any register set in this union is occupied by some other live vreg and - * cannot be used as the reassignment target. */ - uint32_t blocked = 0; - if (ls->live_regs_by_instruction) - { - for (int j = (int)ls_iv->start; j <= (int)ls_iv->end && j < ls->live_regs_by_instruction_size; j++) - blocked |= ls->live_regs_by_instruction[j]; - } - blocked |= (1u << r); /* keep r itself blocked so we don't choose it */ - - uint32_t avail = CALLEE_SAVED & ~blocked; - if (!avail) - return -1; - - int new_r = (int)__builtin_ctz(avail); /* lowest-numbered free callee-saved */ - - /* --- Apply the reassignment --- */ - - /* 1. Update the IRLiveInterval (read by tcc_ir_fill_registers_ir). */ - ir_iv->allocation.r0 = (uint16_t)new_r; - - /* 2. Update the LSLiveInterval (read by tcc_ls_build_live_regs_by_instruction - * and tcc_ls_find_free_scratch_reg). */ - ls_iv->r0 = (int16_t)new_r; - - /* 3. Patch live_regs_by_instruction for the interval's full range. */ - if (ls->live_regs_by_instruction) - { - for (int j = (int)ls_iv->start; j <= (int)ls_iv->end && j < ls->live_regs_by_instruction_size; j++) - { - ls->live_regs_by_instruction[j] &= ~(1u << r); - ls->live_regs_by_instruction[j] |= (1u << new_r); - } - } - - /* 4. Mark new_r as dirty so the prologue will save/restore it. */ - ls->dirty_registers |= (1ull << new_r); - - return new_r; -} - -/* ============================================================================ - * Helper: fill a single operand from register allocation results. - * Only called at old-path dispatch sites (MOP path fills via machine_op_from_ir). - * ============================================================================ */ -static void ir_fill_op(TCCIRState *ir, IROperand *op) -{ - if (irop_get_tag(*op) != IROP_TAG_NONE) - tcc_ir_fill_registers_ir(ir, op); -} - -/* ============================================================================ - * Main Code Generation Loop - * ============================================================================ */ - -void tcc_ir_codegen_generate(TCCIRState *ir) -{ - IRQuadCompact *cq; - int drop_return_value = 0; - -#ifdef TCC_REGALLOC_DEBUG - int _dbg_trace_all = 0; - { - extern const char *funcname; - fprintf(stderr, "[RA-FUNC] %s (insts=%d)\n", funcname ? funcname : "?", ir->next_instruction_index); - /* Enable full instruction trace for the target function */ - if (funcname && ir->next_instruction_index == 295) - { - const char *_target = "tcc_gen_machine_func_call_op"; - const char *_fn = funcname; - int _match = 1; - while (*_target && *_fn) - { - if (*_target++ != *_fn++) - { - _match = 0; - break; - } - } - if (_match && *_target == 0 && *_fn == 0) - _dbg_trace_all = 1; - } - } -#endif - -#ifdef TCC_REGALLOC_DEBUG - /* Print vreg statistics for size optimization analysis */ - { - int local_count = ir->next_local_variable; - int temp_count = ir->next_temporary_variable; - int param_count = ir->next_parameter; - int total_vregs = local_count + temp_count + param_count; - if (total_vregs > 1000) /* Only print for large functions */ - fprintf(stderr, "[VREG STATS] locals=%d temps=%d params=%d total=%d (max_encoded=%d)\n", local_count, temp_count, - param_count, total_vregs, - (local_count > temp_count ? local_count : temp_count) > param_count - ? (local_count > temp_count ? local_count : temp_count) - : param_count); - } -#endif - - /* `&&label` stores label positions as IR indices BEFORE DCE/compaction. - * Build a mapping for original indices, not just the compacted array indices. - */ - int max_orig_index = -1; - for (int i = 0; i < ir->next_instruction_index; i++) - { - if (ir->compact_instructions[i].orig_index > max_orig_index) - max_orig_index = ir->compact_instructions[i].orig_index; - } - if (max_orig_index < 0) - max_orig_index = 0; - - /* +1 to include epilogue when needed. - * Keep this mapping available after codegen (e.g. for &&label). */ - if (ir->ir_to_code_mapping) - { - tcc_free(ir->ir_to_code_mapping); - ir->ir_to_code_mapping = NULL; - ir->ir_to_code_mapping_size = 0; - } - ir->ir_to_code_mapping_size = ir->next_instruction_index + 1; - ir->ir_to_code_mapping = tcc_mallocz(sizeof(uint32_t) * ir->ir_to_code_mapping_size); - uint32_t *ir_to_code_mapping = ir->ir_to_code_mapping; - - if (ir->orig_ir_to_code_mapping) - { - tcc_free(ir->orig_ir_to_code_mapping); - ir->orig_ir_to_code_mapping = NULL; - ir->orig_ir_to_code_mapping_size = 0; - } - /* +1 extra slot for a synthetic epilogue mapping. - * Use 0xFFFFFFFF sentinel to distinguish "unmapped" from offset 0. */ - ir->orig_ir_to_code_mapping_size = max_orig_index + 2; - ir->orig_ir_to_code_mapping = tcc_malloc(sizeof(uint32_t) * ir->orig_ir_to_code_mapping_size); - uint32_t *orig_ir_to_code_mapping = ir->orig_ir_to_code_mapping; - memset(orig_ir_to_code_mapping, 0xFF, sizeof(uint32_t) * ir->orig_ir_to_code_mapping_size); - /* Track addresses of return jumps for later backpatching to epilogue */ - int *return_jump_addrs = tcc_malloc(sizeof(int) * ir->next_instruction_index); - int num_return_jumps = 0; - - /* Clear spill cache at function start */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - - /* Some peephole optimizations (LOAD/ASSIGN -> RETURNVALUE in R0, and skipping - * RETURNVALUE moves) are only valid when RETURNVALUE is reached by straight-line - * fallthrough from the immediately preceding instruction. - * - * If RETURNVALUE is a jump target (a control-flow merge), those peepholes can - * become incorrect: the preceding instruction might not execute on all paths, - * leaving the return value in a non-return register. - * - * Track which IR instruction indices are jump targets to guard these peepholes. - */ - uint8_t *has_incoming_jump = tcc_mallocz(ir->next_instruction_index ? ir->next_instruction_index : 1); - for (int i = 0; i < ir->next_instruction_index; ++i) - { - IRQuadCompact *p = &ir->compact_instructions[i]; - if (p->op == TCCIR_OP_JUMP || p->op == TCCIR_OP_JUMPIF) - { - /* Read jump target from IROperand pool */ - IROperand dest_irop = tcc_ir_op_get_dest(ir, p); - int target = (int)dest_irop.u.imm32; - if (target >= 0 && target < ir->next_instruction_index) - has_incoming_jump[target] = 1; - } - } - - /* Reserve outgoing call stack args area at the very bottom of the frame. - * This ensures prepared-call stack args are at call-time SP. - */ - if (ir->call_outgoing_size > 0) - { - loc -= ir->call_outgoing_size; - ir->call_outgoing_base = loc; - } - - int stack_size = (-loc + 7) & ~7; // align to 8 bytes - - /* ============================================================================ - * DRY RUN PASS: Analyze scratch register needs before emitting prologue - * ============================================================================ - * This discovers what scratch registers will be needed during code generation, - * allowing us to include them in the prologue (avoiding push/pop in loops). - */ - int original_leaffunc = ir->leaffunc; - uint32_t extra_prologue_regs = 0; - - /* If this function has a static chain (nested function), reserve R10 - * as callee-saved so the parent's static chain is preserved. - * R10 is the static chain register per architecture_config.static_chain_reg. */ - if (ir->has_static_chain) - { - extra_prologue_regs |= (1 << architecture_config.static_chain_reg); - } - - /* Phase-3 per-instruction scratch constraint recording. - * Allocated once per function; indexed by instruction index. - * dry_insn_scratch[i] = number of mach_alloc_scratch() calls at instruction i. - * dry_insn_saves[i] = bitmask of registers that would be PUSH'd at instruction i. - * Both arrays are declared before #if so they are visible in both passes. */ - int *dry_insn_scratch = tcc_mallocz(ir->next_instruction_index * sizeof(int)); - uint16_t *dry_insn_saves = tcc_mallocz(ir->next_instruction_index * sizeof(uint16_t)); - -#if 1 /* DRY_RUN_ENABLED */ - - /* Initialize dry-run state and branch optimization */ - tcc_gen_machine_dry_run_init(); - tcc_gen_machine_branch_opt_init(); - tcc_gen_machine_dry_run_start(); - - /* Reset scratch state for clean dry-run */ - tcc_gen_machine_reset_scratch_state(); - tcc_ir_spill_cache_clear(&ir->spill_cache); - - /* Save state that will be modified during dry run */ - int saved_ind = ind; - int saved_codegen_idx = ir->codegen_instruction_idx; - int saved_loc = loc; - int saved_call_outgoing_base = ir->call_outgoing_base; - - /* Run through all instructions without emitting. - * We call the actual codegen functions, but ot() is a no-op during dry-run. - * This ensures we exercise the exact same code paths for scratch allocation. */ - for (int i = 0; i < ir->next_instruction_index; i++) - { - ir->codegen_instruction_idx = i; - cq = &ir->compact_instructions[i]; - - /* Debug tracking: update current op for ot_check failure reporting */ - g_debug_current_op = (int)cq->op; - - /* Record address mapping for branch optimizer analysis */ - ir_to_code_mapping[i] = ind; - - /* Skip marker ops */ - if (cq->op == TCCIR_OP_ASM_INPUT || cq->op == TCCIR_OP_ASM_OUTPUT || cq->op == TCCIR_OP_NOP || - cq->op == TCCIR_OP_INLINE_ASM) - continue; - - /* Get operand copies from iroperand_pool */ - IROperand src1_ir = tcc_ir_op_get_src1(ir, cq); - IROperand src2_ir = tcc_ir_op_get_src2(ir, cq); - IROperand dest_ir = tcc_ir_op_get_dest(ir, cq); - - /* Operands are filled lazily: machine_op_from_ir fills via ir_fill_op for - * MOP-path operands; old-path dispatch sites call ir_fill_op explicitly. */ - - /* Mop path: use MachineOperand-based dispatch for simple 32-bit ops; - * the mach_* helpers in arm-thumb-gen.c handle all materialization. */ - bool use_mop_dp = false; - bool use_mop_assign = false; - bool use_mop_setif = false; - bool use_mop_bool = false; - bool use_mop_load = false; - bool use_mop_store = false; - bool use_mop_load_indexed = false; - bool use_mop_store_indexed = false; - bool use_mop_load_postinc = false; - bool use_mop_store_postinc = false; - bool use_mop_ijump = false; - bool use_mop_funcparam = false; - bool use_mop_returnvalue = false; - bool use_mop_muldiv = false; - bool use_mop_fp = false; - bool use_mop_vla = false; - bool use_mop_func_call = false; - switch (cq->op) - { - case TCCIR_OP_ADD: - case TCCIR_OP_SUB: - case TCCIR_OP_CMP: - case TCCIR_OP_SHL: - case TCCIR_OP_SHR: - case TCCIR_OP_SAR: - case TCCIR_OP_AND: - case TCCIR_OP_OR: - case TCCIR_OP_XOR: - case TCCIR_OP_ADC_GEN: - case TCCIR_OP_ADC_USE: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_dp = true; - break; - case TCCIR_OP_ASSIGN: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_assign = true; - break; - case TCCIR_OP_SETIF: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_setif = true; - break; - case TCCIR_OP_BOOL_OR: - case TCCIR_OP_BOOL_AND: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && !ir->has_static_chain) - use_mop_bool = true; - break; - case TCCIR_OP_LOAD: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_load = true; - break; - case TCCIR_OP_STORE: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store = true; - break; - case TCCIR_OP_LOAD_INDEXED: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_load_indexed = true; - break; - case TCCIR_OP_STORE_INDEXED: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store_indexed = true; - break; - case TCCIR_OP_LOAD_POSTINC: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_load_postinc = true; - break; - case TCCIR_OP_STORE_POSTINC: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store_postinc = true; - break; - case TCCIR_OP_IJUMP: - if (!ir->has_static_chain) - use_mop_ijump = true; - break; - case TCCIR_OP_FUNCPARAMVAL: - case TCCIR_OP_FUNCPARAMVOID: - use_mop_funcparam = true; - break; - case TCCIR_OP_RETURNVALUE: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_returnvalue = true; - break; - case TCCIR_OP_MUL: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_DIV: - case TCCIR_OP_UDIV: - case TCCIR_OP_IMOD: - case TCCIR_OP_UMOD: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_TEST_ZERO: - if (!irop_needs_pair(src1_ir) && !irop_is_64bit(src1_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_FADD: - case TCCIR_OP_FSUB: - case TCCIR_OP_FMUL: - case TCCIR_OP_FDIV: - case TCCIR_OP_FNEG: - case TCCIR_OP_FCMP: - case TCCIR_OP_CVT_FTOF: - case TCCIR_OP_CVT_ITOF: - case TCCIR_OP_CVT_FTOI: - if (!src1_ir.is_complex && !dest_ir.is_complex && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && - !irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_fp = true; - break; - case TCCIR_OP_VLA_ALLOC: - case TCCIR_OP_VLA_SP_SAVE: - case TCCIR_OP_VLA_SP_RESTORE: - if (!ir->has_static_chain) - use_mop_vla = true; - break; - case TCCIR_OP_FUNCCALLVAL: - case TCCIR_OP_FUNCCALLVOID: - if (!irop_needs_pair(dest_ir) && !dest_ir.is_complex && !ir->has_static_chain) - use_mop_func_call = true; - break; - default: - break; - } - - /* Call the actual codegen function - ot() will be a no-op in dry-run mode, - * but scratch allocation inside these functions will still be recorded */ - switch (cq->op) - { - case TCCIR_OP_LOAD: - { - bool load_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - load_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_load && !load_before_ret) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - - /* Sub-component access on register pairs (e.g., __imag__ on _Complex float). - * When a STACKOFF operand with a component offset gets rewritten to VREG by - * fill_registers_ir, the byte-offset delta is preserved in u.imm32: - * u.imm32 == 0 → first element (pr0_reg, e.g. real part) - * u.imm32 > 0 → second element (pr1_reg, e.g. imaginary part) - * This ONLY applies to LOAD sources — DP/ASSIGN operands must not be - * rewritten because a 64-bit interval allocated as a register pair - * can also have pr1_reg set with a non-zero u.imm32 (delta from - * fill_registers_ir), which is not a sub-component access. */ - if (mop_src.kind == MACH_OP_REG && !src1_ir.is_lval && src1_ir.pr1_reg != (int)PREG_REG_NONE && - src1_ir.u.imm32 != 0) - { - mop_src.u.reg.r0 = (int)src1_ir.pr1_reg; - mop_src.u.reg.r1 = -1; - mop_src.needs_deref = false; - } - - if (mop_dest.kind == MACH_OP_REG && !mop_dest.needs_deref && mop_dest.u.reg.r0 != (int)PREG_REG_NONE) - { - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_mop(mop_src, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - tcc_gen_machine_load_op(dest_ir, src1_ir); - } - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_load_op(dest_ir, src1_ir); - } - break; - } - case TCCIR_OP_STORE: - { - if (use_mop_store) - { - MachineOperand mop_dest_s = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src_s = machine_op_from_ir(ir, &src1_ir); - /* Sub-component fixup for STORE value — same logic as LOAD source. */ - if (mop_src_s.kind == MACH_OP_REG && !src1_ir.is_lval && src1_ir.pr1_reg != (int)PREG_REG_NONE && - src1_ir.u.imm32 != 0) - { - mop_src_s.u.reg.r0 = (int)src1_ir.pr1_reg; - mop_src_s.u.reg.r1 = -1; - mop_src_s.needs_deref = false; - } - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_mop(mop_dest_s, mop_src_s, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_store_op(dest_ir, src1_ir, cq->op); - } - break; - } - case TCCIR_OP_LOAD_INDEXED: - { - bool load_indexed_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - load_indexed_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_load_indexed && !load_indexed_before_ret) - { - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_base = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_indexed_mop(mop_dest, mop_base, mop_index, mop_scale, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - IROperand base_op = src1_ir; - IROperand index_op = src2_ir; - IROperand scale_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &base_op); - ir_fill_op(ir, &index_op); - tcc_gen_machine_load_indexed_op(dest_ir, base_op, index_op, scale_op); - } - break; - } - case TCCIR_OP_STORE_INDEXED: - { - if (use_mop_store_indexed) - { - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_base = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_indexed_mop(mop_base, mop_index, mop_scale, mop_value, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - IROperand base_op = dest_ir; - IROperand index_op = src2_ir; - IROperand scale_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &base_op); - ir_fill_op(ir, &index_op); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_store_indexed_op(base_op, index_op, scale_op, src1_ir); - } - break; - } - case TCCIR_OP_LOAD_POSTINC: - { - if (use_mop_load_postinc) - { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_ptr = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_postinc_mop(mop_dest, mop_ptr, mop_offset, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - IROperand ptr_op = src1_ir; - IROperand offset_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &ptr_op); - tcc_gen_machine_load_postinc_op(dest_ir, ptr_op, offset_op); - } - break; - } - case TCCIR_OP_STORE_POSTINC: - { - if (use_mop_store_postinc) - { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_ptr = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_postinc_mop(mop_ptr, mop_value, mop_offset, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - IROperand ptr_op = dest_ir; - IROperand value_op = src1_ir; - IROperand offset_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &ptr_op); - ir_fill_op(ir, &value_op); - tcc_gen_machine_store_postinc_op(ptr_op, value_op, offset_op); - } - break; - } - case TCCIR_OP_LEA: - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_lea_op(dest_ir, src1_ir, cq->op); - break; - case TCCIR_OP_ASSIGN: - { - /* Skip MOP path when next instruction is RETURNVALUE targeting same vreg, - * because the real-run applies a peephole (dest→R0) that doesn't exist in - * the dry-run — the resulting dry/real scratch mismatch would corrupt the - * Phase-3 fixup. The has_incoming_jump guard mirrors the real-run peephole - * condition so both passes make the same MOP/legacy decision. */ - bool assign_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - assign_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_assign && !assign_before_ret) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_assign_mop(mop_src, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - TCC_MACH_DBG( - "[DBG-ASSIGN] i=%d dest btype=%d pr0=%d pr1=%d is64=%d needs_pair=%d src btype=%d pr0=%d pr1=%d is64=%d\n", - i, irop_get_btype(dest_ir), dest_ir.pr0_reg, dest_ir.pr1_reg, irop_is_64bit(dest_ir), - irop_needs_pair(dest_ir), irop_get_btype(src1_ir), src1_ir.pr0_reg, src1_ir.pr1_reg, - irop_is_64bit(src1_ir)); - tcc_gen_machine_assign_op(dest_ir, src1_ir, cq->op); - } - break; - } - case TCCIR_OP_RETURNVALUE: - if (use_mop_returnvalue) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_return_value_mop(mop_src, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_return_value_op(src1_ir, cq->op); - } - break; - case TCCIR_OP_RETURNVOID: - /* No scratch allocation needed */ - break; - case TCCIR_OP_JUMP: - /* Record branch for optimization analysis (ot() is no-op during dry-run) */ - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_jump_op(cq->op, dest_ir, i); - break; - case TCCIR_OP_JUMPIF: - /* Record branch for optimization analysis (ot() is no-op during dry-run) */ - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_conditional_jump_op(src1_ir, cq->op, dest_ir, i); - break; - case TCCIR_OP_MUL: - case TCCIR_OP_DIV: - case TCCIR_OP_UDIV: - case TCCIR_OP_IMOD: - case TCCIR_OP_UMOD: - case TCCIR_OP_TEST_ZERO: - if (use_mop_muldiv) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_muldiv_mop(mop_src1, mop_src2, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - } - break; - case TCCIR_OP_MLA: - case TCCIR_OP_UMULL: - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - break; - case TCCIR_OP_ADD: - case TCCIR_OP_SUB: - case TCCIR_OP_CMP: - case TCCIR_OP_SHL: - case TCCIR_OP_SHR: - case TCCIR_OP_SAR: - case TCCIR_OP_OR: - case TCCIR_OP_AND: - case TCCIR_OP_XOR: - case TCCIR_OP_ADC_GEN: - case TCCIR_OP_ADC_USE: - if (use_mop_dp) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_data_processing_mop(mop_src1, mop_src2, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - } - break; - case TCCIR_OP_IJUMP: - if (use_mop_ijump) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_indirect_jump_mop(mop_src, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_indirect_jump_op(src1_ir); - } - break; - case TCCIR_OP_SWITCH_TABLE: - { - /* Dry-run: compute exact table size so branch offsets are accurate. - * Layout: ADD.W(4) + LDR.W(4) + ADD.W(4) + BX(2) = 14 bytes preamble - * + 4 bytes per table entry (32-bit signed PC-relative offsets). */ - int table_id = (int)irop_get_imm64_ex(ir, src2_ir); - TCCIRSwitchTable *table = &ir->switch_tables[table_id]; - int table_data_size = table->num_entries * 4; /* 4 bytes per entry */ - ind += 14; /* preamble instructions */ - ind += table_data_size; /* Jump table entries */ - break; - } - case TCCIR_OP_SETIF: - if (use_mop_setif) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_setif_mop(mop_src, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_setif_op(dest_ir, src1_ir, cq->op); - } - break; - case TCCIR_OP_BOOL_OR: - case TCCIR_OP_BOOL_AND: - if (use_mop_bool) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_bool_mop(mop_src1, mop_src2, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_bool_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_FUNCCALLVOID: - case TCCIR_OP_FUNCCALLVAL: - if (use_mop_func_call) - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_func_call_mop(src1_ir, src2_ir, mop_dest, 0, ir, i); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_func_call_op(src1_ir, src2_ir, dest_ir, 0, ir, i); - } - if (ir->has_static_chain) - tcc_gen_machine_restore_chain(); - break; - case TCCIR_OP_SET_CHAIN: - /* Static chain setup: move FP to static chain register */ - tcc_gen_machine_set_chain(); - break; - case TCCIR_OP_INIT_CHAIN_SLOT: - /* Store parent FP into chain slot for nested function trampoline */ - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_init_chain_slot(src1_ir); - break; - case TCCIR_OP_FUNCPARAMVAL: - case TCCIR_OP_FUNCPARAMVOID: - if (use_mop_funcparam) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - /* No scratch tracking: FUNCPARAM does not allocate scratch registers */ - tcc_gen_machine_func_parameter_mop(mop_src1, mop_src2, cq->op); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_func_parameter_op(src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_FADD: - case TCCIR_OP_FSUB: - case TCCIR_OP_FMUL: - case TCCIR_OP_FDIV: - case TCCIR_OP_FNEG: - case TCCIR_OP_FCMP: - case TCCIR_OP_CVT_FTOF: - case TCCIR_OP_CVT_ITOF: - case TCCIR_OP_CVT_FTOI: - if (use_mop_fp) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_fp_mop(mop_src1, mop_src2, mop_dest, cq->op); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_fp_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_VLA_ALLOC: - case TCCIR_OP_VLA_SP_SAVE: - case TCCIR_OP_VLA_SP_RESTORE: - if (use_mop_vla) - { - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - tcc_gen_machine_vla_mop(mop_dest, mop_src1, mop_src2, cq->op); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_vla_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_TRAP: - tcc_gen_machine_trap_op(); - break; - default: - /* Unknown op - skip */ - break; - } - - /* Clean up scratch register state */ - tcc_gen_machine_end_instruction(); - } - - /* End dry-run and analyze results */ - tcc_gen_machine_dry_run_end(); - - /* Analyze branch offsets and select optimal encodings */ - tcc_gen_machine_branch_opt_analyze(ir_to_code_mapping, ir->next_instruction_index); - - /* Check if LR was pushed during dry run in a leaf function */ - if (original_leaffunc && tcc_gen_machine_dry_run_get_lr_push_count() > 0) - { - /* LR was pushed in loop - save at prologue instead */ - extra_prologue_regs |= (1 << 14); /* R_LR */ - /* NOTE: We don't modify ir->leaffunc here because optimizations may depend on it. - * The extra_prologue_regs will ensure LR is pushed in the prologue, making it - * available as scratch without push/pop in loops, which is the main goal. */ - } - - /* Restore state for real code generation */ - ind = saved_ind; - loc = saved_loc; - ir->call_outgoing_base = saved_call_outgoing_base; - ir->codegen_instruction_idx = saved_codegen_idx; - - /* Phase-3 scratch conflict fixup. - * For each mop instruction where the dry run needed to PUSH a register - * (because no caller-saved scratch was free), try to move the blocking vreg - * to a free callee-saved register. This eliminates the push/pop at that - * instruction at the cost of one extra callee-saved register in the prologue. - */ - { - int any_fixup = 0; - for (int i = 0; i < ir->next_instruction_index; i++) - { - uint16_t saves = dry_insn_saves[i]; - if (!saves) - continue; - while (saves) - { - int r = (int)__builtin_ctz(saves); - saves = (uint16_t)(saves & (saves - 1u)); - int new_r = try_reassign_scratch_conflict(ir, r, i); - if (new_r >= 0) - { - /* Clear the recorded dry-run scratch count for this instruction so - * the debug consistency check accepts the improved real-emit count. */ - dry_insn_scratch[i] = 0; - any_fixup = 1; - } - } - } - if (any_fixup) - { - /* Invalidate the liveness cache so real-emit sees the new assignments. */ - tcc_ls_reset_scratch_cache(&ir->ls); - } - } - - /* Reset scratch state for real pass */ - tcc_gen_machine_reset_scratch_state(); - - /* Clear caches for fresh start - dry-run may have recorded entries - * but the actual instructions were never emitted */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - tcc_ir_opt_fp_cache_clear(ir); -#endif /* DRY_RUN_DISABLED */ - - /* ============================================================================ - * REAL CODE GENERATION PASS - * ============================================================================ - */ - - // generate prolog (with extra registers if needed) - (void)original_leaffunc; /* May be unused when dry-run is disabled */ - if (!ir->naked) - tcc_gen_machine_prolog(ir->leaffunc, ir->ls.dirty_registers, stack_size, extra_prologue_regs); - - /* Emit DWARF prologue_end AFTER machine prolog so the debugger knows - * where the prologue ends and sets breakpoints at the correct address. - * Previously this was emitted in tccgen.c before any machine code existed, - * causing breakpoints to land far from the actual prolog. */ - if (!ir->naked) - tcc_debug_prolog_epilog(tcc_state, 0); - - for (int i = 0; i < ir->next_instruction_index; i++) - { - drop_return_value = 0; - cq = &ir->compact_instructions[i]; - - /* Default: no extra scratch constraints for this instruction. */ - ir->codegen_materialize_scratch_flags = 0; - - /* Track current instruction for scratch register allocation */ - ir->codegen_instruction_idx = i; - - /* Debug tracking: let ot_check print the current IR op on failure */ - g_debug_current_op = (int)cq->op; - - ir_to_code_mapping[i] = ind; - - if (cq->orig_index >= 0 && cq->orig_index < ir->orig_ir_to_code_mapping_size) - orig_ir_to_code_mapping[cq->orig_index] = ind; - - // emit debug line info for this IR instruction AFTER recording ind - tcc_debug_line_num(tcc_state, cq->line_num); - - /* Get operand copies from iroperand_pool (compact representation) */ - IROperand src1_ir = tcc_ir_op_get_src1(ir, cq); - IROperand src2_ir = tcc_ir_op_get_src2(ir, cq); - IROperand dest_ir = tcc_ir_op_get_dest(ir, cq); - - /* Peephole for LOAD/ASSIGN/LOAD_INDEXED followed by RETURNVALUE: - * Update the live interval to use R0 BEFORE register allocation. - * This ensures the load result goes directly to the return register. - */ - if (cq->op == TCCIR_OP_LOAD || cq->op == TCCIR_OP_ASSIGN || cq->op == TCCIR_OP_LOAD_INDEXED) - { - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand next_src1 = tcc_ir_op_get_src1(ir, ir_next); - int next_vr = irop_get_vreg(next_src1); - int dest_vr = irop_get_vreg(dest_ir); - if (next_vr == dest_vr && next_vr >= 0) - { - IRLiveInterval *li = tcc_ir_get_live_interval(ir, dest_vr); - if (li && li->allocation.r0 != REG_IRET) - { -#ifdef TCC_REGALLOC_DEBUG - fprintf(stderr, "[RA-PEEPHOLE] i=%d op=%d dest_vr=0x%x old_r0=%d -> R0 (RETURNVALUE next)\n", i, cq->op, - dest_vr, li->allocation.r0); -#endif - li->allocation.r0 = REG_IRET; - li->allocation.offset = 0; - if (li->is_llong || li->is_double) - li->allocation.r1 = REG_IRE2; - } - } - } - } - - /* Operands are filled lazily: machine_op_from_ir fills via ir_fill_op for - * MOP-path operands; old-path dispatch sites call ir_fill_op explicitly. */ - - /* Mop path: use MachineOperand-based dispatch for simple 32-bit ops; - * the mach_* helpers in arm-thumb-gen.c handle all materialization. */ - bool use_mop_dp = false; - bool use_mop_assign = false; - bool use_mop_setif = false; - bool use_mop_bool = false; - bool use_mop_load = false; - bool use_mop_store = false; - bool use_mop_load_indexed = false; - bool use_mop_store_indexed = false; - bool use_mop_load_postinc = false; - bool use_mop_store_postinc = false; - bool use_mop_ijump = false; - bool use_mop_funcparam = false; - bool use_mop_returnvalue = false; - bool use_mop_muldiv = false; - bool use_mop_fp = false; - bool use_mop_vla = false; - bool use_mop_func_call = false; - switch (cq->op) - { - case TCCIR_OP_ADD: - case TCCIR_OP_SUB: - case TCCIR_OP_CMP: - case TCCIR_OP_SHL: - case TCCIR_OP_SHR: - case TCCIR_OP_SAR: - case TCCIR_OP_AND: - case TCCIR_OP_OR: - case TCCIR_OP_XOR: - case TCCIR_OP_ADC_GEN: - case TCCIR_OP_ADC_USE: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_dp = true; - break; - case TCCIR_OP_ASSIGN: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_assign = true; - break; - case TCCIR_OP_SETIF: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_setif = true; - break; - case TCCIR_OP_BOOL_OR: - case TCCIR_OP_BOOL_AND: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && !ir->has_static_chain) - use_mop_bool = true; - break; - case TCCIR_OP_LOAD: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_load = true; - break; - case TCCIR_OP_STORE: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store = true; - break; - case TCCIR_OP_LOAD_INDEXED: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_load_indexed = true; - break; - case TCCIR_OP_STORE_INDEXED: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store_indexed = true; - break; - case TCCIR_OP_LOAD_POSTINC: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_load_postinc = true; - break; - case TCCIR_OP_STORE_POSTINC: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store_postinc = true; - break; - case TCCIR_OP_IJUMP: - if (!ir->has_static_chain) - use_mop_ijump = true; - break; - case TCCIR_OP_FUNCPARAMVAL: - case TCCIR_OP_FUNCPARAMVOID: - use_mop_funcparam = true; - break; - case TCCIR_OP_RETURNVALUE: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_returnvalue = true; - break; - case TCCIR_OP_MUL: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_DIV: - case TCCIR_OP_UDIV: - case TCCIR_OP_IMOD: - case TCCIR_OP_UMOD: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_TEST_ZERO: - if (!irop_needs_pair(src1_ir) && !irop_is_64bit(src1_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_FADD: - case TCCIR_OP_FSUB: - case TCCIR_OP_FMUL: - case TCCIR_OP_FDIV: - case TCCIR_OP_FNEG: - case TCCIR_OP_FCMP: - case TCCIR_OP_CVT_FTOF: - case TCCIR_OP_CVT_ITOF: - case TCCIR_OP_CVT_FTOI: - if (!src1_ir.is_complex && !dest_ir.is_complex && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && - !irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_fp = true; - break; - case TCCIR_OP_VLA_ALLOC: - case TCCIR_OP_VLA_SP_SAVE: - case TCCIR_OP_VLA_SP_RESTORE: - if (!ir->has_static_chain) - use_mop_vla = true; - break; - case TCCIR_OP_FUNCCALLVAL: - case TCCIR_OP_FUNCCALLVOID: - if (!irop_needs_pair(dest_ir) && !dest_ir.is_complex && !ir->has_static_chain) - use_mop_func_call = true; - break; - default: - break; - } - -#ifdef TCC_REGALLOC_DEBUG - /* Trace reads register fields; fill is now lazy so create filled local copies. */ - IROperand trc_s1 = src1_ir, trc_s2 = src2_ir, trc_d = dest_ir; - ir_fill_op(ir, &trc_s1); - ir_fill_op(ir, &trc_s2); - ir_fill_op(ir, &trc_d); - /* Full instruction trace for target function */ - if (_dbg_trace_all) - { - IROperand raw_s1 = tcc_ir_op_get_src1(ir, cq); - IROperand raw_s2 = tcc_ir_op_get_src2(ir, cq); - IROperand raw_d = tcc_ir_op_get_dest(ir, cq); - fprintf(stderr, - "[RA-TRACE] i=%d op=%d s1_vr=0x%x s1_pr0=%d s2_vr=0x%x s2_pr0=%d d_vr=0x%x d_pr0=%d s1_tag=%d d_tag=%d\n", - i, cq->op, irop_get_vreg(raw_s1), trc_s1.pr0_reg, irop_get_vreg(raw_s2), trc_s2.pr0_reg, - irop_get_vreg(raw_d), trc_d.pr0_reg, irop_get_tag(trc_s1), irop_get_tag(trc_d)); - } - - /* Diagnostic: for LOAD instructions, log ALL source vreg details */ - if (cq->op == TCCIR_OP_LOAD) - { - IROperand raw_src1 = tcc_ir_op_get_src1(ir, cq); - int raw_tag = irop_get_tag(raw_src1); - if (raw_tag == IROP_TAG_VREG || raw_tag == 2 /* IROP_TAG_VREG_LVAL */) - { - int src_vreg = irop_get_vreg(raw_src1); - if (src_vreg > 0) - { - IRLiveInterval *dbg_li = tcc_ir_get_live_interval(ir, src_vreg); - if (dbg_li) - fprintf( - stderr, - "[RA-LOAD] i=%d src_vreg=0x%x alloc.r0=%d pr0_reg=%d dest_pr0=%d tag=%d lval=%d local=%d spill=%d\n", i, - src_vreg, dbg_li->allocation.r0, trc_s1.pr0_reg, trc_d.pr0_reg, irop_get_tag(trc_s1), trc_s1.is_lval, - trc_s1.is_local, trc_s1.pr0_spilled); - } - } - } - /* Also log AND/OR/ADD operations that might show the register mismatch */ - if (cq->op == TCCIR_OP_AND || cq->op == TCCIR_OP_OR) - { - IROperand raw_dest = tcc_ir_op_get_dest(ir, cq); - IROperand raw_src1 = tcc_ir_op_get_src1(ir, cq); - fprintf( - stderr, - "[RA-ALU] i=%d op=%d src1_pr0=%d src2_pr0=%d dest_pr0=%d src1_tag=%d dest_tag=%d src1_vr=0x%x dest_vr=0x%x\n", - i, cq->op, trc_s1.pr0_reg, trc_s2.pr0_reg, trc_d.pr0_reg, irop_get_tag(trc_s1), irop_get_tag(trc_d), - irop_get_vreg(raw_src1), irop_get_vreg(raw_dest)); - } - /* Log ASSIGN operations */ - if (cq->op == TCCIR_OP_ASSIGN) - { - IROperand raw_dest = tcc_ir_op_get_dest(ir, cq); - IROperand raw_src1 = tcc_ir_op_get_src1(ir, cq); - fprintf(stderr, "[RA-ASSIGN] i=%d src1_pr0=%d dest_pr0=%d src1_tag=%d dest_tag=%d src1_vr=0x%x dest_vr=0x%x\n", i, - trc_s1.pr0_reg, trc_d.pr0_reg, irop_get_tag(trc_s1), irop_get_tag(trc_d), irop_get_vreg(raw_src1), - irop_get_vreg(raw_dest)); - } -#endif - - switch (cq->op) - { - case TCCIR_OP_MUL: - case TCCIR_OP_DIV: - case TCCIR_OP_UDIV: - case TCCIR_OP_IMOD: - case TCCIR_OP_UMOD: - case TCCIR_OP_TEST_ZERO: - if (use_mop_muldiv) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_muldiv_mop(mop_src1, mop_src2, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - } - break; - case TCCIR_OP_MLA: - case TCCIR_OP_UMULL: - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - break; - case TCCIR_OP_ADD: - case TCCIR_OP_SUB: - case TCCIR_OP_CMP: - case TCCIR_OP_SHL: - case TCCIR_OP_SHR: - case TCCIR_OP_SAR: - case TCCIR_OP_OR: - case TCCIR_OP_AND: - case TCCIR_OP_XOR: - case TCCIR_OP_ADC_GEN: - case TCCIR_OP_ADC_USE: - if (use_mop_dp) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_data_processing_mop(mop_src1, mop_src2, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - /* Phase-3 consistency check: dry-run and real-emit scratch counts must agree. - * A mismatch is expected (and acceptable) for instructions where the scratch - * conflict fixup was applied (dry_insn_saves != 0 means fixup was attempted). */ - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - } - break; - case TCCIR_OP_FADD: - case TCCIR_OP_FSUB: - case TCCIR_OP_FMUL: - case TCCIR_OP_FDIV: - case TCCIR_OP_FNEG: - case TCCIR_OP_FCMP: - case TCCIR_OP_CVT_FTOF: - case TCCIR_OP_CVT_ITOF: - case TCCIR_OP_CVT_FTOI: - if (use_mop_fp) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_fp_mop(mop_src1, mop_src2, mop_dest, cq->op); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_fp_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_LOAD: - { - bool load_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - load_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_load && !load_before_ret) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - - /* Sub-component fixup for LOAD sources — see dry-run comment above. */ - if (mop_src.kind == MACH_OP_REG && !src1_ir.is_lval && src1_ir.pr1_reg != (int)PREG_REG_NONE && - src1_ir.u.imm32 != 0) - { - mop_src.u.reg.r0 = (int)src1_ir.pr1_reg; - mop_src.u.reg.r1 = -1; - mop_src.needs_deref = false; - } - - if (mop_dest.kind == MACH_OP_REG && !mop_dest.needs_deref && mop_dest.u.reg.r0 != (int)PREG_REG_NONE) - { - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_mop(mop_src, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, - dry_insn_scratch[i], real_scratch); - } -#endif - } - else - { - /* Dest not a simple register: fall back to old path. */ - tcc_gen_machine_load_op(dest_ir, src1_ir); - } - } - else - { - /* Old path with RETURNVALUE peephole */ - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - int ir_next_src1_vr = -1; - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE) - { - IROperand next_src1_irop = tcc_ir_op_get_src1(ir, ir_next); - ir_next_src1_vr = irop_get_vreg(next_src1_irop); - } - const int dest_vreg = irop_get_vreg(dest_ir); - int is_64bit_load = irop_is_64bit(dest_ir); - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && ir_next_src1_vr == dest_vreg && !has_incoming_jump[i + 1]) - { - dest_ir.pr0_reg = REG_IRET; /* R0 */ - dest_ir.pr0_spilled = 0; - if (is_64bit_load) - { - dest_ir.pr1_reg = REG_IRE2; /* R1 */ - dest_ir.pr1_spilled = 0; - } - /* Also update the interval allocation so that RETURNVALUE's src1 gets the same registers */ - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, dest_vreg); - if (interval) - { - interval->allocation.r0 = REG_IRET; - if (is_64bit_load) - interval->allocation.r1 = REG_IRE2; - } - } - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_load_op(dest_ir, src1_ir); - } - break; - } - case TCCIR_OP_STORE: - { - if (use_mop_store) - { - MachineOperand mop_dest_s = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src_s = machine_op_from_ir(ir, &src1_ir); - /* Sub-component fixup for STORE value — same logic as LOAD source. */ - if (mop_src_s.kind == MACH_OP_REG && !src1_ir.is_lval && src1_ir.pr1_reg != (int)PREG_REG_NONE && - src1_ir.u.imm32 != 0) - { - mop_src_s.u.reg.r0 = (int)src1_ir.pr1_reg; - mop_src_s.u.reg.r1 = -1; - mop_src_s.needs_deref = false; - } - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_mop(mop_dest_s, mop_src_s, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_store_op(dest_ir, src1_ir, cq->op); - } - break; - } - case TCCIR_OP_LOAD_INDEXED: - { - /* LOAD_INDEXED: dest = *(base + (index << scale)) */ - bool load_indexed_before_ret = false; - { - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, ir_next); - load_indexed_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_load_indexed && !load_indexed_before_ret) - { - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_base = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_indexed_mop(mop_dest, mop_base, mop_index, mop_scale, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - /* Old path with RETURNVALUE peephole — load directly into R0 if next is RETURNVALUE */ - IROperand base_op = src1_ir; - IROperand index_op = src2_ir; - IROperand scale_op = tcc_ir_op_get_scale(ir, cq); - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - const int dest_vreg = irop_get_vreg(dest_ir); - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && load_indexed_before_ret && !has_incoming_jump[i + 1]) - { - dest_ir.pr0_reg = REG_IRET; - dest_ir.pr0_spilled = 0; - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, dest_vreg); - if (interval) - interval->allocation.r0 = REG_IRET; - } - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &base_op); - ir_fill_op(ir, &index_op); - tcc_gen_machine_load_indexed_op(dest_ir, base_op, index_op, scale_op); - } - break; - } - case TCCIR_OP_STORE_INDEXED: - { - /* STORE_INDEXED: *(base + (index << scale)) = value */ - if (use_mop_store_indexed) - { - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_base = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_indexed_mop(mop_base, mop_index, mop_scale, mop_value, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - IROperand base_op = dest_ir; - IROperand value_op = src1_ir; - IROperand index_op = src2_ir; - IROperand scale_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &base_op); - ir_fill_op(ir, &value_op); - ir_fill_op(ir, &index_op); - tcc_gen_machine_store_indexed_op(base_op, index_op, scale_op, value_op); - } - break; - } - case TCCIR_OP_LOAD_POSTINC: - { - /* LOAD_POSTINC: dest = *ptr; ptr += offset */ - if (use_mop_load_postinc) - { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_ptr = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_postinc_mop(mop_dest, mop_ptr, mop_offset, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - IROperand ptr_op = src1_ir; - IROperand offset_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &ptr_op); - tcc_gen_machine_load_postinc_op(dest_ir, ptr_op, offset_op); - } - break; - } - case TCCIR_OP_STORE_POSTINC: - { - /* STORE_POSTINC: *ptr = value; ptr += offset */ - if (use_mop_store_postinc) - { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_ptr = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_postinc_mop(mop_ptr, mop_value, mop_offset, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - IROperand ptr_op = dest_ir; - IROperand value_op = src1_ir; - IROperand offset_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &ptr_op); - ir_fill_op(ir, &value_op); - tcc_gen_machine_store_postinc_op(ptr_op, value_op, offset_op); - } - break; - } - case TCCIR_OP_RETURNVALUE: - { - if (use_mop_returnvalue) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_return_value_mop(mop_src, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - /* Peephole: if previous instruction was LOAD/ASSIGN that already loaded to R0, - * skip the return value copy. */ - const IRQuadCompact *ir_prev = (i > 0) ? &ir->compact_instructions[i - 1] : NULL; - int skip_copy = 0; - if (!has_incoming_jump[i] && ir_prev && (ir_prev->op == TCCIR_OP_LOAD || ir_prev->op == TCCIR_OP_ASSIGN)) - { - IROperand prev_dest_irop = tcc_ir_op_get_dest(ir, ir_prev); - const int prev_dest_vreg = irop_get_vreg(prev_dest_irop); - const int src1_vreg = irop_get_vreg(src1_ir); - if (prev_dest_vreg == src1_vreg) - { - IRLiveInterval *prev_interval = tcc_ir_get_live_interval(ir, prev_dest_vreg); - if (prev_interval && prev_interval->allocation.r0 == REG_IRET) - skip_copy = 1; - } - } - if (!skip_copy) - { - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_return_value_op(src1_ir, cq->op); - } - } - } - case TCCIR_OP_RETURNVOID: - /* Emit jump to epilogue (will be backpatched later) */ - /* if return is last instruction, then jump is not needed */ - if (i != ir->next_instruction_index - 1) - { - return_jump_addrs[num_return_jumps++] = ind; - /* Return jumps target the epilogue (-1 indicates no IR target) */ - tcc_gen_machine_jump_op(cq->op, dest_ir, i); - } - break; - case TCCIR_OP_ASSIGN: - { - /* Peephole: if next instruction is RETURNVALUE using this ASSIGN's dest, - * assign directly to R0 to avoid an extra move */ - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - int ir_next_src1_vr = -1; - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE) - { - IROperand next_src1_irop = tcc_ir_op_get_src1(ir, ir_next); - ir_next_src1_vr = irop_get_vreg(next_src1_irop); - } - const int assign_dest_vreg = irop_get_vreg(dest_ir); - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && ir_next_src1_vr == assign_dest_vreg && - !has_incoming_jump[i + 1]) - { - dest_ir.pr0_reg = REG_IRET; /* R0 */ - dest_ir.pr0_spilled = 0; - if (irop_is_64bit(dest_ir)) - { - dest_ir.pr1_reg = REG_IRE2; /* R1 */ - dest_ir.pr1_spilled = 0; - } - /* Update the interval allocation so RETURNVALUE sees the change */ - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, assign_dest_vreg); - if (interval) - { - interval->allocation.r0 = REG_IRET; - if (irop_is_64bit(dest_ir)) - interval->allocation.r1 = REG_IRE2; - } - } - /* Same assign_before_ret guard as the dry-run: keep both passes consistent. */ - bool assign_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - assign_before_ret = (irop_get_vreg(nq_src1) == assign_dest_vreg); - } - } - if (use_mop_assign && !assign_before_ret) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_assign_mop(mop_src, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_assign_op(dest_ir, src1_ir, cq->op); - } - break; - } - case TCCIR_OP_LEA: - /* Load Effective Address: compute address of src1 into dest */ - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_lea_op(dest_ir, src1_ir, cq->op); - break; - case TCCIR_OP_FUNCPARAMVAL: - case TCCIR_OP_FUNCPARAMVOID: - { - if (use_mop_funcparam) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - /* No scratch tracking: FUNCPARAM does not allocate scratch registers */ - tcc_gen_machine_func_parameter_mop(mop_src1, mop_src2, cq->op); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_func_parameter_op(src1_ir, src2_ir, cq->op); - } - break; - } - case TCCIR_OP_JUMP: - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_jump_op(cq->op, dest_ir, i); - /* Update mapping to actual instruction address (may have shifted due to literal pool) */ - ir_to_code_mapping[i] = ind - (tcc_gen_machine_branch_opt_get_encoding(i) == 16 ? 2 : 4); - /* Clear spill cache at branch - value may come from different path */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - break; - case TCCIR_OP_JUMPIF: - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_conditional_jump_op(src1_ir, cq->op, dest_ir, i); - /* Update mapping to actual instruction address (may have shifted due to literal pool) */ - ir_to_code_mapping[i] = ind - (tcc_gen_machine_branch_opt_get_encoding(i) == 16 ? 2 : 4); - /* Clear spill cache at conditional branch - target may have different values */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - break; - case TCCIR_OP_IJUMP: - if (use_mop_ijump) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_indirect_jump_mop(mop_src, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_indirect_jump_op(src1_ir); - } - tcc_ir_spill_cache_clear(&ir->spill_cache); - break; - case TCCIR_OP_SWITCH_TABLE: - { - int table_id = (int)irop_get_imm64_ex(ir, src2_ir); - TCCIRSwitchTable *table = &ir->switch_tables[table_id]; - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_switch_table_op(src1_ir, table, ir, i); - tcc_ir_spill_cache_clear(&ir->spill_cache); - break; - } - case TCCIR_OP_SETIF: - if (use_mop_setif) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_setif_mop(mop_src, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_setif_op(dest_ir, src1_ir, cq->op); - } - break; - case TCCIR_OP_BOOL_OR: - case TCCIR_OP_BOOL_AND: - if (use_mop_bool) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_bool_mop(mop_src1, mop_src2, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_bool_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - - case TCCIR_OP_VLA_ALLOC: - case TCCIR_OP_VLA_SP_SAVE: - case TCCIR_OP_VLA_SP_RESTORE: - if (use_mop_vla) - { - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - tcc_gen_machine_vla_mop(mop_dest, mop_src1, mop_src2, cq->op); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_vla_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_FUNCCALLVOID: - drop_return_value = 1; - /* fall through */ - case TCCIR_OP_FUNCCALLVAL: - { - if (use_mop_func_call) - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_func_call_mop(src1_ir, src2_ir, mop_dest, drop_return_value, ir, i); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_func_call_op(src1_ir, src2_ir, dest_ir, drop_return_value, ir, i); - } - /* Clear spill cache after function call - callee may have modified memory */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - /* Restore R10 after call: trampoline calls for nested functions clobber R10. - * Re-load from the chain save slot at [FP, #-4] to keep R10 correct. */ - if (ir->has_static_chain) - tcc_gen_machine_restore_chain(); - break; - } - case TCCIR_OP_NOP: - /* No operation - skip silently */ - break; - case TCCIR_OP_TRAP: - /* Generate trap instruction */ - tcc_gen_machine_trap_op(); - break; - case TCCIR_OP_SET_CHAIN: - /* Static chain setup: move FP to static chain register */ - tcc_gen_machine_set_chain(); - break; - case TCCIR_OP_INIT_CHAIN_SLOT: - /* Store parent FP into chain slot for nested function trampoline */ - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_init_chain_slot(src1_ir); - break; - case TCCIR_OP_ASM_INPUT: - case TCCIR_OP_ASM_OUTPUT: - /* Marker ops only: regalloc/liveness uses them, codegen emits nothing. */ - break; - case TCCIR_OP_INLINE_ASM: - { -#ifdef CONFIG_TCC_ASM - ir_fill_op(ir, &src1_ir); - tcc_ir_codegen_inline_asm_ir(ir, src1_ir); - /* Inline asm may clobber registers/memory: treat as a full barrier. */ - tcc_ir_spill_cache_clear(&ir->spill_cache); -#else - tcc_error("inline asm not supported"); -#endif - break; - } - default: - { - printf("Unsupported operation in tcc_generate_code: %s\n", tcc_ir_get_op_name(cq->op)); - if (ir->ir_to_code_mapping) - { - tcc_free(ir->ir_to_code_mapping); - ir->ir_to_code_mapping = NULL; - ir->ir_to_code_mapping_size = 0; - } - tcc_free(return_jump_addrs); - exit(1); - } - }; - - /* Clean up scratch register state at end of each IR instruction. - * This restores any pushed scratch registers and resets the global exclude mask. */ - tcc_gen_machine_end_instruction(); - } - - ir_to_code_mapping[ir->next_instruction_index] = ind; - orig_ir_to_code_mapping[ir->orig_ir_to_code_mapping_size - 1] = ind; - - /* Fill gaps for removed original indices: map them to the next reachable - * emitted code address (or epilogue). This keeps &&label stable even if the - * instruction at the exact original index was optimized away. */ - { - uint32_t last = orig_ir_to_code_mapping[ir->orig_ir_to_code_mapping_size - 1]; - for (int k = ir->orig_ir_to_code_mapping_size - 2; k >= 0; --k) - { - if (orig_ir_to_code_mapping[k] == 0xFFFFFFFFu) - orig_ir_to_code_mapping[k] = last; - else - last = orig_ir_to_code_mapping[k]; - } - } - - if (!ir->naked) - tcc_gen_machine_epilog(ir->leaffunc); - tcc_ir_codegen_backpatch_jumps(ir, ir_to_code_mapping); - - /* Backpatch return jumps to point to epilogue */ - int epilogue_addr = ir_to_code_mapping[ir->next_instruction_index]; - for (int i = 0; i < num_return_jumps; i++) - { - tcc_gen_machine_backpatch_jump(return_jump_addrs[i], epilogue_addr); - } - - tcc_free(return_jump_addrs); - tcc_free(dry_insn_saves); - tcc_free(dry_insn_scratch); - tcc_free(has_incoming_jump); -} - -/* ============================================================================ - * Legacy API Wrappers - * ============================================================================ */ - -/* Note: tcc_ir_generate_code legacy wrapper remains in tccir.c */ diff --git a/ir/codegen.c.bak b/ir/codegen.c.bak deleted file mode 100644 index e64751cb..00000000 --- a/ir/codegen.c.bak +++ /dev/null @@ -1,3068 +0,0 @@ -/* - * TCC IR - Code Generation Helpers Implementation - * - * Copyright (c) 2025 Mateusz Stadnik - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation. - */ - -#define USING_GLOBALS -#include "ir.h" - -/* Debug tracking variable (defined in arm-thumb-gen.c) */ -extern int g_debug_current_op; - -/* ============================================================================ - * Register Fill (Apply Allocation to Operands) - * ============================================================================ */ - -void tcc_ir_fill_registers(TCCIRState *ir, SValue *sv) -{ - int old_r = sv->r; - int old_v = old_r & VT_VALMASK; - - /* VT_LOCAL/VT_LLOCAL operands can mean either: - * - a concrete stack slot (vr == -1), e.g. VLA save slots, or - * - a logical local tracked as a vreg by the IR (vr != -1). - * - * For concrete stack slots, do not rewrite them into registers here; doing - * so can create uninitialized register reads at runtime. - * - * For locals that do carry a vreg, they must participate in register - * allocation so that defs/uses stay consistent. - */ - if ((old_v == VT_LOCAL || old_v == VT_LLOCAL) && sv->vr == -1) - { - sv->pr0_reg = PREG_REG_NONE; - sv->pr0_spilled = 0; - sv->pr1_reg = PREG_REG_NONE; - sv->pr1_spilled = 0; - return; - } - if (tcc_ir_vreg_is_valid(ir, sv->vr)) - { - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, sv->vr); - - /* Stack-passed parameters: if not allocated to a register, treat them as - * residing in the incoming argument area (VT_PARAM) rather than forcing a - * separate local spill slot. - * - * This is safe under AAPCS: the caller's argument stack area remains valid - * for the duration of the call, and it also provides a correct addressable - * home for '¶m' semantics. - */ - if (TCCIR_DECODE_VREG_TYPE(sv->vr) == TCCIR_VREG_TYPE_PARAM && interval && interval->incoming_reg0 < 0 && - interval->allocation.r0 == PREG_NONE && interval->allocation.offset == 0) - { - sv->pr0_reg = PREG_REG_NONE; - sv->pr0_spilled = 0; - sv->pr1_reg = PREG_REG_NONE; - sv->pr1_spilled = 0; - sv->c.i = interval->original_offset; - - int need_lval = (old_r & VT_LVAL); - if (old_v < VT_CONST && old_v != VT_LOCAL && old_v != VT_LLOCAL && interval->is_lvalue) - need_lval = VT_LVAL; - - sv->r = VT_LOCAL | need_lval | VT_PARAM; - return; - } - - /* Register-passed parameters: if allocated to a register (not spilled), - * clear VT_LVAL. The value is already in the register, no dereference needed. - * VT_LVAL is only used on parameters for address-of operations (¶m) or - * when they're on the stack (VT_LOCAL). - */ - int is_register_param = - (TCCIR_DECODE_VREG_TYPE(sv->vr) == TCCIR_VREG_TYPE_PARAM && interval && interval->incoming_reg0 >= 0); - - sv->pr0_reg = interval->allocation.r0 & PREG_REG_NONE; - sv->pr0_spilled = (interval->allocation.r0 & PREG_SPILLED) != 0; - sv->pr1_reg = interval->allocation.r1 & PREG_REG_NONE; - sv->pr1_spilled = (interval->allocation.r1 & PREG_SPILLED) != 0; - sv->c.i = interval->allocation.offset; - - /* Determine if we should preserve VT_LVAL: - * - If old_r was VT_LOCAL|VT_LVAL (local variable on stack), and now - * it's allocated to a register, we should NOT preserve VT_LVAL because - * the value is already in the register, no load needed. - * - If old_r has VT_LVAL but (old_r & VT_VALMASK) < VT_CONST, it means - * the vreg holds a pointer that needs dereferencing - preserve VT_LVAL. - * - Register parameters: do NOT preserve VT_LVAL when allocated to a register. - * VT_LVAL on parameters is only needed for stack params (VT_LOCAL) or for - * address-of operations. - * - If old_r does NOT have VT_LVAL, this is an address-of operation - * (we want the address, not the value). Do NOT add VT_LVAL. */ - int preserve_flags = old_r & VT_PARAM; /* Always preserve VT_PARAM */ - if ((old_r & VT_LVAL) && old_v < VT_CONST && old_v != VT_LOCAL && old_v != VT_LLOCAL && !is_register_param) - { - /* The vreg holds a pointer that needs dereferencing. - * Note: VT_LOCAL/VT_LLOCAL use VT_LVAL to mean "load from stack slot". - * When such a local/param is promoted to a register, we must NOT - * preserve VT_LVAL, otherwise we turn a plain value into a pointer - * dereference (double-indirection bugs). - */ - preserve_flags |= VT_LVAL; - } - - if ((interval->allocation.r0 & PREG_SPILLED) || interval->allocation.offset != 0) - { - /* Spilled to stack - treat as local. - * For computed values (old_r was 0 or a register), add VT_LVAL to load the value. - * For address-of expressions (old_r == VT_LOCAL without VT_LVAL), don't add VT_LVAL. - * If original had VT_LVAL (pointer dereference), preserve it. - * - * DOUBLE INDIRECTION CASE: If old_r has VT_LVAL AND the original was NOT - * already a local variable (VT_LOCAL), then the code wants to DEREFERENCE - * the value held in this vreg. If that value is spilled: - * - Spill slot contains a POINTER value (e.g., result of ADD on address) - * - Need to: (1) load pointer from spill, (2) dereference it - * Use VT_LLOCAL to encode this double-indirection requirement. - * - * But if old_v == VT_LOCAL, the VT_LVAL means "load/store from/to this stack slot" - * which is standard local variable access - do NOT use VT_LLOCAL. - * - * ADDRESS-OF CASE: If old_v == VT_LOCAL and old_r does NOT have VT_LVAL, - * this is an address-of operation (&var). We want the ADDRESS of the spill - * slot, not its contents. Do NOT add VT_LVAL in this case. - * - * COMPUTED VALUE CASE: If old_v was a register (computed value that got - * spilled), we ALWAYS need VT_LVAL to load the value from the spill slot. */ - int need_lval; - if (old_v == VT_LOCAL || old_v == VT_LLOCAL) - { - /* Local variable: preserve VT_LVAL to distinguish load vs address-of */ - need_lval = (old_r & VT_LVAL); - } - else - { - /* Computed value (was in register): always need VT_LVAL to load from spill */ - need_lval = VT_LVAL; - } - int base_kind = VT_LOCAL; - if ((old_r & VT_LVAL) && old_v != VT_LOCAL && old_v != VT_LLOCAL) - { - /* The original use wants to dereference the value in this vreg. - * Since the value is spilled, we need double indirection: - * load pointer from spill slot, then dereference it. - * Note: We exclude VT_LOCAL/VT_LLOCAL because their VT_LVAL means - * "access this stack slot" not "dereference pointer in vreg". */ - base_kind = VT_LLOCAL; - } - /* Only preserve VT_PARAM for stack-passed parameters (incoming_reg0 < 0). - * Register-passed parameters that are spilled to local stack should NOT - * have VT_PARAM set, because VT_PARAM causes load_to_dest to add - * offset_to_args (for accessing caller's argument area), but spilled - * register params live in the callee's local stack area (negative FP offset). */ - int spilled_param_flag = 0; - if ((old_r & VT_PARAM) && interval->incoming_reg0 < 0) - { - spilled_param_flag = VT_PARAM; - } - sv->r = base_kind | need_lval | spilled_param_flag; - } - else if (interval->allocation.r0 != PREG_NONE) - { - /* In a register - set r to the register number, preserving VT_LVAL only for pointer derefs */ - sv->r = interval->allocation.r0 | preserve_flags; - } - } - else if ((sv->vr == -1 || sv->vr == 0 || TCCIR_DECODE_VREG_TYPE(sv->vr) == 0) && - (sv->r == -1 || sv->r == PREG_REG_NONE || (old_v >= VT_CONST))) - { - /* No valid vreg and either invalid .r or a constant - preserve important flags. - * This handles global symbol references (VT_CONST | VT_SYM) and plain constants. */ - int flags = sv->r & (VT_LVAL | VT_SYM); - sv->r = VT_CONST | flags; - } - else if (sv->vr == -1 && old_r == 0 && sv->sym) - { - /* Special case: old_r=0 but has a symbol - this is a function symbol reference - * that wasn't marked as VT_CONST. Preserve the symbol. */ - sv->r = VT_CONST | VT_SYM; - } -} - -void tcc_ir_fill_registers_ir(TCCIRState *ir, IROperand *op) -{ - const int old_is_local = op->is_local; - const int old_is_llocal = op->is_llocal; - const int old_is_const = op->is_const; - const int old_is_lval = op->is_lval; - const int old_is_param = op->is_param; - - const int vreg = irop_get_vreg(*op); - - /* VT_LOCAL/VT_LLOCAL operands can mean either: - * - a concrete stack slot (vr == -1), e.g. VLA save slots, or - * - a temp local for type-punning casts (vr <= -2, VR_TEMP_LOCAL), or - * - a logical local tracked as a vreg by the IR (vr > 0). - * - * For concrete stack slots and temp locals, do not rewrite them into - * registers here; doing so can create uninitialized register reads - * at runtime. */ - if ((old_is_local || old_is_llocal) && vreg < 0) - { - op->pr0_reg = PREG_REG_NONE; - op->pr0_spilled = 0; - op->pr1_reg = PREG_REG_NONE; - op->pr1_spilled = 0; - return; - } - - if (tcc_ir_vreg_is_valid(ir, vreg)) - { - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, vreg); - int32_t old_stackoff = 0; - if (op->btype != IROP_BTYPE_STRUCT && irop_get_tag(*op) == IROP_TAG_STACKOFF) - old_stackoff = op->u.imm32; - - /* Stack-passed parameters: if not allocated to a register, treat them as - * residing in the incoming argument area (VT_PARAM) rather than forcing a - * separate local spill slot. */ - if (TCCIR_DECODE_VREG_TYPE(vreg) == TCCIR_VREG_TYPE_PARAM && interval && interval->incoming_reg0 < 0 && - interval->allocation.r0 == PREG_NONE && interval->allocation.offset == 0) - { - op->pr0_reg = PREG_REG_NONE; - op->pr0_spilled = 0; - op->pr1_reg = PREG_REG_NONE; - op->pr1_spilled = 0; - /* For STRUCT types, preserve ctype_idx in the split encoding */ - if (op->btype == IROP_BTYPE_STRUCT) - { - op->u.s.aux_data = interval->original_offset; - } - else - { - op->u.imm32 = interval->original_offset; - } - op->tag = IROP_TAG_STACKOFF; - - int need_lval = old_is_lval; - /* old_v < VT_CONST && old_v != VT_LOCAL && old_v != VT_LLOCAL → reg kind operand */ - if (!old_is_const && !old_is_local && !old_is_llocal && interval->is_lvalue) - need_lval = 1; - - op->is_local = 1; - op->is_llocal = 0; - op->is_const = 0; - op->is_lval = need_lval; - op->is_param = 1; - return; - } - - /* Register-passed parameters: if allocated to a register (not spilled), - * clear VT_LVAL. The value is already in the register, no dereference needed. */ - int is_register_param = - (TCCIR_DECODE_VREG_TYPE(vreg) == TCCIR_VREG_TYPE_PARAM && interval && interval->incoming_reg0 >= 0); - - op->pr0_reg = interval->allocation.r0 & PREG_REG_NONE; - op->pr0_spilled = (interval->allocation.r0 & PREG_SPILLED) != 0; - op->pr1_reg = interval->allocation.r1 & PREG_REG_NONE; - op->pr1_spilled = (interval->allocation.r1 & PREG_SPILLED) != 0; - /* For STRUCT types, preserve ctype_idx in the split encoding */ - if (op->btype == IROP_BTYPE_STRUCT) - { - op->u.s.aux_data = interval->allocation.offset; - } - else - { - if ((old_is_local || old_is_llocal) && !old_is_param && irop_get_tag(*op) == IROP_TAG_STACKOFF) - { - int32_t delta = old_stackoff - interval->original_offset; - op->u.imm32 = interval->allocation.offset + delta; - } - else - { - op->u.imm32 = interval->allocation.offset; - } - } - - /* Determine if we should preserve is_lval: - * - If was local|lval and now in register, do NOT preserve is_lval - * - If was lval with reg-kind operand (pointer deref), preserve is_lval - * - Register parameters: do NOT preserve is_lval when in register */ - int preserve_param = old_is_param; - int preserve_lval = 0; - if (old_is_lval && !old_is_const && !old_is_local && !old_is_llocal && !is_register_param) - { - preserve_lval = 1; - } - - if ((interval->allocation.r0 & PREG_SPILLED) || interval->allocation.offset != 0) - { - /* Spilled to stack */ - int need_lval; - if (old_is_local || old_is_llocal) - { - need_lval = old_is_lval; - } - else - { - /* Computed value (was in register): always need lval to load from spill */ - need_lval = 1; - } - - int use_llocal = 0; - if (old_is_lval && !old_is_local && !old_is_llocal) - { - /* Double indirection: spilled pointer that needs dereferencing */ - use_llocal = 1; - } - - /* Only preserve is_param for stack-passed parameters (incoming_reg0 < 0). - * Register-passed parameters spilled to local stack should NOT have is_param. */ - int spilled_param = 0; - if (old_is_param && interval->incoming_reg0 < 0) - { - spilled_param = 1; - } - - op->is_local = 1; - op->is_llocal = use_llocal; - op->is_const = 0; - op->is_lval = need_lval; - op->is_param = spilled_param; - op->tag = IROP_TAG_STACKOFF; - } - else if (interval->allocation.r0 != PREG_NONE) - { - /* In a register */ - op->is_local = 0; - op->is_llocal = 0; - op->is_const = 0; - op->is_lval = preserve_lval; - op->is_param = preserve_param; - op->tag = IROP_TAG_VREG; - } - } - /* No valid vreg: constants, symbols, etc. - IROperand already has the right encoding - * from the pool. Nothing to do for register allocation. */ -} - -/* ============================================================================ - * Parameter Register Allocation - * ============================================================================ */ - -void tcc_ir_register_allocation_params(TCCIRState *ir) -{ - /* For leaf functions: parameters can stay in registers r0-r3, UNLESS - * the linear scan allocator already spilled them due to register pressure. - * For non-leaf functions: parameters arrive in registers but must be - * stored to stack since r0-r3 are caller-saved. - * In both cases, we need to track which register each parameter arrives in. - */ - int argno = 0; // current register number (r0-r3) - for (int vreg = 0; vreg < ir->next_parameter; ++vreg) - { - const int encoded_vreg = (TCCIR_VREG_TYPE_PARAM << 28) | vreg; - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, encoded_vreg); - /* is_double for soft-float (LS_REG_TYPE_DOUBLE_SOFT) or is_llong for 64-bit - */ - int is_64bit = interval && (interval->is_double || interval->is_llong || interval->is_complex); - - /* If the ABI incoming registers were already set (e.g., by the - * parameter handling in tcc_ir_add_function_parameters), respect them - * and only advance argno for subsequent parameters. - */ - if (interval && (interval->incoming_reg0 >= 0 || interval->incoming_reg1 >= 0)) - { - argno += is_64bit ? 2 : 1; - continue; - } - - /* AAPCS: 64-bit values must be aligned to even register pairs */ - if (is_64bit && (argno & 1)) - { - argno++; /* skip odd register to align to even */ - } - - if (is_64bit) - { - /* 64-bit value (double or long long) takes r0+r1 or r2+r3 */ - if (argno <= 2) - { - /* Parameter arrives in registers */ - interval->incoming_reg0 = argno; - interval->incoming_reg1 = argno + 1; - /* NOTE: For leaf functions, the linear scanner has already assigned registers. - * Don't overwrite interval->allocation here - it would clobber the correct allocation - * with argno (parameter index), which is NOT the same as the physical register number. - * The prolog will use incoming_reg0/1 to know which registers the parameter arrives in. */ - } - else - { - /* Spilled to caller's stack frame - parameter passed on stack */ - interval->incoming_reg0 = -1; - interval->incoming_reg1 = -1; - /* Record where the parameter arrives on the caller's stack frame. - * Use original_offset if already set by tcc_ir_set_original_offset - * (from the ABI layout), otherwise compute from argno. - * The ABI-derived offset is more accurate for complex cases like - * split structs (REG_STACK) where argno doesn't account for - * stack words that don't have PARAM vregs. - */ - if (interval->original_offset == 0) - interval->original_offset = (argno - 4) * 4; - /* See 64-bit case above: do not overwrite allocator spill slots with - * caller-stack offsets. - */ - interval->allocation.r0 = PREG_NONE; - interval->allocation.r1 = PREG_NONE; - interval->allocation.offset = 0; - } - argno += 2; - } - else - { - if (argno <= 3) - { - interval->incoming_reg0 = argno; - interval->incoming_reg1 = -1; - } - else - { - /* Spilled to caller's stack frame - parameter passed on stack */ - interval->incoming_reg0 = -1; - interval->incoming_reg1 = -1; - /* Record where the parameter arrives on the caller's stack frame. - * Use original_offset if already set by tcc_ir_set_original_offset - * (from the ABI layout), otherwise compute from argno. - */ - if (interval->original_offset == 0) - interval->original_offset = (argno - 4) * 4; - /* See 64-bit case above: do not overwrite allocator spill slots with - * caller-stack offsets. - */ - interval->allocation.r0 = PREG_NONE; - interval->allocation.r1 = PREG_NONE; - interval->allocation.offset = 0; - } - argno++; - } - } -} - -void tcc_ir_mark_return_value_incoming_regs(TCCIRState *ir) -{ - if (!ir) - return; - - /* Scan all instructions to find FUNCCALLVAL that produce return values */ - for (int i = 0; i < ir->next_instruction_index; ++i) - { - IRQuadCompact *q = &ir->compact_instructions[i]; - if (q->op != TCCIR_OP_FUNCCALLVAL) - continue; - - /* dest is the vreg that receives the return value */ - const IROperand dest = tcc_ir_op_get_dest(ir, q); - if (dest.vr < 0 || !tcc_ir_vreg_is_valid(ir, dest.vr)) - continue; - - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, dest.vr); - if (!interval) - continue; - - /* Mark that this vreg arrives in r0 (or r0+r1 for 64-bit returns) */ - interval->incoming_reg0 = 0; /* r0 */ - if (interval->is_llong || interval->is_double || interval->is_complex) - interval->incoming_reg1 = 1; /* r1 */ - else - interval->incoming_reg1 = -1; - } -} - -void tcc_ir_avoid_spilling_stack_passed_params(TCCIRState *ir) -{ - if (!ir) - return; - - /* Compute which PARAM vregs are stack-passed under AAPCS. - * We intentionally do this before patching IRLiveInterval allocations, - * operating on the linear-scan table so we can also shrink `loc`/frame size. - */ - const int param_count = ir->next_parameter; - if (param_count <= 0) - return; - - uint8_t *is_stack_passed = tcc_mallocz((size_t)param_count); - int argno = 0; - for (int vreg = 0; vreg < param_count; ++vreg) - { - const int encoded_vreg = (TCCIR_VREG_TYPE_PARAM << 28) | vreg; - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, encoded_vreg); - if (!interval) - continue; - - const int is_64bit = interval->is_double || interval->is_llong; - if (is_64bit && (argno & 1)) - argno++; /* align 64-bit to even reg pair */ - - const int in_regs = is_64bit ? (argno <= 2) : (argno <= 3); - if (!in_regs) - is_stack_passed[vreg] = 1; - - argno += is_64bit ? 2 : 1; - } - - /* Rewrite linear-scan results: stack-passed params already have an incoming - * memory home (caller arg area), so if the allocator spilled them, drop the - * local spill slot. Also force address-taken stack params to remain in - * memory (we can use the incoming slot as their addressable home). - */ - for (int i = 0; i < ir->ls.next_interval_index; ++i) - { - LSLiveInterval *ls = &ir->ls.intervals[i]; - if (TCCIR_DECODE_VREG_TYPE((int)ls->vreg) != TCCIR_VREG_TYPE_PARAM) - continue; - const int pidx = TCCIR_DECODE_VREG_POSITION((int)ls->vreg); - if (pidx < 0 || pidx >= param_count) - continue; - if (!is_stack_passed[pidx]) - continue; - - /* Stack-passed params live in the caller's argument area. If linear-scan - * assigned them a register (without spilling), the prolog won't load them - * into that register, causing incorrect code. Always reset r0/r1 to force - * them to use the incoming stack location via VT_PARAM path. */ - ls->r0 = PREG_NONE; - ls->r1 = PREG_NONE; - ls->stack_location = 0; - } - - tcc_free(is_stack_passed); -} - -/* ============================================================================ - * Code Generation Helpers - * ============================================================================ */ - -IROperand tcc_ir_codegen_dest_get(TCCIRState *ir, const IRQuadCompact *q) -{ - if (!irop_config[q->op].has_dest) - { - IROperand empty = {0}; - return empty; - } - return ir->iroperand_pool[q->operand_base + 0]; -} - -IROperand tcc_ir_codegen_src1_get(TCCIRState *ir, const IRQuadCompact *q) -{ - int off = irop_config[q->op].has_dest; - if (!irop_config[q->op].has_src1) - { - IROperand empty = {0}; - return empty; - } - return ir->iroperand_pool[q->operand_base + off]; -} - -IROperand tcc_ir_codegen_src2_get(TCCIRState *ir, const IRQuadCompact *q) -{ - int off = irop_config[q->op].has_dest + irop_config[q->op].has_src1; - if (!irop_config[q->op].has_src2) - { - IROperand empty = {0}; - return empty; - } - return ir->iroperand_pool[q->operand_base + off]; -} - -void tcc_ir_codegen_dest_set(TCCIRState *ir, const IRQuadCompact *q, IROperand irop) -{ - if (!irop_config[q->op].has_dest) - return; - ir->iroperand_pool[q->operand_base + 0] = irop; -} - -int tcc_ir_codegen_reg_get(TCCIRState *ir, int vreg) -{ - if (!ir || !tcc_ir_vreg_is_valid(ir, vreg)) - return PREG_NONE; - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, vreg); - if (!interval) - return PREG_NONE; - return interval->allocation.r0; -} - -void tcc_ir_codegen_reg_set(TCCIRState *ir, int vreg, int preg) -{ - if (!ir || !tcc_ir_vreg_is_valid(ir, vreg)) - return; - IRLiveInterval *interval = tcc_ir_vreg_live_interval(ir, vreg); - if (interval) - interval->allocation.r0 = preg; -} - -void tcc_ir_codegen_params_setup(TCCIRState *ir) -{ - tcc_ir_register_allocation_params(ir); -} - -void tcc_ir_codegen_cmp_jmp_set(TCCIRState *ir) -{ - if (ir == NULL) - return; - /* Guard against invalid vtop - can happen with empty structs */ - extern SValue _vstack[]; - if (vtop < _vstack + 1) /* vstack is defined as (_vstack + 1) */ - return; - int v = vtop->r & VT_VALMASK; - if (v == VT_CMP) - { - SValue src, dest; - int jtrue = vtop->jtrue; - int jfalse = vtop->jfalse; - svalue_init(&src); - svalue_init(&dest); - dest.vr = tcc_ir_get_vreg_temp(ir); - dest.type.t = VT_INT; - dest.pr0_reg = PREG_REG_NONE; - dest.pr0_spilled = 0; - dest.pr1_reg = PREG_REG_NONE; - dest.pr1_spilled = 0; - - if (jtrue >= 0 || jfalse >= 0) - { - /* We have pending jump chains - need to merge them with the comparison */ - SValue jump_dest; - svalue_init(&jump_dest); - jump_dest.vr = -1; - jump_dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - - /* Generate SETIF for the comparison part */ - src.vr = -1; - src.r = VT_CONST; - src.c.i = vtop->cmp_op; - tcc_ir_put(ir, TCCIR_OP_SETIF, &src, NULL, &dest); - - /* Jump to end */ - jump_dest.c.i = -1; /* will be patched */ - int end_jump = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &jump_dest); - - /* Patch jtrue chain to here - set dest = 1 */ - if (jtrue >= 0) - { - tcc_ir_backpatch_to_here(ir, jtrue); - src.r = VT_CONST; - src.c.i = 1; - src.pr0_reg = PREG_REG_NONE; - src.pr0_spilled = 0; - src.pr1_reg = PREG_REG_NONE; - src.pr1_spilled = 0; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src, NULL, &dest); - if (jfalse >= 0) - { - /* Jump over the jfalse handler */ - jump_dest.c.i = -1; /* will be patched */ - int skip_jump = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &jump_dest); - /* Patch jfalse chain to here - set dest = 0 */ - tcc_ir_backpatch_to_here(ir, jfalse); - src.r = VT_CONST; - src.c.i = 0; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src, NULL, &dest); - /* Patch skip_jump to end */ - tcc_ir_set_dest_jump_target(ir, skip_jump, ir->next_instruction_index); - } - } - else if (jfalse >= 0) - { - tcc_ir_backpatch_to_here(ir, jfalse); - src.r = VT_CONST; - src.c.i = 0; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src, NULL, &dest); - } - - /* Patch end_jump to here */ - tcc_ir_set_dest_jump_target(ir, end_jump, ir->next_instruction_index); - tcc_ir_codegen_bb_start(ir); - } - else - { - /* Simple case - just SETIF */ - src.vr = -1; - src.r = VT_CONST; - src.c.i = vtop->cmp_op; - tcc_ir_put(ir, TCCIR_OP_SETIF, &src, NULL, &dest); - } - - vtop->vr = dest.vr; - vtop->r = 0; - } - else if ((v & ~1) == VT_JMP) - { - SValue dest, src1; - SValue jump_dest; - int t; - svalue_init(&src1); - svalue_init(&dest); - svalue_init(&jump_dest); - dest.vr = tcc_ir_get_vreg_temp(ir); - dest.type.t = VT_INT; - src1.vr = -1; - src1.r = VT_CONST; - t = v & 1; - src1.c.i = t; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src1, NULL, &dest); - - /* Default path: result already set to `t`. Skip the alternate assignment. - If the jump chain is taken, execution lands at the alternate assignment - which flips the result to `t ^ 1`. */ - jump_dest.vr = -1; - jump_dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - jump_dest.c.i = -1; /* patched to end */ - int end_jump = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &jump_dest); - - tcc_ir_backpatch_to_here(ir, vtop->c.i); - src1.c.i = t ^ 1; - tcc_ir_put(ir, TCCIR_OP_ASSIGN, &src1, NULL, &dest); - IROperand end_dest = tcc_ir_op_get_dest(ir, &ir->compact_instructions[end_jump]); - end_dest.u.imm32 = ir->next_instruction_index; - tcc_ir_op_set_dest(ir, &ir->compact_instructions[end_jump], end_dest); - vtop->vr = dest.vr; - vtop->r = 0; - } -} - -void tcc_ir_codegen_backpatch(TCCIRState *ir, int jump_idx, int target_address) -{ - tcc_ir_backpatch(ir, jump_idx, target_address); -} - -void tcc_ir_codegen_backpatch_here(TCCIRState *ir, int jump_idx) -{ - tcc_ir_backpatch_to_here(ir, jump_idx); -} - -void tcc_ir_codegen_backpatch_first(TCCIRState *ir, int jump_idx, int target_address) -{ - tcc_ir_backpatch_first(ir, jump_idx, target_address); -} - -int tcc_ir_codegen_jump_append(TCCIRState *ir, int chain, int jump) -{ - return tcc_ir_gjmp_append(ir, chain, jump); -} - -int tcc_ir_codegen_test_gen(TCCIRState *ir, int invert, int test) -{ - int v; - v = vtop->r & VT_VALMASK; - if (v == VT_CMP) - { - SValue src, dest; - int jtrue = vtop->jtrue; - int jfalse = vtop->jfalse; - - svalue_init(&src); - svalue_init(&dest); - src.vr = -1; - src.r = VT_CONST; - /* Use cmp_op and invert if needed. In TCC, comparison tokens are designed - * so that XORing with 1 inverts them (e.g., TOK_EQ ^ 1 = TOK_NE) */ - int cond = vtop->cmp_op ^ invert; - /* Validate condition is a valid comparison token */ - src.c.i = cond; - dest.vr = -1; - dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - dest.c.i = test; - test = tcc_ir_put(ir, TCCIR_OP_JUMPIF, &src, NULL, &dest); - - /* Handle pending jump chains - merge with the appropriate chain */ - if (invert) - { - /* inv=1: we want to jump when condition is false */ - /* Merge any existing "jump-on-false" chain with the new jump. - * Patch the opposite chain (jump-on-true) to fall through here. */ - if (jfalse >= 0) - { - tcc_ir_backpatch_first(ir, jfalse, test); - test = jfalse; - } - if (jtrue >= 0) - { - tcc_ir_backpatch_to_here(ir, jtrue); - } - } - else - { - /* inv=0: we want to jump when condition is true */ - /* Merge any existing "jump-on-true" chain with the new jump. - * Patch the opposite chain (jump-on-false) to fall through here. */ - if (jtrue >= 0) - { - tcc_ir_backpatch_first(ir, jtrue, test); - test = jtrue; - } - if (jfalse >= 0) - { - tcc_ir_backpatch_to_here(ir, jfalse); - } - } - } - else if (v == VT_JMP || v == VT_JMPI) - { - if ((v & 1) == invert) - { - if (vtop->c.i == -1) - { - vtop->c.i = test; - } - else - { - if (test != -1) - { - tcc_ir_backpatch_first(ir, vtop->c.i, test); - } - test = vtop->c.i; - } - } - else - { - SValue dest; - svalue_init(&dest); - dest.vr = -1; - dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - dest.c.i = test; - test = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &dest); - tcc_ir_backpatch_to_here(ir, vtop->c.i); - } - } - else - { - if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) - { - if ((vtop->c.i != 0) != invert) - { - SValue dest; - svalue_init(&dest); - dest.vr = -1; - dest.r = VT_CONST; /* Mark as constant so jump target is stored in u.imm32 */ - dest.c.i = test; - test = tcc_ir_put(ir, TCCIR_OP_JUMP, NULL, NULL, &dest); - /* Unconditional jump for a compile-time constant condition: - * code after this point is unreachable. Must mirror gjmp_acs() - * which calls CODE_OFF() so that data/code suppression works - * correctly for dead branches (e.g. if(0) { ... }). - * CODE_OFF_BIT = 0x20000000 (defined in tccgen.c). */ - if (!nocode_wanted) - nocode_wanted |= 0x20000000; - } - } - else - { - /* If we're testing a memory lvalue (e.g. tabl[i]), load the value first. - * Otherwise we end up testing the address, which is almost always non-zero - * and can lead to invalid indirect calls. - */ - tcc_ir_put(ir, TCCIR_OP_TEST_ZERO, &vtop[0], NULL, NULL); - vtop->r = VT_CMP; - vtop->cmp_op = TOK_NE; - vtop->jtrue = -1; /* -1 = no chain */ - vtop->jfalse = -1; /* -1 = no chain */ - return tcc_ir_codegen_test_gen(ir, invert, test); - } - } - --vtop; - return test; -} - -void tcc_ir_codegen_bb_start(TCCIRState *ir) -{ - if (ir) - ir->basic_block_start = 1; -} - -/* ============================================================================ - * Return Value Handling - * ============================================================================ */ - -void tcc_ir_codegen_drop_return(TCCIRState *ir) -{ - if (ir->next_instruction_index == 0) - { - return; - } - IRQuadCompact *last_instr = &ir->compact_instructions[ir->next_instruction_index - 1]; - - if (last_instr->op == TCCIR_OP_FUNCCALLVAL) - { - /* Only drop return values that are assigned to temporaries. - * If coalescing redirected the dest to a VAR, the value IS used - * and should not be dropped. */ - IROperand dest = tcc_ir_op_get_dest(ir, last_instr); - if (TCCIR_DECODE_VREG_TYPE(dest.vr) == TCCIR_VREG_TYPE_TEMP) - { - if (tcc_ir_vreg_is_valid(ir, dest.vr)) - { - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, dest.vr); - interval->start = INTERVAL_NOT_STARTED; - interval->end = 0; - } - irop_set_vreg(&dest, -1); - dest.vr = -1; - tcc_ir_op_set_dest(ir, last_instr, dest); - } - } -} - -/* ============================================================================ - * Inline Assembly Code Generation - * ============================================================================ */ - -#ifdef CONFIG_TCC_ASM - -static void tcc_ir_codegen_inline_asm_by_id(TCCIRState *ir, int id) -{ - if (!ir) - return; - if (id < 0 || id >= ir->inline_asm_count) - tcc_error("IR: invalid inline asm id"); - - TCCIRInlineAsm *ia = &ir->inline_asms[id]; - if (!ia->asm_str) - tcc_error("IR: inline asm payload missing"); - - const int nb_operands = ia->nb_operands; - const int nb_labels = ia->nb_labels; - if (nb_operands < 0 || nb_operands > MAX_ASM_OPERANDS || nb_operands + nb_labels > MAX_ASM_OPERANDS) - tcc_error("IR: invalid asm operand count"); - - ASMOperand ops[MAX_ASM_OPERANDS]; - SValue vals[MAX_ASM_OPERANDS]; - memset(ops, 0, sizeof(ops)); - memset(vals, 0, sizeof(vals)); - - memcpy(ops, ia->operands, sizeof(ASMOperand) * (nb_operands + nb_labels)); - for (int i = 0; i < nb_operands; ++i) - { - vals[i] = ia->values[i]; - tcc_ir_fill_registers(ir, &vals[i]); - ops[i].vt = &vals[i]; - } - for (int i = nb_operands; i < nb_operands + nb_labels; ++i) - ops[i].vt = NULL; - - uint8_t clobber_regs[NB_ASM_REGS]; - memcpy(clobber_regs, ia->clobber_regs, sizeof(clobber_regs)); - - /* Compute reserved_regs: physical registers of vregs that are live at this - * INLINE_ASM instruction but are NOT asm operands. The constraint solver - * must avoid these registers when picking registers for "r" constraints, - * otherwise the operand load will clobber the live value. - * - * Unlike clobber_regs, reserved_regs only affect constraint allocation — - * they do NOT trigger save/restore in asm_gen_code prolog/epilog. */ - uint8_t reserved_regs[NB_ASM_REGS]; - memset(reserved_regs, 0, sizeof(reserved_regs)); - { - int asm_instr_idx = ir->codegen_instruction_idx; - struct - { - IRLiveInterval *intervals; - int count; - } groups[3] = { - {ir->variables_live_intervals, ir->variables_live_intervals_size}, - {ir->temporary_variables_live_intervals, ir->temporary_variables_live_intervals_size}, - {ir->parameters_live_intervals, ir->parameters_live_intervals_size}, - }; - - for (int g = 0; g < 3; g++) - { - for (int j = 0; j < groups[g].count; j++) - { - IRLiveInterval *interval = &groups[g].intervals[j]; - if (interval->start == INTERVAL_NOT_STARTED) - continue; - if ((int)interval->start > asm_instr_idx || (int)interval->end < asm_instr_idx) - continue; - - int r0 = interval->allocation.r0; - if (r0 & PREG_SPILLED) - continue; - int phys_reg = r0 & PREG_REG_NONE; - if (phys_reg == PREG_REG_NONE) - continue; - if (phys_reg < NB_ASM_REGS) - reserved_regs[phys_reg] = 1; - - int r1 = interval->allocation.r1; - if (!(r1 & PREG_SPILLED)) - { - int phys_reg1 = r1 & PREG_REG_NONE; - if (phys_reg1 != PREG_REG_NONE && phys_reg1 < NB_ASM_REGS) - reserved_regs[phys_reg1] = 1; - } - } - } - } - - tcc_asm_emit_inline(ops, nb_operands, ia->nb_outputs, nb_labels, clobber_regs, reserved_regs, ia->asm_str, - ia->asm_len, ia->must_subst); -} - -static void tcc_ir_codegen_inline_asm_ir(TCCIRState *ir, IROperand dest_irop) -{ - if (!ir) - return; - const int id = (int)irop_get_imm64_ex(ir, dest_irop); - tcc_ir_codegen_inline_asm_by_id(ir, id); -} -#endif - -/* ============================================================================ - * Jump Backpatching - * ============================================================================ */ - -static void tcc_ir_codegen_backpatch_jumps(TCCIRState *ir, uint32_t *ir_to_code_mapping) -{ - IRQuadCompact *q; - for (int i = 0; i < ir->next_instruction_index; i++) - { - q = &ir->compact_instructions[i]; - if (q->op == TCCIR_OP_JUMP || q->op == TCCIR_OP_JUMPIF) - { - IROperand dest = tcc_ir_op_get_dest(ir, q); - int target_ir = irop_is_none(dest) ? -1 : (int)dest.u.imm32; - /* Skip unpatched jumps (target is -1 or truly out of range) - * Note: target_ir == ir->next_instruction_index is valid (epilogue) */ - if (target_ir < 0 || target_ir > ir->next_instruction_index) - continue; - const int instruction_address = ir_to_code_mapping[i]; - const int target_address = ir_to_code_mapping[target_ir]; - tcc_gen_machine_backpatch_jump(instruction_address, target_address); - } - } - - /* Backpatch switch table entries. - * Table entries are 32-bit signed PC-relative offsets with Thumb bit. - * The reference point is table_start, which is the PC value when - * the 16-bit ADD Rt, PC instruction at ind+10 reads PC (= ind+10+4 = ind+14 = table_start). - * Formula: table[i] = (target_addr | 1) - table_start - * This must happen after all code is generated so forward targets are mapped. */ - for (int t = 0; t < ir->num_switch_tables; t++) - { - TCCIRSwitchTable *table = &ir->switch_tables[t]; - int table_start = table->table_code_addr; - if (table_start <= 0) - continue; /* Table not emitted (e.g. dead code) */ - int ref_point = table_start; /* PC value at the 16-bit ADD Rt, PC instruction (at ind+10, PC=ind+14=table_start) */ - for (int j = 0; j < table->num_entries; j++) - { - int target_ir = table->targets[j]; - int entry_addr = table_start + j * 4; /* 4 bytes per entry */ - int target_addr; - if (target_ir >= 0 && target_ir < (int)ir->ir_to_code_mapping_size) - target_addr = ir_to_code_mapping[target_ir]; - else - target_addr = ir_to_code_mapping[ir->next_instruction_index]; /* epilogue */ - int32_t offset = (int32_t)((target_addr | 1) - ref_point); - write32le(cur_text_section->data + entry_addr, (uint32_t)offset); - } - } -} - -/* ============================================================================ - * Phase-3 scratch conflict fixup - * ============================================================================ - * - * After the dry run has identified which instructions would push a register - * to the stack (no free scratch register available), this function tries to - * move the vreg currently occupying that register to a free callee-saved - * register. This eliminates the push/pop overhead for those instructions. - * - * Parameters: - * ir - current function IR state - * r - physical register that would be pushed at instruction insn_i - * insn_i - the instruction index where the push was noted - * - * Returns the new physical register on success, -1 if no reassignment could - * be made (e.g. all callee-saved registers are already occupied over the - * vreg's live range, or the interval is complex / 64-bit / float). - */ -static int try_reassign_scratch_conflict(TCCIRState *ir, int r, int insn_i) -{ - LSLiveIntervalState *ls = &ir->ls; - - /* Callee-saved registers R4-R11 (bits 4..11 = 0x0FF0), minus reserved - * special-purpose registers: - * R7 = R_FP (= 7): always reserved as frame pointer by the ARM backend. - * arm-thumb-gen.c: "Always reserve R7 (FP) and never allocate it as a - * general register." The linear-scan allocator never assigns vregs to R7, - * so it never appears in live_regs_by_instruction. We must exclude it - * here as well, otherwise we would clobber the frame pointer. - * R10 = static_chain_reg (= 10): reserved when function uses a static chain. - */ - const uint32_t ALL_CALLEE_SAVED = 0x0FF0u; - const uint32_t ARM_FP_REG = 7u; /* R_FP = R7, defined in arm-thumb-opcodes.h */ - uint32_t reserved = (1u << ARM_FP_REG); /* always exclude frame pointer */ - if (ir->has_static_chain) - reserved |= (1u << (uint32_t)architecture_config.static_chain_reg); - const uint32_t CALLEE_SAVED = ALL_CALLEE_SAVED & ~reserved; - - /* Find the LSLiveInterval holding r at instruction insn_i. */ - LSLiveInterval *ls_iv = NULL; - for (int k = 0; k < ls->next_interval_index; k++) - { - LSLiveInterval *iv = &ls->intervals[k]; - /* Only handle plain integer register allocations. */ - if (iv->reg_type != LS_REG_TYPE_INT) - continue; - if (iv->addrtaken || iv->stack_location != 0) - continue; - /* Skip 64-bit pairs — they need two adjacent registers. */ - if (iv->r1 >= 0 && iv->r1 < 16) - continue; - if (iv->r0 != r) - continue; - if ((int)iv->start > insn_i || (int)iv->end < insn_i) - continue; - ls_iv = iv; - break; - } - if (!ls_iv) - return -1; - - /* Get the IRLiveInterval for the same vreg to check for float/double/llong. */ - IRLiveInterval *ir_iv = tcc_ir_get_live_interval(ir, (int)ls_iv->vreg); - if (!ir_iv) - return -1; - /* Skip floating-point and 64-bit intervals. */ - if (ir_iv->is_float || ir_iv->is_double || ir_iv->is_llong || ir_iv->is_complex || ir_iv->use_vfp) - return -1; - /* Skip ABI-pinned intervals: function parameters and call return values have - * incoming_reg0 >= 0, meaning the hardware places the value in a specific - * register dictated by the calling convention. Changing the allocation would - * cause the codegen to look in the wrong register after a call/entry. */ - if (ir_iv->incoming_reg0 >= 0) - return -1; - - /* Compute the union of live register masks across [ls_iv->start .. ls_iv->end]. - * Any register set in this union is occupied by some other live vreg and - * cannot be used as the reassignment target. */ - uint32_t blocked = 0; - if (ls->live_regs_by_instruction) - { - for (int j = (int)ls_iv->start; j <= (int)ls_iv->end && j < ls->live_regs_by_instruction_size; j++) - blocked |= ls->live_regs_by_instruction[j]; - } - blocked |= (1u << r); /* keep r itself blocked so we don't choose it */ - - uint32_t avail = CALLEE_SAVED & ~blocked; - if (!avail) - return -1; - - int new_r = (int)__builtin_ctz(avail); /* lowest-numbered free callee-saved */ - - /* --- Apply the reassignment --- */ - - /* 1. Update the IRLiveInterval (read by tcc_ir_fill_registers_ir). */ - ir_iv->allocation.r0 = (uint16_t)new_r; - - /* 2. Update the LSLiveInterval (read by tcc_ls_build_live_regs_by_instruction - * and tcc_ls_find_free_scratch_reg). */ - ls_iv->r0 = (int16_t)new_r; - - /* 3. Patch live_regs_by_instruction for the interval's full range. */ - if (ls->live_regs_by_instruction) - { - for (int j = (int)ls_iv->start; j <= (int)ls_iv->end && j < ls->live_regs_by_instruction_size; j++) - { - ls->live_regs_by_instruction[j] &= ~(1u << r); - ls->live_regs_by_instruction[j] |= (1u << new_r); - } - } - - /* 4. Mark new_r as dirty so the prologue will save/restore it. */ - ls->dirty_registers |= (1ull << new_r); - - return new_r; -} - -/* ============================================================================ - * Helper: fill a single operand from register allocation results. - * Only called at old-path dispatch sites (MOP path fills via machine_op_from_ir). - * ============================================================================ */ -static void ir_fill_op(TCCIRState *ir, IROperand *op) -{ - if (irop_get_tag(*op) != IROP_TAG_NONE) - tcc_ir_fill_registers_ir(ir, op); -} - -/* ============================================================================ - * Main Code Generation Loop - * ============================================================================ */ - -void tcc_ir_codegen_generate(TCCIRState *ir) -{ - IRQuadCompact *cq; - int drop_return_value = 0; - -#ifdef TCC_REGALLOC_DEBUG - int _dbg_trace_all = 0; - { - extern const char *funcname; - fprintf(stderr, "[RA-FUNC] %s (insts=%d)\n", funcname ? funcname : "?", ir->next_instruction_index); - /* Enable full instruction trace for the target function */ - if (funcname && ir->next_instruction_index == 295) - { - const char *_target = "tcc_gen_machine_func_call_op"; - const char *_fn = funcname; - int _match = 1; - while (*_target && *_fn) - { - if (*_target++ != *_fn++) - { - _match = 0; - break; - } - } - if (_match && *_target == 0 && *_fn == 0) - _dbg_trace_all = 1; - } - } -#endif - -#ifdef TCC_REGALLOC_DEBUG - /* Print vreg statistics for size optimization analysis */ - { - int local_count = ir->next_local_variable; - int temp_count = ir->next_temporary_variable; - int param_count = ir->next_parameter; - int total_vregs = local_count + temp_count + param_count; - if (total_vregs > 1000) /* Only print for large functions */ - fprintf(stderr, "[VREG STATS] locals=%d temps=%d params=%d total=%d (max_encoded=%d)\n", local_count, temp_count, - param_count, total_vregs, - (local_count > temp_count ? local_count : temp_count) > param_count - ? (local_count > temp_count ? local_count : temp_count) - : param_count); - } -#endif - - /* `&&label` stores label positions as IR indices BEFORE DCE/compaction. - * Build a mapping for original indices, not just the compacted array indices. - */ - int max_orig_index = -1; - for (int i = 0; i < ir->next_instruction_index; i++) - { - if (ir->compact_instructions[i].orig_index > max_orig_index) - max_orig_index = ir->compact_instructions[i].orig_index; - } - if (max_orig_index < 0) - max_orig_index = 0; - - /* +1 to include epilogue when needed. - * Keep this mapping available after codegen (e.g. for &&label). */ - if (ir->ir_to_code_mapping) - { - tcc_free(ir->ir_to_code_mapping); - ir->ir_to_code_mapping = NULL; - ir->ir_to_code_mapping_size = 0; - } - ir->ir_to_code_mapping_size = ir->next_instruction_index + 1; - ir->ir_to_code_mapping = tcc_mallocz(sizeof(uint32_t) * ir->ir_to_code_mapping_size); - uint32_t *ir_to_code_mapping = ir->ir_to_code_mapping; - - if (ir->orig_ir_to_code_mapping) - { - tcc_free(ir->orig_ir_to_code_mapping); - ir->orig_ir_to_code_mapping = NULL; - ir->orig_ir_to_code_mapping_size = 0; - } - /* +1 extra slot for a synthetic epilogue mapping. - * Use 0xFFFFFFFF sentinel to distinguish "unmapped" from offset 0. */ - ir->orig_ir_to_code_mapping_size = max_orig_index + 2; - ir->orig_ir_to_code_mapping = tcc_malloc(sizeof(uint32_t) * ir->orig_ir_to_code_mapping_size); - uint32_t *orig_ir_to_code_mapping = ir->orig_ir_to_code_mapping; - memset(orig_ir_to_code_mapping, 0xFF, sizeof(uint32_t) * ir->orig_ir_to_code_mapping_size); - /* Track addresses of return jumps for later backpatching to epilogue */ - int *return_jump_addrs = tcc_malloc(sizeof(int) * ir->next_instruction_index); - int num_return_jumps = 0; - - /* Clear spill cache at function start */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - - /* Some peephole optimizations (LOAD/ASSIGN -> RETURNVALUE in R0, and skipping - * RETURNVALUE moves) are only valid when RETURNVALUE is reached by straight-line - * fallthrough from the immediately preceding instruction. - * - * If RETURNVALUE is a jump target (a control-flow merge), those peepholes can - * become incorrect: the preceding instruction might not execute on all paths, - * leaving the return value in a non-return register. - * - * Track which IR instruction indices are jump targets to guard these peepholes. - */ - uint8_t *has_incoming_jump = tcc_mallocz(ir->next_instruction_index ? ir->next_instruction_index : 1); - for (int i = 0; i < ir->next_instruction_index; ++i) - { - IRQuadCompact *p = &ir->compact_instructions[i]; - if (p->op == TCCIR_OP_JUMP || p->op == TCCIR_OP_JUMPIF) - { - /* Read jump target from IROperand pool */ - IROperand dest_irop = tcc_ir_op_get_dest(ir, p); - int target = (int)dest_irop.u.imm32; - if (target >= 0 && target < ir->next_instruction_index) - has_incoming_jump[target] = 1; - } - } - - /* Reserve outgoing call stack args area at the very bottom of the frame. - * This ensures prepared-call stack args are at call-time SP. - */ - if (ir->call_outgoing_size > 0) - { - loc -= ir->call_outgoing_size; - ir->call_outgoing_base = loc; - } - - int stack_size = (-loc + 7) & ~7; // align to 8 bytes - - /* ============================================================================ - * DRY RUN PASS: Analyze scratch register needs before emitting prologue - * ============================================================================ - * This discovers what scratch registers will be needed during code generation, - * allowing us to include them in the prologue (avoiding push/pop in loops). - */ - int original_leaffunc = ir->leaffunc; - uint32_t extra_prologue_regs = 0; - - /* If this function has a static chain (nested function), reserve R10 - * as callee-saved so the parent's static chain is preserved. - * R10 is the static chain register per architecture_config.static_chain_reg. */ - if (ir->has_static_chain) - { - extra_prologue_regs |= (1 << architecture_config.static_chain_reg); - } - - /* Phase-3 per-instruction scratch constraint recording. - * Allocated once per function; indexed by instruction index. - * dry_insn_scratch[i] = number of mach_alloc_scratch() calls at instruction i. - * dry_insn_saves[i] = bitmask of registers that would be PUSH'd at instruction i. - * Both arrays are declared before #if so they are visible in both passes. */ - int *dry_insn_scratch = tcc_mallocz(ir->next_instruction_index * sizeof(int)); - uint16_t *dry_insn_saves = tcc_mallocz(ir->next_instruction_index * sizeof(uint16_t)); - -#if 1 /* DRY_RUN_ENABLED */ - - /* Initialize dry-run state and branch optimization */ - tcc_gen_machine_dry_run_init(); - tcc_gen_machine_branch_opt_init(); - tcc_gen_machine_dry_run_start(); - - /* Reset scratch state for clean dry-run */ - tcc_gen_machine_reset_scratch_state(); - tcc_ir_spill_cache_clear(&ir->spill_cache); - - /* Save state that will be modified during dry run */ - int saved_ind = ind; - int saved_codegen_idx = ir->codegen_instruction_idx; - int saved_loc = loc; - int saved_call_outgoing_base = ir->call_outgoing_base; - - /* Run through all instructions without emitting. - * We call the actual codegen functions, but ot() is a no-op during dry-run. - * This ensures we exercise the exact same code paths for scratch allocation. */ - for (int i = 0; i < ir->next_instruction_index; i++) - { - ir->codegen_instruction_idx = i; - cq = &ir->compact_instructions[i]; - - /* Debug tracking: update current op for ot_check failure reporting */ - g_debug_current_op = (int)cq->op; - - /* Record address mapping for branch optimizer analysis */ - ir_to_code_mapping[i] = ind; - - /* Skip marker ops */ - if (cq->op == TCCIR_OP_ASM_INPUT || cq->op == TCCIR_OP_ASM_OUTPUT || cq->op == TCCIR_OP_NOP || - cq->op == TCCIR_OP_INLINE_ASM) - continue; - - /* Get operand copies from iroperand_pool */ - IROperand src1_ir = tcc_ir_op_get_src1(ir, cq); - IROperand src2_ir = tcc_ir_op_get_src2(ir, cq); - IROperand dest_ir = tcc_ir_op_get_dest(ir, cq); - - /* Operands are filled lazily: machine_op_from_ir fills via ir_fill_op for - * MOP-path operands; old-path dispatch sites call ir_fill_op explicitly. */ - - /* Mop path: use MachineOperand-based dispatch for simple 32-bit ops; - * the mach_* helpers in arm-thumb-gen.c handle all materialization. */ - bool use_mop_dp = false; - bool use_mop_assign = false; - bool use_mop_setif = false; - bool use_mop_bool = false; - bool use_mop_load = false; - bool use_mop_store = false; - bool use_mop_load_indexed = false; - bool use_mop_store_indexed = false; - bool use_mop_load_postinc = false; - bool use_mop_store_postinc = false; - bool use_mop_ijump = false; - bool use_mop_funcparam = false; - bool use_mop_returnvalue = false; - bool use_mop_muldiv = false; - bool use_mop_fp = false; - bool use_mop_vla = false; - bool use_mop_func_call = false; - switch (cq->op) - { - case TCCIR_OP_ADD: - case TCCIR_OP_SUB: - case TCCIR_OP_CMP: - case TCCIR_OP_SHL: - case TCCIR_OP_SHR: - case TCCIR_OP_SAR: - case TCCIR_OP_AND: - case TCCIR_OP_OR: - case TCCIR_OP_XOR: - case TCCIR_OP_ADC_GEN: - case TCCIR_OP_ADC_USE: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_dp = true; - break; - case TCCIR_OP_ASSIGN: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_assign = true; - break; - case TCCIR_OP_SETIF: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_setif = true; - break; - case TCCIR_OP_BOOL_OR: - case TCCIR_OP_BOOL_AND: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && !ir->has_static_chain) - use_mop_bool = true; - break; - case TCCIR_OP_LOAD: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_load = true; - break; - case TCCIR_OP_STORE: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store = true; - break; - case TCCIR_OP_LOAD_INDEXED: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_load_indexed = true; - break; - case TCCIR_OP_STORE_INDEXED: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store_indexed = true; - break; - case TCCIR_OP_LOAD_POSTINC: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_load_postinc = true; - break; - case TCCIR_OP_STORE_POSTINC: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store_postinc = true; - break; - case TCCIR_OP_IJUMP: - if (!ir->has_static_chain) - use_mop_ijump = true; - break; - case TCCIR_OP_FUNCPARAMVAL: - case TCCIR_OP_FUNCPARAMVOID: - use_mop_funcparam = true; - break; - case TCCIR_OP_RETURNVALUE: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_returnvalue = true; - break; - case TCCIR_OP_MUL: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_DIV: - case TCCIR_OP_UDIV: - case TCCIR_OP_IMOD: - case TCCIR_OP_UMOD: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_TEST_ZERO: - if (!irop_needs_pair(src1_ir) && !irop_is_64bit(src1_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_FADD: - case TCCIR_OP_FSUB: - case TCCIR_OP_FMUL: - case TCCIR_OP_FDIV: - case TCCIR_OP_FNEG: - case TCCIR_OP_FCMP: - case TCCIR_OP_CVT_FTOF: - case TCCIR_OP_CVT_ITOF: - case TCCIR_OP_CVT_FTOI: - if (!src1_ir.is_complex && !dest_ir.is_complex && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && - !irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_fp = true; - break; - case TCCIR_OP_VLA_ALLOC: - case TCCIR_OP_VLA_SP_SAVE: - case TCCIR_OP_VLA_SP_RESTORE: - if (!ir->has_static_chain) - use_mop_vla = true; - break; - case TCCIR_OP_FUNCCALLVAL: - case TCCIR_OP_FUNCCALLVOID: - if (!irop_needs_pair(dest_ir) && !dest_ir.is_complex && !ir->has_static_chain) - use_mop_func_call = true; - break; - default: - break; - } - - /* Call the actual codegen function - ot() will be a no-op in dry-run mode, - * but scratch allocation inside these functions will still be recorded */ - switch (cq->op) - { - case TCCIR_OP_LOAD: - { - bool load_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - load_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_load && !load_before_ret) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - - /* Sub-component access on register pairs (e.g., __imag__ on _Complex float). - * When a STACKOFF operand with a component offset gets rewritten to VREG by - * fill_registers_ir, the byte-offset delta is preserved in u.imm32: - * u.imm32 == 0 → first element (pr0_reg, e.g. real part) - * u.imm32 > 0 → second element (pr1_reg, e.g. imaginary part) - * This ONLY applies to LOAD sources — DP/ASSIGN operands must not be - * rewritten because a 64-bit interval allocated as a register pair - * can also have pr1_reg set with a non-zero u.imm32 (delta from - * fill_registers_ir), which is not a sub-component access. */ - if (mop_src.kind == MACH_OP_REG && !src1_ir.is_lval && src1_ir.pr1_reg != (int)PREG_REG_NONE && - src1_ir.u.imm32 != 0) - { - mop_src.u.reg.r0 = (int)src1_ir.pr1_reg; - mop_src.u.reg.r1 = -1; - mop_src.needs_deref = false; - } - - if (mop_dest.kind == MACH_OP_REG && !mop_dest.needs_deref && mop_dest.u.reg.r0 != (int)PREG_REG_NONE) - { - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_mop(mop_src, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - tcc_gen_machine_load_op(dest_ir, src1_ir); - } - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_load_op(dest_ir, src1_ir); - } - break; - } - case TCCIR_OP_STORE: - { - if (use_mop_store) - { - MachineOperand mop_dest_s = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src_s = machine_op_from_ir(ir, &src1_ir); - /* Sub-component fixup for STORE value — same logic as LOAD source. */ - if (mop_src_s.kind == MACH_OP_REG && !src1_ir.is_lval && src1_ir.pr1_reg != (int)PREG_REG_NONE && - src1_ir.u.imm32 != 0) - { - mop_src_s.u.reg.r0 = (int)src1_ir.pr1_reg; - mop_src_s.u.reg.r1 = -1; - mop_src_s.needs_deref = false; - } - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_mop(mop_dest_s, mop_src_s, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_store_op(dest_ir, src1_ir, cq->op); - } - break; - } - case TCCIR_OP_LOAD_INDEXED: - { - bool load_indexed_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - load_indexed_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_load_indexed && !load_indexed_before_ret) - { - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_base = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_indexed_mop(mop_dest, mop_base, mop_index, mop_scale, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - IROperand base_op = src1_ir; - IROperand index_op = src2_ir; - IROperand scale_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &base_op); - ir_fill_op(ir, &index_op); - tcc_gen_machine_load_indexed_op(dest_ir, base_op, index_op, scale_op); - } - break; - } - case TCCIR_OP_STORE_INDEXED: - { - if (use_mop_store_indexed) - { - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_base = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_indexed_mop(mop_base, mop_index, mop_scale, mop_value, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - IROperand base_op = dest_ir; - IROperand index_op = src2_ir; - IROperand scale_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &base_op); - ir_fill_op(ir, &index_op); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_store_indexed_op(base_op, index_op, scale_op, src1_ir); - } - break; - } - case TCCIR_OP_LOAD_POSTINC: - { - if (use_mop_load_postinc) - { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_ptr = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_postinc_mop(mop_dest, mop_ptr, mop_offset, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - IROperand ptr_op = src1_ir; - IROperand offset_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &ptr_op); - tcc_gen_machine_load_postinc_op(dest_ir, ptr_op, offset_op); - } - break; - } - case TCCIR_OP_STORE_POSTINC: - { - if (use_mop_store_postinc) - { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_ptr = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_postinc_mop(mop_ptr, mop_value, mop_offset, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - IROperand ptr_op = dest_ir; - IROperand value_op = src1_ir; - IROperand offset_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &ptr_op); - ir_fill_op(ir, &value_op); - tcc_gen_machine_store_postinc_op(ptr_op, value_op, offset_op); - } - break; - } - case TCCIR_OP_LEA: - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_lea_op(dest_ir, src1_ir, cq->op); - break; - case TCCIR_OP_ASSIGN: - { - /* Skip MOP path when next instruction is RETURNVALUE targeting same vreg, - * because the real-run applies a peephole (dest→R0) that doesn't exist in - * the dry-run — the resulting dry/real scratch mismatch would corrupt the - * Phase-3 fixup. The has_incoming_jump guard mirrors the real-run peephole - * condition so both passes make the same MOP/legacy decision. */ - bool assign_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - assign_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_assign && !assign_before_ret) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_assign_mop(mop_src, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - TCC_MACH_DBG( - "[DBG-ASSIGN] i=%d dest btype=%d pr0=%d pr1=%d is64=%d needs_pair=%d src btype=%d pr0=%d pr1=%d is64=%d\n", - i, irop_get_btype(dest_ir), dest_ir.pr0_reg, dest_ir.pr1_reg, irop_is_64bit(dest_ir), - irop_needs_pair(dest_ir), irop_get_btype(src1_ir), src1_ir.pr0_reg, src1_ir.pr1_reg, - irop_is_64bit(src1_ir)); - tcc_gen_machine_assign_op(dest_ir, src1_ir, cq->op); - } - break; - } - case TCCIR_OP_RETURNVALUE: - if (use_mop_returnvalue) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_return_value_mop(mop_src, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_return_value_op(src1_ir, cq->op); - } - break; - case TCCIR_OP_RETURNVOID: - /* No scratch allocation needed */ - break; - case TCCIR_OP_JUMP: - /* Record branch for optimization analysis (ot() is no-op during dry-run) */ - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_jump_op(cq->op, dest_ir, i); - break; - case TCCIR_OP_JUMPIF: - /* Record branch for optimization analysis (ot() is no-op during dry-run) */ - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_conditional_jump_op(src1_ir, cq->op, dest_ir, i); - break; - case TCCIR_OP_MUL: - case TCCIR_OP_DIV: - case TCCIR_OP_UDIV: - case TCCIR_OP_IMOD: - case TCCIR_OP_UMOD: - case TCCIR_OP_TEST_ZERO: - if (use_mop_muldiv) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_muldiv_mop(mop_src1, mop_src2, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - } - break; - case TCCIR_OP_MLA: - case TCCIR_OP_UMULL: - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - break; - case TCCIR_OP_ADD: - case TCCIR_OP_SUB: - case TCCIR_OP_CMP: - case TCCIR_OP_SHL: - case TCCIR_OP_SHR: - case TCCIR_OP_SAR: - case TCCIR_OP_OR: - case TCCIR_OP_AND: - case TCCIR_OP_XOR: - case TCCIR_OP_ADC_GEN: - case TCCIR_OP_ADC_USE: - if (use_mop_dp) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_data_processing_mop(mop_src1, mop_src2, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - } - break; - case TCCIR_OP_IJUMP: - if (use_mop_ijump) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_indirect_jump_mop(mop_src, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_indirect_jump_op(src1_ir); - } - break; - case TCCIR_OP_SWITCH_TABLE: - { - /* Dry-run: compute exact table size so branch offsets are accurate. - * Layout: ADD.W(4) + LDR.W(4) + ADD.W(4) + BX(2) = 14 bytes preamble - * + 4 bytes per table entry (32-bit signed PC-relative offsets). */ - int table_id = (int)irop_get_imm64_ex(ir, src2_ir); - TCCIRSwitchTable *table = &ir->switch_tables[table_id]; - int table_data_size = table->num_entries * 4; /* 4 bytes per entry */ - ind += 14; /* preamble instructions */ - ind += table_data_size; /* Jump table entries */ - break; - } - case TCCIR_OP_SETIF: - if (use_mop_setif) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_setif_mop(mop_src, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_setif_op(dest_ir, src1_ir, cq->op); - } - break; - case TCCIR_OP_BOOL_OR: - case TCCIR_OP_BOOL_AND: - if (use_mop_bool) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_bool_mop(mop_src1, mop_src2, mop_dest, cq->op); - dry_insn_scratch[i] = tcc_gen_machine_insn_scratch_count(); - dry_insn_saves[i] = tcc_gen_machine_insn_scratch_saves_mask(); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_bool_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_FUNCCALLVOID: - case TCCIR_OP_FUNCCALLVAL: - if (use_mop_func_call) - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_func_call_mop(src1_ir, src2_ir, mop_dest, 0, ir, i); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_func_call_op(src1_ir, src2_ir, dest_ir, 0, ir, i); - } - if (ir->has_static_chain) - tcc_gen_machine_restore_chain(); - break; - case TCCIR_OP_SET_CHAIN: - /* Static chain setup: move FP to static chain register */ - tcc_gen_machine_set_chain(); - break; - case TCCIR_OP_INIT_CHAIN_SLOT: - /* Store parent FP into chain slot for nested function trampoline */ - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_init_chain_slot(src1_ir); - break; - case TCCIR_OP_FUNCPARAMVAL: - case TCCIR_OP_FUNCPARAMVOID: - if (use_mop_funcparam) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - /* No scratch tracking: FUNCPARAM does not allocate scratch registers */ - tcc_gen_machine_func_parameter_mop(mop_src1, mop_src2, cq->op); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_func_parameter_op(src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_FADD: - case TCCIR_OP_FSUB: - case TCCIR_OP_FMUL: - case TCCIR_OP_FDIV: - case TCCIR_OP_FNEG: - case TCCIR_OP_FCMP: - case TCCIR_OP_CVT_FTOF: - case TCCIR_OP_CVT_ITOF: - case TCCIR_OP_CVT_FTOI: - if (use_mop_fp) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_fp_mop(mop_src1, mop_src2, mop_dest, cq->op); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_fp_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_VLA_ALLOC: - case TCCIR_OP_VLA_SP_SAVE: - case TCCIR_OP_VLA_SP_RESTORE: - if (use_mop_vla) - { - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - tcc_gen_machine_vla_mop(mop_dest, mop_src1, mop_src2, cq->op); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_vla_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_TRAP: - tcc_gen_machine_trap_op(); - break; - default: - /* Unknown op - skip */ - break; - } - - /* Clean up scratch register state */ - tcc_gen_machine_end_instruction(); - } - - /* End dry-run and analyze results */ - tcc_gen_machine_dry_run_end(); - - /* Analyze branch offsets and select optimal encodings */ - tcc_gen_machine_branch_opt_analyze(ir_to_code_mapping, ir->next_instruction_index); - - /* Check if LR was pushed during dry run in a leaf function */ - if (original_leaffunc && tcc_gen_machine_dry_run_get_lr_push_count() > 0) - { - /* LR was pushed in loop - save at prologue instead */ - extra_prologue_regs |= (1 << 14); /* R_LR */ - /* NOTE: We don't modify ir->leaffunc here because optimizations may depend on it. - * The extra_prologue_regs will ensure LR is pushed in the prologue, making it - * available as scratch without push/pop in loops, which is the main goal. */ - } - - /* Restore state for real code generation */ - ind = saved_ind; - loc = saved_loc; - ir->call_outgoing_base = saved_call_outgoing_base; - ir->codegen_instruction_idx = saved_codegen_idx; - - /* Phase-3 scratch conflict fixup. - * For each mop instruction where the dry run needed to PUSH a register - * (because no caller-saved scratch was free), try to move the blocking vreg - * to a free callee-saved register. This eliminates the push/pop at that - * instruction at the cost of one extra callee-saved register in the prologue. - */ - { - int any_fixup = 0; - for (int i = 0; i < ir->next_instruction_index; i++) - { - uint16_t saves = dry_insn_saves[i]; - if (!saves) - continue; - while (saves) - { - int r = (int)__builtin_ctz(saves); - saves = (uint16_t)(saves & (saves - 1u)); - int new_r = try_reassign_scratch_conflict(ir, r, i); - if (new_r >= 0) - { - /* Clear the recorded dry-run scratch count for this instruction so - * the debug consistency check accepts the improved real-emit count. */ - dry_insn_scratch[i] = 0; - any_fixup = 1; - } - } - } - if (any_fixup) - { - /* Invalidate the liveness cache so real-emit sees the new assignments. */ - tcc_ls_reset_scratch_cache(&ir->ls); - } - } - - /* Reset scratch state for real pass */ - tcc_gen_machine_reset_scratch_state(); - - /* Clear caches for fresh start - dry-run may have recorded entries - * but the actual instructions were never emitted */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - tcc_ir_opt_fp_cache_clear(ir); -#endif /* DRY_RUN_DISABLED */ - - /* ============================================================================ - * REAL CODE GENERATION PASS - * ============================================================================ - */ - - // generate prolog (with extra registers if needed) - (void)original_leaffunc; /* May be unused when dry-run is disabled */ - if (!ir->naked) - tcc_gen_machine_prolog(ir->leaffunc, ir->ls.dirty_registers, stack_size, extra_prologue_regs); - - /* Emit DWARF prologue_end AFTER machine prolog so the debugger knows - * where the prologue ends and sets breakpoints at the correct address. - * Previously this was emitted in tccgen.c before any machine code existed, - * causing breakpoints to land far from the actual prolog. */ - if (!ir->naked) - tcc_debug_prolog_epilog(tcc_state, 0); - - for (int i = 0; i < ir->next_instruction_index; i++) - { - drop_return_value = 0; - cq = &ir->compact_instructions[i]; - - /* Default: no extra scratch constraints for this instruction. */ - ir->codegen_materialize_scratch_flags = 0; - - /* Track current instruction for scratch register allocation */ - ir->codegen_instruction_idx = i; - - /* Debug tracking: let ot_check print the current IR op on failure */ - g_debug_current_op = (int)cq->op; - - ir_to_code_mapping[i] = ind; - - if (cq->orig_index >= 0 && cq->orig_index < ir->orig_ir_to_code_mapping_size) - orig_ir_to_code_mapping[cq->orig_index] = ind; - - // emit debug line info for this IR instruction AFTER recording ind - tcc_debug_line_num(tcc_state, cq->line_num); - - /* Get operand copies from iroperand_pool (compact representation) */ - IROperand src1_ir = tcc_ir_op_get_src1(ir, cq); - IROperand src2_ir = tcc_ir_op_get_src2(ir, cq); - IROperand dest_ir = tcc_ir_op_get_dest(ir, cq); - - /* Peephole for LOAD/ASSIGN/LOAD_INDEXED followed by RETURNVALUE: - * Update the live interval to use R0 BEFORE register allocation. - * This ensures the load result goes directly to the return register. - */ - if (cq->op == TCCIR_OP_LOAD || cq->op == TCCIR_OP_ASSIGN || cq->op == TCCIR_OP_LOAD_INDEXED) - { - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand next_src1 = tcc_ir_op_get_src1(ir, ir_next); - int next_vr = irop_get_vreg(next_src1); - int dest_vr = irop_get_vreg(dest_ir); - if (next_vr == dest_vr && next_vr >= 0) - { - IRLiveInterval *li = tcc_ir_get_live_interval(ir, dest_vr); - if (li && li->allocation.r0 != REG_IRET) - { -#ifdef TCC_REGALLOC_DEBUG - fprintf(stderr, "[RA-PEEPHOLE] i=%d op=%d dest_vr=0x%x old_r0=%d -> R0 (RETURNVALUE next)\n", i, cq->op, - dest_vr, li->allocation.r0); -#endif - li->allocation.r0 = REG_IRET; - li->allocation.offset = 0; - if (li->is_llong || li->is_double) - li->allocation.r1 = REG_IRE2; - } - } - } - } - - /* Operands are filled lazily: machine_op_from_ir fills via ir_fill_op for - * MOP-path operands; old-path dispatch sites call ir_fill_op explicitly. */ - - /* Mop path: use MachineOperand-based dispatch for simple 32-bit ops; - * the mach_* helpers in arm-thumb-gen.c handle all materialization. */ - bool use_mop_dp = false; - bool use_mop_assign = false; - bool use_mop_setif = false; - bool use_mop_bool = false; - bool use_mop_load = false; - bool use_mop_store = false; - bool use_mop_load_indexed = false; - bool use_mop_store_indexed = false; - bool use_mop_load_postinc = false; - bool use_mop_store_postinc = false; - bool use_mop_ijump = false; - bool use_mop_funcparam = false; - bool use_mop_returnvalue = false; - bool use_mop_muldiv = false; - bool use_mop_fp = false; - bool use_mop_vla = false; - bool use_mop_func_call = false; - switch (cq->op) - { - case TCCIR_OP_ADD: - case TCCIR_OP_SUB: - case TCCIR_OP_CMP: - case TCCIR_OP_SHL: - case TCCIR_OP_SHR: - case TCCIR_OP_SAR: - case TCCIR_OP_AND: - case TCCIR_OP_OR: - case TCCIR_OP_XOR: - case TCCIR_OP_ADC_GEN: - case TCCIR_OP_ADC_USE: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_dp = true; - break; - case TCCIR_OP_ASSIGN: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_assign = true; - break; - case TCCIR_OP_SETIF: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_setif = true; - break; - case TCCIR_OP_BOOL_OR: - case TCCIR_OP_BOOL_AND: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && !ir->has_static_chain) - use_mop_bool = true; - break; - case TCCIR_OP_LOAD: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_load = true; - break; - case TCCIR_OP_STORE: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store = true; - break; - case TCCIR_OP_LOAD_INDEXED: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_load_indexed = true; - break; - case TCCIR_OP_STORE_INDEXED: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store_indexed = true; - break; - case TCCIR_OP_LOAD_POSTINC: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_load_postinc = true; - break; - case TCCIR_OP_STORE_POSTINC: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_store_postinc = true; - break; - case TCCIR_OP_IJUMP: - if (!ir->has_static_chain) - use_mop_ijump = true; - break; - case TCCIR_OP_FUNCPARAMVAL: - case TCCIR_OP_FUNCPARAMVOID: - use_mop_funcparam = true; - break; - case TCCIR_OP_RETURNVALUE: - if (!irop_needs_pair(src1_ir) && !ir->has_static_chain) - use_mop_returnvalue = true; - break; - case TCCIR_OP_MUL: - if (!irop_needs_pair(dest_ir) && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_DIV: - case TCCIR_OP_UDIV: - case TCCIR_OP_IMOD: - case TCCIR_OP_UMOD: - if (!irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_TEST_ZERO: - if (!irop_needs_pair(src1_ir) && !irop_is_64bit(src1_ir) && !ir->has_static_chain) - use_mop_muldiv = true; - break; - case TCCIR_OP_FADD: - case TCCIR_OP_FSUB: - case TCCIR_OP_FMUL: - case TCCIR_OP_FDIV: - case TCCIR_OP_FNEG: - case TCCIR_OP_FCMP: - case TCCIR_OP_CVT_FTOF: - case TCCIR_OP_CVT_ITOF: - case TCCIR_OP_CVT_FTOI: - if (!src1_ir.is_complex && !dest_ir.is_complex && !irop_needs_pair(src1_ir) && !irop_needs_pair(src2_ir) && - !irop_needs_pair(dest_ir) && !ir->has_static_chain) - use_mop_fp = true; - break; - case TCCIR_OP_VLA_ALLOC: - case TCCIR_OP_VLA_SP_SAVE: - case TCCIR_OP_VLA_SP_RESTORE: - if (!ir->has_static_chain) - use_mop_vla = true; - break; - case TCCIR_OP_FUNCCALLVAL: - case TCCIR_OP_FUNCCALLVOID: - if (!irop_needs_pair(dest_ir) && !dest_ir.is_complex && !ir->has_static_chain) - use_mop_func_call = true; - break; - default: - break; - } - -#ifdef TCC_REGALLOC_DEBUG - /* Trace reads register fields; fill is now lazy so create filled local copies. */ - IROperand trc_s1 = src1_ir, trc_s2 = src2_ir, trc_d = dest_ir; - ir_fill_op(ir, &trc_s1); - ir_fill_op(ir, &trc_s2); - ir_fill_op(ir, &trc_d); - /* Full instruction trace for target function */ - if (_dbg_trace_all) - { - IROperand raw_s1 = tcc_ir_op_get_src1(ir, cq); - IROperand raw_s2 = tcc_ir_op_get_src2(ir, cq); - IROperand raw_d = tcc_ir_op_get_dest(ir, cq); - fprintf(stderr, - "[RA-TRACE] i=%d op=%d s1_vr=0x%x s1_pr0=%d s2_vr=0x%x s2_pr0=%d d_vr=0x%x d_pr0=%d s1_tag=%d d_tag=%d\n", - i, cq->op, irop_get_vreg(raw_s1), trc_s1.pr0_reg, irop_get_vreg(raw_s2), trc_s2.pr0_reg, - irop_get_vreg(raw_d), trc_d.pr0_reg, irop_get_tag(trc_s1), irop_get_tag(trc_d)); - } - - /* Diagnostic: for LOAD instructions, log ALL source vreg details */ - if (cq->op == TCCIR_OP_LOAD) - { - IROperand raw_src1 = tcc_ir_op_get_src1(ir, cq); - int raw_tag = irop_get_tag(raw_src1); - if (raw_tag == IROP_TAG_VREG || raw_tag == 2 /* IROP_TAG_VREG_LVAL */) - { - int src_vreg = irop_get_vreg(raw_src1); - if (src_vreg > 0) - { - IRLiveInterval *dbg_li = tcc_ir_get_live_interval(ir, src_vreg); - if (dbg_li) - fprintf( - stderr, - "[RA-LOAD] i=%d src_vreg=0x%x alloc.r0=%d pr0_reg=%d dest_pr0=%d tag=%d lval=%d local=%d spill=%d\n", i, - src_vreg, dbg_li->allocation.r0, trc_s1.pr0_reg, trc_d.pr0_reg, irop_get_tag(trc_s1), trc_s1.is_lval, - trc_s1.is_local, trc_s1.pr0_spilled); - } - } - } - /* Also log AND/OR/ADD operations that might show the register mismatch */ - if (cq->op == TCCIR_OP_AND || cq->op == TCCIR_OP_OR) - { - IROperand raw_dest = tcc_ir_op_get_dest(ir, cq); - IROperand raw_src1 = tcc_ir_op_get_src1(ir, cq); - fprintf( - stderr, - "[RA-ALU] i=%d op=%d src1_pr0=%d src2_pr0=%d dest_pr0=%d src1_tag=%d dest_tag=%d src1_vr=0x%x dest_vr=0x%x\n", - i, cq->op, trc_s1.pr0_reg, trc_s2.pr0_reg, trc_d.pr0_reg, irop_get_tag(trc_s1), irop_get_tag(trc_d), - irop_get_vreg(raw_src1), irop_get_vreg(raw_dest)); - } - /* Log ASSIGN operations */ - if (cq->op == TCCIR_OP_ASSIGN) - { - IROperand raw_dest = tcc_ir_op_get_dest(ir, cq); - IROperand raw_src1 = tcc_ir_op_get_src1(ir, cq); - fprintf(stderr, "[RA-ASSIGN] i=%d src1_pr0=%d dest_pr0=%d src1_tag=%d dest_tag=%d src1_vr=0x%x dest_vr=0x%x\n", i, - trc_s1.pr0_reg, trc_d.pr0_reg, irop_get_tag(trc_s1), irop_get_tag(trc_d), irop_get_vreg(raw_src1), - irop_get_vreg(raw_dest)); - } -#endif - - switch (cq->op) - { - case TCCIR_OP_MUL: - case TCCIR_OP_DIV: - case TCCIR_OP_UDIV: - case TCCIR_OP_IMOD: - case TCCIR_OP_UMOD: - case TCCIR_OP_TEST_ZERO: - if (use_mop_muldiv) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_muldiv_mop(mop_src1, mop_src2, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - } - break; - case TCCIR_OP_MLA: - case TCCIR_OP_UMULL: - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - break; - case TCCIR_OP_ADD: - case TCCIR_OP_SUB: - case TCCIR_OP_CMP: - case TCCIR_OP_SHL: - case TCCIR_OP_SHR: - case TCCIR_OP_SAR: - case TCCIR_OP_OR: - case TCCIR_OP_AND: - case TCCIR_OP_XOR: - case TCCIR_OP_ADC_GEN: - case TCCIR_OP_ADC_USE: - if (use_mop_dp) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_data_processing_mop(mop_src1, mop_src2, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - /* Phase-3 consistency check: dry-run and real-emit scratch counts must agree. - * A mismatch is expected (and acceptable) for instructions where the scratch - * conflict fixup was applied (dry_insn_saves != 0 means fixup was attempted). */ - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_data_processing_op(src1_ir, src2_ir, dest_ir, cq->op); - } - break; - case TCCIR_OP_FADD: - case TCCIR_OP_FSUB: - case TCCIR_OP_FMUL: - case TCCIR_OP_FDIV: - case TCCIR_OP_FNEG: - case TCCIR_OP_FCMP: - case TCCIR_OP_CVT_FTOF: - case TCCIR_OP_CVT_ITOF: - case TCCIR_OP_CVT_FTOI: - if (use_mop_fp) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_fp_mop(mop_src1, mop_src2, mop_dest, cq->op); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_fp_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_LOAD: - { - bool load_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - load_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_load && !load_before_ret) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - - /* Sub-component fixup for LOAD sources — see dry-run comment above. */ - if (mop_src.kind == MACH_OP_REG && !src1_ir.is_lval && src1_ir.pr1_reg != (int)PREG_REG_NONE && - src1_ir.u.imm32 != 0) - { - mop_src.u.reg.r0 = (int)src1_ir.pr1_reg; - mop_src.u.reg.r1 = -1; - mop_src.needs_deref = false; - } - - if (mop_dest.kind == MACH_OP_REG && !mop_dest.needs_deref && mop_dest.u.reg.r0 != (int)PREG_REG_NONE) - { - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_mop(mop_src, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, - dry_insn_scratch[i], real_scratch); - } -#endif - } - else - { - /* Dest not a simple register: fall back to old path. */ - tcc_gen_machine_load_op(dest_ir, src1_ir); - } - } - else - { - /* Old path with RETURNVALUE peephole */ - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - int ir_next_src1_vr = -1; - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE) - { - IROperand next_src1_irop = tcc_ir_op_get_src1(ir, ir_next); - ir_next_src1_vr = irop_get_vreg(next_src1_irop); - } - const int dest_vreg = irop_get_vreg(dest_ir); - int is_64bit_load = irop_is_64bit(dest_ir); - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && ir_next_src1_vr == dest_vreg && !has_incoming_jump[i + 1]) - { - dest_ir.pr0_reg = REG_IRET; /* R0 */ - dest_ir.pr0_spilled = 0; - if (is_64bit_load) - { - dest_ir.pr1_reg = REG_IRE2; /* R1 */ - dest_ir.pr1_spilled = 0; - } - /* Also update the interval allocation so that RETURNVALUE's src1 gets the same registers */ - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, dest_vreg); - if (interval) - { - interval->allocation.r0 = REG_IRET; - if (is_64bit_load) - interval->allocation.r1 = REG_IRE2; - } - } - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_load_op(dest_ir, src1_ir); - } - break; - } - case TCCIR_OP_STORE: - { - if (use_mop_store) - { - MachineOperand mop_dest_s = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src_s = machine_op_from_ir(ir, &src1_ir); - /* Sub-component fixup for STORE value — same logic as LOAD source. */ - if (mop_src_s.kind == MACH_OP_REG && !src1_ir.is_lval && src1_ir.pr1_reg != (int)PREG_REG_NONE && - src1_ir.u.imm32 != 0) - { - mop_src_s.u.reg.r0 = (int)src1_ir.pr1_reg; - mop_src_s.u.reg.r1 = -1; - mop_src_s.needs_deref = false; - } - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_mop(mop_dest_s, mop_src_s, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_store_op(dest_ir, src1_ir, cq->op); - } - break; - } - case TCCIR_OP_LOAD_INDEXED: - { - /* LOAD_INDEXED: dest = *(base + (index << scale)) */ - bool load_indexed_before_ret = false; - { - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, ir_next); - load_indexed_before_ret = (irop_get_vreg(nq_src1) == irop_get_vreg(dest_ir)); - } - } - if (use_mop_load_indexed && !load_indexed_before_ret) - { - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_base = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_indexed_mop(mop_dest, mop_base, mop_index, mop_scale, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - /* Old path with RETURNVALUE peephole — load directly into R0 if next is RETURNVALUE */ - IROperand base_op = src1_ir; - IROperand index_op = src2_ir; - IROperand scale_op = tcc_ir_op_get_scale(ir, cq); - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - const int dest_vreg = irop_get_vreg(dest_ir); - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && load_indexed_before_ret && !has_incoming_jump[i + 1]) - { - dest_ir.pr0_reg = REG_IRET; - dest_ir.pr0_spilled = 0; - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, dest_vreg); - if (interval) - interval->allocation.r0 = REG_IRET; - } - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &base_op); - ir_fill_op(ir, &index_op); - tcc_gen_machine_load_indexed_op(dest_ir, base_op, index_op, scale_op); - } - break; - } - case TCCIR_OP_STORE_INDEXED: - { - /* STORE_INDEXED: *(base + (index << scale)) = value */ - if (use_mop_store_indexed) - { - IROperand scale_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_base = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_index = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_scale = machine_op_from_ir(ir, &scale_raw); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_indexed_mop(mop_base, mop_index, mop_scale, mop_value, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - IROperand base_op = dest_ir; - IROperand value_op = src1_ir; - IROperand index_op = src2_ir; - IROperand scale_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &base_op); - ir_fill_op(ir, &value_op); - ir_fill_op(ir, &index_op); - tcc_gen_machine_store_indexed_op(base_op, index_op, scale_op, value_op); - } - break; - } - case TCCIR_OP_LOAD_POSTINC: - { - /* LOAD_POSTINC: dest = *ptr; ptr += offset */ - if (use_mop_load_postinc) - { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_ptr = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_load_postinc_mop(mop_dest, mop_ptr, mop_offset, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - IROperand ptr_op = src1_ir; - IROperand offset_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &ptr_op); - tcc_gen_machine_load_postinc_op(dest_ir, ptr_op, offset_op); - } - break; - } - case TCCIR_OP_STORE_POSTINC: - { - /* STORE_POSTINC: *ptr = value; ptr += offset */ - if (use_mop_store_postinc) - { - IROperand offset_raw = tcc_ir_op_get_scale(ir, cq); - MachineOperand mop_ptr = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_value = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_offset = machine_op_from_ir(ir, &offset_raw); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_store_postinc_mop(mop_ptr, mop_value, mop_offset, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - IROperand ptr_op = dest_ir; - IROperand value_op = src1_ir; - IROperand offset_op = tcc_ir_op_get_scale(ir, cq); - ir_fill_op(ir, &ptr_op); - ir_fill_op(ir, &value_op); - tcc_gen_machine_store_postinc_op(ptr_op, value_op, offset_op); - } - break; - } - case TCCIR_OP_RETURNVALUE: - { - if (use_mop_returnvalue) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_return_value_mop(mop_src, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - /* Peephole: if previous instruction was LOAD/ASSIGN that already loaded to R0, - * skip the return value copy. */ - const IRQuadCompact *ir_prev = (i > 0) ? &ir->compact_instructions[i - 1] : NULL; - int skip_copy = 0; - if (!has_incoming_jump[i] && ir_prev && (ir_prev->op == TCCIR_OP_LOAD || ir_prev->op == TCCIR_OP_ASSIGN)) - { - IROperand prev_dest_irop = tcc_ir_op_get_dest(ir, ir_prev); - const int prev_dest_vreg = irop_get_vreg(prev_dest_irop); - const int src1_vreg = irop_get_vreg(src1_ir); - if (prev_dest_vreg == src1_vreg) - { - IRLiveInterval *prev_interval = tcc_ir_get_live_interval(ir, prev_dest_vreg); - if (prev_interval && prev_interval->allocation.r0 == REG_IRET) - skip_copy = 1; - } - } - if (!skip_copy) - { - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_return_value_op(src1_ir, cq->op); - } - } - } - case TCCIR_OP_RETURNVOID: - /* Emit jump to epilogue (will be backpatched later) */ - /* if return is last instruction, then jump is not needed */ - if (i != ir->next_instruction_index - 1) - { - return_jump_addrs[num_return_jumps++] = ind; - /* Return jumps target the epilogue (-1 indicates no IR target) */ - tcc_gen_machine_jump_op(cq->op, dest_ir, i); - } - break; - case TCCIR_OP_ASSIGN: - { - /* Peephole: if next instruction is RETURNVALUE using this ASSIGN's dest, - * assign directly to R0 to avoid an extra move */ - const IRQuadCompact *ir_next = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - int ir_next_src1_vr = -1; - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE) - { - IROperand next_src1_irop = tcc_ir_op_get_src1(ir, ir_next); - ir_next_src1_vr = irop_get_vreg(next_src1_irop); - } - const int assign_dest_vreg = irop_get_vreg(dest_ir); - if (ir_next && ir_next->op == TCCIR_OP_RETURNVALUE && ir_next_src1_vr == assign_dest_vreg && - !has_incoming_jump[i + 1]) - { - dest_ir.pr0_reg = REG_IRET; /* R0 */ - dest_ir.pr0_spilled = 0; - if (irop_is_64bit(dest_ir)) - { - dest_ir.pr1_reg = REG_IRE2; /* R1 */ - dest_ir.pr1_spilled = 0; - } - /* Update the interval allocation so RETURNVALUE sees the change */ - IRLiveInterval *interval = tcc_ir_get_live_interval(ir, assign_dest_vreg); - if (interval) - { - interval->allocation.r0 = REG_IRET; - if (irop_is_64bit(dest_ir)) - interval->allocation.r1 = REG_IRE2; - } - } - /* Same assign_before_ret guard as the dry-run: keep both passes consistent. */ - bool assign_before_ret = false; - { - const IRQuadCompact *nq = (i + 1 < ir->next_instruction_index) ? &ir->compact_instructions[i + 1] : NULL; - if (nq && nq->op == TCCIR_OP_RETURNVALUE && !has_incoming_jump[i + 1]) - { - IROperand nq_src1 = tcc_ir_op_get_src1(ir, nq); - assign_before_ret = (irop_get_vreg(nq_src1) == assign_dest_vreg); - } - } - if (use_mop_assign && !assign_before_ret) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_assign_mop(mop_src, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_assign_op(dest_ir, src1_ir, cq->op); - } - break; - } - case TCCIR_OP_LEA: - /* Load Effective Address: compute address of src1 into dest */ - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_lea_op(dest_ir, src1_ir, cq->op); - break; - case TCCIR_OP_FUNCPARAMVAL: - case TCCIR_OP_FUNCPARAMVOID: - { - if (use_mop_funcparam) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - /* No scratch tracking: FUNCPARAM does not allocate scratch registers */ - tcc_gen_machine_func_parameter_mop(mop_src1, mop_src2, cq->op); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_func_parameter_op(src1_ir, src2_ir, cq->op); - } - break; - } - case TCCIR_OP_JUMP: - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_jump_op(cq->op, dest_ir, i); - /* Update mapping to actual instruction address (may have shifted due to literal pool) */ - ir_to_code_mapping[i] = ind - (tcc_gen_machine_branch_opt_get_encoding(i) == 16 ? 2 : 4); - /* Clear spill cache at branch - value may come from different path */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - break; - case TCCIR_OP_JUMPIF: - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_conditional_jump_op(src1_ir, cq->op, dest_ir, i); - /* Update mapping to actual instruction address (may have shifted due to literal pool) */ - ir_to_code_mapping[i] = ind - (tcc_gen_machine_branch_opt_get_encoding(i) == 16 ? 2 : 4); - /* Clear spill cache at conditional branch - target may have different values */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - break; - case TCCIR_OP_IJUMP: - if (use_mop_ijump) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_indirect_jump_mop(mop_src, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_indirect_jump_op(src1_ir); - } - tcc_ir_spill_cache_clear(&ir->spill_cache); - break; - case TCCIR_OP_SWITCH_TABLE: - { - int table_id = (int)irop_get_imm64_ex(ir, src2_ir); - TCCIRSwitchTable *table = &ir->switch_tables[table_id]; - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_switch_table_op(src1_ir, table, ir, i); - tcc_ir_spill_cache_clear(&ir->spill_cache); - break; - } - case TCCIR_OP_SETIF: - if (use_mop_setif) - { - MachineOperand mop_src = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_setif_mop(mop_src, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_setif_op(dest_ir, src1_ir, cq->op); - } - break; - case TCCIR_OP_BOOL_OR: - case TCCIR_OP_BOOL_AND: - if (use_mop_bool) - { - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_insn_scratch_reset(); - tcc_gen_machine_bool_mop(mop_src1, mop_src2, mop_dest, cq->op); -#ifdef TCC_LS_DEBUG - { - int real_scratch = tcc_gen_machine_insn_scratch_count(); - if (real_scratch != dry_insn_scratch[i] && dry_insn_saves[i] == 0) - fprintf(stderr, "[insn-scratch] i=%d op=%d dry=%d real=%d MISMATCH\n", i, (int)cq->op, dry_insn_scratch[i], - real_scratch); - } -#endif - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_bool_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - - case TCCIR_OP_VLA_ALLOC: - case TCCIR_OP_VLA_SP_SAVE: - case TCCIR_OP_VLA_SP_RESTORE: - if (use_mop_vla) - { - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - MachineOperand mop_src1 = machine_op_from_ir(ir, &src1_ir); - MachineOperand mop_src2 = machine_op_from_ir(ir, &src2_ir); - tcc_gen_machine_vla_mop(mop_dest, mop_src1, mop_src2, cq->op); - } - else - { - ir_fill_op(ir, &dest_ir); - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - tcc_gen_machine_vla_op(dest_ir, src1_ir, src2_ir, cq->op); - } - break; - case TCCIR_OP_FUNCCALLVOID: - drop_return_value = 1; - /* fall through */ - case TCCIR_OP_FUNCCALLVAL: - { - if (use_mop_func_call) - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - MachineOperand mop_dest = machine_op_from_ir(ir, &dest_ir); - tcc_gen_machine_func_call_mop(src1_ir, src2_ir, mop_dest, drop_return_value, ir, i); - } - else - { - ir_fill_op(ir, &src1_ir); - ir_fill_op(ir, &src2_ir); - ir_fill_op(ir, &dest_ir); - tcc_gen_machine_func_call_op(src1_ir, src2_ir, dest_ir, drop_return_value, ir, i); - } - /* Clear spill cache after function call - callee may have modified memory */ - tcc_ir_spill_cache_clear(&ir->spill_cache); - /* Restore R10 after call: trampoline calls for nested functions clobber R10. - * Re-load from the chain save slot at [FP, #-4] to keep R10 correct. */ - if (ir->has_static_chain) - tcc_gen_machine_restore_chain(); - break; - } - case TCCIR_OP_NOP: - /* No operation - skip silently */ - break; - case TCCIR_OP_TRAP: - /* Generate trap instruction */ - tcc_gen_machine_trap_op(); - break; - case TCCIR_OP_SET_CHAIN: - /* Static chain setup: move FP to static chain register */ - tcc_gen_machine_set_chain(); - break; - case TCCIR_OP_INIT_CHAIN_SLOT: - /* Store parent FP into chain slot for nested function trampoline */ - ir_fill_op(ir, &src1_ir); - tcc_gen_machine_init_chain_slot(src1_ir); - break; - case TCCIR_OP_ASM_INPUT: - case TCCIR_OP_ASM_OUTPUT: - /* Marker ops only: regalloc/liveness uses them, codegen emits nothing. */ - break; - case TCCIR_OP_INLINE_ASM: - { -#ifdef CONFIG_TCC_ASM - ir_fill_op(ir, &src1_ir); - tcc_ir_codegen_inline_asm_ir(ir, src1_ir); - /* Inline asm may clobber registers/memory: treat as a full barrier. */ - tcc_ir_spill_cache_clear(&ir->spill_cache); -#else - tcc_error("inline asm not supported"); -#endif - break; - } - default: - { - printf("Unsupported operation in tcc_generate_code: %s\n", tcc_ir_get_op_name(cq->op)); - if (ir->ir_to_code_mapping) - { - tcc_free(ir->ir_to_code_mapping); - ir->ir_to_code_mapping = NULL; - ir->ir_to_code_mapping_size = 0; - } - tcc_free(return_jump_addrs); - exit(1); - } - }; - - /* Clean up scratch register state at end of each IR instruction. - * This restores any pushed scratch registers and resets the global exclude mask. */ - tcc_gen_machine_end_instruction(); - } - - ir_to_code_mapping[ir->next_instruction_index] = ind; - orig_ir_to_code_mapping[ir->orig_ir_to_code_mapping_size - 1] = ind; - - /* Fill gaps for removed original indices: map them to the next reachable - * emitted code address (or epilogue). This keeps &&label stable even if the - * instruction at the exact original index was optimized away. */ - { - uint32_t last = orig_ir_to_code_mapping[ir->orig_ir_to_code_mapping_size - 1]; - for (int k = ir->orig_ir_to_code_mapping_size - 2; k >= 0; --k) - { - if (orig_ir_to_code_mapping[k] == 0xFFFFFFFFu) - orig_ir_to_code_mapping[k] = last; - else - last = orig_ir_to_code_mapping[k]; - } - } - - if (!ir->naked) - tcc_gen_machine_epilog(ir->leaffunc); - tcc_ir_codegen_backpatch_jumps(ir, ir_to_code_mapping); - - /* Backpatch return jumps to point to epilogue */ - int epilogue_addr = ir_to_code_mapping[ir->next_instruction_index]; - for (int i = 0; i < num_return_jumps; i++) - { - tcc_gen_machine_backpatch_jump(return_jump_addrs[i], epilogue_addr); - } - - tcc_free(return_jump_addrs); - tcc_free(dry_insn_saves); - tcc_free(dry_insn_scratch); - tcc_free(has_incoming_jump); -} - -/* ============================================================================ - * Legacy API Wrappers - * ============================================================================ */ - -/* Note: tcc_ir_generate_code legacy wrapper remains in tccir.c */ diff --git a/ir/core.c b/ir/core.c index ab45d5ec..de082370 100644 --- a/ir/core.c +++ b/ir/core.c @@ -243,6 +243,17 @@ void tcc_ir_free(TCCIRState *ir) ir->switch_tables_capacity = 0; } + /* Free switch value tables (SWITCH_LOAD lookup data) */ + if (ir->switch_value_tables) + { + for (int i = 0; i < ir->num_switch_value_tables; i++) + tcc_free(ir->switch_value_tables[i].values); + tcc_free(ir->switch_value_tables); + ir->switch_value_tables = NULL; + ir->num_switch_value_tables = 0; + ir->switch_value_tables_capacity = 0; + } + /* Free nested_funcs array (note: NestedFunc structs themselves are owned by TCCState) */ if (ir->nested_funcs) { @@ -351,6 +362,13 @@ int tcc_ir_put(TCCIRState *ir, TccIrOp op, SValue *src1, SValue *src2, SValue *d memset(cq, 0, sizeof(IRQuadCompact)); cq->op = (uint8_t)op; cq->orig_index = pos; + if (pos > ir->max_orig_index) + ir->max_orig_index = pos; + if (ir->next_insn_is_jump_target) + { + cq->is_jump_target = 1; + ir->next_insn_is_jump_target = 0; + } cq->operand_base = ir->iroperand_pool_count; /* Handle destination operand */ @@ -403,18 +421,25 @@ int tcc_ir_put(TCCIRState *ir, TccIrOp op, SValue *src1, SValue *src2, SValue *d dest->type = src1->type; } - if (tcc_ir_type_is_float(dest->type.t)) - { - tcc_ir_vreg_type_set_fp(ir, dest->vr, 1, tcc_ir_type_is_double(dest->type.t)); - } - else if ((dest->type.t & VT_BTYPE) == VT_LLONG) - { - tcc_ir_vreg_type_set_64bit(ir, dest->vr); - } - /* Phase 3: Set complex flag for complex types */ - if (dest->type.t & VT_COMPLEX) - { - tcc_ir_vreg_type_set_complex(ir, dest->vr); + /* For STORE ops the dest vreg holds a 32-bit address; dest->type + * describes the stored value, not the pointer. Don't promote the + * address vreg to float/64-bit/complex. */ + int dest_is_store = (op == TCCIR_OP_STORE || op == TCCIR_OP_STORE_INDEXED || + op == TCCIR_OP_STORE_POSTINC); + if (!dest_is_store) { + if (tcc_ir_type_is_float(dest->type.t)) + { + tcc_ir_vreg_type_set_fp(ir, dest->vr, 1, tcc_ir_type_is_double(dest->type.t)); + } + else if ((dest->type.t & VT_BTYPE) == VT_LLONG) + { + tcc_ir_vreg_type_set_64bit(ir, dest->vr); + } + /* Phase 3: Set complex flag for complex types */ + if (dest->type.t & VT_COMPLEX) + { + tcc_ir_vreg_type_set_complex(ir, dest->vr); + } } dest_interval = tcc_ir_vreg_live_interval(ir, dest->vr); int new_is_lvalue; @@ -629,7 +654,6 @@ static void tcc_ir_params_add_hidden_sret(TCCIRState *ir, CType *func_type) loc = (loc - PTR_SIZE) & -PTR_SIZE; func_vc = loc; - tcc_state->need_frame_pointer = 1; /* Consume a PARAM vreg for the hidden sret pointer */ int sret_param_vr = tcc_ir_get_vreg_param(ir); @@ -720,8 +744,8 @@ void tcc_ir_params_process_single(TCCIRState *ir, Sym *sym, int arg_index, TCCAb TCCAbiArgLoc loc_info = tcc_abi_classify_argument(call_layout, arg_index, &desc); tcc_ir_params_update_tracking(ir, loc_info, call_layout); - if (loc_info.kind == TCC_ABI_LOC_STACK || loc_info.kind == TCC_ABI_LOC_REG_STACK) - tcc_state->need_frame_pointer = 1; + /* With the pre-reserved outgoing call area, stack args no longer require + * a frame pointer — SP stays fixed across calls. */ if ((type->t & VT_BTYPE) == VT_STRUCT || (type->t & VT_COMPLEX)) { @@ -971,6 +995,10 @@ void tcc_ir_params_process_scalar(TCCIRState *ir, Sym *sym, CType *type, TCCAbiA { int flags = 0, addr = 0; int variadic = (sym->f.func_type == FUNC_ELLIPSIS); + CType pushed_type = *type; + + if (sym->a.param_volatile) + pushed_type.t |= VT_VOLATILE; if (loc_info->kind == TCC_ABI_LOC_REG) { @@ -996,7 +1024,7 @@ void tcc_ir_params_process_scalar(TCCIRState *ir, Sym *sym, CType *type, TCCAbiA int v = sym->v & ~SYM_FIELD; if (!v) v = anon_sym++; - sym_push(v, type, flags, addr); + sym_push(v, &pushed_type, flags, addr); } int tcc_ir_local_add(TCCIRState *ir, Sym *sym, int stack_offset) @@ -1111,6 +1139,8 @@ TccIrOp tcc_irop_from_token(int token) return TCCIR_OP_MUL; case TOK_UMULL: return TCCIR_OP_UMULL; + case TOK_SMULL: + return TCCIR_OP_SMULL; case TOK_SHL: return TCCIR_OP_SHL; case TOK_SAR: @@ -1168,12 +1198,17 @@ void tcc_ir_gen_i(TCCIRState *ir, int op) svalue_init(&dest); dest.vr = tcc_ir_get_vreg_temp(ir); dest.r = 0; - /* Most integer ops preserve the operand type, but UMULL produces a 64-bit result. */ + /* Most integer ops preserve the operand type, but UMULL/SMULL produce a 64-bit result. */ if (ir_op == TCCIR_OP_UMULL) { dest.type.t = VT_LLONG | VT_UNSIGNED; tcc_ir_set_llong_type(ir, dest.vr); } + else if (ir_op == TCCIR_OP_SMULL) + { + dest.type.t = VT_LLONG; + tcc_ir_set_llong_type(ir, dest.vr); + } else { dest.type.t = vtop[-1].type.t; @@ -1181,7 +1216,7 @@ void tcc_ir_gen_i(TCCIRState *ir, int op) tcc_ir_put(ir, ir_op, &vtop[-1], &vtop[0], &dest); vtop[-1].vr = dest.vr; vtop[-1].r = 0; - vtop[-1].type = dest.type; /* Update type - critical for UMULL which produces 64-bit from 32-bit inputs */ + vtop[-1].type = dest.type; /* Update type - critical for UMULL/SMULL which produce 64-bit from 32-bit inputs */ --vtop; } @@ -1629,6 +1664,15 @@ void tcc_ir_backpatch(TCCIRState *ir, int t, int target_address) const int pool_off = ir->compact_instructions[t].operand_base; ir->iroperand_pool[pool_off] = cur; + /* Mark the target instruction as a jump target. + * If it already exists, set the flag directly. + * If it is the next-to-be-created slot (tcc_ir_backpatch_to_here pattern), + * set a pending flag that tcc_ir_put picks up on creation. */ + if (target_address >= 0 && target_address < ir->next_instruction_index) + ir->compact_instructions[target_address].is_jump_target = 1; + else if (target_address == ir->next_instruction_index) + ir->next_insn_is_jump_target = 1; + /* Chain ends when next is -1 (sentinel), out of range, or already patched */ if (next < 0 || next >= ir->next_instruction_index || next == target_address) break; @@ -1943,6 +1987,7 @@ const IRRegistersConfig irop_config[] = { [TCCIR_OP_MUL] = {1, 1, 1}, [TCCIR_OP_MLA] = {1, 1, 1}, /* MLA has accumulator as extra operand at pool[operand_base+3] */ [TCCIR_OP_UMULL] = {1, 1, 1}, + [TCCIR_OP_SMULL] = {1, 1, 1}, [TCCIR_OP_DIV] = {1, 1, 1}, [TCCIR_OP_UMOD] = {1, 1, 1}, [TCCIR_OP_IMOD] = {1, 1, 1}, @@ -1976,6 +2021,8 @@ const IRRegistersConfig irop_config[] = { [TCCIR_OP_LOAD_POSTINC] = {1, 1, 0}, /* dest = *ptr; ptr += offset */ [TCCIR_OP_STORE_POSTINC] = {1, 1, 0}, /* *ptr = src; ptr += offset */ [TCCIR_OP_TEST_ZERO] = {0, 1, 0}, + [TCCIR_OP_UBFX] = {1, 1, 1}, /* dest = (src1 >> lsb) & ((1<