From 734f6e53ea79675a00d0f0ab099f57f560f738ce Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 14:42:53 +0200 Subject: [PATCH 1/7] PDF stage 2.1: text transforms & the placed-text emission Apply the full text transform chain and introduce a renderer-agnostic placed-text emission, the foundation for stage 2's positioning work. Glyph advances are deliberately left to stage 2.2. - pdf_geometry.hpp: a 2-D affine `Matrix` (compose, point-apply, translation/scaling factories), PDF row-vector convention. - GraphicsState: the CTM now concatenates on `cm` (it was overwritten); the text matrix `Tm` and line matrix `Tlm` are tracked as `Matrix` values with `BT`/`Td`/`TD`/`T*` and the line-move half of `'`/`"`; `text_placement_matrix()` folds in horizontal scaling and rise, keeping font size separate so the run-vs-glyph mapping stays a renderer choice. - pdf_page_text (`extract_text`): emit one `TextElement` per show operation, positioned by the placement matrix, carrying font/size/ spacing/codes/Unicode. Lenient font lookup (unknown ref -> warn). - html/pdf_file.cpp: map each `TextElement` to a positioned span with a CSS `transform` (PDF user space -> page box in CSS px, glyphs upright); route through `Logger`; drop the debug `std::cout` and the `"hi"` marker. Tests: pdf_geometry.cpp (Matrix compose/apply) and pdf_page_text.cpp (Td/Tm/cm/Tz/Ts, TJ concatenation, T*/'/" line moves), both inline. Co-Authored-By: Claude Opus 4.8 --- CMakeLists.txt | 1 + src/odr/internal/html/pdf_file.cpp | 104 +++++------- src/odr/internal/pdf/AGENTS.md | 169 ++++++++++++++------ src/odr/internal/pdf/pdf_geometry.hpp | 41 +++++ src/odr/internal/pdf/pdf_graphics_state.cpp | 62 +++++-- src/odr/internal/pdf/pdf_graphics_state.hpp | 35 ++-- src/odr/internal/pdf/pdf_page_text.cpp | 102 ++++++++++++ src/odr/internal/pdf/pdf_page_text.hpp | 48 ++++++ test/CMakeLists.txt | 2 + test/src/internal/pdf/pdf_geometry.cpp | 52 ++++++ test/src/internal/pdf/pdf_page_text.cpp | 109 +++++++++++++ 11 files changed, 586 insertions(+), 139 deletions(-) create mode 100644 src/odr/internal/pdf/pdf_geometry.hpp create mode 100644 src/odr/internal/pdf/pdf_page_text.cpp create mode 100644 src/odr/internal/pdf/pdf_page_text.hpp create mode 100644 test/src/internal/pdf/pdf_geometry.cpp create mode 100644 test/src/internal/pdf/pdf_page_text.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index c3faa544..589fe26f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,6 +190,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/pdf/pdf_graphics_state.cpp" "src/odr/internal/pdf/pdf_object.cpp" "src/odr/internal/pdf/pdf_object_parser.cpp" + "src/odr/internal/pdf/pdf_page_text.cpp" "src/odr/internal/svm/svm_file.cpp" "src/odr/internal/svm/svm_format.cpp" diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index b7a18e34..24371271 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -12,12 +12,8 @@ #include #include #include -#include -#include -#include - -#include -#include +#include +#include namespace odr::internal::html { @@ -93,14 +89,21 @@ class HtmlServiceImpl final : public HtmlService { out.write_body_begin(); + // CSS uses 96px to the inch, PDF user space 72 units to the inch. + static constexpr double pt_to_px = 96.0 / 72.0; + for (pdf::Page *page : pages) { const pdf::Array &page_box = page->media_box.as_array(); + const double box_x0 = page_box[0].as_real(); + const double box_y0 = page_box[1].as_real(); + const double width = page_box[2].as_real() - box_x0; + const double height = page_box[3].as_real() - box_y0; out.write_element_begin( "div", HtmlElementOptions().set_style([&](std::ostream &o) { o << "position:relative;"; - o << "width:" << page_box[2].as_real() / 72.0 << "in;"; - o << "height:" << page_box[3].as_real() / 72.0 << "in;"; + o << "width:" << width / 72.0 << "in;"; + o << "height:" << height / 72.0 << "in;"; })); std::string stream; @@ -110,66 +113,31 @@ class HtmlServiceImpl final : public HtmlService { stream += '\n'; } - std::istringstream ss(stream); - pdf::GraphicsOperatorParser parser2(ss); - pdf::GraphicsState state; - while (!ss.eof()) { - pdf::GraphicsOperator op = parser2.read_operator(); - state.execute(op); - - if (op.type == pdf::GraphicsOperatorType::text_next_line) { - double leading = state.current().text.leading; - double size = state.current().text.size; - - state.current().text.offset[1] -= size + leading; - } else if (op.type == pdf::GraphicsOperatorType::show_text) { - const std::string &font_ref = state.current().text.font; - double size = state.current().text.size; - - std::array offset = state.current().text.offset; - - pdf::Font *font = page->resources->font.at(font_ref); - - const std::string &glyphs = op.arguments[0].as_string(); - std::string unicode = font->to_unicode(glyphs); - - if (unicode.find("Colored Line") != std::string::npos) { - std::cout << "hi" << '\n'; - } - - out.write_element_begin( - "span", HtmlElementOptions().set_style([&](std::ostream &o) { - o << "position:absolute;"; - o << "left:" << offset[0] / 72.0 << "in;"; - o << "bottom:" << offset[1] / 72.0 << "in;"; - o << "font-size:" << size << "pt;"; - })); - out.write_raw(escape_text(unicode)); - out.write_element_end("span"); - } else if (op.type == - pdf::GraphicsOperatorType::show_text_manual_spacing) { - const std::string &font_ref = state.current().text.font; - pdf::Font *font = page->resources->font.at(font_ref); - double size = state.current().text.size; - - std::cout << font->object << '\n'; - - for (const auto &element : op.arguments[0].as_array()) { - if (element.is_real()) { - std::cout << "spacing: " << element.as_real() << '\n'; - } else if (element.is_string()) { - const std::string &glyphs = element.as_string(); - std::string unicode = font->to_unicode(glyphs); - std::cout << "show text manual spacing: font=" << font - << ", size=" << size << ", text=" << unicode << '\n'; - } - } - } else if (op.type == pdf::GraphicsOperatorType::show_text_next_line) { - std::cout << "TODO show_text_next_line" << '\n'; - } else if (op.type == - pdf::GraphicsOperatorType::show_text_next_line_set_spacing) { - std::cout << "TODO show_text_next_line_set_spacing" << '\n'; - } + // Map PDF user space (origin at the MediaBox corner, y up) to the page + // box in CSS pixels (origin top-left, y down). `flip_glyph` un-mirrors + // the glyphs so text stays upright after the page flip. + const pdf::Matrix flip_glyph{1, 0, 0, -1, 0, 0}; + const pdf::Matrix to_box = pdf::Matrix::translation(-box_x0, -box_y0) * + pdf::Matrix{1, 0, 0, -1, 0, height}; + + for (const pdf::TextElement &text : + pdf::extract_text(stream, *page->resources, *m_logger)) { + const pdf::Matrix m = flip_glyph * text.transform * to_box; + + out.write_element_begin( + "span", HtmlElementOptions().set_style([&](std::ostream &o) { + o << "position:absolute;left:0;top:0;"; + o << "transform-origin:0 0;"; + // TODO baseline sits at the box top until font ascent metrics + // land + o << "transform:matrix(" << m.a << "," << m.b << "," << m.c << "," + << m.d << "," << m.e * pt_to_px << "," << m.f * pt_to_px + << ");"; + o << "font-size:" << text.size * pt_to_px << "px;"; + o << "white-space:pre;"; + })); + out.write_raw(escape_text(text.text)); + out.write_element_end("span"); } out.write_element_end("div"); diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index 0dc19044..98b0d428 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -13,9 +13,10 @@ tables, cross-reference streams, object streams, hybrid files, with a forward-scan recovery path for broken cross-references), build the page tree with fonts and annotations, tokenize page content streams into graphics operators, and emit a **proof-of-concept HTML rendering**: absolutely positioned -text spans per `Tj`, pages sized from `MediaBox`. Encrypted files are decrypted -(RC4, AES-128, AES-256). No graphics, no images, no font files. Experimental and -not production-quality — the HTML path still contains debug `std::cout` output. +text spans, one per show operation, placed by the full text transform (CTM × +text matrix, stage 2.1), pages sized from `MediaBox`. Encrypted files are +decrypted (RC4, AES-128, AES-256). No glyph advances yet (stage 2.2), no +graphics, no images, no font files. Experimental and not production-quality. --- @@ -107,13 +108,28 @@ not production-quality — the HTML path still contains debug `std::cout` output deferred half of part B) or the embedded font program (stage 3) land. - **Content streams**: the full graphics-operator vocabulary is tokenized; `GraphicsState` executes a subset (state stack `q`/`Q`, matrices `cm`/`Tm`, - line parameters, text state `Tc`/`Tw`/`Tz`/`TL`/`Tf`/`Tr`/`Ts`, text - positioning `Td`/`TD`, grey/RGB/CMYK colors, glyph metrics `d0`/`d1`). Unknown + line parameters, text state `Tc`/`Tw`/`Tz`/`TL`/`Tf`/`Tr`/`Ts`, glyph metrics + `d0`/`d1`, grey/RGB/CMYK colors). The CTM **concatenates** on `cm` (ISO 32000-1 + 8.4.4); the text matrix `Tm` and text line matrix `Tlm` are tracked as 2-D + affine `Matrix` values (`pdf_geometry.hpp`), with `BT` resetting them, `Td`/`TD` + /`T*` (and the line-move half of `'`/`"`) advancing `Tlm` → `Tm`. Unknown operators are logged to stderr and skipped. +- **Text layout** (`pdf_page_text`, stage 2.1): `extract_text` runs the operator + parser + `GraphicsState` over a page's content and emits a renderer-agnostic + `TextElement` per show operation (`Tj`/`TJ`/`'`/`"`) — its text-space → user- + space transform (CTM × `Tm`, with horizontal scaling and rise folded in, font + size kept separate), the resolved font, size, spacing parameters, raw codes, + and the CMap-translated Unicode. Font lookup is lenient (unknown ref → warn, + raw codes). **Glyph advances are not yet applied** (stage 2.2): each show op + yields one element at its starting origin, `TJ`'s numeric adjustments are + dropped, and `Tc`/`Tw`/`Tz`-driven spacing is carried but not consumed. - **HTML**: one `document.html` view; each page is a `div` sized from `MediaBox` - (points → inches), each `Tj` becomes an absolutely positioned `span` at the - text-state offset with `font-size` from `Tf` and the CMap-translated text. - `TJ`/`'`/`"` are recognized but only printed to stdout, not rendered. + (points → inches). Each `TextElement` becomes an absolutely positioned `span` + carrying a CSS `transform` matrix (the placement transform mapped from PDF user + space — y-up, MediaBox origin — into the page box in CSS pixels, the glyphs + un-mirrored so text stays upright), `font-size` from the text state, and the + Unicode text. Precise baseline placement (needs font ascent metrics) is + deferred; the baseline currently sits at the span's box top. ## Module layout @@ -133,9 +149,11 @@ not production-quality — the HTML path still contains debug `std::cout` output | `pdf_encoding.{hpp,cpp}` | Simple-font `/Encoding` → Unicode: `BaseEncoding` tables, `/Differences` overlay (`Encoding`), glyph-name → Unicode via AGL + `uniXXXX`/`uXXXXXX` (stage 1.2) | | `pdf_cid.{hpp,cpp}` | Composite-font predefined `/Encoding` → Unicode: the `Uni*-UCS2/UTF16/UTF32` CMaps decoded directly (no data tables), stage 1.3 part B; legacy CJK CMaps deferred (see `tools/pdf/generate_cid_data.py`) | | `pdf_encoding_data.{hpp,cpp}` | **Generated** (`tools/pdf/generate_encoding_data.py`): base-encoding tables + the Adobe Glyph List as a name-sorted array | +| `pdf_geometry.hpp` | `Matrix`: 2-D affine transform (PDF row-vector convention) — compose, point-apply, translation/scaling factories (stage 2.1) | | `pdf_graphics_operator.hpp` | `GraphicsOperatorType` enum (full operator set) + `GraphicsOperator` (type + `Object` arguments) | | `pdf_graphics_operator_parser.{hpp,cpp}` | Content-stream tokenizer: arguments then operator name | -| `pdf_graphics_state.{hpp,cpp}` | `GraphicsState`: stack of `State` (general/path/text/color), `execute(op)` for the modelled subset | +| `pdf_graphics_state.{hpp,cpp}` | `GraphicsState`: stack of `State` (general/path/text/color), `execute(op)` for the modelled subset; CTM/`Tm`/`Tlm` as `Matrix`, `text_placement_matrix()` for the text rendering transform sans font size | +| `pdf_page_text.{hpp,cpp}` | `extract_text`: run the content stream through `GraphicsState`, emit a `TextElement` (placed transform + font/size/spacing + codes + Unicode) per show operation (stage 2.1) | | `pdf_file.{hpp,cpp}` | `abstract::PdfFile` wrapper; probes encryption at construction and implements `password_encrypted()`/`decrypt()`, carrying the authenticated `Decryptor` (not the password) so rendering needs no re-derivation | Consumers outside the module: `open_strategy.cpp` (detection / engine @@ -170,11 +188,14 @@ selection) and `html/pdf_file.cpp` (`create_pdf_service`). 5. **Decode content.** Per page (depth-first), the `Contents` streams are read, decoded through their `/Filter` chain (`read_decoded_stream`), concatenated with a newline between streams. -6. **Execute and emit.** `GraphicsOperatorParser` tokenizes; `GraphicsState` - updates the state stack. `T*` advances the text offset by `size + leading`; - `Tj` emits a positioned `span` using `state.text.offset` and the `Tf` size, - glyphs translated through the font's CMap. The text and transform matrices - are tracked but **not applied** to positioning. +6. **Lay out and emit.** `extract_text` runs `GraphicsOperatorParser` + + `GraphicsState` over the content and returns a `TextElement` per show + operation, each placed by `text_placement_matrix()` (CTM × `Tm`, with + horizontal scaling and rise folded in), its glyphs translated through the + font's CMap. The HTML layer maps each element to a positioned `span` with a + CSS `transform` (PDF user space → the page box in CSS pixels) and `font-size` + from the text state. Glyph advances are **not yet applied** (stage 2.2), so + shows without an explicit move overlap. --- @@ -221,12 +242,12 @@ as absent (7.5.8.3). A structural throw in the cross-reference layer is not fatal, though: it is caught once and the file is forward-scanned to rebuild the table (*Cross-reference recovery* above) before giving up. -**Debug output still in place.** `html/pdf_file.cpp`, `pdf_graphics_state.cpp`, -`pdf_graphics_operator_parser.cpp` and `pdf_cmap_parser.cpp` print diagnostics -(and one leftover `"hi"` breakpoint marker) to stdout/stderr instead of -`Logger`. Proof-of-concept residue; should move to `Logger` or be removed. -`DocumentParser` itself takes an optional `Logger &` (default `Logger::null()`) -and routes its warnings through it — new diagnostics should do the same. +**Debug output still in place.** `pdf_graphics_state.cpp` (dash pattern, stroke/ +other color) and `pdf_graphics_operator_parser.cpp` still print diagnostics to +stdout/stderr instead of `Logger`. The text path is now clean: `html/pdf_file.cpp` +and `pdf_page_text.cpp` route through `Logger` and the leftover `"hi"` marker is +gone (stage 2.1). `DocumentParser` and `extract_text` take a `Logger &` (default +`Logger::null()`) — new diagnostics should do the same. **Rendering is deferred to the browser; display and text are decoupled.** We emit no rasterized output: glyphs render via the embedded font (`@font-face`, stage 3) @@ -302,9 +323,17 @@ such PDFs look right, their text just isn't selectable until the tables land. `translate_predefined_cmap` over the predefined Unicode CMaps — `UCS2`/`UTF16` (incl. a surrogate pair) and `UTF32` decoding, a `-V` writing-mode variant, and the `nullopt` for `Identity-H` and the legacy CJK CMaps (stage 1.3 part B). +- `test/src/internal/pdf/pdf_geometry.cpp` — **assertion-based**, no fixtures: + `Matrix` point-apply (identity/translation/scaling), the ordered + (row-vector) composition, and compose-then-apply ≡ sequential apply (stage 2.1). +- `test/src/internal/pdf/pdf_page_text.cpp` — **assertion-based**, inline content + streams through `extract_text` (empty resources, so codes pass through as + `text`): `Td` translation, `Tm` scaling, `cm` CTM concatenation under `Tm`, + horizontal scaling and rise in the transform, `TJ` string concatenation, and + the `T*`/`'`/`"` line moves with their leading and spacing (stage 2.1). No assertion-based coverage of the tokenizer (escapes, references, hex strings) -or the HTML output. +or the HTML output itself (the span emission / CSS transform mapping). --- @@ -358,30 +387,80 @@ fixture needs them yet: ## Stage 2 — text positioning & metrics -Independent of Unicode work; fixes layout even with today's partial CMaps. - -- Apply the full transform: text matrix × CTM (both tracked in `GraphicsState` - but never applied), text rise, horizontal scaling. -- **Glyph advances**: `/Widths` + `/MissingWidth` (simple), `/W` + `/DW` (CID), - char/word spacing, the numeric adjustments in `TJ` — so `TJ`, `'`, `"` finally - render and `Tj` runs land correctly. -- **Form XObjects** (`Do` on a `/Form`): recursive content-stream execution with - scoped `/Resources` and the form matrix. Many producers put most page content - inside forms, and tiling patterns (stage 4) and annotation appearances - (stage 5) run on the same machinery — a structural prerequisite. -- **Text render modes** (`Tr`): mode 3 (invisible text, OCR-over-scan) must stay - selectable but unpainted; stroke/clip modes (1–2, 4–7) need graceful - degradation. -- **Space inference**: PDFs routinely encode no spaces; insert them from - glyph-gap heuristics (as pdf2htmlEX does) so copy/paste and search work. -- Layout side of bidi (RTL run ordering) and vertical writing (Identity-V/CJK). -- HTML mapping decision: per-run spans with CSS `transform` (cheap, breaks on - heavy kerning) vs. per-glyph positioning (exact, verbose) — likely per-run - with a kerning threshold that splits runs, like pdf2htmlEX. -- **Extraction refinements** (was stage 1.5, rides on the run plumbing above): - mark a run "no Unicode" when the code → Unicode chain yields nothing, so stage 3 - can re-encode it; honour `/ActualText` (tagged PDFs, ligatures) as an extraction - override of the whole chain. +Independent of Unicode work; fixes layout even with today's partial CMaps. Split +into sub-stages (mirroring stage 1's slicing), each its own PR. + +**Architecture decision (2026-06): a renderer-agnostic placed-text emission.** +The content executor produces a per-page list of **placed text items**, each +carrying its text-space → user-space transform (CTM × text matrix, with font +size / horizontal scaling / rise folded in), the resolved font, size, the +text-state spacing parameters, the raw character codes, and a Unicode +representation (which may lack inferred spaces). The HTML layer consumes that +list and decides how to map it — per-run spans with a CSS `transform` vs. +per-glyph positioning. **The core never commits to either**; this pushes the +run-vs-glyph question all the way down to rendering. (The earlier framing of an +up-front "HTML mapping decision" is dissolved into this.) + +### 2.1 — transforms & the placed-text emission — **in progress** + +The geometry foundation plus the emission contract, *without* glyph advances: +- A 2-D affine `Matrix` (`pdf_geometry.hpp`): compose, point-apply, + translation/scaling factories. +- Apply the full transform chain in `GraphicsState`: the CTM now *concatenates* + on `cm` (it was overwritten); the text matrix `Tm` and text line matrix `Tlm` + are tracked properly (`BT` resets them, `Tm` sets both, `Td`/`TD`/`T*` and the + line-move half of `'`/`"` update `Tlm` → `Tm`); text rise and horizontal + scaling fold into the text rendering matrix. +- A `TextElement` emission (`pdf_page_text.{hpp,cpp}`): `extract_text(content, + resources, logger)` runs the operator parser + state and yields one placed item + per show operation (`Tj`/`TJ`/`'`/`"`), positioned by the text rendering + matrix. Font lookup is lenient (unknown ref → warn, codes pass through). +- The HTML layer maps each `TextElement` to an absolutely-positioned span with a + CSS `transform` (full matrix, incl. rotation/scaling), font size from the text + state, the page y-axis flipped once per page. The text-path debug `std::cout` + (incl. the `"hi"` marker) is removed. + +**Deliberately out of scope here (→ 2.2):** glyph advances (`/Widths`, +`/W`/`/DW`) and the *application* of char/word spacing and the `TJ` numeric +adjustments, so consecutive shows on a line without an explicit move still +overlap, and `TJ` renders its strings concatenated at one origin. Precise +baseline placement (needs font ascent metrics) is likewise deferred. Whether 2.2 +folds into this PR or branches off is an open call once 2.1 lands. + +### 2.2 — glyph advances & metrics + +Parse `/Widths` + `/MissingWidth` (simple) and `/W` + `/DW` (CID); apply char/word +spacing, horizontal scaling and the `TJ` numeric adjustments to advance the text +matrix per glyph — so `TJ`, `'`, `"` land correctly, `Tj` runs space correctly, +and the emission can be subdivided per glyph (which makes the renderer's +per-glyph option exercisable). + +### 2.3 — Form XObjects + +`Do` on a `/Form`: recursive content-stream execution with scoped `/Resources` +and the form matrix; extend `Resources` parsing to the XObject table. Many +producers put most page content inside forms, and tiling patterns (stage 4) and +annotation appearances (stage 5) run on the same machinery — a structural +prerequisite. + +### 2.4 — text render modes & extraction refinements + +**Text render modes** (`Tr`): mode 3 (invisible text, OCR-over-scan) must stay +selectable but unpainted; stroke/clip modes (1–2, 4–7) need graceful +degradation. Plus **extraction refinements** (was stage 1.5, rides on the run +plumbing): mark a run "no Unicode" when the code → Unicode chain yields nothing, +so stage 3 can re-encode it; honour `/ActualText` (tagged PDFs, ligatures) as an +extraction override of the whole chain. + +### 2.5 — space inference + +PDFs routinely encode no spaces; insert them from glyph-gap heuristics (as +pdf2htmlEX does) so copy/paste and search work. + +### 2.6 — bidi & vertical writing (deferral candidate) + +Layout side of bidi (RTL run ordering) and vertical writing (Identity-V/CJK). No +corpus fixture needs it yet — likely pushed out until one does. ## Stage 3 — fonts in HTML diff --git a/src/odr/internal/pdf/pdf_geometry.hpp b/src/odr/internal/pdf/pdf_geometry.hpp new file mode 100644 index 00000000..ca6863e9 --- /dev/null +++ b/src/odr/internal/pdf/pdf_geometry.hpp @@ -0,0 +1,41 @@ +#pragma once + +#include + +namespace odr::internal::pdf { + +/// 2-D affine transform in PDF's convention: the matrix `[a b c d e f]` denotes +/// +/// | a b 0 | +/// | c d 0 | +/// | e f 1 | +/// +/// and points are row vectors multiplied on the left: `[x y 1] * M`. +/// Composition follows the same order, so `a * b` is "apply `a`, then `b`" (ISO +/// 32000-1 8.3.4). +struct Matrix { + double a{1}, b{0}, c{0}, d{1}, e{0}, f{0}; + + Matrix() = default; + Matrix(double a, double b, double c, double d, double e, double f) + : a{a}, b{b}, c{c}, d{d}, e{e}, f{f} {} + + static Matrix translation(double x, double y) { return {1, 0, 0, 1, x, y}; } + static Matrix scaling(double x, double y) { return {x, 0, 0, y, 0, 0}; } + + /// `*this` applied first, then `rhs`. + [[nodiscard]] Matrix operator*(const Matrix &rhs) const { + return { + a * rhs.a + b * rhs.c, a * rhs.b + b * rhs.d, + c * rhs.a + d * rhs.c, c * rhs.b + d * rhs.d, + e * rhs.a + f * rhs.c + rhs.e, e * rhs.b + f * rhs.d + rhs.f, + }; + } + + /// Map the point `(x, y)` through the transform. + [[nodiscard]] std::array apply(double x, double y) const { + return {a * x + c * y + e, b * x + d * y + f}; + } +}; + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index 5c4c75ae..60010c47 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -20,6 +20,12 @@ ColorSpace color_space_name_to_enum(const std::string &name) { return util::map::lookup_default(mapping, name, ColorSpace::unknown); } +Matrix matrix_from_args(const GraphicsOperator &op) { + return {op.arguments.at(0).as_real(), op.arguments.at(1).as_real(), + op.arguments.at(2).as_real(), op.arguments.at(3).as_real(), + op.arguments.at(4).as_real(), op.arguments.at(5).as_real()}; +} + } // namespace GraphicsState::GraphicsState() { stack.emplace_back(); } @@ -30,6 +36,20 @@ const GraphicsState::State &GraphicsState::current() const { return stack.back(); } +Matrix GraphicsState::text_placement_matrix() const { + const Text &text = current().text; + // text rendering matrix without the font size (ISO 32000-1 9.4.4): the font + // size scales x and y, horizontal scaling scales x only, rise offsets y. + Matrix params{text.horizontal_scaling / 100.0, 0, 0, 1, 0, text.rise}; + return params * text.matrix * current().general.transform_matrix; +} + +void GraphicsState::next_line(double tx, double ty) { + Text &text = current().text; + text.line_matrix = Matrix::translation(tx, ty) * text.line_matrix; + text.matrix = text.line_matrix; +} + void GraphicsState::execute(const GraphicsOperator &op) { switch (op.type) { case GraphicsOperatorType::save_state: @@ -40,9 +60,9 @@ void GraphicsState::execute(const GraphicsOperator &op) { break; case GraphicsOperatorType::set_matrix: - for (int i = 0; i < 6; ++i) { - current().general.transform_matrix.at(i) = op.arguments.at(i).as_real(); - } + // `cm` concatenates: CTM = matrix * CTM (ISO 32000-1 8.4.4). + current().general.transform_matrix = + matrix_from_args(op) * current().general.transform_matrix; break; case GraphicsOperatorType::set_line_width: @@ -123,21 +143,33 @@ void GraphicsState::execute(const GraphicsOperator &op) { current().text.rise = op.arguments.at(0).as_real(); break; - case GraphicsOperatorType::text_next_line_relative: - for (int i = 0; i < 2; ++i) { - current().text.offset.at(i) += op.arguments.at(i).as_real(); - } + case GraphicsOperatorType::begin_text: + // BT initializes both the text matrix and the text line matrix to identity. + current().text.matrix = Matrix(); + current().text.line_matrix = Matrix(); break; - case GraphicsOperatorType::text_next_line_relative_leading: + case GraphicsOperatorType::text_next_line_relative: // Td + next_line(op.arguments.at(0).as_real(), op.arguments.at(1).as_real()); + break; + case GraphicsOperatorType::text_next_line_relative_leading: // TD current().text.leading = -op.arguments.at(1).as_real(); - for (int i = 0; i < 2; ++i) { - current().text.offset.at(i) += op.arguments.at(i).as_real(); - } + next_line(op.arguments.at(0).as_real(), op.arguments.at(1).as_real()); break; - case GraphicsOperatorType::set_text_matrix: - for (int i = 0; i < 6; ++i) { - current().text.transform_matrix.at(i) = op.arguments.at(i).as_real(); - } + case GraphicsOperatorType::set_text_matrix: // Tm + current().text.matrix = matrix_from_args(op); + current().text.line_matrix = current().text.matrix; + break; + case GraphicsOperatorType::text_next_line: // T* + next_line(0, -current().text.leading); + break; + case GraphicsOperatorType::show_text_next_line: // ' : T* then show + next_line(0, -current().text.leading); + break; + case GraphicsOperatorType::show_text_next_line_set_spacing: + // " : aw ac string -> set word/char spacing, T*, then show + current().text.word_spacing = op.arguments.at(0).as_real(); + current().text.char_spacing = op.arguments.at(1).as_real(); + next_line(0, -current().text.leading); break; case GraphicsOperatorType::set_stroke_color_space: diff --git a/src/odr/internal/pdf/pdf_graphics_state.hpp b/src/odr/internal/pdf/pdf_graphics_state.hpp index b65079f2..6020a66f 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.hpp +++ b/src/odr/internal/pdf/pdf_graphics_state.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include #include #include @@ -25,7 +27,7 @@ struct GraphicsState { std::string color_rendering_intent; double flatness_tolerance{}; std::string graphics_state_parameters; - std::array transform_matrix{1, 0, 0, 1, 0, 0}; + Matrix transform_matrix; // CTM }; struct Path { @@ -34,16 +36,16 @@ struct GraphicsState { }; struct Text { - double char_spacing{0}; - double word_spacing{0}; - double horizontal_scaling{1}; - double leading{0}; - std::string font; - double size{}; - int rendering_mode{0}; - double rise{0}; - std::array offset{0, 0}; - std::array transform_matrix{1, 0, 0, 1, 0, 0}; + double char_spacing{0}; // Tc + double word_spacing{0}; // Tw + double horizontal_scaling{100}; // Tz, in percent (100 = normal) + double leading{0}; // TL + std::string font; // Tf resource name + double size{}; // Tf size + int rendering_mode{0}; // Tr + double rise{0}; // Ts + Matrix matrix; // Tm + Matrix line_matrix; // Tlm std::array glyph_width{}; std::array glyph_bounding_box{}; }; @@ -72,6 +74,17 @@ struct GraphicsState { [[nodiscard]] const State ¤t() const; void execute(const GraphicsOperator &); + + /// Text rendering matrix *excluding* the font size: maps text space (1 unit = + /// 1 em at the current font size) to user space, with horizontal scaling and + /// rise folded in. The font size is applied separately (as the rendered + /// font-size), which keeps the run-vs-glyph mapping decision in the renderer. + [[nodiscard]] Matrix text_placement_matrix() const; + +private: + /// Move to the start of a new text line: `Tlm = translate(tx, ty) * Tlm` and + /// `Tm = Tlm` (the shared mechanic behind `Td`, `TD`, `T*`, `'`, `"`). + void next_line(double tx, double ty); }; } // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_page_text.cpp b/src/odr/internal/pdf/pdf_page_text.cpp new file mode 100644 index 00000000..313fb4c0 --- /dev/null +++ b/src/odr/internal/pdf/pdf_page_text.cpp @@ -0,0 +1,102 @@ +#include + +#include + +#include +#include +#include +#include + +#include +#include + +namespace odr::internal::pdf { + +namespace { + +Font *lookup_font(const Resources &resources, const std::string &name, + const Logger &logger, std::set &warned) { + if (const auto it = resources.font.find(name); it != resources.font.end()) { + return it->second; + } + if (warned.insert(name).second) { + ODR_WARNING(logger, "pdf: unknown font resource '" + name + + "', emitting raw codes"); + } + return nullptr; +} + +void emit(std::vector &out, const GraphicsState &state, + std::string codes, Font *font) { + const GraphicsState::Text &text = state.current().text; + + TextElement element; + element.transform = state.text_placement_matrix(); + element.font = font; + element.size = text.size; + element.char_spacing = text.char_spacing; + element.word_spacing = text.word_spacing; + element.horizontal_scaling = text.horizontal_scaling; + element.rise = text.rise; + element.rendering_mode = text.rendering_mode; + element.text = font != nullptr ? font->to_unicode(codes) : codes; + element.codes = std::move(codes); + + out.push_back(std::move(element)); +} + +/// Concatenate the string elements of a `TJ` array, dropping the numeric +/// position adjustments (their application is stage 2.2). +std::string join_array_strings(const Array &array) { + std::string codes; + for (const Object &element : array) { + if (element.is_string()) { + codes += element.as_string(); + } + } + return codes; +} + +} // namespace + +std::vector extract_text(const std::string &content, + const Resources &resources, + const Logger &logger) { + std::vector result; + std::set warned; + + std::istringstream ss(content); + GraphicsOperatorParser parser(ss); + GraphicsState state; + + while (!ss.eof()) { + const GraphicsOperator op = parser.read_operator(); + state.execute(op); + + switch (op.type) { + case GraphicsOperatorType::show_text: + case GraphicsOperatorType::show_text_next_line: { // Tj, ' + Font *font = + lookup_font(resources, state.current().text.font, logger, warned); + emit(result, state, op.arguments.at(0).as_string(), font); + } break; + case GraphicsOperatorType::show_text_manual_spacing: { // TJ + Font *font = + lookup_font(resources, state.current().text.font, logger, warned); + emit(result, state, join_array_strings(op.arguments.at(0).as_array()), + font); + } break; + case GraphicsOperatorType::show_text_next_line_set_spacing: { // " + Font *font = + lookup_font(resources, state.current().text.font, logger, warned); + emit(result, state, op.arguments.at(2).as_string(), font); + } break; + default: + break; + } + } + + return result; +} + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_page_text.hpp b/src/odr/internal/pdf/pdf_page_text.hpp new file mode 100644 index 00000000..bbacbe2a --- /dev/null +++ b/src/odr/internal/pdf/pdf_page_text.hpp @@ -0,0 +1,48 @@ +#pragma once + +#include + +#include +#include + +namespace odr { +class Logger; +} + +namespace odr::internal::pdf { + +struct Resources; +struct Font; + +/// One show-text operation laid out in user space. The transform places the +/// text origin and orientation; the font size is kept separate so the renderer +/// can choose per-run or per-glyph mapping. Spacing parameters and the raw +/// codes are carried for the (deferred) advance application. +struct TextElement { + /// Text-space -> user-space, font size *not* applied (see + /// `GraphicsState::text_placement_matrix`). + Matrix transform; + /// Resolved font, or `nullptr` when the `/Font` resource name was unknown. + Font *font{nullptr}; + double size{0}; // Tf size + double char_spacing{0}; // Tc + double word_spacing{0}; // Tw + double horizontal_scaling{100}; // Tz, percent + double rise{0}; // Ts + int rendering_mode{0}; // Tr + /// Raw character codes shown (for `TJ`, the array's strings concatenated). + std::string codes; + /// Unicode representation of `codes`; may lack spaces the producer cannot + /// infer (space inference is stage 2.5). + std::string text; +}; + +/// Execute a page's (decoded, concatenated) content stream and collect the text +/// it shows as placed elements. Non-text operators update the graphics state +/// but produce no output. Glyph advances are not yet applied (stage 2.2), so +/// each show operation yields a single element at its starting origin. +std::vector extract_text(const std::string &content, + const Resources &resources, + const Logger &logger); + +} // namespace odr::internal::pdf diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0ac3f996..87298a35 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -47,7 +47,9 @@ add_executable(odr_test "src/internal/pdf/pdf_file_object.cpp" "src/internal/pdf/pdf_file_parser.cpp" "src/internal/pdf/pdf_filter.cpp" + "src/internal/pdf/pdf_geometry.cpp" "src/internal/pdf/pdf_object_parser.cpp" + "src/internal/pdf/pdf_page_text.cpp" "src/internal/pdf/pdf_test_file_builder.cpp" "src/internal/svm/svm_test.cpp" diff --git a/test/src/internal/pdf/pdf_geometry.cpp b/test/src/internal/pdf/pdf_geometry.cpp new file mode 100644 index 00000000..38dec04d --- /dev/null +++ b/test/src/internal/pdf/pdf_geometry.cpp @@ -0,0 +1,52 @@ +#include + +#include + +using namespace odr::internal::pdf; + +TEST(PdfGeometry, identity_apply) { + const auto p = Matrix().apply(3, 4); + EXPECT_DOUBLE_EQ(p[0], 3); + EXPECT_DOUBLE_EQ(p[1], 4); +} + +TEST(PdfGeometry, translation_apply) { + const auto p = Matrix::translation(10, -5).apply(3, 4); + EXPECT_DOUBLE_EQ(p[0], 13); + EXPECT_DOUBLE_EQ(p[1], -1); +} + +TEST(PdfGeometry, scaling_apply) { + const auto p = Matrix::scaling(2, 3).apply(3, 4); + EXPECT_DOUBLE_EQ(p[0], 6); + EXPECT_DOUBLE_EQ(p[1], 12); +} + +// `a * b` applies `a` first, then `b` (row-vector convention), so order +// matters. +TEST(PdfGeometry, compose_is_ordered) { + // scale, then translate + const auto p = + (Matrix::scaling(2, 2) * Matrix::translation(1, 1)).apply(3, 4); + EXPECT_DOUBLE_EQ(p[0], 7); // (3,4) -> (6,8) -> (7,9) + EXPECT_DOUBLE_EQ(p[1], 9); + + // translate, then scale -> different result + const auto q = + (Matrix::translation(1, 1) * Matrix::scaling(2, 2)).apply(3, 4); + EXPECT_DOUBLE_EQ(q[0], 8); // (3,4) -> (4,5) -> (8,10) + EXPECT_DOUBLE_EQ(q[1], 10); +} + +// Composing then applying equals applying each factor in sequence. +TEST(PdfGeometry, compose_matches_sequential_apply) { + const Matrix a{1, 2, 3, 4, 5, 6}; + const Matrix b{2, 0, 1, 3, -1, 4}; + + const auto direct = (a * b).apply(7, 8); + const auto step = a.apply(7, 8); + const auto seq = b.apply(step[0], step[1]); + + EXPECT_DOUBLE_EQ(direct[0], seq[0]); + EXPECT_DOUBLE_EQ(direct[1], seq[1]); +} diff --git a/test/src/internal/pdf/pdf_page_text.cpp b/test/src/internal/pdf/pdf_page_text.cpp new file mode 100644 index 00000000..efe4f33f --- /dev/null +++ b/test/src/internal/pdf/pdf_page_text.cpp @@ -0,0 +1,109 @@ +#include + +#include + +#include + +#include +#include + +#include + +using namespace odr::internal::pdf; +using odr::Logger; + +namespace { + +// Run a content stream with no font resources: fonts resolve to null, so the +// emitted `text` is the raw codes and we can assert positioning in isolation. +std::vector run(const std::string &content) { + Resources resources; + return extract_text(content, resources, Logger::null()); +} + +} // namespace + +// `Td` places the origin via the text line matrix; the font size is carried +// separately, not folded into the transform. +TEST(PdfPageText, td_translation) { + const auto texts = run("BT /F1 12 Tf 100 700 Td (Hi) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.e, 100); + EXPECT_DOUBLE_EQ(texts[0].transform.f, 700); + EXPECT_DOUBLE_EQ(texts[0].transform.a, 1); + EXPECT_DOUBLE_EQ(texts[0].transform.d, 1); + EXPECT_DOUBLE_EQ(texts[0].size, 12); + EXPECT_EQ(texts[0].codes, "Hi"); + EXPECT_EQ(texts[0].text, "Hi"); // no font -> raw codes pass through +} + +// `Tm` sets the text matrix outright, scaling and all. +TEST(PdfPageText, tm_scaling) { + const auto texts = run("BT /F1 10 Tf 2 0 0 2 50 60 Tm (X) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.a, 2); + EXPECT_DOUBLE_EQ(texts[0].transform.d, 2); + EXPECT_DOUBLE_EQ(texts[0].transform.e, 50); + EXPECT_DOUBLE_EQ(texts[0].transform.f, 60); +} + +// `cm` concatenates into the CTM, which composes under the text matrix. +TEST(PdfPageText, ctm_concatenates) { + const auto texts = run("2 0 0 2 0 0 cm BT /F1 10 Tf 10 20 Td (Y) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.a, 2); + EXPECT_DOUBLE_EQ(texts[0].transform.d, 2); + EXPECT_DOUBLE_EQ(texts[0].transform.e, 20); // 10 * 2 + EXPECT_DOUBLE_EQ(texts[0].transform.f, 40); // 20 * 2 +} + +// Horizontal scaling (`Tz`, percent) scales x only, in the transform. +TEST(PdfPageText, horizontal_scaling) { + const auto texts = run("BT /F1 10 Tf 50 Tz 0 0 Td (Z) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.a, 0.5); + EXPECT_DOUBLE_EQ(texts[0].transform.d, 1); + EXPECT_DOUBLE_EQ(texts[0].horizontal_scaling, 50); +} + +// Text rise (`Ts`) offsets the origin in y, unscaled by the font size. +TEST(PdfPageText, text_rise) { + const auto texts = run("BT /F1 10 Tf 0 0 Td 5 Ts (R) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.f, 5); + EXPECT_DOUBLE_EQ(texts[0].rise, 5); +} + +// `TJ` concatenates its strings; the numeric adjustments are dropped +// (stage 2.2). +TEST(PdfPageText, tj_concatenates_strings) { + const auto texts = run("BT /F1 10 Tf 0 0 Td [(Ab) -120 (cd)] TJ ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_EQ(texts[0].codes, "Abcd"); +} + +// `T*` moves down one line by the leading set with `TL`. +TEST(PdfPageText, next_line_uses_leading) { + const auto texts = run("BT /F1 10 Tf 14 TL 0 800 Td (a) Tj T* (b) Tj ET"); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[0].transform.f, 800); + EXPECT_DOUBLE_EQ(texts[1].transform.f, 786); // 800 - 14 +} + +// `'` does the line move and then shows. +TEST(PdfPageText, show_next_line) { + const auto texts = run("BT /F1 10 Tf 10 TL 0 500 Td (a) Tj (b) ' ET"); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[1].transform.f, 490); // 500 - 10 + EXPECT_EQ(texts[1].codes, "b"); +} + +// `"` sets word/char spacing, does the line move, then shows the third operand. +TEST(PdfPageText, show_next_line_set_spacing) { + const auto texts = run("BT /F1 10 Tf 12 TL 0 400 Td (a) Tj 1 2 (b) \" ET"); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[1].transform.f, 388); // 400 - 12 + EXPECT_EQ(texts[1].codes, "b"); + EXPECT_DOUBLE_EQ(texts[1].word_spacing, 1); + EXPECT_DOUBLE_EQ(texts[1].char_spacing, 2); +} From 12bee6b5827c627f17a470407414847000a8ffa6 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 21:03:52 +0200 Subject: [PATCH 2/7] refine --- src/odr/internal/html/pdf_file.cpp | 12 ++--- src/odr/internal/pdf/pdf_geometry.hpp | 41 -------------- src/odr/internal/pdf/pdf_graphics_state.cpp | 16 +++--- src/odr/internal/pdf/pdf_graphics_state.hpp | 26 ++++----- src/odr/internal/pdf/pdf_page_text.hpp | 4 +- src/odr/internal/util/math_util.hpp | 53 +++++++++++++++++++ test/CMakeLists.txt | 2 +- .../math_util_test.cpp} | 28 +++++----- 8 files changed, 98 insertions(+), 84 deletions(-) delete mode 100644 src/odr/internal/pdf/pdf_geometry.hpp create mode 100644 src/odr/internal/util/math_util.hpp rename test/src/internal/{pdf/pdf_geometry.cpp => util/math_util_test.cpp} (55%) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 24371271..d366803f 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -12,7 +12,6 @@ #include #include #include -#include #include namespace odr::internal::html { @@ -74,7 +73,7 @@ class HtmlServiceImpl final : public HtmlService { const auto &pdf_file = dynamic_cast(*m_pdf_file.impl()); pdf::DocumentParser parser = pdf_file.create_parser(*m_logger); - std::unique_ptr document = parser.parse_document(); + const std::unique_ptr document = parser.parse_document(); const std::vector pages = document->collect_pages(); @@ -116,13 +115,14 @@ class HtmlServiceImpl final : public HtmlService { // Map PDF user space (origin at the MediaBox corner, y up) to the page // box in CSS pixels (origin top-left, y down). `flip_glyph` un-mirrors // the glyphs so text stays upright after the page flip. - const pdf::Matrix flip_glyph{1, 0, 0, -1, 0, 0}; - const pdf::Matrix to_box = pdf::Matrix::translation(-box_x0, -box_y0) * - pdf::Matrix{1, 0, 0, -1, 0, height}; + const util::math::Transform2D flip_glyph{1, 0, 0, -1, 0, 0}; + const util::math::Transform2D to_box = + util::math::Transform2D::translation(-box_x0, -box_y0) * + util::math::Transform2D{1, 0, 0, -1, 0, height}; for (const pdf::TextElement &text : pdf::extract_text(stream, *page->resources, *m_logger)) { - const pdf::Matrix m = flip_glyph * text.transform * to_box; + const util::math::Transform2D m = flip_glyph * text.transform * to_box; out.write_element_begin( "span", HtmlElementOptions().set_style([&](std::ostream &o) { diff --git a/src/odr/internal/pdf/pdf_geometry.hpp b/src/odr/internal/pdf/pdf_geometry.hpp deleted file mode 100644 index ca6863e9..00000000 --- a/src/odr/internal/pdf/pdf_geometry.hpp +++ /dev/null @@ -1,41 +0,0 @@ -#pragma once - -#include - -namespace odr::internal::pdf { - -/// 2-D affine transform in PDF's convention: the matrix `[a b c d e f]` denotes -/// -/// | a b 0 | -/// | c d 0 | -/// | e f 1 | -/// -/// and points are row vectors multiplied on the left: `[x y 1] * M`. -/// Composition follows the same order, so `a * b` is "apply `a`, then `b`" (ISO -/// 32000-1 8.3.4). -struct Matrix { - double a{1}, b{0}, c{0}, d{1}, e{0}, f{0}; - - Matrix() = default; - Matrix(double a, double b, double c, double d, double e, double f) - : a{a}, b{b}, c{c}, d{d}, e{e}, f{f} {} - - static Matrix translation(double x, double y) { return {1, 0, 0, 1, x, y}; } - static Matrix scaling(double x, double y) { return {x, 0, 0, y, 0, 0}; } - - /// `*this` applied first, then `rhs`. - [[nodiscard]] Matrix operator*(const Matrix &rhs) const { - return { - a * rhs.a + b * rhs.c, a * rhs.b + b * rhs.d, - c * rhs.a + d * rhs.c, c * rhs.b + d * rhs.d, - e * rhs.a + f * rhs.c + rhs.e, e * rhs.b + f * rhs.d + rhs.f, - }; - } - - /// Map the point `(x, y)` through the transform. - [[nodiscard]] std::array apply(double x, double y) const { - return {a * x + c * y + e, b * x + d * y + f}; - } -}; - -} // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index 60010c47..6d261097 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -20,7 +20,7 @@ ColorSpace color_space_name_to_enum(const std::string &name) { return util::map::lookup_default(mapping, name, ColorSpace::unknown); } -Matrix matrix_from_args(const GraphicsOperator &op) { +util::math::Transform2D matrix_from_args(const GraphicsOperator &op) { return {op.arguments.at(0).as_real(), op.arguments.at(1).as_real(), op.arguments.at(2).as_real(), op.arguments.at(3).as_real(), op.arguments.at(4).as_real(), op.arguments.at(5).as_real()}; @@ -36,17 +36,19 @@ const GraphicsState::State &GraphicsState::current() const { return stack.back(); } -Matrix GraphicsState::text_placement_matrix() const { +util::math::Transform2D GraphicsState::text_placement_matrix() const { const Text &text = current().text; // text rendering matrix without the font size (ISO 32000-1 9.4.4): the font // size scales x and y, horizontal scaling scales x only, rise offsets y. - Matrix params{text.horizontal_scaling / 100.0, 0, 0, 1, 0, text.rise}; + util::math::Transform2D params{ + text.horizontal_scaling / 100.0, 0, 0, 1, 0, text.rise}; return params * text.matrix * current().general.transform_matrix; } -void GraphicsState::next_line(double tx, double ty) { +void GraphicsState::next_line(const double tx, const double ty) { Text &text = current().text; - text.line_matrix = Matrix::translation(tx, ty) * text.line_matrix; + text.line_matrix = + util::math::Transform2D::translation(tx, ty) * text.line_matrix; text.matrix = text.line_matrix; } @@ -145,8 +147,8 @@ void GraphicsState::execute(const GraphicsOperator &op) { case GraphicsOperatorType::begin_text: // BT initializes both the text matrix and the text line matrix to identity. - current().text.matrix = Matrix(); - current().text.line_matrix = Matrix(); + current().text.matrix = util::math::Transform2D(); + current().text.line_matrix = util::math::Transform2D(); break; case GraphicsOperatorType::text_next_line_relative: // Td next_line(op.arguments.at(0).as_real(), op.arguments.at(1).as_real()); diff --git a/src/odr/internal/pdf/pdf_graphics_state.hpp b/src/odr/internal/pdf/pdf_graphics_state.hpp index 6020a66f..242b0921 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.hpp +++ b/src/odr/internal/pdf/pdf_graphics_state.hpp @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -27,7 +27,7 @@ struct GraphicsState { std::string color_rendering_intent; double flatness_tolerance{}; std::string graphics_state_parameters; - Matrix transform_matrix; // CTM + util::math::Transform2D transform_matrix; // CTM }; struct Path { @@ -36,16 +36,16 @@ struct GraphicsState { }; struct Text { - double char_spacing{0}; // Tc - double word_spacing{0}; // Tw - double horizontal_scaling{100}; // Tz, in percent (100 = normal) - double leading{0}; // TL - std::string font; // Tf resource name - double size{}; // Tf size - int rendering_mode{0}; // Tr - double rise{0}; // Ts - Matrix matrix; // Tm - Matrix line_matrix; // Tlm + double char_spacing{0}; // Tc + double word_spacing{0}; // Tw + double horizontal_scaling{100}; // Tz, in percent (100 = normal) + double leading{0}; // TL + std::string font; // Tf resource name + double size{}; // Tf size + int rendering_mode{0}; // Tr + double rise{0}; // Ts + util::math::Transform2D matrix; // Tm + util::math::Transform2D line_matrix; // Tlm std::array glyph_width{}; std::array glyph_bounding_box{}; }; @@ -79,7 +79,7 @@ struct GraphicsState { /// 1 em at the current font size) to user space, with horizontal scaling and /// rise folded in. The font size is applied separately (as the rendered /// font-size), which keeps the run-vs-glyph mapping decision in the renderer. - [[nodiscard]] Matrix text_placement_matrix() const; + [[nodiscard]] util::math::Transform2D text_placement_matrix() const; private: /// Move to the start of a new text line: `Tlm = translate(tx, ty) * Tlm` and diff --git a/src/odr/internal/pdf/pdf_page_text.hpp b/src/odr/internal/pdf/pdf_page_text.hpp index bbacbe2a..eff56332 100644 --- a/src/odr/internal/pdf/pdf_page_text.hpp +++ b/src/odr/internal/pdf/pdf_page_text.hpp @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -21,7 +21,7 @@ struct Font; struct TextElement { /// Text-space -> user-space, font size *not* applied (see /// `GraphicsState::text_placement_matrix`). - Matrix transform; + util::math::Transform2D transform; /// Resolved font, or `nullptr` when the `/Font` resource name was unknown. Font *font{nullptr}; double size{0}; // Tf size diff --git a/src/odr/internal/util/math_util.hpp b/src/odr/internal/util/math_util.hpp new file mode 100644 index 00000000..5b7fcc93 --- /dev/null +++ b/src/odr/internal/util/math_util.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include + +namespace odr::internal::util::math { + +/// 2-D affine transform using PDF's row-vector convention: `[a b c d e f]` +/// denotes +/// +/// | a b 0 | +/// | c d 0 | +/// | e f 1 | +/// +/// Points are row vectors multiplied on the left: `[x y 1] * M`. Composition +/// follows the same order: `a * b` means "apply `a`, then `b`" (ISO 32000-1 +/// 8.3.4). +struct Transform2D { + double a{1}; + double b{0}; + double c{0}; + double d{1}; + double e{0}; + double f{0}; + + Transform2D() = default; + Transform2D(const double a_, const double b_, const double c_, + const double d_, const double e_, const double f_) + : a{a_}, b{b_}, c{c_}, d{d_}, e{e_}, f{f_} {} + + static Transform2D translation(const double x, const double y) { + return {1, 0, 0, 1, x, y}; + } + static Transform2D scaling(const double x, const double y) { + return {x, 0, 0, y, 0, 0}; + } + + /// `*this` applied first, then `rhs`. + [[nodiscard]] Transform2D operator*(const Transform2D &rhs) const { + return { + a * rhs.a + b * rhs.c, a * rhs.b + b * rhs.d, + c * rhs.a + d * rhs.c, c * rhs.b + d * rhs.d, + e * rhs.a + f * rhs.c + rhs.e, e * rhs.b + f * rhs.d + rhs.f, + }; + } + + /// Map the point `(x, y)` through the transform. + [[nodiscard]] std::array apply(const double x, + const double y) const { + return {a * x + c * y + e, b * x + d * y + f}; + } +}; + +} // namespace odr::internal::util::math diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 87298a35..17ba1af6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -47,7 +47,7 @@ add_executable(odr_test "src/internal/pdf/pdf_file_object.cpp" "src/internal/pdf/pdf_file_parser.cpp" "src/internal/pdf/pdf_filter.cpp" - "src/internal/pdf/pdf_geometry.cpp" + "src/internal/util/math_util_test.cpp" "src/internal/pdf/pdf_object_parser.cpp" "src/internal/pdf/pdf_page_text.cpp" "src/internal/pdf/pdf_test_file_builder.cpp" diff --git a/test/src/internal/pdf/pdf_geometry.cpp b/test/src/internal/util/math_util_test.cpp similarity index 55% rename from test/src/internal/pdf/pdf_geometry.cpp rename to test/src/internal/util/math_util_test.cpp index 38dec04d..8b02ecbc 100644 --- a/test/src/internal/pdf/pdf_geometry.cpp +++ b/test/src/internal/util/math_util_test.cpp @@ -1,47 +1,47 @@ -#include +#include #include -using namespace odr::internal::pdf; +using namespace odr::internal::util::math; -TEST(PdfGeometry, identity_apply) { - const auto p = Matrix().apply(3, 4); +TEST(Transform2D, identity_apply) { + const auto p = Transform2D().apply(3, 4); EXPECT_DOUBLE_EQ(p[0], 3); EXPECT_DOUBLE_EQ(p[1], 4); } -TEST(PdfGeometry, translation_apply) { - const auto p = Matrix::translation(10, -5).apply(3, 4); +TEST(Transform2D, translation_apply) { + const auto p = Transform2D::translation(10, -5).apply(3, 4); EXPECT_DOUBLE_EQ(p[0], 13); EXPECT_DOUBLE_EQ(p[1], -1); } -TEST(PdfGeometry, scaling_apply) { - const auto p = Matrix::scaling(2, 3).apply(3, 4); +TEST(Transform2D, scaling_apply) { + const auto p = Transform2D::scaling(2, 3).apply(3, 4); EXPECT_DOUBLE_EQ(p[0], 6); EXPECT_DOUBLE_EQ(p[1], 12); } // `a * b` applies `a` first, then `b` (row-vector convention), so order // matters. -TEST(PdfGeometry, compose_is_ordered) { +TEST(Transform2D, compose_is_ordered) { // scale, then translate const auto p = - (Matrix::scaling(2, 2) * Matrix::translation(1, 1)).apply(3, 4); + (Transform2D::scaling(2, 2) * Transform2D::translation(1, 1)).apply(3, 4); EXPECT_DOUBLE_EQ(p[0], 7); // (3,4) -> (6,8) -> (7,9) EXPECT_DOUBLE_EQ(p[1], 9); // translate, then scale -> different result const auto q = - (Matrix::translation(1, 1) * Matrix::scaling(2, 2)).apply(3, 4); + (Transform2D::translation(1, 1) * Transform2D::scaling(2, 2)).apply(3, 4); EXPECT_DOUBLE_EQ(q[0], 8); // (3,4) -> (4,5) -> (8,10) EXPECT_DOUBLE_EQ(q[1], 10); } // Composing then applying equals applying each factor in sequence. -TEST(PdfGeometry, compose_matches_sequential_apply) { - const Matrix a{1, 2, 3, 4, 5, 6}; - const Matrix b{2, 0, 1, 3, -1, 4}; +TEST(Transform2D, compose_matches_sequential_apply) { + const Transform2D a{1, 2, 3, 4, 5, 6}; + const Transform2D b{2, 0, 1, 3, -1, 4}; const auto direct = (a * b).apply(7, 8); const auto step = a.apply(7, 8); From e889169c52291c73bd4feb5fecf27655f3c73fb4 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 21:22:50 +0200 Subject: [PATCH 3/7] few more minor --- src/odr/internal/html/pdf_file.cpp | 5 ++-- src/odr/internal/pdf/pdf_graphics_state.cpp | 7 +++--- src/odr/internal/pdf/pdf_graphics_state.hpp | 11 +++++---- src/odr/internal/pdf/pdf_page_text.cpp | 15 +++++++----- src/odr/internal/util/math_util.hpp | 26 ++++++++++++++------- 5 files changed, 40 insertions(+), 24 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index d366803f..d96d180a 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -90,6 +90,7 @@ class HtmlServiceImpl final : public HtmlService { // CSS uses 96px to the inch, PDF user space 72 units to the inch. static constexpr double pt_to_px = 96.0 / 72.0; + static constexpr double to_in = 1 / 72.0; for (pdf::Page *page : pages) { const pdf::Array &page_box = page->media_box.as_array(); @@ -101,8 +102,8 @@ class HtmlServiceImpl final : public HtmlService { out.write_element_begin( "div", HtmlElementOptions().set_style([&](std::ostream &o) { o << "position:relative;"; - o << "width:" << width / 72.0 << "in;"; - o << "height:" << height / 72.0 << "in;"; + o << "width:" << width * to_in << "in;"; + o << "height:" << height * to_in << "in;"; })); std::string stream; diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index 6d261097..7c415029 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -36,12 +36,13 @@ const GraphicsState::State &GraphicsState::current() const { return stack.back(); } -util::math::Transform2D GraphicsState::text_placement_matrix() const { +util::math::Transform2D GraphicsState::text_placement_transform() const { const Text &text = current().text; // text rendering matrix without the font size (ISO 32000-1 9.4.4): the font // size scales x and y, horizontal scaling scales x only, rise offsets y. - util::math::Transform2D params{ - text.horizontal_scaling / 100.0, 0, 0, 1, 0, text.rise}; + const util::math::Transform2D params = + util::math::Transform2D::translation_scaling( + 0, text.rise, text.horizontal_scaling / 100.0, 1); return params * text.matrix * current().general.transform_matrix; } diff --git a/src/odr/internal/pdf/pdf_graphics_state.hpp b/src/odr/internal/pdf/pdf_graphics_state.hpp index 242b0921..98c03a75 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.hpp +++ b/src/odr/internal/pdf/pdf_graphics_state.hpp @@ -75,11 +75,12 @@ struct GraphicsState { void execute(const GraphicsOperator &); - /// Text rendering matrix *excluding* the font size: maps text space (1 unit = - /// 1 em at the current font size) to user space, with horizontal scaling and - /// rise folded in. The font size is applied separately (as the rendered - /// font-size), which keeps the run-vs-glyph mapping decision in the renderer. - [[nodiscard]] util::math::Transform2D text_placement_matrix() const; + /// Text rendering transform *excluding* the font size: maps text space (1 + /// unit = 1 em at the current font size) to user space, with horizontal + /// scaling and rise folded in. The font size is applied separately (as the + /// rendered font-size), which keeps the run-vs-glyph mapping decision in the + /// renderer. + [[nodiscard]] util::math::Transform2D text_placement_transform() const; private: /// Move to the start of a new text line: `Tlm = translate(tx, ty) * Tlm` and diff --git a/src/odr/internal/pdf/pdf_page_text.cpp b/src/odr/internal/pdf/pdf_page_text.cpp index 313fb4c0..ee2ac2e5 100644 --- a/src/odr/internal/pdf/pdf_page_text.cpp +++ b/src/odr/internal/pdf/pdf_page_text.cpp @@ -11,7 +11,6 @@ #include namespace odr::internal::pdf { - namespace { Font *lookup_font(const Resources &resources, const std::string &name, @@ -31,7 +30,7 @@ void emit(std::vector &out, const GraphicsState &state, const GraphicsState::Text &text = state.current().text; TextElement element; - element.transform = state.text_placement_matrix(); + element.transform = state.text_placement_transform(); element.font = font; element.size = text.size; element.char_spacing = text.char_spacing; @@ -59,9 +58,13 @@ std::string join_array_strings(const Array &array) { } // namespace -std::vector extract_text(const std::string &content, - const Resources &resources, - const Logger &logger) { +} // namespace odr::internal::pdf + +namespace odr::internal { + +std::vector pdf::extract_text(const std::string &content, + const Resources &resources, + const Logger &logger) { std::vector result; std::set warned; @@ -99,4 +102,4 @@ std::vector extract_text(const std::string &content, return result; } -} // namespace odr::internal::pdf +} // namespace odr::internal diff --git a/src/odr/internal/util/math_util.hpp b/src/odr/internal/util/math_util.hpp index 5b7fcc93..e5fa57b5 100644 --- a/src/odr/internal/util/math_util.hpp +++ b/src/odr/internal/util/math_util.hpp @@ -22,20 +22,30 @@ struct Transform2D { double e{0}; double f{0}; - Transform2D() = default; - Transform2D(const double a_, const double b_, const double c_, - const double d_, const double e_, const double f_) + constexpr Transform2D() noexcept = default; + constexpr Transform2D(const double a_, const double b_, const double c_, + const double d_, const double e_, + const double f_) noexcept : a{a_}, b{b_}, c{c_}, d{d_}, e{e_}, f{f_} {} - static Transform2D translation(const double x, const double y) { + constexpr static Transform2D translation(const double x, + const double y) noexcept { return {1, 0, 0, 1, x, y}; } - static Transform2D scaling(const double x, const double y) { + constexpr static Transform2D scaling(const double x, + const double y) noexcept { return {x, 0, 0, y, 0, 0}; } + constexpr static Transform2D translation_scaling(const double x, + const double y, + const double sx, + const double sy) noexcept { + return {sx, 0, 0, sy, x, y}; + } /// `*this` applied first, then `rhs`. - [[nodiscard]] Transform2D operator*(const Transform2D &rhs) const { + [[nodiscard]] constexpr Transform2D + operator*(const Transform2D &rhs) const noexcept { return { a * rhs.a + b * rhs.c, a * rhs.b + b * rhs.d, c * rhs.a + d * rhs.c, c * rhs.b + d * rhs.d, @@ -44,8 +54,8 @@ struct Transform2D { } /// Map the point `(x, y)` through the transform. - [[nodiscard]] std::array apply(const double x, - const double y) const { + [[nodiscard]] constexpr std::array + apply(const double x, const double y) const noexcept { return {a * x + c * y + e, b * x + d * y + f}; } }; From 110fa1f05d1074ec6dbc65b20e0d55937a0a73db Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 21:37:44 +0200 Subject: [PATCH 4/7] =?UTF-8?q?AGENTS.md:=20update=20Matrix=20=E2=86=92=20?= =?UTF-8?q?Transform2D=20and=20pdf=5Fgeometry.hpp=20=E2=86=92=20util/math?= =?UTF-8?q?=5Futil.hpp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- src/odr/internal/pdf/AGENTS.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index 98b0d428..fd55e21c 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -111,7 +111,7 @@ graphics, no images, no font files. Experimental and not production-quality. line parameters, text state `Tc`/`Tw`/`Tz`/`TL`/`Tf`/`Tr`/`Ts`, glyph metrics `d0`/`d1`, grey/RGB/CMYK colors). The CTM **concatenates** on `cm` (ISO 32000-1 8.4.4); the text matrix `Tm` and text line matrix `Tlm` are tracked as 2-D - affine `Matrix` values (`pdf_geometry.hpp`), with `BT` resetting them, `Td`/`TD` + affine `Transform2D` values (`util/math_util.hpp`), with `BT` resetting them, `Td`/`TD` /`T*` (and the line-move half of `'`/`"`) advancing `Tlm` → `Tm`. Unknown operators are logged to stderr and skipped. - **Text layout** (`pdf_page_text`, stage 2.1): `extract_text` runs the operator @@ -149,10 +149,10 @@ graphics, no images, no font files. Experimental and not production-quality. | `pdf_encoding.{hpp,cpp}` | Simple-font `/Encoding` → Unicode: `BaseEncoding` tables, `/Differences` overlay (`Encoding`), glyph-name → Unicode via AGL + `uniXXXX`/`uXXXXXX` (stage 1.2) | | `pdf_cid.{hpp,cpp}` | Composite-font predefined `/Encoding` → Unicode: the `Uni*-UCS2/UTF16/UTF32` CMaps decoded directly (no data tables), stage 1.3 part B; legacy CJK CMaps deferred (see `tools/pdf/generate_cid_data.py`) | | `pdf_encoding_data.{hpp,cpp}` | **Generated** (`tools/pdf/generate_encoding_data.py`): base-encoding tables + the Adobe Glyph List as a name-sorted array | -| `pdf_geometry.hpp` | `Matrix`: 2-D affine transform (PDF row-vector convention) — compose, point-apply, translation/scaling factories (stage 2.1) | +| `util/math_util.hpp` | `util::math::Transform2D`: 2-D affine transform (PDF row-vector convention) — compose, point-apply, translation/scaling factories (stage 2.1) | | `pdf_graphics_operator.hpp` | `GraphicsOperatorType` enum (full operator set) + `GraphicsOperator` (type + `Object` arguments) | | `pdf_graphics_operator_parser.{hpp,cpp}` | Content-stream tokenizer: arguments then operator name | -| `pdf_graphics_state.{hpp,cpp}` | `GraphicsState`: stack of `State` (general/path/text/color), `execute(op)` for the modelled subset; CTM/`Tm`/`Tlm` as `Matrix`, `text_placement_matrix()` for the text rendering transform sans font size | +| `pdf_graphics_state.{hpp,cpp}` | `GraphicsState`: stack of `State` (general/path/text/color), `execute(op)` for the modelled subset; CTM/`Tm`/`Tlm` as `Transform2D`, `text_placement_matrix()` for the text rendering transform sans font size | | `pdf_page_text.{hpp,cpp}` | `extract_text`: run the content stream through `GraphicsState`, emit a `TextElement` (placed transform + font/size/spacing + codes + Unicode) per show operation (stage 2.1) | | `pdf_file.{hpp,cpp}` | `abstract::PdfFile` wrapper; probes encryption at construction and implements `password_encrypted()`/`decrypt()`, carrying the authenticated `Decryptor` (not the password) so rendering needs no re-derivation | @@ -323,8 +323,8 @@ such PDFs look right, their text just isn't selectable until the tables land. `translate_predefined_cmap` over the predefined Unicode CMaps — `UCS2`/`UTF16` (incl. a surrogate pair) and `UTF32` decoding, a `-V` writing-mode variant, and the `nullopt` for `Identity-H` and the legacy CJK CMaps (stage 1.3 part B). -- `test/src/internal/pdf/pdf_geometry.cpp` — **assertion-based**, no fixtures: - `Matrix` point-apply (identity/translation/scaling), the ordered +- `test/src/internal/util/math_util_test.cpp` — **assertion-based**, no fixtures: + `Transform2D` point-apply (identity/translation/scaling), the ordered (row-vector) composition, and compose-then-apply ≡ sequential apply (stage 2.1). - `test/src/internal/pdf/pdf_page_text.cpp` — **assertion-based**, inline content streams through `extract_text` (empty resources, so codes pass through as @@ -404,7 +404,7 @@ up-front "HTML mapping decision" is dissolved into this.) ### 2.1 — transforms & the placed-text emission — **in progress** The geometry foundation plus the emission contract, *without* glyph advances: -- A 2-D affine `Matrix` (`pdf_geometry.hpp`): compose, point-apply, +- A 2-D affine `Transform2D` (`util/math_util.hpp`): compose, point-apply, translation/scaling factories. - Apply the full transform chain in `GraphicsState`: the CTM now *concatenates* on `cm` (it was overwritten); the text matrix `Tm` and text line matrix `Tlm` From f461de306bfd36fc6f77aee2711f723188c4ff8f Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 22:50:46 +0200 Subject: [PATCH 5/7] update all refs; optimize pdf size a bit --- src/odr/internal/html/pdf_file.cpp | 56 ++++++++++++++++++-------- test/data/reference-output/odr-private | 2 +- test/data/reference-output/odr-public | 2 +- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index d96d180a..3e33acfd 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -14,6 +14,8 @@ #include #include +#include + namespace odr::internal::html { namespace { @@ -84,6 +86,13 @@ class HtmlServiceImpl final : public HtmlService { out.write_header_title("odr"); out.write_header_viewport( "width=device-width,initial-scale=1.0,user-scalable=yes"); + // Constant per-page and per-glyph styling lives in classes so it is not + // repeated inline on every one of the (potentially millions of) spans. + out.write_header_style_begin(); + out.out() << ".p{position:relative}"; + out.out() << ".t{position:absolute;left:0;top:0;transform-origin:0 0;" + "white-space:pre}"; + out.write_header_style_end(); out.write_header_end(); out.write_body_begin(); @@ -100,10 +109,10 @@ class HtmlServiceImpl final : public HtmlService { const double height = page_box[3].as_real() - box_y0; out.write_element_begin( - "div", HtmlElementOptions().set_style([&](std::ostream &o) { - o << "position:relative;"; - o << "width:" << width * to_in << "in;"; - o << "height:" << height * to_in << "in;"; + "div", + HtmlElementOptions().set_class("p").set_style([&](std::ostream &o) { + o << "width:" << width * pt_to_in << "in;"; + o << "height:" << height * pt_to_in << "in;"; })); std::string stream; @@ -116,26 +125,41 @@ class HtmlServiceImpl final : public HtmlService { // Map PDF user space (origin at the MediaBox corner, y up) to the page // box in CSS pixels (origin top-left, y down). `flip_glyph` un-mirrors // the glyphs so text stays upright after the page flip. - const util::math::Transform2D flip_glyph{1, 0, 0, -1, 0, 0}; + constexpr util::math::Transform2D flip_glyph = + util::math::Transform2D::scaling(1, -1); const util::math::Transform2D to_box = util::math::Transform2D::translation(-box_x0, -box_y0) * - util::math::Transform2D{1, 0, 0, -1, 0, height}; + util::math::Transform2D::translation_scaling(1, -1, 0, height); + + // Round CSS coordinates to 0.01px; sub-pixel precision beyond that is + // invisible and the extra digits add up over millions of spans. + const auto round2 = [](const double v) { + return std::round(v * 100.0) / 100.0; + }; for (const pdf::TextElement &text : pdf::extract_text(stream, *page->resources, *m_logger)) { const util::math::Transform2D m = flip_glyph * text.transform * to_box; out.write_element_begin( - "span", HtmlElementOptions().set_style([&](std::ostream &o) { - o << "position:absolute;left:0;top:0;"; - o << "transform-origin:0 0;"; - // TODO baseline sits at the box top until font ascent metrics - // land - o << "transform:matrix(" << m.a << "," << m.b << "," << m.c << "," - << m.d << "," << m.e * pt_to_px << "," << m.f * pt_to_px - << ");"; - o << "font-size:" << text.size * pt_to_px << "px;"; - o << "white-space:pre;"; + "span", + HtmlElementOptions().set_class("t").set_style([&](std::ostream &o) { + // TODO baseline sits at the box top until font ascent + // metrics land + if (m.b == 0 && m.c == 0 && m.a == m.d) { + // Upright uniform scale: fold the scale into the + // font size and place the origin with left/top, so + // the (otherwise near-universal) matrix is dropped. + o << "left:" << round2(m.e * pt_to_px) << "px;"; + o << "top:" << round2(m.f * pt_to_px) << "px;"; + o << "font-size:" << round2(m.a * text.size * pt_to_px) + << "px;"; + } else { + o << "transform:matrix(" << m.a << "," << m.b << "," << m.c + << "," << m.d << "," << round2(m.e * pt_to_px) << "," + << round2(m.f * pt_to_px) << ");"; + o << "font-size:" << round2(text.size * pt_to_px) << "px;"; + } })); out.write_raw(escape_text(text.text)); out.write_element_end("span"); diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index 83dfd8f5..9c1e64e4 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit 83dfd8f556e33db8db37822351f75305d60e8cb9 +Subproject commit 9c1e64e43623027cbdf1392fb6e8fe7022f2ad70 diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index c5d35204..ceb9001b 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit c5d35204cd3ae501f79c88f26ac80293cefd64c4 +Subproject commit ceb9001b1506b3fa218bd722ce0e6105d321a218 From 619d957e1000e8cb21c485ec4e8293b98ccf0237 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 21:44:52 +0200 Subject: [PATCH 6/7] fix From 57c05e9687330ae4d577c58705c1c9923a4959ab Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 23:04:01 +0200 Subject: [PATCH 7/7] fix --- src/odr/internal/html/pdf_file.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 3e33acfd..6c6910f4 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -99,7 +99,7 @@ class HtmlServiceImpl final : public HtmlService { // CSS uses 96px to the inch, PDF user space 72 units to the inch. static constexpr double pt_to_px = 96.0 / 72.0; - static constexpr double to_in = 1 / 72.0; + static constexpr double pt_to_in = 1 / 72.0; for (pdf::Page *page : pages) { const pdf::Array &page_box = page->media_box.as_array();