From 61061bba20befaf6c73360805b0abf6b45d5c70e Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 14:59:55 +0200 Subject: [PATCH 1/6] PDF stage 2.2: glyph advances & metrics Parse font glyph widths and advance the text matrix per glyph, on top of 2.1's placed-text emission, so segments, TJ kerning and lines land in the right place. - Font metrics (pdf_document_parser, Font): /FirstChar + /Widths + /FontDescriptor /MissingWidth (simple), /W + /DW (descendant CIDFont, both `c [w...]` and `c_first c_last w` forms). Font::advance_width(code) returns the advance in text-space units with the MissingWidth/DW fallbacks; code_byte_width() is 1 (simple) / 2 (composite). - Advance application (extract_text, GraphicsState::advance_text): emit one TextElement per shown segment (one Tj/'/", or one string of a TJ array); after each, advance Tm by sum(width*Tfs + Tc [+ Tw for single-byte 0x20]) * Th, and translate Tm by -n/1000*Tfs*Th for a TJ number. The element carries its total advance; per-glyph placement stays re-derivable from font->advance_width, keeping the run-vs-glyph choice in the renderer. Out of scope (later): intra-segment glyph shaping (stage 3), AFM widths for non-embedded standard-14 fonts (stage 3), vertical writing advances (2.6). Tests: composite /W+/DW and simple /Widths+/MissingWidth parsing asserted through advance_width; extract_text advance coverage (simple widths, TJ adjustment, char/word spacing, composite /DW, advance_width fallbacks). Co-Authored-By: Claude Opus 4.8 --- src/odr/internal/pdf/AGENTS.md | 115 ++++++++++------- src/odr/internal/pdf/pdf_document.cpp | 14 +++ src/odr/internal/pdf/pdf_document_element.hpp | 22 ++++ src/odr/internal/pdf/pdf_document_parser.cpp | 98 ++++++++++++++- src/odr/internal/pdf/pdf_graphics_state.cpp | 5 + src/odr/internal/pdf/pdf_graphics_state.hpp | 4 + src/odr/internal/pdf/pdf_page_text.cpp | 60 ++++++--- src/odr/internal/pdf/pdf_page_text.hpp | 16 ++- test/src/internal/pdf/pdf_document_parser.cpp | 50 +++++++- test/src/internal/pdf/pdf_page_text.cpp | 117 ++++++++++++++++-- 10 files changed, 425 insertions(+), 76 deletions(-) diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index fd55e21c..d6afd38f 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -13,9 +13,9 @@ tables, cross-reference streams, object streams, hybrid files, with a forward-scan recovery path for broken cross-references), build the page tree with fonts and annotations, tokenize page content streams into graphics operators, and emit a **proof-of-concept HTML rendering**: absolutely positioned -text spans, one per show operation, placed by the full text transform (CTM × -text matrix, stage 2.1), pages sized from `MediaBox`. Encrypted files are -decrypted (RC4, AES-128, AES-256). No glyph advances yet (stage 2.2), no +text spans, one per shown segment, placed by the full text transform (CTM × text +matrix) and advanced by the parsed glyph widths (stages 2.1–2.2), pages sized +from `MediaBox`. Encrypted files are decrypted (RC4, AES-128, AES-256). No graphics, no images, no font files. Experimental and not production-quality. --- @@ -106,6 +106,13 @@ graphics, no images, no font files. Experimental and not production-quality. other case (`Identity-H/V`, or the legacy CJK code→CID CMaps) yields "no Unicode" (not byte-garbage) until the legacy CID → Unicode tables (the deferred half of part B) or the embedded font program (stage 3) land. +- **Glyph metrics** (stage 2.2): a font's advance widths are parsed — + `/FirstChar` + `/Widths` + `/FontDescriptor` `/MissingWidth` for simple fonts, + `/W` + `/DW` (the descendant CIDFont, both `c [w…]` and `c_first c_last w` + forms) for composite fonts. `Font::advance_width(code)` returns the advance in + text-space units (glyph-space / 1000), falling back to `/MissingWidth` or `/DW`. + Codes outside the corpus are interpreted as CIDs for composite fonts (identity); + AFM widths for the non-embedded standard-14 fonts are stage 3. - **Content streams**: the full graphics-operator vocabulary is tokenized; `GraphicsState` executes a subset (state stack `q`/`Q`, matrices `cm`/`Tm`, line parameters, text state `Tc`/`Tw`/`Tz`/`TL`/`Tf`/`Tr`/`Ts`, glyph metrics @@ -114,15 +121,21 @@ graphics, no images, no font files. Experimental and not production-quality. affine `Transform2D` values (`util/math_util.hpp`), with `BT` resetting them, `Td`/`TD` /`T*` (and the line-move half of `'`/`"`) advancing `Tlm` → `Tm`. Unknown operators are logged to stderr and skipped. -- **Text layout** (`pdf_page_text`, stage 2.1): `extract_text` runs the operator - parser + `GraphicsState` over a page's content and emits a renderer-agnostic - `TextElement` per show operation (`Tj`/`TJ`/`'`/`"`) — its text-space → user- - space transform (CTM × `Tm`, with horizontal scaling and rise folded in, font - size kept separate), the resolved font, size, spacing parameters, raw codes, - and the CMap-translated Unicode. Font lookup is lenient (unknown ref → warn, - raw codes). **Glyph advances are not yet applied** (stage 2.2): each show op - yields one element at its starting origin, `TJ`'s numeric adjustments are - dropped, and `Tc`/`Tw`/`Tz`-driven spacing is carried but not consumed. +- **Text layout** (`pdf_page_text`, stages 2.1–2.2): `extract_text` runs the + operator parser + `GraphicsState` over a page's content and emits a + renderer-agnostic `TextElement` per shown *segment* (one `Tj`/`'`/`"`, or one + string of a `TJ` array) — its text-space → user-space transform (CTM × `Tm`, + with horizontal scaling and rise folded in, font size kept separate), the + resolved font, size, spacing parameters, raw codes, the CMap-translated + Unicode, and the segment's total advance. Font lookup is lenient (unknown ref → + warn, raw codes). **Glyph advances are applied** (stage 2.2): after each segment + the text matrix `Tm` advances by the glyph widths × font size plus char/word + spacing (× horizontal scaling), and a `TJ` number translates `Tm` by + `−n/1000 × Tfs × Th` — so segments, `TJ` kerning and lines land in the right + place. A renderer wanting per-glyph placement re-derives per-code advances from + `font->advance_width`. Still deferred: intra-segment glyph shaping (the browser + lays a segment out in a fallback font until stage 3) and vertical writing-mode + advances (stage 2.6). - **HTML**: one `document.html` view; each page is a `div` sized from `MediaBox` (points → inches). Each `TextElement` becomes an absolutely positioned `span` carrying a CSS `transform` matrix (the placement transform mapped from PDF user @@ -143,7 +156,7 @@ graphics, no images, no font files. Experimental and not production-quality. | `pdf_document_parser.{hpp,cpp}` | `parse_document()`: xref/trailer chain → catalog → page tree; lazy object reads with cache; (deep) reference resolution | | `pdf_encryption.{hpp,cpp}` | Standard security handler: `Authenticator` (parse `/Encrypt`, authenticate password → `Decryptor`) and `Decryptor` (decrypt strings/streams; RC4, AES-128, AES-256), plus a `standard_security` namespace of pure key/password algorithms for known-answer tests | | `pdf_document.hpp` | `Document`: arena of `Element`s + `catalog` pointer | -| `pdf_document_element.hpp` | Element structs: `Catalog`, `Pages`, `Page`, `Annotation`, `Resources`, `Font` (incl. the `composite`/`cid_registry`/`cid_ordering` Type0 facts and `to_unicode`) | +| `pdf_document_element.hpp` | Element structs: `Catalog`, `Pages`, `Page`, `Annotation`, `Resources`, `Font` (incl. the `composite`/`cid_registry`/`cid_ordering` Type0 facts, the `/Widths`-`/W`/`/DW` glyph metrics + `advance_width`, and `to_unicode`) | | `pdf_cmap.{hpp,cpp}` | `CMap`: 1-byte glyph → UTF-16 `bfchar` map + string translation | | `pdf_cmap_parser.{hpp,cpp}` | `ToUnicode` CMap stream parser (`begincodespacerange`/`beginbfchar`/`beginbfrange`; only `bfchar` applied) | | `pdf_encoding.{hpp,cpp}` | Simple-font `/Encoding` → Unicode: `BaseEncoding` tables, `/Differences` overlay (`Encoding`), glyph-name → Unicode via AGL + `uniXXXX`/`uXXXXXX` (stage 1.2) | @@ -152,8 +165,8 @@ graphics, no images, no font files. Experimental and not production-quality. | `util/math_util.hpp` | `util::math::Transform2D`: 2-D affine transform (PDF row-vector convention) — compose, point-apply, translation/scaling factories (stage 2.1) | | `pdf_graphics_operator.hpp` | `GraphicsOperatorType` enum (full operator set) + `GraphicsOperator` (type + `Object` arguments) | | `pdf_graphics_operator_parser.{hpp,cpp}` | Content-stream tokenizer: arguments then operator name | -| `pdf_graphics_state.{hpp,cpp}` | `GraphicsState`: stack of `State` (general/path/text/color), `execute(op)` for the modelled subset; CTM/`Tm`/`Tlm` as `Transform2D`, `text_placement_matrix()` for the text rendering transform sans font size | -| `pdf_page_text.{hpp,cpp}` | `extract_text`: run the content stream through `GraphicsState`, emit a `TextElement` (placed transform + font/size/spacing + codes + Unicode) per show operation (stage 2.1) | +| `pdf_graphics_state.{hpp,cpp}` | `GraphicsState`: stack of `State` (general/path/text/color), `execute(op)` for the modelled subset; CTM/`Tm`/`Tlm` as `Transform2D`, `text_placement_matrix()` for the text rendering transform sans font size, `advance_text()` for the post-glyph `Tm` advance | +| `pdf_page_text.{hpp,cpp}` | `extract_text`: run the content stream through `GraphicsState`, emit a `TextElement` (placed transform + font/size/spacing + codes + Unicode + advance) per shown segment, advancing `Tm` by the glyph widths and `TJ` adjustments (stages 2.1–2.2) | | `pdf_file.{hpp,cpp}` | `abstract::PdfFile` wrapper; probes encryption at construction and implements `password_encrypted()`/`decrypt()`, carrying the authenticated `Decryptor` (not the password) so rendering needs no re-derivation | Consumers outside the module: `open_strategy.cpp` (detection / engine @@ -189,13 +202,15 @@ selection) and `html/pdf_file.cpp` (`create_pdf_service`). decoded through their `/Filter` chain (`read_decoded_stream`), concatenated with a newline between streams. 6. **Lay out and emit.** `extract_text` runs `GraphicsOperatorParser` + - `GraphicsState` over the content and returns a `TextElement` per show - operation, each placed by `text_placement_matrix()` (CTM × `Tm`, with - horizontal scaling and rise folded in), its glyphs translated through the - font's CMap. The HTML layer maps each element to a positioned `span` with a - CSS `transform` (PDF user space → the page box in CSS pixels) and `font-size` - from the text state. Glyph advances are **not yet applied** (stage 2.2), so - shows without an explicit move overlap. + `GraphicsState` over the content and returns a `TextElement` per shown + segment, each placed by `text_placement_matrix()` (CTM × `Tm`, with horizontal + scaling and rise folded in), its glyphs translated through the font's CMap. + After each segment `Tm` is advanced by the glyph widths (`advance_width`) plus + char/word spacing, and `TJ` numbers translate `Tm` directly, so segments and + lines land correctly. The HTML layer maps each element to a positioned `span` + with a CSS `transform` (PDF user space → the page box in CSS pixels) and + `font-size` from the text state. Intra-segment glyph shaping is the browser's + until the embedded font lands (stage 3). --- @@ -294,8 +309,10 @@ such PDFs look right, their text just isn't selectable until the tables land. stream), plus composite-font coverage (a Type0 font over an `Identity-H` descendant `CIDFontType2`: `composite`/`/CIDSystemInfo` recorded, 2-byte `/ToUnicode` extraction, the no-`/ToUnicode` "no Unicode" fallback, and a - predefined `Uni*-UCS2-H` `/Encoding` extracting without a `/ToUnicode`). - End-to-end: the classic fixture + predefined `Uni*-UCS2-H` `/Encoding` extracting without a `/ToUnicode`), plus + glyph-metric coverage (the composite `/W`+`/DW` and a simple + `/FirstChar`/`/Widths`/`/MissingWidth` font, asserted through `advance_width`, + stage 2.2). End-to-end: the classic fixture `odr-public/pdf/style-various-1.pdf`, plus decryption of `odr-public/pdf/Casio_WVA-M650-7AJF.pdf` (RC4, empty password) and `odr-private/pdf/encrypted_fontfile3_opentype.pdf` (AES-256; skipped when the @@ -327,10 +344,13 @@ such PDFs look right, their text just isn't selectable until the tables land. `Transform2D` point-apply (identity/translation/scaling), the ordered (row-vector) composition, and compose-then-apply ≡ sequential apply (stage 2.1). - `test/src/internal/pdf/pdf_page_text.cpp` — **assertion-based**, inline content - streams through `extract_text` (empty resources, so codes pass through as - `text`): `Td` translation, `Tm` scaling, `cm` CTM concatenation under `Tm`, - horizontal scaling and rise in the transform, `TJ` string concatenation, and - the `T*`/`'`/`"` line moves with their leading and spacing (stage 2.1). + streams through `extract_text`: `Td` translation, `Tm` scaling, `cm` CTM + concatenation under `Tm`, horizontal scaling and rise in the transform, and the + `T*`/`'`/`"` line moves with their leading and spacing (stage 2.1); plus + glyph-advance coverage with hand-built `Font`s — simple `/Widths` advancing a + following show, `TJ` emitting per string with the numeric adjustment applied, + char spacing, word spacing on the single-byte space, the composite 2-byte `/DW` + advance, and the `advance_width` fallbacks (stage 2.2). No assertion-based coverage of the tokenizer (escapes, references, hex strings) or the HTML output itself (the span emission / CSS transform mapping). @@ -401,7 +421,7 @@ per-glyph positioning. **The core never commits to either**; this pushes the run-vs-glyph question all the way down to rendering. (The earlier framing of an up-front "HTML mapping decision" is dissolved into this.) -### 2.1 — transforms & the placed-text emission — **in progress** +### 2.1 — transforms & the placed-text emission — **done** The geometry foundation plus the emission contract, *without* glyph advances: - A 2-D affine `Transform2D` (`util/math_util.hpp`): compose, point-apply, @@ -420,20 +440,29 @@ The geometry foundation plus the emission contract, *without* glyph advances: state, the page y-axis flipped once per page. The text-path debug `std::cout` (incl. the `"hi"` marker) is removed. -**Deliberately out of scope here (→ 2.2):** glyph advances (`/Widths`, -`/W`/`/DW`) and the *application* of char/word spacing and the `TJ` numeric -adjustments, so consecutive shows on a line without an explicit move still -overlap, and `TJ` renders its strings concatenated at one origin. Precise -baseline placement (needs font ascent metrics) is likewise deferred. Whether 2.2 -folds into this PR or branches off is an open call once 2.1 lands. - -### 2.2 — glyph advances & metrics - -Parse `/Widths` + `/MissingWidth` (simple) and `/W` + `/DW` (CID); apply char/word -spacing, horizontal scaling and the `TJ` numeric adjustments to advance the text -matrix per glyph — so `TJ`, `'`, `"` land correctly, `Tj` runs space correctly, -and the emission can be subdivided per glyph (which makes the renderer's -per-glyph option exercisable). +**Deliberately out of scope here (→ 2.2):** glyph advances. (Precise baseline +placement — needs font ascent metrics — stays deferred past 2.2 too.) + +### 2.2 — glyph advances & metrics — **done** + +Glyph metrics and the per-glyph text-matrix advance, on top of 2.1's emission: +- **Width parsing** (`pdf_document_parser`, `Font`): `/FirstChar` + `/Widths` + + `/FontDescriptor` `/MissingWidth` (simple), `/W` + `/DW` (the descendant + CIDFont, both `c [w…]` and `c_first c_last w` forms). `Font::advance_width(code)` + returns the advance in text-space units, with the `/MissingWidth` / `/DW` + fallbacks; `code_byte_width()` is 1 (simple) / 2 (composite). +- **Advance application** (`extract_text`, `GraphicsState::advance_text`): a + `TextElement` is now emitted per shown *segment* (one `Tj`/`'`/`"`, or one + string of a `TJ` array); after each segment `Tm` advances by Σ(width × Tfs + Tc + [+ Tw for single-byte 0x20]) × Th, and a `TJ` number translates `Tm` by + `−n/1000 × Tfs × Th`. So `TJ`/`'`/`"` land correctly and `Tj` segments space + correctly. The element carries its total advance; per-glyph placement is + re-derivable from `font->advance_width`, keeping the run-vs-glyph choice in the + renderer. + +Out of scope (later): intra-segment glyph shaping (browser fallback until the +embedded font, stage 3), AFM widths for non-embedded standard-14 fonts (stage 3), +vertical writing-mode advances (stage 2.6). ### 2.3 — Form XObjects diff --git a/src/odr/internal/pdf/pdf_document.cpp b/src/odr/internal/pdf/pdf_document.cpp index 1376975e..18e507a0 100644 --- a/src/odr/internal/pdf/pdf_document.cpp +++ b/src/odr/internal/pdf/pdf_document.cpp @@ -30,6 +30,20 @@ std::vector Document::collect_pages() const { return pages; } +double Font::advance_width(const std::uint32_t code) const { + if (composite) { + if (const auto it = cid_widths.find(code); it != cid_widths.end()) { + return it->second / 1000.0; + } + return cid_default_width / 1000.0; + } + const long index = static_cast(code) - first_char; + if (index >= 0 && index < static_cast(widths.size())) { + return widths[static_cast(index)] / 1000.0; + } + return missing_width / 1000.0; +} + std::string Font::to_unicode(const std::string &codes) const { if (!cmap.empty()) { return cmap.translate_string(codes); diff --git a/src/odr/internal/pdf/pdf_document_element.hpp b/src/odr/internal/pdf/pdf_document_element.hpp index 00ac8462..be7580f1 100644 --- a/src/odr/internal/pdf/pdf_document_element.hpp +++ b/src/odr/internal/pdf/pdf_document_element.hpp @@ -97,6 +97,28 @@ struct Font final : Element { /// the predefined Unicode-CMap extraction path. std::string cid_encoding_name; + /// Simple-font glyph metrics (ISO 32000-1 9.2.4): `/Widths`, in glyph space + /// (1/1000 text-space units), indexed by `code - first_char`; codes outside + /// the range fall back to `missing_width` (`/MissingWidth`). + int first_char{0}; + std::vector widths; + double missing_width{0}; + + /// Composite-font glyph metrics (9.7.4.3): the `/DW` default width and the + /// `/W` map CID -> width, both glyph-space units. Codes are interpreted as + /// CIDs (identity), the common `Identity-H/V` case. + double cid_default_width{1000}; + std::unordered_map cid_widths; + + /// Bytes per character code: 2 for composite (Type0) fonts (the + /// `Identity-H/V` and common CID case), 1 for simple fonts. + [[nodiscard]] int code_byte_width() const { return composite ? 2 : 1; } + + /// Glyph advance width of a single character code, in text-space units + /// (glyph-space / 1000; multiply by the font size for user space). Falls back + /// to `/MissingWidth` (simple) or `/DW` (composite) for absent codes. + [[nodiscard]] double advance_width(std::uint32_t code) const; + /// Translate a string of character codes to Unicode: the `ToUnicode` CMap /// when present (authoritative), else, for a composite font, "no Unicode", /// else the simple-font `/Encoding`, else identity bytes. diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp index efa85d66..6b9a4026 100644 --- a/src/odr/internal/pdf/pdf_document_parser.cpp +++ b/src/odr/internal/pdf/pdf_document_parser.cpp @@ -177,6 +177,77 @@ std::optional parse_encoding(DocumentParser &parser, return encoding; } +/// Parse a CIDFont's `/W` array into the `cid -> width` map. Entries are either +/// `c [w1 w2 ...]` (CIDs c, c+1, ... get the listed widths) or `c_first c_last +/// w` (the whole CID range gets `w`) — ISO 32000-1 9.7.4.3. +void parse_cid_widths(const Array &w, Font &font) { + // guard against a pathological `c_first c_last` range exhausting memory + static constexpr std::uint32_t max_range = 70000; + + std::size_t i = 0; + while (i < w.size()) { + if (!w[i].is_integer()) { + ++i; + continue; + } + const auto first = static_cast(w[i].as_integer()); + if (i + 1 < w.size() && w[i + 1].is_array()) { + const Array &list = w[i + 1].as_array(); + for (std::size_t j = 0; j < list.size(); ++j) { + if (list[j].is_real()) { + font.cid_widths[first + static_cast(j)] = + list[j].as_real(); + } + } + i += 2; + } else if (i + 2 < w.size() && w[i + 1].is_integer() && + w[i + 2].is_real()) { + const auto last = static_cast(w[i + 1].as_integer()); + const double width = w[i + 2].as_real(); + if (last >= first && last - first < max_range) { + for (std::uint32_t c = first; c <= last; ++c) { + font.cid_widths[c] = width; + } + } + i += 3; + } else { + ++i; + } + } +} + +/// Parse a simple font's `/FirstChar`, `/Widths` and `/FontDescriptor` +/// `/MissingWidth` glyph metrics (ISO 32000-1 9.2.4). +void parse_simple_font_widths(DocumentParser &parser, + const Dictionary &dictionary, Font &font) { + if (dictionary.has_key("FirstChar")) { + const Object first = parser.resolve_object_copy(dictionary["FirstChar"]); + if (first.is_integer()) { + font.first_char = static_cast(first.as_integer()); + } + } + if (dictionary.has_key("Widths")) { + const Object widths = parser.resolve_object_copy(dictionary["Widths"]); + if (widths.is_array()) { + for (const Object &width : widths.as_array()) { + font.widths.push_back(width.is_real() ? width.as_real() : 0.0); + } + } + } + if (dictionary.has_key("FontDescriptor")) { + const Object descriptor = + parser.resolve_object_copy(dictionary["FontDescriptor"]); + if (descriptor.is_dictionary() && + descriptor.as_dictionary().has_key("MissingWidth")) { + const Object missing = parser.resolve_object_copy( + descriptor.as_dictionary()["MissingWidth"]); + if (missing.is_real()) { + font.missing_width = missing.as_real(); + } + } + } +} + /// Parse a composite (Type0) font's descendant CIDFont (`/DescendantFonts` is a /// one-element array of the CIDFont): records the `/CIDSystemInfo` /// `/Registry`/`/Ordering` used to pick a predefined CID -> Unicode table. @@ -211,6 +282,20 @@ void parse_composite_font(DocumentParser &parser, const Dictionary &dictionary, return; } const Dictionary &cid_font_dictionary = cid_font.as_dictionary(); + + if (cid_font_dictionary.has_key("DW")) { + const Object dw = parser.resolve_object_copy(cid_font_dictionary["DW"]); + if (dw.is_real()) { + font.cid_default_width = dw.as_real(); + } + } + if (cid_font_dictionary.has_key("W")) { + const Object w = parser.resolve_object_copy(cid_font_dictionary["W"]); + if (w.is_array()) { + parse_cid_widths(w.as_array(), font); + } + } + if (!cid_font_dictionary.has_key("CIDSystemInfo")) { return; } @@ -258,11 +343,14 @@ Font *parse_font(DocumentParser &parser, const ObjectReference &reference, // simple-font glyph-name encoding, so it must not go through // `parse_encoding`. Extraction relies on `/ToUnicode` (parsed above). parse_composite_font(parser, dictionary, *font); - } else if (dictionary.has_key("Encoding")) { - // Simple-font `/Encoding`: a base-encoding name, or a dictionary with - // `/BaseEncoding` + `/Differences`. The text-extraction fallback for fonts - // without a `ToUnicode` CMap. - font->encoding = parse_encoding(parser, dictionary["Encoding"]); + } else { + parse_simple_font_widths(parser, dictionary, *font); + if (dictionary.has_key("Encoding")) { + // Simple-font `/Encoding`: a base-encoding name, or a dictionary with + // `/BaseEncoding` + `/Differences`. The text-extraction fallback for + // fonts without a `ToUnicode` CMap. + font->encoding = parse_encoding(parser, dictionary["Encoding"]); + } } return font; diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index 7c415029..42e274fb 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -53,6 +53,11 @@ void GraphicsState::next_line(const double tx, const double ty) { text.matrix = text.line_matrix; } +void GraphicsState::advance_text(double tx, double ty) { + Text &text = current().text; + text.matrix = Matrix::translation(tx, ty) * text.matrix; +} + void GraphicsState::execute(const GraphicsOperator &op) { switch (op.type) { case GraphicsOperatorType::save_state: diff --git a/src/odr/internal/pdf/pdf_graphics_state.hpp b/src/odr/internal/pdf/pdf_graphics_state.hpp index 98c03a75..382022af 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.hpp +++ b/src/odr/internal/pdf/pdf_graphics_state.hpp @@ -82,6 +82,10 @@ struct GraphicsState { /// renderer. [[nodiscard]] util::math::Transform2D text_placement_transform() const; + /// Advance the text matrix `Tm` by `(tx, ty)` in text space after showing + /// glyphs (the text line matrix `Tlm` is unaffected). + void advance_text(double tx, double ty); + private: /// Move to the start of a new text line: `Tlm = translate(tx, ty) * Tlm` and /// `Tm = Tlm` (the shared mechanic behind `Td`, `TD`, `T*`, `'`, `"`). diff --git a/src/odr/internal/pdf/pdf_page_text.cpp b/src/odr/internal/pdf/pdf_page_text.cpp index ee2ac2e5..9edbecfe 100644 --- a/src/odr/internal/pdf/pdf_page_text.cpp +++ b/src/odr/internal/pdf/pdf_page_text.cpp @@ -25,9 +25,35 @@ Font *lookup_font(const Resources &resources, const std::string &name, return nullptr; } -void emit(std::vector &out, const GraphicsState &state, +/// Total advance of a shown string, in text-space units: the per-code glyph +/// width times the font size, plus char spacing (every code) and word spacing +/// (single-byte code 0x20 only), the whole scaled by horizontal scaling +/// (ISO 32000-1 9.4.4). 0 when the font is unknown. +double segment_advance(const GraphicsState::Text &text, const Font *font, + const std::string &codes) { + if (font == nullptr) { + return 0.0; + } + const int width = font->code_byte_width(); + double tx = 0.0; + for (std::size_t i = 0; i + width <= codes.size(); i += width) { + std::uint32_t code = 0; + for (int k = 0; k < width; ++k) { + code = (code << 8) | static_cast(codes[i + k]); + } + tx += font->advance_width(code) * text.size + text.char_spacing; + if (!font->composite && code == ' ') { + tx += text.word_spacing; + } + } + return tx * (text.horizontal_scaling / 100.0); +} + +/// Emit one placed segment and advance the text matrix by its width. +void show(std::vector &out, GraphicsState &state, std::string codes, Font *font) { const GraphicsState::Text &text = state.current().text; + const double advance = segment_advance(text, font, codes); TextElement element; element.transform = state.text_placement_transform(); @@ -40,20 +66,10 @@ void emit(std::vector &out, const GraphicsState &state, element.rendering_mode = text.rendering_mode; element.text = font != nullptr ? font->to_unicode(codes) : codes; element.codes = std::move(codes); + element.width = advance; out.push_back(std::move(element)); -} - -/// Concatenate the string elements of a `TJ` array, dropping the numeric -/// position adjustments (their application is stage 2.2). -std::string join_array_strings(const Array &array) { - std::string codes; - for (const Object &element : array) { - if (element.is_string()) { - codes += element.as_string(); - } - } - return codes; + state.advance_text(advance, 0); } } // namespace @@ -81,18 +97,28 @@ std::vector pdf::extract_text(const std::string &content, case GraphicsOperatorType::show_text_next_line: { // Tj, ' Font *font = lookup_font(resources, state.current().text.font, logger, warned); - emit(result, state, op.arguments.at(0).as_string(), font); + show(result, state, op.arguments.at(0).as_string(), font); } break; case GraphicsOperatorType::show_text_manual_spacing: { // TJ Font *font = lookup_font(resources, state.current().text.font, logger, warned); - emit(result, state, join_array_strings(op.arguments.at(0).as_array()), - font); + const GraphicsState::Text &text = state.current().text; + for (const Object &item : op.arguments.at(0).as_array()) { + if (item.is_string()) { + show(result, state, item.as_string(), font); + } else if (item.is_real()) { + // a number translates the next glyph left by adj/1000 text-space + // units, scaled by the font size and horizontal scaling (9.4.3). + const double adjust = -item.as_real() / 1000.0 * text.size * + (text.horizontal_scaling / 100.0); + state.advance_text(adjust, 0); + } + } } break; case GraphicsOperatorType::show_text_next_line_set_spacing: { // " Font *font = lookup_font(resources, state.current().text.font, logger, warned); - emit(result, state, op.arguments.at(2).as_string(), font); + show(result, state, op.arguments.at(2).as_string(), font); } break; default: break; diff --git a/src/odr/internal/pdf/pdf_page_text.hpp b/src/odr/internal/pdf/pdf_page_text.hpp index eff56332..27994818 100644 --- a/src/odr/internal/pdf/pdf_page_text.hpp +++ b/src/odr/internal/pdf/pdf_page_text.hpp @@ -30,17 +30,27 @@ struct TextElement { double horizontal_scaling{100}; // Tz, percent double rise{0}; // Ts int rendering_mode{0}; // Tr - /// Raw character codes shown (for `TJ`, the array's strings concatenated). + /// Raw character codes shown by this segment (one `Tj`, or one string of a + /// `TJ` array). std::string codes; /// Unicode representation of `codes`; may lack spaces the producer cannot /// infer (space inference is stage 2.5). std::string text; + /// Total advance of this segment, in text-space units (the displacement + /// applied to the text matrix after it — already scaled by the font size and + /// including char/word spacing and horizontal scaling). 0 when the font is + /// unknown. A renderer wanting per-glyph placement can re-derive per-code + /// advances from `font->advance_width` over `codes`. + double width{0}; }; /// Execute a page's (decoded, concatenated) content stream and collect the text /// it shows as placed elements. Non-text operators update the graphics state -/// but produce no output. Glyph advances are not yet applied (stage 2.2), so -/// each show operation yields a single element at its starting origin. +/// but produce no output. Each shown segment (one `Tj`/`'`/`"`, or one string +/// of a `TJ` array) yields one element at its origin; the text matrix is +/// advanced by the glyph widths (`font->advance_width`) plus char/word spacing +/// and the `TJ` numeric adjustments, so segments and lines land in the right +/// place. std::vector extract_text(const std::string &content, const Resources &resources, const Logger &logger); diff --git a/test/src/internal/pdf/pdf_document_parser.cpp b/test/src/internal/pdf/pdf_document_parser.cpp index db1f44db..5f20d492 100644 --- a/test/src/internal/pdf/pdf_document_parser.cpp +++ b/test/src/internal/pdf/pdf_document_parser.cpp @@ -230,7 +230,8 @@ composite_font_mini_pdf(const bool with_to_unicode, (with_to_unicode ? " /ToUnicode 7 0 R >>" : " >>")) .object("<< /Type /Font /Subtype /CIDFontType2 /BaseFont /AAAAAA+X " "/CIDSystemInfo << /Registry (Adobe) /Ordering (Identity) " - "/Supplement 0 >> /CIDToGIDMap /Identity >>"); + "/Supplement 0 >> /CIDToGIDMap /Identity " + "/DW 1000 /W [0 [500 600]] >>"); if (with_to_unicode) { builder.stream_object("", "1 begincodespacerange\n<0000> \n" "endcodespacerange\n1 beginbfchar\n" @@ -239,6 +240,24 @@ composite_font_mini_pdf(const bool with_to_unicode, return builder.trailer("/Root 1 0 R").build_classic(); } +/// A mini-PDF whose single page references one simple TrueType font `F1` with +/// `/FirstChar` 65, `/Widths` for `A`/`B`, and a `/FontDescriptor` carrying +/// `/MissingWidth`. +std::string simple_font_mini_pdf() { + PdfFileBuilder builder; + builder.object("<< /Type /Catalog /Pages 2 0 R >>") + .object("<< /Type /Pages /Kids [3 0 R] /Count 1 >>") + .object("<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " + "/Resources << /Font << /F1 5 0 R >> >> /Contents 4 0 R >>") + .stream_object("", "BT ET") + .object("<< /Type /Font /Subtype /TrueType /BaseFont /Helvetica " + "/FirstChar 65 /LastChar 66 /Widths [500 600] " + "/FontDescriptor 6 0 R >>") + .object("<< /Type /FontDescriptor /FontName /Helvetica " + "/MissingWidth 250 >>"); + return builder.trailer("/Root 1 0 R").build_classic(); +} + const Font *first_page_font(const Document &document, const std::string &name) { const auto *page = dynamic_cast(document.catalog->pages->kids.front()); @@ -297,6 +316,35 @@ TEST(DocumentParser, composite_font_predefined_unicode_cmap) { "A\xe4\xb8\xad"); } +// A composite font's `/W` array and `/DW` default drive CID advance widths. +TEST(DocumentParser, composite_font_cid_widths) { + const std::string pdf = composite_font_mini_pdf(true); + DocumentParser parser(std::make_unique(pdf)); + const std::unique_ptr document = parser.parse_document(); + + const Font *font = first_page_font(*document, "F0"); + ASSERT_NE(font, nullptr); + EXPECT_DOUBLE_EQ(font->cid_default_width, 1000); + EXPECT_DOUBLE_EQ(font->advance_width(0), 0.5); // W [0 [500 600]] + EXPECT_DOUBLE_EQ(font->advance_width(1), 0.6); + EXPECT_DOUBLE_EQ(font->advance_width(2), 1.0); // /DW default +} + +// A simple font's `/FirstChar`, `/Widths` and `/MissingWidth` drive advances. +TEST(DocumentParser, simple_font_widths) { + const std::string pdf = simple_font_mini_pdf(); + DocumentParser parser(std::make_unique(pdf)); + const std::unique_ptr document = parser.parse_document(); + + const Font *font = first_page_font(*document, "F1"); + ASSERT_NE(font, nullptr); + EXPECT_FALSE(font->composite); + EXPECT_EQ(font->first_char, 65); + EXPECT_DOUBLE_EQ(font->advance_width(65), 0.5); // 'A' + EXPECT_DOUBLE_EQ(font->advance_width(66), 0.6); // 'B' + EXPECT_DOUBLE_EQ(font->advance_width(67), 0.25); // 'C' -> /MissingWidth +} + // Recovery: a valid file with garbage prepended (the real fixture // `order-EK52VKL0.pdf` is an HTTP response saved as `.pdf`) has every xref // offset and the `startxref` shifted, so the chain walk fails. A forward scan diff --git a/test/src/internal/pdf/pdf_page_text.cpp b/test/src/internal/pdf/pdf_page_text.cpp index efe4f33f..67e18323 100644 --- a/test/src/internal/pdf/pdf_page_text.cpp +++ b/test/src/internal/pdf/pdf_page_text.cpp @@ -14,11 +14,25 @@ using odr::Logger; namespace { +std::vector run(const std::string &content, + const Resources &resources) { + return extract_text(content, resources, Logger::null()); +} + // Run a content stream with no font resources: fonts resolve to null, so the -// emitted `text` is the raw codes and we can assert positioning in isolation. +// emitted `text` is the raw codes and advances are zero — lets us assert +// transform positioning in isolation. std::vector run(const std::string &content) { Resources resources; - return extract_text(content, resources, Logger::null()); + return run(content, resources); +} + +// A simple font `widths` (glyph space, 1/1000 em) starting at `first_char`. +Font simple_font(int first_char, std::vector widths) { + Font font; + font.first_char = first_char; + font.widths = std::move(widths); + return font; } } // namespace @@ -74,12 +88,101 @@ TEST(PdfPageText, text_rise) { EXPECT_DOUBLE_EQ(texts[0].rise, 5); } -// `TJ` concatenates its strings; the numeric adjustments are dropped -// (stage 2.2). -TEST(PdfPageText, tj_concatenates_strings) { +// `TJ` emits one element per string; with no font the strings stay put (zero +// advance) but the numeric adjustments still move the origin. +TEST(PdfPageText, tj_emits_per_string_with_adjustments) { + // adjustment -120 -> +120/1000 * 10 = +1.2 between the two strings const auto texts = run("BT /F1 10 Tf 0 0 Td [(Ab) -120 (cd)] TJ ET"); - ASSERT_EQ(texts.size(), 1); - EXPECT_EQ(texts[0].codes, "Abcd"); + ASSERT_EQ(texts.size(), 2); + EXPECT_EQ(texts[0].codes, "Ab"); + EXPECT_DOUBLE_EQ(texts[0].transform.e, 0); + EXPECT_EQ(texts[1].codes, "cd"); + EXPECT_DOUBLE_EQ(texts[1].transform.e, 1.2); +} + +// Simple-font `/Widths` advance the text matrix, so a following show lands past +// the previous one. +TEST(PdfPageText, simple_font_widths_advance) { + Font font = simple_font('A', {500, 600, 700}); // A=0.5, B=0.6, C=0.7 em + Resources res; + res.font["F1"] = &font; + + const auto texts = run("BT /F1 10 Tf 0 0 Td (AB) Tj (C) Tj ET", res); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[0].transform.e, 0); + EXPECT_DOUBLE_EQ(texts[0].width, 11); // 5 + 6 + EXPECT_DOUBLE_EQ(texts[1].transform.e, 11); + EXPECT_DOUBLE_EQ(texts[1].width, 7); +} + +// A `TJ` adjustment combines with the glyph width to place the next string. +TEST(PdfPageText, tj_adjustment_after_width) { + Font font = simple_font('A', {500, 600}); // A=0.5, B=0.6 em + Resources res; + res.font["F1"] = &font; + + // (A): width 5; -100 -> +1.0; (B) lands at 6.0 + const auto texts = run("BT /F1 10 Tf 0 0 Td [(A) -100 (B)] TJ ET", res); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[0].transform.e, 0); + EXPECT_DOUBLE_EQ(texts[1].transform.e, 6); +} + +// Char spacing adds to every glyph's advance. +TEST(PdfPageText, char_spacing_advance) { + Font font = simple_font('A', {500, 500}); + Resources res; + res.font["F1"] = &font; + + // Tc=2: each of A,B advances 5 + 2 = 7 -> 14 total + const auto texts = run("BT /F1 10 Tf 2 Tc 0 0 Td (AB) Tj (x) Tj ET", res); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[0].width, 14); + EXPECT_DOUBLE_EQ(texts[1].transform.e, 14); +} + +// Word spacing adds to the single-byte space (0x20) only. +TEST(PdfPageText, word_spacing_applies_to_space) { + Font font = simple_font(32, {250}); // space = 0.25 em + Resources res; + res.font["F1"] = &font; + + // Tw=5: the space advances 2.5 + 5 = 7.5 + const auto texts = run("BT /F1 10 Tf 5 Tw 0 0 Td ( ) Tj (x) Tj ET", res); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[0].width, 7.5); + EXPECT_DOUBLE_EQ(texts[1].transform.e, 7.5); +} + +// Composite (Type0) fonts use 2-byte codes and the `/DW` default width. +TEST(PdfPageText, composite_default_width_advance) { + Font font; + font.composite = true; + font.cid_default_width = 1000; // 1.0 em + Resources res; + res.font["F1"] = &font; + + // <0001> and <0002> are one 2-byte code each; size 10 -> advance 10 apiece + const auto texts = run("BT /F1 10 Tf 0 0 Td <0001> Tj <0002> Tj ET", res); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[0].width, 10); + EXPECT_DOUBLE_EQ(texts[1].transform.e, 10); +} + +// `Font::advance_width` falls back to `/MissingWidth` (simple) and `/DW` +// (composite) for absent codes. +TEST(PdfPageText, advance_width_fallbacks) { + Font simple = simple_font('A', {500}); + simple.missing_width = 250; + EXPECT_DOUBLE_EQ(simple.advance_width('A'), 0.5); + EXPECT_DOUBLE_EQ(simple.advance_width('B'), 0.25); // out of range + + Font composite; + composite.composite = true; + composite.cid_default_width = 1000; + composite.cid_widths[1] = 2000; + EXPECT_DOUBLE_EQ(composite.advance_width(1), 2.0); + EXPECT_DOUBLE_EQ(composite.advance_width(2), 1.0); // default } // `T*` moves down one line by the leading set with `TL`. From 50dba922a4d981268f58871b0815286f11293336 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 21:44:52 +0200 Subject: [PATCH 2/6] fix --- src/odr/internal/pdf/pdf_graphics_state.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index 42e274fb..fe55964b 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -55,7 +55,7 @@ void GraphicsState::next_line(const double tx, const double ty) { void GraphicsState::advance_text(double tx, double ty) { Text &text = current().text; - text.matrix = Matrix::translation(tx, ty) * text.matrix; + text.matrix = util::math::Transform2D::translation(tx, ty) * text.matrix; } void GraphicsState::execute(const GraphicsOperator &op) { From 256b988f1b61fd7336af56f004889ca35a23b00b Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 15 Jun 2026 21:44:05 +0200 Subject: [PATCH 3/6] minor --- src/odr/internal/html/pdf_file.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 6c6910f4..3d3d0006 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -109,8 +109,13 @@ class HtmlServiceImpl final : public HtmlService { const double height = page_box[3].as_real() - box_y0; out.write_element_begin( +<<<<<<< HEAD "div", HtmlElementOptions().set_class("p").set_style([&](std::ostream &o) { +======= + "div", HtmlElementOptions().set_style([&](std::ostream &o) { + o << "position:relative;"; +>>>>>>> 09d848cc (minor) o << "width:" << width * pt_to_in << "in;"; o << "height:" << height * pt_to_in << "in;"; })); From 7567d49d120eb08ac73e3a60e568a63b164590aa Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Tue, 16 Jun 2026 00:32:31 +0200 Subject: [PATCH 4/6] fix --- src/odr/internal/html/pdf_file.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 3d3d0006..6c6910f4 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -109,13 +109,8 @@ class HtmlServiceImpl final : public HtmlService { const double height = page_box[3].as_real() - box_y0; out.write_element_begin( -<<<<<<< HEAD "div", HtmlElementOptions().set_class("p").set_style([&](std::ostream &o) { -======= - "div", HtmlElementOptions().set_style([&](std::ostream &o) { - o << "position:relative;"; ->>>>>>> 09d848cc (minor) o << "width:" << width * pt_to_in << "in;"; o << "height:" << height * pt_to_in << "in;"; })); From ce22ef4e182574b32ae04f21373302329ac49be3 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Tue, 16 Jun 2026 07:50:59 +0200 Subject: [PATCH 5/6] minor --- src/odr/internal/pdf/pdf_graphics_state.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index fe55964b..f92691b5 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -53,7 +53,7 @@ void GraphicsState::next_line(const double tx, const double ty) { text.matrix = text.line_matrix; } -void GraphicsState::advance_text(double tx, double ty) { +void GraphicsState::advance_text(const double tx, const double ty) { Text &text = current().text; text.matrix = util::math::Transform2D::translation(tx, ty) * text.matrix; } From f0455ed1f8b1f2d4af16a55fa367d810b0ddd559 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Tue, 16 Jun 2026 08:12:45 +0200 Subject: [PATCH 6/6] PDF: emit per-code glyph advances on TextElement segment_advances now returns each code's advance alongside the total and takes a Font reference (called only when a font is present). TextElement carries the per-code advances vector so renderers need no re-derivation from font->advance_width. Co-Authored-By: Claude Opus 4.8 --- src/odr/internal/pdf/AGENTS.md | 21 ++++++------ src/odr/internal/pdf/pdf_page_text.cpp | 45 +++++++++++++++++--------- src/odr/internal/pdf/pdf_page_text.hpp | 7 ++-- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index d6afd38f..680aeffe 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -127,12 +127,13 @@ graphics, no images, no font files. Experimental and not production-quality. string of a `TJ` array) — its text-space → user-space transform (CTM × `Tm`, with horizontal scaling and rise folded in, font size kept separate), the resolved font, size, spacing parameters, raw codes, the CMap-translated - Unicode, and the segment's total advance. Font lookup is lenient (unknown ref → - warn, raw codes). **Glyph advances are applied** (stage 2.2): after each segment - the text matrix `Tm` advances by the glyph widths × font size plus char/word - spacing (× horizontal scaling), and a `TJ` number translates `Tm` by - `−n/1000 × Tfs × Th` — so segments, `TJ` kerning and lines land in the right - place. A renderer wanting per-glyph placement re-derives per-code advances from + Unicode, and the segment's per-code advances plus their total. Font lookup is + lenient (unknown ref → warn, raw codes). **Glyph advances are applied** (stage + 2.2): after each segment the text matrix `Tm` advances by the glyph widths × + font size plus char/word spacing (× horizontal scaling), and a `TJ` number + translates `Tm` by `−n/1000 × Tfs × Th` — so segments, `TJ` kerning and lines + land in the right place. The element carries the per-code advances directly, so + a renderer wanting per-glyph placement need not re-derive them from `font->advance_width`. Still deferred: intra-segment glyph shaping (the browser lays a segment out in a fallback font until stage 3) and vertical writing-mode advances (stage 2.6). @@ -166,7 +167,7 @@ graphics, no images, no font files. Experimental and not production-quality. | `pdf_graphics_operator.hpp` | `GraphicsOperatorType` enum (full operator set) + `GraphicsOperator` (type + `Object` arguments) | | `pdf_graphics_operator_parser.{hpp,cpp}` | Content-stream tokenizer: arguments then operator name | | `pdf_graphics_state.{hpp,cpp}` | `GraphicsState`: stack of `State` (general/path/text/color), `execute(op)` for the modelled subset; CTM/`Tm`/`Tlm` as `Transform2D`, `text_placement_matrix()` for the text rendering transform sans font size, `advance_text()` for the post-glyph `Tm` advance | -| `pdf_page_text.{hpp,cpp}` | `extract_text`: run the content stream through `GraphicsState`, emit a `TextElement` (placed transform + font/size/spacing + codes + Unicode + advance) per shown segment, advancing `Tm` by the glyph widths and `TJ` adjustments (stages 2.1–2.2) | +| `pdf_page_text.{hpp,cpp}` | `extract_text`: run the content stream through `GraphicsState`, emit a `TextElement` (placed transform + font/size/spacing + codes + Unicode + per-code advances + total advance) per shown segment, advancing `Tm` by the glyph widths and `TJ` adjustments (stages 2.1–2.2) | | `pdf_file.{hpp,cpp}` | `abstract::PdfFile` wrapper; probes encryption at construction and implements `password_encrypted()`/`decrypt()`, carrying the authenticated `Decryptor` (not the password) so rendering needs no re-derivation | Consumers outside the module: `open_strategy.cpp` (detection / engine @@ -456,9 +457,9 @@ Glyph metrics and the per-glyph text-matrix advance, on top of 2.1's emission: string of a `TJ` array); after each segment `Tm` advances by Σ(width × Tfs + Tc [+ Tw for single-byte 0x20]) × Th, and a `TJ` number translates `Tm` by `−n/1000 × Tfs × Th`. So `TJ`/`'`/`"` land correctly and `Tj` segments space - correctly. The element carries its total advance; per-glyph placement is - re-derivable from `font->advance_width`, keeping the run-vs-glyph choice in the - renderer. + correctly. The element carries both its per-code advances and their total, so + per-glyph placement needs no re-derivation from `font->advance_width`, keeping + the run-vs-glyph choice in the renderer. Out of scope (later): intra-segment glyph shaping (browser fallback until the embedded font, stage 3), AFM widths for non-embedded standard-14 fonts (stage 3), diff --git a/src/odr/internal/pdf/pdf_page_text.cpp b/src/odr/internal/pdf/pdf_page_text.cpp index 9edbecfe..61ab678d 100644 --- a/src/odr/internal/pdf/pdf_page_text.cpp +++ b/src/odr/internal/pdf/pdf_page_text.cpp @@ -25,35 +25,43 @@ Font *lookup_font(const Resources &resources, const std::string &name, return nullptr; } -/// Total advance of a shown string, in text-space units: the per-code glyph -/// width times the font size, plus char spacing (every code) and word spacing -/// (single-byte code 0x20 only), the whole scaled by horizontal scaling -/// (ISO 32000-1 9.4.4). 0 when the font is unknown. -double segment_advance(const GraphicsState::Text &text, const Font *font, - const std::string &codes) { - if (font == nullptr) { - return 0.0; - } - const int width = font->code_byte_width(); - double tx = 0.0; +/// Per-code advances of a shown string and their total, in text-space units. +struct SegmentAdvances { + /// Advance of each character code, in code order. Sums to `total`. + std::vector advances; + /// Total advance applied to the text matrix after the segment. + double total{0}; +}; + +/// Advance of each code in a shown string and their total, in text-space units: +/// the per-code glyph width times the font size, plus char spacing (every code) +/// and word spacing (single-byte code 0x20 only), each scaled by horizontal +/// scaling (ISO 32000-1 9.4.4). +SegmentAdvances segment_advances(const GraphicsState::Text &text, + const Font &font, const std::string &codes) { + const int width = font.code_byte_width(); + const double scaling = text.horizontal_scaling / 100.0; + SegmentAdvances result; for (std::size_t i = 0; i + width <= codes.size(); i += width) { std::uint32_t code = 0; for (int k = 0; k < width; ++k) { code = (code << 8) | static_cast(codes[i + k]); } - tx += font->advance_width(code) * text.size + text.char_spacing; - if (!font->composite && code == ' ') { + double tx = font.advance_width(code) * text.size + text.char_spacing; + if (!font.composite && code == ' ') { tx += text.word_spacing; } + tx *= scaling; + result.advances.push_back(tx); + result.total += tx; } - return tx * (text.horizontal_scaling / 100.0); + return result; } /// Emit one placed segment and advance the text matrix by its width. void show(std::vector &out, GraphicsState &state, std::string codes, Font *font) { const GraphicsState::Text &text = state.current().text; - const double advance = segment_advance(text, font, codes); TextElement element; element.transform = state.text_placement_transform(); @@ -66,8 +74,13 @@ void show(std::vector &out, GraphicsState &state, element.rendering_mode = text.rendering_mode; element.text = font != nullptr ? font->to_unicode(codes) : codes; element.codes = std::move(codes); - element.width = advance; + if (font != nullptr) { + auto [advances, total] = segment_advances(text, *font, element.codes); + element.width = total; + element.advances = std::move(advances); + } + const double advance = element.width; out.push_back(std::move(element)); state.advance_text(advance, 0); } diff --git a/src/odr/internal/pdf/pdf_page_text.hpp b/src/odr/internal/pdf/pdf_page_text.hpp index 27994818..42198d14 100644 --- a/src/odr/internal/pdf/pdf_page_text.hpp +++ b/src/odr/internal/pdf/pdf_page_text.hpp @@ -39,9 +39,12 @@ struct TextElement { /// Total advance of this segment, in text-space units (the displacement /// applied to the text matrix after it — already scaled by the font size and /// including char/word spacing and horizontal scaling). 0 when the font is - /// unknown. A renderer wanting per-glyph placement can re-derive per-code - /// advances from `font->advance_width` over `codes`. + /// unknown. Equal to the sum of `advances`. double width{0}; + /// Per-character-code advance, in code order and text-space units, summing to + /// `width`. Empty when the font is unknown. Lets a renderer place glyphs + /// individually without re-deriving widths from `font->advance_width`. + std::vector advances; }; /// Execute a page's (decoded, concatenated) content stream and collect the text