diff --git a/CMakeLists.txt b/CMakeLists.txt index c3faa544..589fe26f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,6 +190,7 @@ set(ODR_SOURCE_FILES "src/odr/internal/pdf/pdf_graphics_state.cpp" "src/odr/internal/pdf/pdf_object.cpp" "src/odr/internal/pdf/pdf_object_parser.cpp" + "src/odr/internal/pdf/pdf_page_text.cpp" "src/odr/internal/svm/svm_file.cpp" "src/odr/internal/svm/svm_format.cpp" diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index b7a18e34..6c6910f4 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -12,12 +12,9 @@ #include #include #include -#include -#include -#include +#include -#include -#include +#include namespace odr::internal::html { @@ -78,7 +75,7 @@ class HtmlServiceImpl final : public HtmlService { const auto &pdf_file = dynamic_cast(*m_pdf_file.impl()); pdf::DocumentParser parser = pdf_file.create_parser(*m_logger); - std::unique_ptr document = parser.parse_document(); + const std::unique_ptr document = parser.parse_document(); const std::vector pages = document->collect_pages(); @@ -89,18 +86,33 @@ class HtmlServiceImpl final : public HtmlService { out.write_header_title("odr"); out.write_header_viewport( "width=device-width,initial-scale=1.0,user-scalable=yes"); + // Constant per-page and per-glyph styling lives in classes so it is not + // repeated inline on every one of the (potentially millions of) spans. + out.write_header_style_begin(); + out.out() << ".p{position:relative}"; + out.out() << ".t{position:absolute;left:0;top:0;transform-origin:0 0;" + "white-space:pre}"; + out.write_header_style_end(); out.write_header_end(); out.write_body_begin(); + // CSS uses 96px to the inch, PDF user space 72 units to the inch. + static constexpr double pt_to_px = 96.0 / 72.0; + static constexpr double pt_to_in = 1 / 72.0; + for (pdf::Page *page : pages) { const pdf::Array &page_box = page->media_box.as_array(); + const double box_x0 = page_box[0].as_real(); + const double box_y0 = page_box[1].as_real(); + const double width = page_box[2].as_real() - box_x0; + const double height = page_box[3].as_real() - box_y0; out.write_element_begin( - "div", HtmlElementOptions().set_style([&](std::ostream &o) { - o << "position:relative;"; - o << "width:" << page_box[2].as_real() / 72.0 << "in;"; - o << "height:" << page_box[3].as_real() / 72.0 << "in;"; + "div", + HtmlElementOptions().set_class("p").set_style([&](std::ostream &o) { + o << "width:" << width * pt_to_in << "in;"; + o << "height:" << height * pt_to_in << "in;"; })); std::string stream; @@ -110,66 +122,47 @@ class HtmlServiceImpl final : public HtmlService { stream += '\n'; } - std::istringstream ss(stream); - pdf::GraphicsOperatorParser parser2(ss); - pdf::GraphicsState state; - while (!ss.eof()) { - pdf::GraphicsOperator op = parser2.read_operator(); - state.execute(op); - - if (op.type == pdf::GraphicsOperatorType::text_next_line) { - double leading = state.current().text.leading; - double size = state.current().text.size; - - state.current().text.offset[1] -= size + leading; - } else if (op.type == pdf::GraphicsOperatorType::show_text) { - const std::string &font_ref = state.current().text.font; - double size = state.current().text.size; - - std::array offset = state.current().text.offset; - - pdf::Font *font = page->resources->font.at(font_ref); - - const std::string &glyphs = op.arguments[0].as_string(); - std::string unicode = font->to_unicode(glyphs); - - if (unicode.find("Colored Line") != std::string::npos) { - std::cout << "hi" << '\n'; - } - - out.write_element_begin( - "span", HtmlElementOptions().set_style([&](std::ostream &o) { - o << "position:absolute;"; - o << "left:" << offset[0] / 72.0 << "in;"; - o << "bottom:" << offset[1] / 72.0 << "in;"; - o << "font-size:" << size << "pt;"; - })); - out.write_raw(escape_text(unicode)); - out.write_element_end("span"); - } else if (op.type == - pdf::GraphicsOperatorType::show_text_manual_spacing) { - const std::string &font_ref = state.current().text.font; - pdf::Font *font = page->resources->font.at(font_ref); - double size = state.current().text.size; - - std::cout << font->object << '\n'; - - for (const auto &element : op.arguments[0].as_array()) { - if (element.is_real()) { - std::cout << "spacing: " << element.as_real() << '\n'; - } else if (element.is_string()) { - const std::string &glyphs = element.as_string(); - std::string unicode = font->to_unicode(glyphs); - std::cout << "show text manual spacing: font=" << font - << ", size=" << size << ", text=" << unicode << '\n'; - } - } - } else if (op.type == pdf::GraphicsOperatorType::show_text_next_line) { - std::cout << "TODO show_text_next_line" << '\n'; - } else if (op.type == - pdf::GraphicsOperatorType::show_text_next_line_set_spacing) { - std::cout << "TODO show_text_next_line_set_spacing" << '\n'; - } + // Map PDF user space (origin at the MediaBox corner, y up) to the page + // box in CSS pixels (origin top-left, y down). `flip_glyph` un-mirrors + // the glyphs so text stays upright after the page flip. + constexpr util::math::Transform2D flip_glyph = + util::math::Transform2D::scaling(1, -1); + const util::math::Transform2D to_box = + util::math::Transform2D::translation(-box_x0, -box_y0) * + util::math::Transform2D::translation_scaling(1, -1, 0, height); + + // Round CSS coordinates to 0.01px; sub-pixel precision beyond that is + // invisible and the extra digits add up over millions of spans. + const auto round2 = [](const double v) { + return std::round(v * 100.0) / 100.0; + }; + + for (const pdf::TextElement &text : + pdf::extract_text(stream, *page->resources, *m_logger)) { + const util::math::Transform2D m = flip_glyph * text.transform * to_box; + + out.write_element_begin( + "span", + HtmlElementOptions().set_class("t").set_style([&](std::ostream &o) { + // TODO baseline sits at the box top until font ascent + // metrics land + if (m.b == 0 && m.c == 0 && m.a == m.d) { + // Upright uniform scale: fold the scale into the + // font size and place the origin with left/top, so + // the (otherwise near-universal) matrix is dropped. + o << "left:" << round2(m.e * pt_to_px) << "px;"; + o << "top:" << round2(m.f * pt_to_px) << "px;"; + o << "font-size:" << round2(m.a * text.size * pt_to_px) + << "px;"; + } else { + o << "transform:matrix(" << m.a << "," << m.b << "," << m.c + << "," << m.d << "," << round2(m.e * pt_to_px) << "," + << round2(m.f * pt_to_px) << ");"; + o << "font-size:" << round2(text.size * pt_to_px) << "px;"; + } + })); + out.write_raw(escape_text(text.text)); + out.write_element_end("span"); } out.write_element_end("div"); diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index 0dc19044..fd55e21c 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -13,9 +13,10 @@ tables, cross-reference streams, object streams, hybrid files, with a forward-scan recovery path for broken cross-references), build the page tree with fonts and annotations, tokenize page content streams into graphics operators, and emit a **proof-of-concept HTML rendering**: absolutely positioned -text spans per `Tj`, pages sized from `MediaBox`. Encrypted files are decrypted -(RC4, AES-128, AES-256). No graphics, no images, no font files. Experimental and -not production-quality — the HTML path still contains debug `std::cout` output. +text spans, one per show operation, placed by the full text transform (CTM × +text matrix, stage 2.1), pages sized from `MediaBox`. Encrypted files are +decrypted (RC4, AES-128, AES-256). No glyph advances yet (stage 2.2), no +graphics, no images, no font files. Experimental and not production-quality. --- @@ -107,13 +108,28 @@ not production-quality — the HTML path still contains debug `std::cout` output deferred half of part B) or the embedded font program (stage 3) land. - **Content streams**: the full graphics-operator vocabulary is tokenized; `GraphicsState` executes a subset (state stack `q`/`Q`, matrices `cm`/`Tm`, - line parameters, text state `Tc`/`Tw`/`Tz`/`TL`/`Tf`/`Tr`/`Ts`, text - positioning `Td`/`TD`, grey/RGB/CMYK colors, glyph metrics `d0`/`d1`). Unknown + line parameters, text state `Tc`/`Tw`/`Tz`/`TL`/`Tf`/`Tr`/`Ts`, glyph metrics + `d0`/`d1`, grey/RGB/CMYK colors). The CTM **concatenates** on `cm` (ISO 32000-1 + 8.4.4); the text matrix `Tm` and text line matrix `Tlm` are tracked as 2-D + affine `Transform2D` values (`util/math_util.hpp`), with `BT` resetting them, `Td`/`TD` + /`T*` (and the line-move half of `'`/`"`) advancing `Tlm` → `Tm`. Unknown operators are logged to stderr and skipped. +- **Text layout** (`pdf_page_text`, stage 2.1): `extract_text` runs the operator + parser + `GraphicsState` over a page's content and emits a renderer-agnostic + `TextElement` per show operation (`Tj`/`TJ`/`'`/`"`) — its text-space → user- + space transform (CTM × `Tm`, with horizontal scaling and rise folded in, font + size kept separate), the resolved font, size, spacing parameters, raw codes, + and the CMap-translated Unicode. Font lookup is lenient (unknown ref → warn, + raw codes). **Glyph advances are not yet applied** (stage 2.2): each show op + yields one element at its starting origin, `TJ`'s numeric adjustments are + dropped, and `Tc`/`Tw`/`Tz`-driven spacing is carried but not consumed. - **HTML**: one `document.html` view; each page is a `div` sized from `MediaBox` - (points → inches), each `Tj` becomes an absolutely positioned `span` at the - text-state offset with `font-size` from `Tf` and the CMap-translated text. - `TJ`/`'`/`"` are recognized but only printed to stdout, not rendered. + (points → inches). Each `TextElement` becomes an absolutely positioned `span` + carrying a CSS `transform` matrix (the placement transform mapped from PDF user + space — y-up, MediaBox origin — into the page box in CSS pixels, the glyphs + un-mirrored so text stays upright), `font-size` from the text state, and the + Unicode text. Precise baseline placement (needs font ascent metrics) is + deferred; the baseline currently sits at the span's box top. ## Module layout @@ -133,9 +149,11 @@ not production-quality — the HTML path still contains debug `std::cout` output | `pdf_encoding.{hpp,cpp}` | Simple-font `/Encoding` → Unicode: `BaseEncoding` tables, `/Differences` overlay (`Encoding`), glyph-name → Unicode via AGL + `uniXXXX`/`uXXXXXX` (stage 1.2) | | `pdf_cid.{hpp,cpp}` | Composite-font predefined `/Encoding` → Unicode: the `Uni*-UCS2/UTF16/UTF32` CMaps decoded directly (no data tables), stage 1.3 part B; legacy CJK CMaps deferred (see `tools/pdf/generate_cid_data.py`) | | `pdf_encoding_data.{hpp,cpp}` | **Generated** (`tools/pdf/generate_encoding_data.py`): base-encoding tables + the Adobe Glyph List as a name-sorted array | +| `util/math_util.hpp` | `util::math::Transform2D`: 2-D affine transform (PDF row-vector convention) — compose, point-apply, translation/scaling factories (stage 2.1) | | `pdf_graphics_operator.hpp` | `GraphicsOperatorType` enum (full operator set) + `GraphicsOperator` (type + `Object` arguments) | | `pdf_graphics_operator_parser.{hpp,cpp}` | Content-stream tokenizer: arguments then operator name | -| `pdf_graphics_state.{hpp,cpp}` | `GraphicsState`: stack of `State` (general/path/text/color), `execute(op)` for the modelled subset | +| `pdf_graphics_state.{hpp,cpp}` | `GraphicsState`: stack of `State` (general/path/text/color), `execute(op)` for the modelled subset; CTM/`Tm`/`Tlm` as `Transform2D`, `text_placement_matrix()` for the text rendering transform sans font size | +| `pdf_page_text.{hpp,cpp}` | `extract_text`: run the content stream through `GraphicsState`, emit a `TextElement` (placed transform + font/size/spacing + codes + Unicode) per show operation (stage 2.1) | | `pdf_file.{hpp,cpp}` | `abstract::PdfFile` wrapper; probes encryption at construction and implements `password_encrypted()`/`decrypt()`, carrying the authenticated `Decryptor` (not the password) so rendering needs no re-derivation | Consumers outside the module: `open_strategy.cpp` (detection / engine @@ -170,11 +188,14 @@ selection) and `html/pdf_file.cpp` (`create_pdf_service`). 5. **Decode content.** Per page (depth-first), the `Contents` streams are read, decoded through their `/Filter` chain (`read_decoded_stream`), concatenated with a newline between streams. -6. **Execute and emit.** `GraphicsOperatorParser` tokenizes; `GraphicsState` - updates the state stack. `T*` advances the text offset by `size + leading`; - `Tj` emits a positioned `span` using `state.text.offset` and the `Tf` size, - glyphs translated through the font's CMap. The text and transform matrices - are tracked but **not applied** to positioning. +6. **Lay out and emit.** `extract_text` runs `GraphicsOperatorParser` + + `GraphicsState` over the content and returns a `TextElement` per show + operation, each placed by `text_placement_matrix()` (CTM × `Tm`, with + horizontal scaling and rise folded in), its glyphs translated through the + font's CMap. The HTML layer maps each element to a positioned `span` with a + CSS `transform` (PDF user space → the page box in CSS pixels) and `font-size` + from the text state. Glyph advances are **not yet applied** (stage 2.2), so + shows without an explicit move overlap. --- @@ -221,12 +242,12 @@ as absent (7.5.8.3). A structural throw in the cross-reference layer is not fatal, though: it is caught once and the file is forward-scanned to rebuild the table (*Cross-reference recovery* above) before giving up. -**Debug output still in place.** `html/pdf_file.cpp`, `pdf_graphics_state.cpp`, -`pdf_graphics_operator_parser.cpp` and `pdf_cmap_parser.cpp` print diagnostics -(and one leftover `"hi"` breakpoint marker) to stdout/stderr instead of -`Logger`. Proof-of-concept residue; should move to `Logger` or be removed. -`DocumentParser` itself takes an optional `Logger &` (default `Logger::null()`) -and routes its warnings through it — new diagnostics should do the same. +**Debug output still in place.** `pdf_graphics_state.cpp` (dash pattern, stroke/ +other color) and `pdf_graphics_operator_parser.cpp` still print diagnostics to +stdout/stderr instead of `Logger`. The text path is now clean: `html/pdf_file.cpp` +and `pdf_page_text.cpp` route through `Logger` and the leftover `"hi"` marker is +gone (stage 2.1). `DocumentParser` and `extract_text` take a `Logger &` (default +`Logger::null()`) — new diagnostics should do the same. **Rendering is deferred to the browser; display and text are decoupled.** We emit no rasterized output: glyphs render via the embedded font (`@font-face`, stage 3) @@ -302,9 +323,17 @@ such PDFs look right, their text just isn't selectable until the tables land. `translate_predefined_cmap` over the predefined Unicode CMaps — `UCS2`/`UTF16` (incl. a surrogate pair) and `UTF32` decoding, a `-V` writing-mode variant, and the `nullopt` for `Identity-H` and the legacy CJK CMaps (stage 1.3 part B). +- `test/src/internal/util/math_util_test.cpp` — **assertion-based**, no fixtures: + `Transform2D` point-apply (identity/translation/scaling), the ordered + (row-vector) composition, and compose-then-apply ≡ sequential apply (stage 2.1). +- `test/src/internal/pdf/pdf_page_text.cpp` — **assertion-based**, inline content + streams through `extract_text` (empty resources, so codes pass through as + `text`): `Td` translation, `Tm` scaling, `cm` CTM concatenation under `Tm`, + horizontal scaling and rise in the transform, `TJ` string concatenation, and + the `T*`/`'`/`"` line moves with their leading and spacing (stage 2.1). No assertion-based coverage of the tokenizer (escapes, references, hex strings) -or the HTML output. +or the HTML output itself (the span emission / CSS transform mapping). --- @@ -358,30 +387,80 @@ fixture needs them yet: ## Stage 2 — text positioning & metrics -Independent of Unicode work; fixes layout even with today's partial CMaps. - -- Apply the full transform: text matrix × CTM (both tracked in `GraphicsState` - but never applied), text rise, horizontal scaling. -- **Glyph advances**: `/Widths` + `/MissingWidth` (simple), `/W` + `/DW` (CID), - char/word spacing, the numeric adjustments in `TJ` — so `TJ`, `'`, `"` finally - render and `Tj` runs land correctly. -- **Form XObjects** (`Do` on a `/Form`): recursive content-stream execution with - scoped `/Resources` and the form matrix. Many producers put most page content - inside forms, and tiling patterns (stage 4) and annotation appearances - (stage 5) run on the same machinery — a structural prerequisite. -- **Text render modes** (`Tr`): mode 3 (invisible text, OCR-over-scan) must stay - selectable but unpainted; stroke/clip modes (1–2, 4–7) need graceful - degradation. -- **Space inference**: PDFs routinely encode no spaces; insert them from - glyph-gap heuristics (as pdf2htmlEX does) so copy/paste and search work. -- Layout side of bidi (RTL run ordering) and vertical writing (Identity-V/CJK). -- HTML mapping decision: per-run spans with CSS `transform` (cheap, breaks on - heavy kerning) vs. per-glyph positioning (exact, verbose) — likely per-run - with a kerning threshold that splits runs, like pdf2htmlEX. -- **Extraction refinements** (was stage 1.5, rides on the run plumbing above): - mark a run "no Unicode" when the code → Unicode chain yields nothing, so stage 3 - can re-encode it; honour `/ActualText` (tagged PDFs, ligatures) as an extraction - override of the whole chain. +Independent of Unicode work; fixes layout even with today's partial CMaps. Split +into sub-stages (mirroring stage 1's slicing), each its own PR. + +**Architecture decision (2026-06): a renderer-agnostic placed-text emission.** +The content executor produces a per-page list of **placed text items**, each +carrying its text-space → user-space transform (CTM × text matrix, with font +size / horizontal scaling / rise folded in), the resolved font, size, the +text-state spacing parameters, the raw character codes, and a Unicode +representation (which may lack inferred spaces). The HTML layer consumes that +list and decides how to map it — per-run spans with a CSS `transform` vs. +per-glyph positioning. **The core never commits to either**; this pushes the +run-vs-glyph question all the way down to rendering. (The earlier framing of an +up-front "HTML mapping decision" is dissolved into this.) + +### 2.1 — transforms & the placed-text emission — **in progress** + +The geometry foundation plus the emission contract, *without* glyph advances: +- A 2-D affine `Transform2D` (`util/math_util.hpp`): compose, point-apply, + translation/scaling factories. +- Apply the full transform chain in `GraphicsState`: the CTM now *concatenates* + on `cm` (it was overwritten); the text matrix `Tm` and text line matrix `Tlm` + are tracked properly (`BT` resets them, `Tm` sets both, `Td`/`TD`/`T*` and the + line-move half of `'`/`"` update `Tlm` → `Tm`); text rise and horizontal + scaling fold into the text rendering matrix. +- A `TextElement` emission (`pdf_page_text.{hpp,cpp}`): `extract_text(content, + resources, logger)` runs the operator parser + state and yields one placed item + per show operation (`Tj`/`TJ`/`'`/`"`), positioned by the text rendering + matrix. Font lookup is lenient (unknown ref → warn, codes pass through). +- The HTML layer maps each `TextElement` to an absolutely-positioned span with a + CSS `transform` (full matrix, incl. rotation/scaling), font size from the text + state, the page y-axis flipped once per page. The text-path debug `std::cout` + (incl. the `"hi"` marker) is removed. + +**Deliberately out of scope here (→ 2.2):** glyph advances (`/Widths`, +`/W`/`/DW`) and the *application* of char/word spacing and the `TJ` numeric +adjustments, so consecutive shows on a line without an explicit move still +overlap, and `TJ` renders its strings concatenated at one origin. Precise +baseline placement (needs font ascent metrics) is likewise deferred. Whether 2.2 +folds into this PR or branches off is an open call once 2.1 lands. + +### 2.2 — glyph advances & metrics + +Parse `/Widths` + `/MissingWidth` (simple) and `/W` + `/DW` (CID); apply char/word +spacing, horizontal scaling and the `TJ` numeric adjustments to advance the text +matrix per glyph — so `TJ`, `'`, `"` land correctly, `Tj` runs space correctly, +and the emission can be subdivided per glyph (which makes the renderer's +per-glyph option exercisable). + +### 2.3 — Form XObjects + +`Do` on a `/Form`: recursive content-stream execution with scoped `/Resources` +and the form matrix; extend `Resources` parsing to the XObject table. Many +producers put most page content inside forms, and tiling patterns (stage 4) and +annotation appearances (stage 5) run on the same machinery — a structural +prerequisite. + +### 2.4 — text render modes & extraction refinements + +**Text render modes** (`Tr`): mode 3 (invisible text, OCR-over-scan) must stay +selectable but unpainted; stroke/clip modes (1–2, 4–7) need graceful +degradation. Plus **extraction refinements** (was stage 1.5, rides on the run +plumbing): mark a run "no Unicode" when the code → Unicode chain yields nothing, +so stage 3 can re-encode it; honour `/ActualText` (tagged PDFs, ligatures) as an +extraction override of the whole chain. + +### 2.5 — space inference + +PDFs routinely encode no spaces; insert them from glyph-gap heuristics (as +pdf2htmlEX does) so copy/paste and search work. + +### 2.6 — bidi & vertical writing (deferral candidate) + +Layout side of bidi (RTL run ordering) and vertical writing (Identity-V/CJK). No +corpus fixture needs it yet — likely pushed out until one does. ## Stage 3 — fonts in HTML diff --git a/src/odr/internal/pdf/pdf_graphics_state.cpp b/src/odr/internal/pdf/pdf_graphics_state.cpp index 5c4c75ae..7c415029 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.cpp +++ b/src/odr/internal/pdf/pdf_graphics_state.cpp @@ -20,6 +20,12 @@ ColorSpace color_space_name_to_enum(const std::string &name) { return util::map::lookup_default(mapping, name, ColorSpace::unknown); } +util::math::Transform2D matrix_from_args(const GraphicsOperator &op) { + return {op.arguments.at(0).as_real(), op.arguments.at(1).as_real(), + op.arguments.at(2).as_real(), op.arguments.at(3).as_real(), + op.arguments.at(4).as_real(), op.arguments.at(5).as_real()}; +} + } // namespace GraphicsState::GraphicsState() { stack.emplace_back(); } @@ -30,6 +36,23 @@ const GraphicsState::State &GraphicsState::current() const { return stack.back(); } +util::math::Transform2D GraphicsState::text_placement_transform() const { + const Text &text = current().text; + // text rendering matrix without the font size (ISO 32000-1 9.4.4): the font + // size scales x and y, horizontal scaling scales x only, rise offsets y. + const util::math::Transform2D params = + util::math::Transform2D::translation_scaling( + 0, text.rise, text.horizontal_scaling / 100.0, 1); + return params * text.matrix * current().general.transform_matrix; +} + +void GraphicsState::next_line(const double tx, const double ty) { + Text &text = current().text; + text.line_matrix = + util::math::Transform2D::translation(tx, ty) * text.line_matrix; + text.matrix = text.line_matrix; +} + void GraphicsState::execute(const GraphicsOperator &op) { switch (op.type) { case GraphicsOperatorType::save_state: @@ -40,9 +63,9 @@ void GraphicsState::execute(const GraphicsOperator &op) { break; case GraphicsOperatorType::set_matrix: - for (int i = 0; i < 6; ++i) { - current().general.transform_matrix.at(i) = op.arguments.at(i).as_real(); - } + // `cm` concatenates: CTM = matrix * CTM (ISO 32000-1 8.4.4). + current().general.transform_matrix = + matrix_from_args(op) * current().general.transform_matrix; break; case GraphicsOperatorType::set_line_width: @@ -123,21 +146,33 @@ void GraphicsState::execute(const GraphicsOperator &op) { current().text.rise = op.arguments.at(0).as_real(); break; - case GraphicsOperatorType::text_next_line_relative: - for (int i = 0; i < 2; ++i) { - current().text.offset.at(i) += op.arguments.at(i).as_real(); - } + case GraphicsOperatorType::begin_text: + // BT initializes both the text matrix and the text line matrix to identity. + current().text.matrix = util::math::Transform2D(); + current().text.line_matrix = util::math::Transform2D(); break; - case GraphicsOperatorType::text_next_line_relative_leading: + case GraphicsOperatorType::text_next_line_relative: // Td + next_line(op.arguments.at(0).as_real(), op.arguments.at(1).as_real()); + break; + case GraphicsOperatorType::text_next_line_relative_leading: // TD current().text.leading = -op.arguments.at(1).as_real(); - for (int i = 0; i < 2; ++i) { - current().text.offset.at(i) += op.arguments.at(i).as_real(); - } + next_line(op.arguments.at(0).as_real(), op.arguments.at(1).as_real()); break; - case GraphicsOperatorType::set_text_matrix: - for (int i = 0; i < 6; ++i) { - current().text.transform_matrix.at(i) = op.arguments.at(i).as_real(); - } + case GraphicsOperatorType::set_text_matrix: // Tm + current().text.matrix = matrix_from_args(op); + current().text.line_matrix = current().text.matrix; + break; + case GraphicsOperatorType::text_next_line: // T* + next_line(0, -current().text.leading); + break; + case GraphicsOperatorType::show_text_next_line: // ' : T* then show + next_line(0, -current().text.leading); + break; + case GraphicsOperatorType::show_text_next_line_set_spacing: + // " : aw ac string -> set word/char spacing, T*, then show + current().text.word_spacing = op.arguments.at(0).as_real(); + current().text.char_spacing = op.arguments.at(1).as_real(); + next_line(0, -current().text.leading); break; case GraphicsOperatorType::set_stroke_color_space: diff --git a/src/odr/internal/pdf/pdf_graphics_state.hpp b/src/odr/internal/pdf/pdf_graphics_state.hpp index b65079f2..98c03a75 100644 --- a/src/odr/internal/pdf/pdf_graphics_state.hpp +++ b/src/odr/internal/pdf/pdf_graphics_state.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include #include #include @@ -25,7 +27,7 @@ struct GraphicsState { std::string color_rendering_intent; double flatness_tolerance{}; std::string graphics_state_parameters; - std::array transform_matrix{1, 0, 0, 1, 0, 0}; + util::math::Transform2D transform_matrix; // CTM }; struct Path { @@ -34,16 +36,16 @@ struct GraphicsState { }; struct Text { - double char_spacing{0}; - double word_spacing{0}; - double horizontal_scaling{1}; - double leading{0}; - std::string font; - double size{}; - int rendering_mode{0}; - double rise{0}; - std::array offset{0, 0}; - std::array transform_matrix{1, 0, 0, 1, 0, 0}; + double char_spacing{0}; // Tc + double word_spacing{0}; // Tw + double horizontal_scaling{100}; // Tz, in percent (100 = normal) + double leading{0}; // TL + std::string font; // Tf resource name + double size{}; // Tf size + int rendering_mode{0}; // Tr + double rise{0}; // Ts + util::math::Transform2D matrix; // Tm + util::math::Transform2D line_matrix; // Tlm std::array glyph_width{}; std::array glyph_bounding_box{}; }; @@ -72,6 +74,18 @@ struct GraphicsState { [[nodiscard]] const State ¤t() const; void execute(const GraphicsOperator &); + + /// Text rendering transform *excluding* the font size: maps text space (1 + /// unit = 1 em at the current font size) to user space, with horizontal + /// scaling and rise folded in. The font size is applied separately (as the + /// rendered font-size), which keeps the run-vs-glyph mapping decision in the + /// renderer. + [[nodiscard]] util::math::Transform2D text_placement_transform() const; + +private: + /// Move to the start of a new text line: `Tlm = translate(tx, ty) * Tlm` and + /// `Tm = Tlm` (the shared mechanic behind `Td`, `TD`, `T*`, `'`, `"`). + void next_line(double tx, double ty); }; } // namespace odr::internal::pdf diff --git a/src/odr/internal/pdf/pdf_page_text.cpp b/src/odr/internal/pdf/pdf_page_text.cpp new file mode 100644 index 00000000..ee2ac2e5 --- /dev/null +++ b/src/odr/internal/pdf/pdf_page_text.cpp @@ -0,0 +1,105 @@ +#include + +#include + +#include +#include +#include +#include + +#include +#include + +namespace odr::internal::pdf { +namespace { + +Font *lookup_font(const Resources &resources, const std::string &name, + const Logger &logger, std::set &warned) { + if (const auto it = resources.font.find(name); it != resources.font.end()) { + return it->second; + } + if (warned.insert(name).second) { + ODR_WARNING(logger, "pdf: unknown font resource '" + name + + "', emitting raw codes"); + } + return nullptr; +} + +void emit(std::vector &out, const GraphicsState &state, + std::string codes, Font *font) { + const GraphicsState::Text &text = state.current().text; + + TextElement element; + element.transform = state.text_placement_transform(); + element.font = font; + element.size = text.size; + element.char_spacing = text.char_spacing; + element.word_spacing = text.word_spacing; + element.horizontal_scaling = text.horizontal_scaling; + element.rise = text.rise; + element.rendering_mode = text.rendering_mode; + element.text = font != nullptr ? font->to_unicode(codes) : codes; + element.codes = std::move(codes); + + out.push_back(std::move(element)); +} + +/// Concatenate the string elements of a `TJ` array, dropping the numeric +/// position adjustments (their application is stage 2.2). +std::string join_array_strings(const Array &array) { + std::string codes; + for (const Object &element : array) { + if (element.is_string()) { + codes += element.as_string(); + } + } + return codes; +} + +} // namespace + +} // namespace odr::internal::pdf + +namespace odr::internal { + +std::vector pdf::extract_text(const std::string &content, + const Resources &resources, + const Logger &logger) { + std::vector result; + std::set warned; + + std::istringstream ss(content); + GraphicsOperatorParser parser(ss); + GraphicsState state; + + while (!ss.eof()) { + const GraphicsOperator op = parser.read_operator(); + state.execute(op); + + switch (op.type) { + case GraphicsOperatorType::show_text: + case GraphicsOperatorType::show_text_next_line: { // Tj, ' + Font *font = + lookup_font(resources, state.current().text.font, logger, warned); + emit(result, state, op.arguments.at(0).as_string(), font); + } break; + case GraphicsOperatorType::show_text_manual_spacing: { // TJ + Font *font = + lookup_font(resources, state.current().text.font, logger, warned); + emit(result, state, join_array_strings(op.arguments.at(0).as_array()), + font); + } break; + case GraphicsOperatorType::show_text_next_line_set_spacing: { // " + Font *font = + lookup_font(resources, state.current().text.font, logger, warned); + emit(result, state, op.arguments.at(2).as_string(), font); + } break; + default: + break; + } + } + + return result; +} + +} // namespace odr::internal diff --git a/src/odr/internal/pdf/pdf_page_text.hpp b/src/odr/internal/pdf/pdf_page_text.hpp new file mode 100644 index 00000000..eff56332 --- /dev/null +++ b/src/odr/internal/pdf/pdf_page_text.hpp @@ -0,0 +1,48 @@ +#pragma once + +#include + +#include +#include + +namespace odr { +class Logger; +} + +namespace odr::internal::pdf { + +struct Resources; +struct Font; + +/// One show-text operation laid out in user space. The transform places the +/// text origin and orientation; the font size is kept separate so the renderer +/// can choose per-run or per-glyph mapping. Spacing parameters and the raw +/// codes are carried for the (deferred) advance application. +struct TextElement { + /// Text-space -> user-space, font size *not* applied (see + /// `GraphicsState::text_placement_matrix`). + util::math::Transform2D transform; + /// Resolved font, or `nullptr` when the `/Font` resource name was unknown. + Font *font{nullptr}; + double size{0}; // Tf size + double char_spacing{0}; // Tc + double word_spacing{0}; // Tw + double horizontal_scaling{100}; // Tz, percent + double rise{0}; // Ts + int rendering_mode{0}; // Tr + /// Raw character codes shown (for `TJ`, the array's strings concatenated). + std::string codes; + /// Unicode representation of `codes`; may lack spaces the producer cannot + /// infer (space inference is stage 2.5). + std::string text; +}; + +/// Execute a page's (decoded, concatenated) content stream and collect the text +/// it shows as placed elements. Non-text operators update the graphics state +/// but produce no output. Glyph advances are not yet applied (stage 2.2), so +/// each show operation yields a single element at its starting origin. +std::vector extract_text(const std::string &content, + const Resources &resources, + const Logger &logger); + +} // namespace odr::internal::pdf diff --git a/src/odr/internal/util/math_util.hpp b/src/odr/internal/util/math_util.hpp new file mode 100644 index 00000000..e5fa57b5 --- /dev/null +++ b/src/odr/internal/util/math_util.hpp @@ -0,0 +1,63 @@ +#pragma once + +#include + +namespace odr::internal::util::math { + +/// 2-D affine transform using PDF's row-vector convention: `[a b c d e f]` +/// denotes +/// +/// | a b 0 | +/// | c d 0 | +/// | e f 1 | +/// +/// Points are row vectors multiplied on the left: `[x y 1] * M`. Composition +/// follows the same order: `a * b` means "apply `a`, then `b`" (ISO 32000-1 +/// 8.3.4). +struct Transform2D { + double a{1}; + double b{0}; + double c{0}; + double d{1}; + double e{0}; + double f{0}; + + constexpr Transform2D() noexcept = default; + constexpr Transform2D(const double a_, const double b_, const double c_, + const double d_, const double e_, + const double f_) noexcept + : a{a_}, b{b_}, c{c_}, d{d_}, e{e_}, f{f_} {} + + constexpr static Transform2D translation(const double x, + const double y) noexcept { + return {1, 0, 0, 1, x, y}; + } + constexpr static Transform2D scaling(const double x, + const double y) noexcept { + return {x, 0, 0, y, 0, 0}; + } + constexpr static Transform2D translation_scaling(const double x, + const double y, + const double sx, + const double sy) noexcept { + return {sx, 0, 0, sy, x, y}; + } + + /// `*this` applied first, then `rhs`. + [[nodiscard]] constexpr Transform2D + operator*(const Transform2D &rhs) const noexcept { + return { + a * rhs.a + b * rhs.c, a * rhs.b + b * rhs.d, + c * rhs.a + d * rhs.c, c * rhs.b + d * rhs.d, + e * rhs.a + f * rhs.c + rhs.e, e * rhs.b + f * rhs.d + rhs.f, + }; + } + + /// Map the point `(x, y)` through the transform. + [[nodiscard]] constexpr std::array + apply(const double x, const double y) const noexcept { + return {a * x + c * y + e, b * x + d * y + f}; + } +}; + +} // namespace odr::internal::util::math diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0ac3f996..17ba1af6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -47,7 +47,9 @@ add_executable(odr_test "src/internal/pdf/pdf_file_object.cpp" "src/internal/pdf/pdf_file_parser.cpp" "src/internal/pdf/pdf_filter.cpp" + "src/internal/util/math_util_test.cpp" "src/internal/pdf/pdf_object_parser.cpp" + "src/internal/pdf/pdf_page_text.cpp" "src/internal/pdf/pdf_test_file_builder.cpp" "src/internal/svm/svm_test.cpp" diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index 83dfd8f5..9c1e64e4 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit 83dfd8f556e33db8db37822351f75305d60e8cb9 +Subproject commit 9c1e64e43623027cbdf1392fb6e8fe7022f2ad70 diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index c5d35204..ceb9001b 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit c5d35204cd3ae501f79c88f26ac80293cefd64c4 +Subproject commit ceb9001b1506b3fa218bd722ce0e6105d321a218 diff --git a/test/src/internal/pdf/pdf_page_text.cpp b/test/src/internal/pdf/pdf_page_text.cpp new file mode 100644 index 00000000..efe4f33f --- /dev/null +++ b/test/src/internal/pdf/pdf_page_text.cpp @@ -0,0 +1,109 @@ +#include + +#include + +#include + +#include +#include + +#include + +using namespace odr::internal::pdf; +using odr::Logger; + +namespace { + +// Run a content stream with no font resources: fonts resolve to null, so the +// emitted `text` is the raw codes and we can assert positioning in isolation. +std::vector run(const std::string &content) { + Resources resources; + return extract_text(content, resources, Logger::null()); +} + +} // namespace + +// `Td` places the origin via the text line matrix; the font size is carried +// separately, not folded into the transform. +TEST(PdfPageText, td_translation) { + const auto texts = run("BT /F1 12 Tf 100 700 Td (Hi) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.e, 100); + EXPECT_DOUBLE_EQ(texts[0].transform.f, 700); + EXPECT_DOUBLE_EQ(texts[0].transform.a, 1); + EXPECT_DOUBLE_EQ(texts[0].transform.d, 1); + EXPECT_DOUBLE_EQ(texts[0].size, 12); + EXPECT_EQ(texts[0].codes, "Hi"); + EXPECT_EQ(texts[0].text, "Hi"); // no font -> raw codes pass through +} + +// `Tm` sets the text matrix outright, scaling and all. +TEST(PdfPageText, tm_scaling) { + const auto texts = run("BT /F1 10 Tf 2 0 0 2 50 60 Tm (X) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.a, 2); + EXPECT_DOUBLE_EQ(texts[0].transform.d, 2); + EXPECT_DOUBLE_EQ(texts[0].transform.e, 50); + EXPECT_DOUBLE_EQ(texts[0].transform.f, 60); +} + +// `cm` concatenates into the CTM, which composes under the text matrix. +TEST(PdfPageText, ctm_concatenates) { + const auto texts = run("2 0 0 2 0 0 cm BT /F1 10 Tf 10 20 Td (Y) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.a, 2); + EXPECT_DOUBLE_EQ(texts[0].transform.d, 2); + EXPECT_DOUBLE_EQ(texts[0].transform.e, 20); // 10 * 2 + EXPECT_DOUBLE_EQ(texts[0].transform.f, 40); // 20 * 2 +} + +// Horizontal scaling (`Tz`, percent) scales x only, in the transform. +TEST(PdfPageText, horizontal_scaling) { + const auto texts = run("BT /F1 10 Tf 50 Tz 0 0 Td (Z) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.a, 0.5); + EXPECT_DOUBLE_EQ(texts[0].transform.d, 1); + EXPECT_DOUBLE_EQ(texts[0].horizontal_scaling, 50); +} + +// Text rise (`Ts`) offsets the origin in y, unscaled by the font size. +TEST(PdfPageText, text_rise) { + const auto texts = run("BT /F1 10 Tf 0 0 Td 5 Ts (R) Tj ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_DOUBLE_EQ(texts[0].transform.f, 5); + EXPECT_DOUBLE_EQ(texts[0].rise, 5); +} + +// `TJ` concatenates its strings; the numeric adjustments are dropped +// (stage 2.2). +TEST(PdfPageText, tj_concatenates_strings) { + const auto texts = run("BT /F1 10 Tf 0 0 Td [(Ab) -120 (cd)] TJ ET"); + ASSERT_EQ(texts.size(), 1); + EXPECT_EQ(texts[0].codes, "Abcd"); +} + +// `T*` moves down one line by the leading set with `TL`. +TEST(PdfPageText, next_line_uses_leading) { + const auto texts = run("BT /F1 10 Tf 14 TL 0 800 Td (a) Tj T* (b) Tj ET"); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[0].transform.f, 800); + EXPECT_DOUBLE_EQ(texts[1].transform.f, 786); // 800 - 14 +} + +// `'` does the line move and then shows. +TEST(PdfPageText, show_next_line) { + const auto texts = run("BT /F1 10 Tf 10 TL 0 500 Td (a) Tj (b) ' ET"); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[1].transform.f, 490); // 500 - 10 + EXPECT_EQ(texts[1].codes, "b"); +} + +// `"` sets word/char spacing, does the line move, then shows the third operand. +TEST(PdfPageText, show_next_line_set_spacing) { + const auto texts = run("BT /F1 10 Tf 12 TL 0 400 Td (a) Tj 1 2 (b) \" ET"); + ASSERT_EQ(texts.size(), 2); + EXPECT_DOUBLE_EQ(texts[1].transform.f, 388); // 400 - 12 + EXPECT_EQ(texts[1].codes, "b"); + EXPECT_DOUBLE_EQ(texts[1].word_spacing, 1); + EXPECT_DOUBLE_EQ(texts[1].char_spacing, 2); +} diff --git a/test/src/internal/util/math_util_test.cpp b/test/src/internal/util/math_util_test.cpp new file mode 100644 index 00000000..8b02ecbc --- /dev/null +++ b/test/src/internal/util/math_util_test.cpp @@ -0,0 +1,52 @@ +#include + +#include + +using namespace odr::internal::util::math; + +TEST(Transform2D, identity_apply) { + const auto p = Transform2D().apply(3, 4); + EXPECT_DOUBLE_EQ(p[0], 3); + EXPECT_DOUBLE_EQ(p[1], 4); +} + +TEST(Transform2D, translation_apply) { + const auto p = Transform2D::translation(10, -5).apply(3, 4); + EXPECT_DOUBLE_EQ(p[0], 13); + EXPECT_DOUBLE_EQ(p[1], -1); +} + +TEST(Transform2D, scaling_apply) { + const auto p = Transform2D::scaling(2, 3).apply(3, 4); + EXPECT_DOUBLE_EQ(p[0], 6); + EXPECT_DOUBLE_EQ(p[1], 12); +} + +// `a * b` applies `a` first, then `b` (row-vector convention), so order +// matters. +TEST(Transform2D, compose_is_ordered) { + // scale, then translate + const auto p = + (Transform2D::scaling(2, 2) * Transform2D::translation(1, 1)).apply(3, 4); + EXPECT_DOUBLE_EQ(p[0], 7); // (3,4) -> (6,8) -> (7,9) + EXPECT_DOUBLE_EQ(p[1], 9); + + // translate, then scale -> different result + const auto q = + (Transform2D::translation(1, 1) * Transform2D::scaling(2, 2)).apply(3, 4); + EXPECT_DOUBLE_EQ(q[0], 8); // (3,4) -> (4,5) -> (8,10) + EXPECT_DOUBLE_EQ(q[1], 10); +} + +// Composing then applying equals applying each factor in sequence. +TEST(Transform2D, compose_matches_sequential_apply) { + const Transform2D a{1, 2, 3, 4, 5, 6}; + const Transform2D b{2, 0, 1, 3, -1, 4}; + + const auto direct = (a * b).apply(7, 8); + const auto step = a.apply(7, 8); + const auto seq = b.apply(step[0], step[1]); + + EXPECT_DOUBLE_EQ(direct[0], seq[0]); + EXPECT_DOUBLE_EQ(direct[1], seq[1]); +}