opendocument-app · andiwand · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/.gitignore b/.gitignore
@@ -60,3 +60,6 @@ __pycache__/
 CMakeUserPresets.json
 
 offline/
+
+## PDF CJK CMap input data (fetched by tools/pdf/generate_cid_data.py)
+tools/pdf/cmap-resources/
diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md
@@ -93,8 +93,16 @@ not production-quality — the HTML path still contains debug `std::cout` output
   bytes). When a simple font carries no `ToUnicode` CMap, `Font::to_unicode`
   falls back to its `/Encoding` — a base encoding (Standard/WinAnsi/MacRoman)
   overlaid with `/Differences`, each code → glyph name → Unicode via the Adobe
-  Glyph List (incl. the `uniXXXX`/`uXXXXXX` forms), stage 1.2. Predefined CJK
-  CMaps and embedded-font fallbacks are still stage 1.3–1.4.
+  Glyph List (incl. the `uniXXXX`/`uXXXXXX` forms), stage 1.2. **Composite
+  (Type0) fonts** (stage 1.3, part A) are recognized: the descendant CIDFont's
+  `/CIDSystemInfo` `/Registry`/`/Ordering` is recorded on the `Font`, and the
+  Type0 `/Encoding` (a code → CID CMap such as `Identity-H`) is kept out of the
+  simple-font encoding path. Extraction is driven by the `/ToUnicode` CMap (the
+  common case — every Type0 font in the corpus carries one); a composite font
+  *without* a `/ToUnicode` yields "no Unicode" (not byte-garbage) until the
+  predefined CID → Unicode tables (part B) or the embedded font program
+  (stage 1.4) land. Predefined CJK CMaps and embedded-font fallbacks are still
+  stage 1.3 (part B)–1.4.
 - **Content streams**: the full graphics-operator vocabulary is tokenized;
   `GraphicsState` executes a subset (state stack `q`/`Q`, matrices `cm`/`Tm`,
   line parameters, text state `Tc`/`Tw`/`Tz`/`TL`/`Tf`/`Tr`/`Ts`, text
@@ -117,7 +125,7 @@ not production-quality — the HTML path still contains debug `std::cout` output
 | `pdf_document_parser.{hpp,cpp}`        | `parse_document()`: xref/trailer chain → catalog → page tree; lazy object reads with cache; (deep) reference resolution |
 | `pdf_encryption.{hpp,cpp}`             | Standard security handler: `Authenticator` (parse `/Encrypt`, authenticate password → `Decryptor`) and `Decryptor` (decrypt strings/streams; RC4, AES-128, AES-256), plus a `standard_security` namespace of pure key/password algorithms for known-answer tests |
 | `pdf_document.hpp`                     | `Document`: arena of `Element`s + `catalog` pointer |
-| `pdf_document_element.hpp`             | Element structs: `Catalog`, `Pages`, `Page`, `Annotation`, `Resources`, `Font` |
+| `pdf_document_element.hpp`             | Element structs: `Catalog`, `Pages`, `Page`, `Annotation`, `Resources`, `Font` (incl. the `composite`/`cid_registry`/`cid_ordering` Type0 facts and `to_unicode`) |
 | `pdf_cmap.{hpp,cpp}`                   | `CMap`: 1-byte glyph → UTF-16 `bfchar` map + string translation |
 | `pdf_cmap_parser.{hpp,cpp}`            | `ToUnicode` CMap stream parser (`begincodespacerange`/`beginbfchar`/`beginbfrange`; only `bfchar` applied) |
 | `pdf_encoding.{hpp,cpp}`               | Simple-font `/Encoding` → Unicode: `BaseEncoding` tables, `/Differences` overlay (`Encoding`), glyph-name → Unicode via AGL + `uniXXXX`/`uXXXXXX` (stage 1.2) |
@@ -247,7 +255,10 @@ and routes its warnings through it — new diagnostics should do the same.
   US-Letter lenience), plus cross-reference-recovery coverage (inline broken
   mini-PDFs: garbage prepended, a bad `startxref`, no trailer at all → catalog
   scan, a duplicate id → last definition wins, a page tree living in an object
-  stream). End-to-end: the classic fixture
+  stream), plus composite-font coverage (a Type0 font over an `Identity-H`
+  descendant `CIDFontType2`: `composite`/`/CIDSystemInfo` recorded, 2-byte
+  `/ToUnicode` extraction, and the no-`/ToUnicode` "no Unicode" fallback).
+  End-to-end: the classic fixture
   `odr-public/pdf/style-various-1.pdf`, plus decryption of
   `odr-public/pdf/Casio_WVA-M650-7AJF.pdf` (RC4, empty password) and
   `odr-private/pdf/encrypted_fontfile3_opentype.pdf` (AES-256; skipped when the
@@ -303,8 +314,10 @@ per-code Unicode (or "unknown", which stage 3 handles). The stage is **too
 large for one change** — it bundles work of very different size and dependency,
 so it is split into the sub-stages below. They are independently useful and
 ordered by corpus frequency; each is its own branch/PR off this roadmap. Sub-
-stages 1.1 and 1.2 have landed; **1.3 (composite/CID fonts) is the next work**;
-1.4 is blocked on stage 3 and stays deferred until then.
+stages 1.1 and 1.2 have landed, as has **1.3 part A** (Type0 structure +
+`Identity-H/V` + `/ToUnicode`-driven extraction); **1.3 part B (predefined CJK
+CMaps + CID → Unicode tables) is the next work**; 1.4 is blocked on stage 3 and
+stays deferred until then.
 
 ### 1.1 — `ToUnicode` CMap: multi-byte codes, `bfrange`, multi-char targets — **done**
 
@@ -386,7 +399,28 @@ Remaining (1.2 deferrals):
 
 `Identity-H/V` plus the predefined CMaps (CJK); map CID → Unicode via the CID
 system info where defined. The predefined CMaps are large external data sets —
-the heaviest data chunk of the stage. Own branch.
+the heaviest data chunk of the stage. Split into two parts because the data
+weight is concentrated in part B, and the whole local corpus (every Type0 font
+is `Identity-H` + `/ToUnicode`) is covered by part A alone.
+
+**Part A — Type0 structure + `Identity-H/V` + `/ToUnicode` — done.** `parse_font`
+detects `/Subtype /Type0`, walks `/DescendantFonts[0]` to record the descendant
+CIDFont's `/CIDSystemInfo` `/Registry`/`/Ordering` on `Font` (`composite`,
+`cid_registry`, `cid_ordering`), and keeps the Type0 `/Encoding` (a code → CID
+CMap, not a glyph-name encoding) out of `parse_encoding` — so `Identity-H` no
+longer trips the "unsupported /Encoding name" warning. Extraction runs through
+the existing multi-byte `/ToUnicode` path (stage 1.1); `Font::to_unicode`
+returns "no Unicode" for a composite font lacking a `/ToUnicode` rather than
+mis-splitting its multi-byte codes through the single-byte identity fallback.
+Tests: `DocumentParser.composite_font_with_to_unicode` /
+`…_without_to_unicode_yields_no_unicode`.
+
+**Part B — predefined CJK CMaps + CID → Unicode — next.** Vendor Adobe's
+`cmap-resources` (the predefined code → CID CMaps) and the CID → Unicode mapping
+tables, generated to committed C++ like the AGL in 1.2 (`tools/pdf`). Select the
+table by the recorded `/Registry`+`/Ordering`. **Blocked on a CJK test fixture**
+— the corpus has none, so this could not be validated as part of part A. Own
+branch.
 
 ### 1.4 — embedded-font fallback — **deferred (needs stage 3)**
 
@@ -563,8 +597,11 @@ tree, little else.
 - **CMap coverage**: the `ToUnicode` CMap is fully handled (multi-byte codes,
   `bfchar`, both `bfrange` forms, multi-char targets — stage 1.1), and a simple
   font's `/Encoding` (base + `/Differences` → AGL) is the fallback when no
-  `ToUnicode` stream is present (stage 1.2). Still open: predefined CJK CMaps and
-  embedded-font reverse maps (stages 1.3–1.4); symbolic fonts with a built-in
-  encoding default to StandardEncoding until 1.4.
+  `ToUnicode` stream is present (stage 1.2). Composite (Type0) fonts are
+  recognized and extract through their `/ToUnicode` (stage 1.3 part A). Still
+  open: predefined CJK CMaps and the CID → Unicode tables for composite fonts
+  without a `/ToUnicode` (stage 1.3 part B), and embedded-font reverse maps
+  (stage 1.4); symbolic fonts with a built-in encoding default to
+  StandardEncoding until 1.4.
 - **Annotations** are collected but their content is not interpreted (stage 5).
 - Revisit the reference-by-lookahead parsing and `read_stream(-1)` fallback.
diff --git a/src/odr/internal/pdf/pdf_document.cpp b/src/odr/internal/pdf/pdf_document.cpp
@@ -33,6 +33,15 @@ std::string Font::to_unicode(const std::string &codes) const {
   if (!cmap.empty()) {
     return cmap.translate_string(codes);
   }
+  if (composite) {
+    // A composite (Type0) font with no `ToUnicode` CMap: code -> CID is known
+    // (identity for `Identity-H/V`) but CID -> Unicode needs either a
+    // predefined CID -> Unicode table (stage 1.3 part B) or the embedded font
+    // program (stage 1.4). Emit "no Unicode" rather than mis-splitting the
+    // multi-byte codes into byte-sized garbage through the identity fallback
+    // below. Stage 1.5 will mark these runs for re-encoding.
+    return {};
+  }
   if (encoding.has_value()) {
     return encoding->translate_string(codes);
   }

diff --git a/src/odr/internal/pdf/pdf_document_element.hpp b/src/odr/internal/pdf/pdf_document_element.hpp
@@ -80,8 +80,23 @@ struct Font final : Element {
   /// fallback used when no `ToUnicode` CMap is present.
   std::optional<Encoding> encoding;
 
+  /// True for composite (Type0) fonts (stage 1.3). Their character codes are
+  /// multi-byte and select CIDs via the Type0 `/Encoding` CMap; `/ToUnicode` is
+  /// the code -> Unicode path. Code -> CID via predefined CJK CMaps and the
+  /// CID -> Unicode tables are stage 1.3 (part B); embedded-font reverse maps
+  /// are stage 1.4.
+  bool composite{false};
+  /// The descendant CIDFont's `/CIDSystemInfo` `/Registry` and `/Ordering`
+  /// (e.g. `Adobe` / `Identity` or `Adobe` / `Japan1`). Recorded for the
+  /// predefined CID -> Unicode table selection of stage 1.3 (part B); empty for
+  /// simple fonts.
+  std::string cid_registry;
+  std::string cid_ordering;
+
   /// Translate a string of character codes to Unicode: the `ToUnicode` CMap
-  /// when present (authoritative), else the `/Encoding`, else identity bytes.
+  /// when present (authoritative), else, for a composite font, "no Unicode"
+  /// (stage 1.3 part B / 1.4 territory), else the simple-font `/Encoding`, else
+  /// identity bytes.
   [[nodiscard]] std::string to_unicode(const std::string &codes) const;
 };
 

diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp
@@ -177,6 +177,49 @@ std::optional<Encoding> parse_encoding(DocumentParser &parser,
   return encoding;
 }
 
+/// Parse a composite (Type0) font's descendant CIDFont (`/DescendantFonts` is a
+/// one-element array of the CIDFont): records the `/CIDSystemInfo`
+/// `/Registry`/`/Ordering` used to pick a predefined CID -> Unicode table in
+/// stage 1.3 (part B). The Type0 `/Encoding` (code -> CID) is `Identity-H/V` or
+/// a predefined CJK CMap; only `/ToUnicode` is used for extraction in part A.
+void parse_composite_font(DocumentParser &parser, const Dictionary &dictionary,
+                          Font &font) {
+  font.composite = true;
+
+  if (!dictionary.has_key("DescendantFonts")) {
+    return;
+  }
+  const Object descendants =
+      parser.resolve_object_copy(dictionary["DescendantFonts"]);
+  if (!descendants.is_array() || descendants.as_array().size() == 0) {
+    return;
+  }
+
+  const Object cid_font = parser.resolve_object_copy(descendants.as_array()[0]);
+  if (!cid_font.is_dictionary()) {
+    return;
+  }
+  const Dictionary &cid_font_dictionary = cid_font.as_dictionary();
+  if (!cid_font_dictionary.has_key("CIDSystemInfo")) {
+    return;
+  }
+
+  const Object system_info =
+      parser.resolve_object_copy(cid_font_dictionary["CIDSystemInfo"]);
+  if (!system_info.is_dictionary()) {
+    return;
+  }
+  const Dictionary &system_info_dictionary = system_info.as_dictionary();
+  if (system_info_dictionary.has_key("Registry") &&
+      system_info_dictionary["Registry"].is_string()) {
+    font.cid_registry = system_info_dictionary["Registry"].as_string();
+  }
+  if (system_info_dictionary.has_key("Ordering") &&
+      system_info_dictionary["Ordering"].is_string()) {
+    font.cid_ordering = system_info_dictionary["Ordering"].as_string();
+  }
+}
+
 Font *parse_font(DocumentParser &parser, const ObjectReference &reference,
                  Document &document) {
   Font *font = document.create_element<Font>();
@@ -187,6 +230,10 @@ Font *parse_font(DocumentParser &parser, const ObjectReference &reference,
   font->object_reference = reference;
   font->object = Object(dictionary);
 
+  const bool is_type0 = dictionary.has_key("Subtype") &&
+                        dictionary["Subtype"].is_name() &&
+                        dictionary["Subtype"].as_name() == "Type0";
+
   if (dictionary.has_key("ToUnicode")) {
     std::string stream =
         parser.read_decoded_stream(dictionary["ToUnicode"].as_reference());
@@ -195,10 +242,16 @@ Font *parse_font(DocumentParser &parser, const ObjectReference &reference,
     font->cmap = cmap_parser.parse_cmap();
   }
 
-  // Simple-font `/Encoding`: a base-encoding name, or a dictionary
-  // with `/BaseEncoding` + `/Differences`. The text-extraction fallback for
-  // fonts without a `ToUnicode` CMap.
-  if (dictionary.has_key("Encoding")) {
+  if (is_type0) {
+    // Composite (Type0) font: the `/Encoding` is a code -> CID CMap, not a
+    // simple-font glyph-name encoding, so it must not go through
+    // `parse_encoding`. Extraction relies on `/ToUnicode` (parsed above) in
+    // stage 1.3 part A.
+    parse_composite_font(parser, dictionary, *font);
+  } else if (dictionary.has_key("Encoding")) {
+    // Simple-font `/Encoding`: a base-encoding name, or a dictionary with
+    // `/BaseEncoding` + `/Differences`. The text-extraction fallback for fonts
+    // without a `ToUnicode` CMap.
     font->encoding = parse_encoding(parser, dictionary["Encoding"]);
   }
 

diff --git a/test/src/internal/pdf/pdf_document_parser.cpp b/test/src/internal/pdf/pdf_document_parser.cpp
@@ -209,6 +209,74 @@ TEST(DocumentParser, inherited_page_attributes) {
   EXPECT_EQ(page6->rotate, 90);
 }
 
+namespace {
+
+/// A mini-PDF whose single page references one composite (Type0) font `F0`:
+/// `Identity-H` over a descendant `CIDFontType2` with an `Adobe`/`Identity`
+/// `/CIDSystemInfo`, optionally carrying a 2-byte `/ToUnicode` CMap (mapping
+/// the code 0x0041 to `A`).
+std::string composite_font_mini_pdf(const bool with_to_unicode) {
+  PdfFileBuilder builder;
+  builder.object("<< /Type /Catalog /Pages 2 0 R >>")
+      .object("<< /Type /Pages /Kids [3 0 R] /Count 1 >>")
+      .object("<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
+              "/Resources << /Font << /F0 5 0 R >> >> /Contents 4 0 R >>")
+      .stream_object("", "BT ET")
+      .object(std::string("<< /Type /Font /Subtype /Type0 /BaseFont /AAAAAA+X "
+                          "/Encoding /Identity-H /DescendantFonts [6 0 R]") +
+              (with_to_unicode ? " /ToUnicode 7 0 R >>" : " >>"))
+      .object("<< /Type /Font /Subtype /CIDFontType2 /BaseFont /AAAAAA+X "
+              "/CIDSystemInfo << /Registry (Adobe) /Ordering (Identity) "
+              "/Supplement 0 >> /CIDToGIDMap /Identity >>");
+  if (with_to_unicode) {
+    builder.stream_object("", "1 begincodespacerange\n<0000> <FFFF>\n"
+                              "endcodespacerange\n1 beginbfchar\n"
+                              "<0041> <0041>\nendbfchar\n");
+  }
+  return builder.trailer("/Root 1 0 R").build_classic();
+}
+
+const Font *first_page_font(const Document &document, const std::string &name) {
+  const auto *page =
+      dynamic_cast<Page *>(document.catalog->pages->kids.front());
+  return page->resources->font.at(name);
+}
+
+} // namespace
+
+// A composite (Type0) font is recognized, its descendant CIDFont's
+// `/CIDSystemInfo` recorded, and its `/ToUnicode` CMap drives extraction over
+// 2-byte codes (stage 1.3).
+TEST(DocumentParser, composite_font_with_to_unicode) {
+  const std::string pdf = composite_font_mini_pdf(true);
+  DocumentParser parser(std::make_unique<std::istringstream>(pdf));
+  const std::unique_ptr<Document> document = parser.parse_document();
+
+  const Font *font = first_page_font(*document, "F0");
+  ASSERT_NE(font, nullptr);
+  EXPECT_TRUE(font->composite);
+  EXPECT_EQ(font->cid_registry, "Adobe");
+  EXPECT_EQ(font->cid_ordering, "Identity");
+  // The 2-byte code 0x0041 maps to `A` via the `/ToUnicode` CMap.
+  EXPECT_EQ(font->to_unicode(std::string("\x00\x41", 2)), "A");
+}
+
+// A composite font without a `/ToUnicode` CMap cannot yet resolve CID ->
+// Unicode (predefined CJK tables are stage 1.3 part B; embedded reverse maps
+// stage 1.4), so extraction yields "no Unicode" rather than the byte-garbage
+// the simple-font identity fallback would produce on multi-byte codes.
+TEST(DocumentParser, composite_font_without_to_unicode_yields_no_unicode) {
+  const std::string pdf = composite_font_mini_pdf(false);
+  DocumentParser parser(std::make_unique<std::istringstream>(pdf));
+  const std::unique_ptr<Document> document = parser.parse_document();
+
+  const Font *font = first_page_font(*document, "F0");
+  ASSERT_NE(font, nullptr);
+  EXPECT_TRUE(font->composite);
+  EXPECT_EQ(font->cid_registry, "Adobe");
+  EXPECT_TRUE(font->to_unicode(std::string("\x00\x41", 2)).empty());
+}
+
 // Recovery: a valid file with garbage prepended (the real fixture
 // `order-EK52VKL0.pdf` is an HTTP response saved as `.pdf`) has every xref
 // offset and the `startxref` shifted, so the chain walk fails. A forward scan