From f602f9d17419431ea46d294113cbfd873fc235cf Mon Sep 17 00:00:00 2001
From: Guohao Zhang <akidezhang@outlook.com>
Date: Wed, 3 Jun 2026 16:37:46 +0800
Subject: [PATCH] fix: harden long PDF page extraction

---
 openkb/indexer.py     | 74 ++++++++++++++++++++++++++++----
 tests/test_indexer.py | 98 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 164 insertions(+), 8 deletions(-)

diff --git a/openkb/indexer.py b/openkb/indexer.py
index 6ea9d73f..8405751e 100644
--- a/openkb/indexer.py
+++ b/openkb/indexer.py
@@ -6,6 +6,7 @@
 
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Any
 
 import os
 
@@ -26,6 +27,65 @@ class IndexResult:
     tree: dict
 
 
+def _normalize_page_content(raw_pages: Any) -> list[dict[str, Any]]:
+    """Normalize PageIndex/local PDF page content into OpenKB's JSON shape."""
+    if not isinstance(raw_pages, list):
+        return []
+
+    pages: list[dict[str, Any]] = []
+    for index, item in enumerate(raw_pages, start=1):
+        if isinstance(item, str):
+            content = item.strip()
+            if content:
+                pages.append({"page": index, "content": content, "images": []})
+            continue
+
+        if not isinstance(item, dict):
+            continue
+
+        raw_page = item.get("page", item.get("page_number", item.get("page_num", index)))
+        try:
+            page_number = int(raw_page)
+        except (TypeError, ValueError):
+            page_number = index
+        if page_number < 1:
+            page_number = index
+
+        content = item.get("content", item.get("markdown", item.get("text", "")))
+        if content is None:
+            content = ""
+        content = str(content).strip()
+
+        images = item.get("images", [])
+        if not isinstance(images, list):
+            images = []
+        normalized_images = [
+            image for image in images
+            if isinstance(image, dict) and isinstance(image.get("path"), str)
+        ]
+
+        if content or normalized_images:
+            pages.append({
+                "page": page_number,
+                "content": content,
+                "images": normalized_images,
+            })
+
+    return pages
+
+
+def _get_pdf_page_count(pdf_path: Path) -> int:
+    from openkb.converter import get_pdf_page_count
+
+    return get_pdf_page_count(pdf_path)
+
+
+def _convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict[str, Any]]:
+    from openkb.images import convert_pdf_to_pages
+
+    return convert_pdf_to_pages(pdf_path, doc_name, images_dir)
+
+
 def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
     """Index a long PDF document using PageIndex and write wiki pages."""
     openkb_dir = kb_dir / ".openkb"
@@ -82,23 +142,23 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
     sources_dir.mkdir(parents=True, exist_ok=True)
     images_dir = sources_dir / "images" / pdf_path.stem
 
-    from openkb.images import convert_pdf_to_pages
-
-    all_pages: list = []
+    all_pages: list[dict[str, Any]] = []
     if pageindex_api_key:
         # Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content
         # requires a page range, so pass "1-N".
-        from openkb.converter import get_pdf_page_count
-        page_count = get_pdf_page_count(pdf_path)
+        page_count = _get_pdf_page_count(pdf_path)
         try:
-            all_pages = col.get_page_content(doc_id, f"1-{page_count}")
+            all_pages = _normalize_page_content(col.get_page_content(doc_id, f"1-{page_count}"))
         except Exception as exc:
             logger.warning("Cloud get_page_content failed for %s: %s", pdf_path.name, exc)
 
     if not all_pages:
         if pageindex_api_key:
             logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
-        all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
+        all_pages = _normalize_page_content(_convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir))
+
+    if not all_pages:
+        raise RuntimeError(f"No page content extracted for {pdf_path.name}")
 
     (sources_dir / f"{pdf_path.stem}.json").write_text(
         json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
diff --git a/tests/test_indexer.py b/tests/test_indexer.py
index bf81d2ab..78b34c97 100644
--- a/tests/test_indexer.py
+++ b/tests/test_indexer.py
@@ -4,7 +4,36 @@
 from unittest.mock import MagicMock, patch
 
 
-from openkb.indexer import IndexResult, index_long_document
+from openkb.indexer import IndexResult, _normalize_page_content, index_long_document
+
+
+class TestNormalizePageContent:
+    def test_normalizes_pageindex_dicts(self):
+        pages = _normalize_page_content([
+            {"page_number": "2", "markdown": "  Page two  ", "images": [{"path": "sources/images/doc/a.png"}]},
+            {"page_num": 3, "text": "Page three", "images": "bad"},
+        ])
+
+        assert pages == [
+            {
+                "page": 2,
+                "content": "Page two",
+                "images": [{"path": "sources/images/doc/a.png"}],
+            },
+            {"page": 3, "content": "Page three", "images": []},
+        ]
+
+    def test_normalizes_string_pages(self):
+        pages = _normalize_page_content([" page one ", "", "page three"])
+
+        assert pages == [
+            {"page": 1, "content": "page one", "images": []},
+            {"page": 3, "content": "page three", "images": []},
+        ]
+
+    def test_rejects_unusable_shapes(self):
+        assert _normalize_page_content({"page": 1}) == []
+        assert _normalize_page_content([None, {}, {"content": ""}]) == []
 
 
 class TestIndexLongDocument:
@@ -123,3 +152,70 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat
         assert ic.if_add_node_text is True
         assert ic.if_add_node_summary is True
         assert ic.if_add_doc_description is True
+
+    def test_cloud_page_content_is_normalized(self, kb_dir, sample_tree, tmp_path, monkeypatch):
+        doc_id = "cloud-123"
+        fake_col = self._make_fake_collection(doc_id, sample_tree)
+        fake_col.get_page_content.return_value = [
+            {"page_number": "1", "markdown": "Cloud page one."},
+            "Cloud page two.",
+        ]
+
+        fake_client = MagicMock()
+        fake_client.collection.return_value = fake_col
+
+        pdf_path = tmp_path / "sample.pdf"
+        pdf_path.write_bytes(b"%PDF-1.4 fake")
+        monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key")
+
+        with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
+             patch("openkb.indexer._get_pdf_page_count", return_value=2), \
+             patch("openkb.indexer._convert_pdf_to_pages") as local_pages:
+            index_long_document(pdf_path, kb_dir)
+
+        local_pages.assert_not_called()
+        json_file = kb_dir / "wiki" / "sources" / "sample.json"
+        assert '"content": "Cloud page one."' in json_file.read_text(encoding="utf-8")
+        assert '"content": "Cloud page two."' in json_file.read_text(encoding="utf-8")
+
+    def test_invalid_cloud_page_content_falls_back_to_local(self, kb_dir, sample_tree, tmp_path, monkeypatch):
+        doc_id = "cloud-456"
+        fake_col = self._make_fake_collection(doc_id, sample_tree)
+        fake_col.get_page_content.return_value = {"bad": "shape"}
+
+        fake_client = MagicMock()
+        fake_client.collection.return_value = fake_col
+
+        pdf_path = tmp_path / "sample.pdf"
+        pdf_path.write_bytes(b"%PDF-1.4 fake")
+        monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key")
+
+        with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
+             patch("openkb.indexer._get_pdf_page_count", return_value=2), \
+             patch("openkb.indexer._convert_pdf_to_pages", return_value=self._fake_pages()) as local_pages:
+            index_long_document(pdf_path, kb_dir)
+
+        local_pages.assert_called_once()
+        json_file = kb_dir / "wiki" / "sources" / "sample.json"
+        assert "Page one text." in json_file.read_text(encoding="utf-8")
+
+    def test_empty_cloud_and_local_pages_fail(self, kb_dir, sample_tree, tmp_path, monkeypatch):
+        doc_id = "empty-123"
+        fake_col = self._make_fake_collection(doc_id, sample_tree)
+
+        fake_client = MagicMock()
+        fake_client.collection.return_value = fake_col
+
+        pdf_path = tmp_path / "sample.pdf"
+        pdf_path.write_bytes(b"%PDF-1.4 fake")
+        monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key")
+
+        with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
+             patch("openkb.indexer._get_pdf_page_count", return_value=2), \
+             patch("openkb.indexer._convert_pdf_to_pages", return_value=[]):
+            try:
+                index_long_document(pdf_path, kb_dir)
+            except RuntimeError as exc:
+                assert "No page content extracted" in str(exc)
+            else:
+                raise AssertionError("expected RuntimeError")