From f602f9d17419431ea46d294113cbfd873fc235cf Mon Sep 17 00:00:00 2001 From: Guohao Zhang Date: Wed, 3 Jun 2026 16:37:46 +0800 Subject: [PATCH] fix: harden long PDF page extraction --- openkb/indexer.py | 74 ++++++++++++++++++++++++++++---- tests/test_indexer.py | 98 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 164 insertions(+), 8 deletions(-) diff --git a/openkb/indexer.py b/openkb/indexer.py index 6ea9d73f..8405751e 100644 --- a/openkb/indexer.py +++ b/openkb/indexer.py @@ -6,6 +6,7 @@ from dataclasses import dataclass from pathlib import Path +from typing import Any import os @@ -26,6 +27,65 @@ class IndexResult: tree: dict +def _normalize_page_content(raw_pages: Any) -> list[dict[str, Any]]: + """Normalize PageIndex/local PDF page content into OpenKB's JSON shape.""" + if not isinstance(raw_pages, list): + return [] + + pages: list[dict[str, Any]] = [] + for index, item in enumerate(raw_pages, start=1): + if isinstance(item, str): + content = item.strip() + if content: + pages.append({"page": index, "content": content, "images": []}) + continue + + if not isinstance(item, dict): + continue + + raw_page = item.get("page", item.get("page_number", item.get("page_num", index))) + try: + page_number = int(raw_page) + except (TypeError, ValueError): + page_number = index + if page_number < 1: + page_number = index + + content = item.get("content", item.get("markdown", item.get("text", ""))) + if content is None: + content = "" + content = str(content).strip() + + images = item.get("images", []) + if not isinstance(images, list): + images = [] + normalized_images = [ + image for image in images + if isinstance(image, dict) and isinstance(image.get("path"), str) + ] + + if content or normalized_images: + pages.append({ + "page": page_number, + "content": content, + "images": normalized_images, + }) + + return pages + + +def _get_pdf_page_count(pdf_path: Path) -> int: + from openkb.converter import get_pdf_page_count + + return get_pdf_page_count(pdf_path) + + +def _convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict[str, Any]]: + from openkb.images import convert_pdf_to_pages + + return convert_pdf_to_pages(pdf_path, doc_name, images_dir) + + def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: """Index a long PDF document using PageIndex and write wiki pages.""" openkb_dir = kb_dir / ".openkb" @@ -82,23 +142,23 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult: sources_dir.mkdir(parents=True, exist_ok=True) images_dir = sources_dir / "images" / pdf_path.stem - from openkb.images import convert_pdf_to_pages - - all_pages: list = [] + all_pages: list[dict[str, Any]] = [] if pageindex_api_key: # Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content # requires a page range, so pass "1-N". - from openkb.converter import get_pdf_page_count - page_count = get_pdf_page_count(pdf_path) + page_count = _get_pdf_page_count(pdf_path) try: - all_pages = col.get_page_content(doc_id, f"1-{page_count}") + all_pages = _normalize_page_content(col.get_page_content(doc_id, f"1-{page_count}")) except Exception as exc: logger.warning("Cloud get_page_content failed for %s: %s", pdf_path.name, exc) if not all_pages: if pageindex_api_key: logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name) - all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir) + all_pages = _normalize_page_content(_convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)) + + if not all_pages: + raise RuntimeError(f"No page content extracted for {pdf_path.name}") (sources_dir / f"{pdf_path.stem}.json").write_text( json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8", diff --git a/tests/test_indexer.py b/tests/test_indexer.py index bf81d2ab..78b34c97 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -4,7 +4,36 @@ from unittest.mock import MagicMock, patch -from openkb.indexer import IndexResult, index_long_document +from openkb.indexer import IndexResult, _normalize_page_content, index_long_document + + +class TestNormalizePageContent: + def test_normalizes_pageindex_dicts(self): + pages = _normalize_page_content([ + {"page_number": "2", "markdown": " Page two ", "images": [{"path": "sources/images/doc/a.png"}]}, + {"page_num": 3, "text": "Page three", "images": "bad"}, + ]) + + assert pages == [ + { + "page": 2, + "content": "Page two", + "images": [{"path": "sources/images/doc/a.png"}], + }, + {"page": 3, "content": "Page three", "images": []}, + ] + + def test_normalizes_string_pages(self): + pages = _normalize_page_content([" page one ", "", "page three"]) + + assert pages == [ + {"page": 1, "content": "page one", "images": []}, + {"page": 3, "content": "page three", "images": []}, + ] + + def test_rejects_unusable_shapes(self): + assert _normalize_page_content({"page": 1}) == [] + assert _normalize_page_content([None, {}, {"content": ""}]) == [] class TestIndexLongDocument: @@ -123,3 +152,70 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat assert ic.if_add_node_text is True assert ic.if_add_node_summary is True assert ic.if_add_doc_description is True + + def test_cloud_page_content_is_normalized(self, kb_dir, sample_tree, tmp_path, monkeypatch): + doc_id = "cloud-123" + fake_col = self._make_fake_collection(doc_id, sample_tree) + fake_col.get_page_content.return_value = [ + {"page_number": "1", "markdown": "Cloud page one."}, + "Cloud page two.", + ] + + fake_client = MagicMock() + fake_client.collection.return_value = fake_col + + pdf_path = tmp_path / "sample.pdf" + pdf_path.write_bytes(b"%PDF-1.4 fake") + monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key") + + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.indexer._get_pdf_page_count", return_value=2), \ + patch("openkb.indexer._convert_pdf_to_pages") as local_pages: + index_long_document(pdf_path, kb_dir) + + local_pages.assert_not_called() + json_file = kb_dir / "wiki" / "sources" / "sample.json" + assert '"content": "Cloud page one."' in json_file.read_text(encoding="utf-8") + assert '"content": "Cloud page two."' in json_file.read_text(encoding="utf-8") + + def test_invalid_cloud_page_content_falls_back_to_local(self, kb_dir, sample_tree, tmp_path, monkeypatch): + doc_id = "cloud-456" + fake_col = self._make_fake_collection(doc_id, sample_tree) + fake_col.get_page_content.return_value = {"bad": "shape"} + + fake_client = MagicMock() + fake_client.collection.return_value = fake_col + + pdf_path = tmp_path / "sample.pdf" + pdf_path.write_bytes(b"%PDF-1.4 fake") + monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key") + + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.indexer._get_pdf_page_count", return_value=2), \ + patch("openkb.indexer._convert_pdf_to_pages", return_value=self._fake_pages()) as local_pages: + index_long_document(pdf_path, kb_dir) + + local_pages.assert_called_once() + json_file = kb_dir / "wiki" / "sources" / "sample.json" + assert "Page one text." in json_file.read_text(encoding="utf-8") + + def test_empty_cloud_and_local_pages_fail(self, kb_dir, sample_tree, tmp_path, monkeypatch): + doc_id = "empty-123" + fake_col = self._make_fake_collection(doc_id, sample_tree) + + fake_client = MagicMock() + fake_client.collection.return_value = fake_col + + pdf_path = tmp_path / "sample.pdf" + pdf_path.write_bytes(b"%PDF-1.4 fake") + monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key") + + with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \ + patch("openkb.indexer._get_pdf_page_count", return_value=2), \ + patch("openkb.indexer._convert_pdf_to_pages", return_value=[]): + try: + index_long_document(pdf_path, kb_dir) + except RuntimeError as exc: + assert "No page content extracted" in str(exc) + else: + raise AssertionError("expected RuntimeError")