Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 67 additions & 7 deletions openkb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from dataclasses import dataclass
from pathlib import Path
from typing import Any

import os

Expand All @@ -26,6 +27,65 @@ class IndexResult:
tree: dict


def _normalize_page_content(raw_pages: Any) -> list[dict[str, Any]]:
"""Normalize PageIndex/local PDF page content into OpenKB's JSON shape."""
if not isinstance(raw_pages, list):
return []

pages: list[dict[str, Any]] = []
for index, item in enumerate(raw_pages, start=1):
if isinstance(item, str):
content = item.strip()
if content:
pages.append({"page": index, "content": content, "images": []})
continue

if not isinstance(item, dict):
continue

raw_page = item.get("page", item.get("page_number", item.get("page_num", index)))
try:
page_number = int(raw_page)
except (TypeError, ValueError):
page_number = index
if page_number < 1:
page_number = index

content = item.get("content", item.get("markdown", item.get("text", "")))
if content is None:
content = ""
content = str(content).strip()

images = item.get("images", [])
if not isinstance(images, list):
images = []
normalized_images = [
image for image in images
if isinstance(image, dict) and isinstance(image.get("path"), str)
]

if content or normalized_images:
pages.append({
"page": page_number,
"content": content,
"images": normalized_images,
})

return pages


def _get_pdf_page_count(pdf_path: Path) -> int:
from openkb.converter import get_pdf_page_count

return get_pdf_page_count(pdf_path)


def _convert_pdf_to_pages(pdf_path: Path, doc_name: str, images_dir: Path) -> list[dict[str, Any]]:
from openkb.images import convert_pdf_to_pages

return convert_pdf_to_pages(pdf_path, doc_name, images_dir)


def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
"""Index a long PDF document using PageIndex and write wiki pages."""
openkb_dir = kb_dir / ".openkb"
Expand Down Expand Up @@ -82,23 +142,23 @@ def index_long_document(pdf_path: Path, kb_dir: Path) -> IndexResult:
sources_dir.mkdir(parents=True, exist_ok=True)
images_dir = sources_dir / "images" / pdf_path.stem

from openkb.images import convert_pdf_to_pages

all_pages: list = []
all_pages: list[dict[str, Any]] = []
if pageindex_api_key:
# Cloud mode: fetch OCR'd markdown from PageIndex. get_page_content
# requires a page range, so pass "1-N".
from openkb.converter import get_pdf_page_count
page_count = get_pdf_page_count(pdf_path)
page_count = _get_pdf_page_count(pdf_path)
try:
all_pages = col.get_page_content(doc_id, f"1-{page_count}")
all_pages = _normalize_page_content(col.get_page_content(doc_id, f"1-{page_count}"))
except Exception as exc:
logger.warning("Cloud get_page_content failed for %s: %s", pdf_path.name, exc)

if not all_pages:
if pageindex_api_key:
logger.warning("Cloud returned no pages for %s; falling back to local pymupdf", pdf_path.name)
all_pages = convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir)
all_pages = _normalize_page_content(_convert_pdf_to_pages(pdf_path, pdf_path.stem, images_dir))

if not all_pages:
raise RuntimeError(f"No page content extracted for {pdf_path.name}")

(sources_dir / f"{pdf_path.stem}.json").write_text(
json_mod.dumps(all_pages, ensure_ascii=False, indent=2), encoding="utf-8",
Expand Down
98 changes: 97 additions & 1 deletion tests/test_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,36 @@
from unittest.mock import MagicMock, patch


from openkb.indexer import IndexResult, index_long_document
from openkb.indexer import IndexResult, _normalize_page_content, index_long_document


class TestNormalizePageContent:
def test_normalizes_pageindex_dicts(self):
pages = _normalize_page_content([
{"page_number": "2", "markdown": " Page two ", "images": [{"path": "sources/images/doc/a.png"}]},
{"page_num": 3, "text": "Page three", "images": "bad"},
])

assert pages == [
{
"page": 2,
"content": "Page two",
"images": [{"path": "sources/images/doc/a.png"}],
},
{"page": 3, "content": "Page three", "images": []},
]

def test_normalizes_string_pages(self):
pages = _normalize_page_content([" page one ", "", "page three"])

assert pages == [
{"page": 1, "content": "page one", "images": []},
{"page": 3, "content": "page three", "images": []},
]

def test_rejects_unusable_shapes(self):
assert _normalize_page_content({"page": 1}) == []
assert _normalize_page_content([None, {}, {"content": ""}]) == []


class TestIndexLongDocument:
Expand Down Expand Up @@ -123,3 +152,70 @@ def test_localclient_called_with_index_config(self, kb_dir, sample_tree, tmp_pat
assert ic.if_add_node_text is True
assert ic.if_add_node_summary is True
assert ic.if_add_doc_description is True

def test_cloud_page_content_is_normalized(self, kb_dir, sample_tree, tmp_path, monkeypatch):
doc_id = "cloud-123"
fake_col = self._make_fake_collection(doc_id, sample_tree)
fake_col.get_page_content.return_value = [
{"page_number": "1", "markdown": "Cloud page one."},
"Cloud page two.",
]

fake_client = MagicMock()
fake_client.collection.return_value = fake_col

pdf_path = tmp_path / "sample.pdf"
pdf_path.write_bytes(b"%PDF-1.4 fake")
monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key")

with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
patch("openkb.indexer._get_pdf_page_count", return_value=2), \
patch("openkb.indexer._convert_pdf_to_pages") as local_pages:
index_long_document(pdf_path, kb_dir)

local_pages.assert_not_called()
json_file = kb_dir / "wiki" / "sources" / "sample.json"
assert '"content": "Cloud page one."' in json_file.read_text(encoding="utf-8")
assert '"content": "Cloud page two."' in json_file.read_text(encoding="utf-8")

def test_invalid_cloud_page_content_falls_back_to_local(self, kb_dir, sample_tree, tmp_path, monkeypatch):
doc_id = "cloud-456"
fake_col = self._make_fake_collection(doc_id, sample_tree)
fake_col.get_page_content.return_value = {"bad": "shape"}

fake_client = MagicMock()
fake_client.collection.return_value = fake_col

pdf_path = tmp_path / "sample.pdf"
pdf_path.write_bytes(b"%PDF-1.4 fake")
monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key")

with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
patch("openkb.indexer._get_pdf_page_count", return_value=2), \
patch("openkb.indexer._convert_pdf_to_pages", return_value=self._fake_pages()) as local_pages:
index_long_document(pdf_path, kb_dir)

local_pages.assert_called_once()
json_file = kb_dir / "wiki" / "sources" / "sample.json"
assert "Page one text." in json_file.read_text(encoding="utf-8")

def test_empty_cloud_and_local_pages_fail(self, kb_dir, sample_tree, tmp_path, monkeypatch):
doc_id = "empty-123"
fake_col = self._make_fake_collection(doc_id, sample_tree)

fake_client = MagicMock()
fake_client.collection.return_value = fake_col

pdf_path = tmp_path / "sample.pdf"
pdf_path.write_bytes(b"%PDF-1.4 fake")
monkeypatch.setenv("PAGEINDEX_API_KEY", "test-key")

with patch("openkb.indexer.PageIndexClient", return_value=fake_client), \
patch("openkb.indexer._get_pdf_page_count", return_value=2), \
patch("openkb.indexer._convert_pdf_to_pages", return_value=[]):
try:
index_long_document(pdf_path, kb_dir)
except RuntimeError as exc:
assert "No page content extracted" in str(exc)
else:
raise AssertionError("expected RuntimeError")