Skip to content

Commit 395a2e4

Browse files
committed
fix: make docling pdf processor backward-compatible
1 parent c497240 commit 395a2e4

3 files changed

Lines changed: 109 additions & 5 deletions

File tree

src/raglight/document_processing/docling_pdf_processor.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,32 @@
99
from .document_processor import DocumentProcessor
1010

1111

12+
def _set_pipeline_option_if_supported(
13+
pipeline_options: PdfPipelineOptions, option_name: str, value: bool
14+
) -> None:
15+
"""
16+
Safely set a Docling PdfPipelineOptions field when available.
17+
18+
Older Docling versions may not expose newer enrichment flags.
19+
"""
20+
model_fields = getattr(type(pipeline_options), "model_fields", None)
21+
if isinstance(model_fields, dict) and option_name not in model_fields:
22+
logging.info(
23+
"Docling option '%s' is not available in this version, skipping.",
24+
option_name,
25+
)
26+
return
27+
28+
try:
29+
setattr(pipeline_options, option_name, value)
30+
except (AttributeError, ValueError) as e:
31+
logging.warning(
32+
"Unable to set Docling option '%s' (continuing without it): %s",
33+
option_name,
34+
e,
35+
)
36+
37+
1238
class DoclingPDFProcessor(DocumentProcessor):
1339
"""
1440
Advanced PDF processor using IBM's Docling for high-fidelity document parsing.
@@ -18,10 +44,18 @@ class DoclingPDFProcessor(DocumentProcessor):
1844

1945
def __init__(self):
2046
pipeline_options = PdfPipelineOptions()
21-
pipeline_options.do_table_structure = True
22-
pipeline_options.do_formula_enrichment = True
23-
pipeline_options.do_code_enrichment = True
24-
pipeline_options.do_chart_extraction = True
47+
_set_pipeline_option_if_supported(
48+
pipeline_options, option_name="do_table_structure", value=True
49+
)
50+
_set_pipeline_option_if_supported(
51+
pipeline_options, option_name="do_formula_enrichment", value=True
52+
)
53+
_set_pipeline_option_if_supported(
54+
pipeline_options, option_name="do_code_enrichment", value=True
55+
)
56+
_set_pipeline_option_if_supported(
57+
pipeline_options, option_name="do_chart_extraction", value=True
58+
)
2559

2660
self.converter = DocumentConverter(
2761
format_options={

src/raglight/document_processing/document_processor_factory.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
from typing import Dict, Optional
23

34
from .document_processor import DocumentProcessor
@@ -13,6 +14,19 @@
1314
HAS_DOCLING = False
1415

1516

17+
def _build_default_pdf_processor() -> DocumentProcessor:
18+
if not HAS_DOCLING:
19+
return PDFProcessor()
20+
21+
try:
22+
return DoclingPDFProcessor()
23+
except Exception as e:
24+
logging.warning(
25+
"Docling initialization failed, falling back to PDFProcessor. Error: %s", e
26+
)
27+
return PDFProcessor()
28+
29+
1630
class DocumentProcessorFactory:
1731
"""
1832
A stateless factory that returns the appropriate DocumentProcessor
@@ -25,7 +39,7 @@ def __init__(
2539
# Default processors
2640
self._processors: Dict[str, DocumentProcessor] = {
2741
# PDF files
28-
"pdf": DoclingPDFProcessor() if HAS_DOCLING else PDFProcessor(),
42+
"pdf": _build_default_pdf_processor(),
2943
# Code files
3044
"py": CodeProcessor(),
3145
"js": CodeProcessor(),
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import unittest
2+
from unittest.mock import MagicMock, patch
3+
4+
from raglight.document_processing.document_processor_factory import (
5+
DocumentProcessorFactory,
6+
)
7+
from raglight.document_processing.pdf_processor import PDFProcessor
8+
9+
10+
class TestDocumentProcessorFactory(unittest.TestCase):
11+
def test_uses_docling_processor_when_available(self):
12+
docling_instance = MagicMock(name="docling_processor")
13+
14+
with (
15+
patch(
16+
"raglight.document_processing.document_processor_factory.HAS_DOCLING",
17+
True,
18+
),
19+
patch(
20+
"raglight.document_processing.document_processor_factory.DoclingPDFProcessor",
21+
return_value=docling_instance,
22+
create=True,
23+
) as mock_docling,
24+
):
25+
factory = DocumentProcessorFactory()
26+
27+
self.assertIs(factory.get_processor("sample.pdf"), docling_instance)
28+
mock_docling.assert_called_once()
29+
30+
def test_falls_back_to_pdf_processor_when_docling_init_fails(self):
31+
with (
32+
patch(
33+
"raglight.document_processing.document_processor_factory.HAS_DOCLING",
34+
True,
35+
),
36+
patch(
37+
"raglight.document_processing.document_processor_factory.DoclingPDFProcessor",
38+
side_effect=ValueError("unsupported docling field"),
39+
create=True,
40+
),
41+
):
42+
factory = DocumentProcessorFactory()
43+
44+
self.assertIsInstance(factory.get_processor("sample.pdf"), PDFProcessor)
45+
46+
def test_falls_back_to_pdf_processor_when_docling_is_unavailable(self):
47+
with patch(
48+
"raglight.document_processing.document_processor_factory.HAS_DOCLING", False
49+
):
50+
factory = DocumentProcessorFactory()
51+
52+
self.assertIsInstance(factory.get_processor("sample.pdf"), PDFProcessor)
53+
54+
55+
if __name__ == "__main__":
56+
unittest.main()

0 commit comments

Comments
 (0)