Skip to content

Commit 4d94e38

Browse files
fix(pypdfium2): Fix OCR bounding box misalignment caused by mismatched rotation metadata (#2039)
* Fix OCR bounding box misalignment caused by rotation metadata Signed-off-by: AndrewTsai0406 <[email protected]> * Add rotation-mismatch scanned pdf test case Signed-off-by: AndrewTsai0406 <[email protected]> * add ground truth for ocr_test_rotation_mismatch.pdf Signed-off-by: AndrewTsai0406 <[email protected]> * add ground truth for ocr_test_rotation_mismatch.pdf Signed-off-by: AndrewTsai0406 <[email protected]> * Updated test GT and merged from main Signed-off-by: Christoph Auer <[email protected]> * Fix OCR test by excluding mismatched rotation example Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: AndrewTsai0406 <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
1 parent 9f4bc5b commit 4d94e38

File tree

4 files changed

+45
-3
lines changed

4 files changed

+45
-3
lines changed

docling/backend/pypdfium2_backend.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,16 +254,38 @@ def merge_group(group: List[TextCell]) -> TextCell:
254254
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
255255
AREA_THRESHOLD = 0 # 32 * 32
256256
page_size = self.get_size()
257+
rotation = self._ppage.get_rotation()
258+
257259
with pypdfium2_lock:
258260
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
259261
pos = obj.get_pos()
262+
if rotation == 90:
263+
pos = (
264+
pos[1],
265+
page_size.height - pos[2],
266+
pos[3],
267+
page_size.height - pos[0],
268+
)
269+
elif rotation == 180:
270+
pos = (
271+
page_size.width - pos[2],
272+
page_size.height - pos[3],
273+
page_size.width - pos[0],
274+
page_size.height - pos[1],
275+
)
276+
elif rotation == 270:
277+
pos = (
278+
page_size.width - pos[3],
279+
pos[0],
280+
page_size.width - pos[1],
281+
pos[2],
282+
)
283+
260284
cropbox = BoundingBox.from_tuple(
261285
pos, origin=CoordOrigin.BOTTOMLEFT
262286
).to_top_left_origin(page_height=page_size.height)
263-
264287
if cropbox.area() > AREA_THRESHOLD:
265288
cropbox = cropbox.scaled(scale=scale)
266-
267289
yield cropbox
268290

269291
def get_text_in_rect(self, bbox: BoundingBox) -> str:
420 KB
Binary file not shown.

tests/test_backend_pdfium.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
)
1010
from docling.datamodel.base_models import InputFormat
1111
from docling.datamodel.document import InputDocument
12+
from docling.datamodel.pipeline_options import PdfPipelineOptions
13+
from docling.document_converter import DocumentConverter, PdfFormatOption
1214

1315

1416
@pytest.fixture
@@ -27,6 +29,23 @@ def _get_backend(pdf_doc):
2729
return doc_backend
2830

2931

32+
def test_get_text_from_rect_rotated():
33+
pdf_doc = Path("./tests/data_scanned/sample_with_rotation_mismatch.pdf")
34+
pipeline_options = PdfPipelineOptions()
35+
pipeline_options.do_ocr = True
36+
37+
doc_converter = DocumentConverter(
38+
format_options={
39+
InputFormat.PDF: PdfFormatOption(
40+
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
41+
)
42+
}
43+
)
44+
conv_res = doc_converter.convert(pdf_doc)
45+
46+
assert "1972" in conv_res.document.export_to_markdown()
47+
48+
3049
def test_text_cell_counts():
3150
pdf_doc = Path("./tests/data/pdf/redp5110_sampled.pdf")
3251

tests/test_e2e_ocr_conversion.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ def get_pdf_paths():
3131
directory = Path("./tests/data_scanned")
3232

3333
# List all PDF files in the directory and its subdirectories
34-
pdf_files = sorted(directory.rglob("*.pdf"))
34+
pdf_files = sorted(directory.rglob("ocr_test*.pdf"))
35+
3536
return pdf_files
3637

3738

0 commit comments

Comments
 (0)