Skip to content

Commit 0700af2

Browse files
authored
fix: Add missing features in ThreadedStandardPdfPipeline (#2252)
Add missing features in ThreadedStandardPdfPipeline Signed-off-by: Christoph Auer <[email protected]>
1 parent 2c91234 commit 0700af2

File tree

1 file changed

+85
-1
lines changed

1 file changed

+85
-1
lines changed

docling/pipeline/threaded_standard_pdf_pipeline.py

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,14 @@
2020
import logging
2121
import threading
2222
import time
23+
import warnings
2324
from collections import defaultdict, deque
2425
from dataclasses import dataclass, field
2526
from pathlib import Path
26-
from typing import Any, Iterable, List, Optional, Sequence, Tuple
27+
from typing import Any, Iterable, List, Optional, Sequence, Tuple, cast
28+
29+
import numpy as np
30+
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
2731

2832
from docling.backend.abstract_backend import AbstractDocumentBackend
2933
from docling.backend.pdf_backend import PdfDocumentBackend
@@ -541,6 +545,86 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
541545
elements=elements, headers=headers, body=body
542546
)
543547
conv_res.document = self.reading_order_model(conv_res)
548+
549+
# Generate page images in the output
550+
if self.pipeline_options.generate_page_images:
551+
for page in conv_res.pages:
552+
assert page.image is not None
553+
page_no = page.page_no + 1
554+
conv_res.document.pages[page_no].image = ImageRef.from_pil(
555+
page.image, dpi=int(72 * self.pipeline_options.images_scale)
556+
)
557+
558+
# Generate images of the requested element types
559+
with warnings.catch_warnings(): # deprecated generate_table_images
560+
warnings.filterwarnings("ignore", category=DeprecationWarning)
561+
if (
562+
self.pipeline_options.generate_picture_images
563+
or self.pipeline_options.generate_table_images
564+
):
565+
scale = self.pipeline_options.images_scale
566+
for element, _level in conv_res.document.iterate_items():
567+
if not isinstance(element, DocItem) or len(element.prov) == 0:
568+
continue
569+
if (
570+
isinstance(element, PictureItem)
571+
and self.pipeline_options.generate_picture_images
572+
) or (
573+
isinstance(element, TableItem)
574+
and self.pipeline_options.generate_table_images
575+
):
576+
page_ix = element.prov[0].page_no - 1
577+
page = next(
578+
(p for p in conv_res.pages if p.page_no == page_ix),
579+
cast("Page", None),
580+
)
581+
assert page is not None
582+
assert page.size is not None
583+
assert page.image is not None
584+
585+
crop_bbox = (
586+
element.prov[0]
587+
.bbox.scaled(scale=scale)
588+
.to_top_left_origin(
589+
page_height=page.size.height * scale
590+
)
591+
)
592+
593+
cropped_im = page.image.crop(crop_bbox.as_tuple())
594+
element.image = ImageRef.from_pil(
595+
cropped_im, dpi=int(72 * scale)
596+
)
597+
598+
# Aggregate confidence values for document:
599+
if len(conv_res.pages) > 0:
600+
with warnings.catch_warnings():
601+
warnings.filterwarnings(
602+
"ignore",
603+
category=RuntimeWarning,
604+
message="Mean of empty slice|All-NaN slice encountered",
605+
)
606+
conv_res.confidence.layout_score = float(
607+
np.nanmean(
608+
[c.layout_score for c in conv_res.confidence.pages.values()]
609+
)
610+
)
611+
conv_res.confidence.parse_score = float(
612+
np.nanquantile(
613+
[c.parse_score for c in conv_res.confidence.pages.values()],
614+
q=0.1, # parse score should relate to worst 10% of pages.
615+
)
616+
)
617+
conv_res.confidence.table_score = float(
618+
np.nanmean(
619+
[c.table_score for c in conv_res.confidence.pages.values()]
620+
)
621+
)
622+
conv_res.confidence.ocr_score = float(
623+
np.nanmean(
624+
[c.ocr_score for c in conv_res.confidence.pages.values()]
625+
)
626+
)
627+
544628
return conv_res
545629

546630
# ---------------------------------------------------------------- misc

0 commit comments

Comments
 (0)