Skip to content

Commit 8872e73

Browse files
feat: Fixed char ordering in text lines (#138)
Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
1 parent 6397287 commit 8872e73

File tree

234 files changed

+426951
-166239
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

234 files changed

+426951
-166239
lines changed

.github/workflows/wheels.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,11 @@ jobs:
269269
shell: pwsh
270270
run: |
271271
New-Item -Path 'C:\nasm' -ItemType Directory -Force
272-
Invoke-WebRequest -Uri 'https://fossies.org/windows/misc/nasm-2.16.03-win64.zip/nasm-2.16.03/nasm.exe' -OutFile 'C:\nasm\nasm.exe'
272+
Invoke-WebRequest -Uri 'https://www.nasm.us/pub/nasm/releasebuilds/2.16.03/win64/nasm-2.16.03-win64.zip' -OutFile 'C:\nasm-2.16.03-win64.zip'
273+
Expand-Archive -Path 'C:\nasm-2.16.03-win64.zip' -DestinationPath 'C:\nasm-temp' -Force
274+
Copy-Item -Path 'C:\nasm-temp\nasm-2.16.03\nasm.exe' -Destination 'C:\nasm\nasm.exe'
275+
Remove-Item -Path 'C:\nasm-2.16.03-win64.zip' -Force
276+
Remove-Item -Path 'C:\nasm-temp' -Recurse -Force
273277
nasm -v
274278
275279
- name: Build wheels

docling_parse/pdf_parser.py

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,12 @@ def _to_table_of_contents(self, toc: dict) -> List[PdfTableOfContents]:
115115
return result
116116

117117
def get_page(
118-
self, page_no: int, create_words: bool = True, create_textlines: bool = True
118+
self,
119+
page_no: int,
120+
*,
121+
create_words: bool = True,
122+
create_textlines: bool = True,
123+
enforce_same_font: bool = True,
119124
) -> SegmentedPdfPage:
120125
if page_no in self._pages.keys():
121126
return self._pages[page_no]
@@ -134,6 +139,7 @@ def get_page(
134139
page=page["original"],
135140
create_words=create_words,
136141
create_textlines=create_textlines,
142+
enforce_same_font=enforce_same_font,
137143
) # put on cache
138144
return self._pages[page_no]
139145

@@ -315,7 +321,12 @@ def _to_lines(self, data: dict) -> List[PdfLine]:
315321
return result
316322

317323
def _to_segmented_page(
318-
self, page: dict, create_words: bool, create_textlines: bool
324+
self,
325+
page: dict,
326+
*,
327+
create_words: bool,
328+
create_textlines: bool,
329+
enforce_same_font: bool = True,
319330
) -> SegmentedPdfPage:
320331

321332
char_cells = self._to_cells(page["cells"])
@@ -330,14 +341,22 @@ def _to_segmented_page(
330341
)
331342

332343
if create_words:
333-
self._create_word_cells(segmented_page)
344+
self._create_word_cells(segmented_page, enforce_same_font=enforce_same_font)
334345

335346
if create_textlines:
336-
self._create_textline_cells(segmented_page)
347+
self._create_textline_cells(
348+
segmented_page, enforce_same_font=enforce_same_font
349+
)
350+
337351
return segmented_page
338352

339353
def _create_word_cells(
340-
self, segmented_page: SegmentedPdfPage, _loglevel: str = "fatal"
354+
self,
355+
segmented_page: SegmentedPdfPage,
356+
*,
357+
space_width_factor_for_merge: float = 0.33,
358+
enforce_same_font: bool = True,
359+
_loglevel: str = "fatal",
341360
):
342361

343362
if len(segmented_page.word_cells) > 0:
@@ -355,7 +374,11 @@ def _create_word_cells(
355374

356375
sanitizer.set_char_cells(data=char_data)
357376

358-
data = sanitizer.create_word_cells(space_width_factor_for_merge=0.33)
377+
# data = sanitizer.create_word_cells(space_width_factor_for_merge=0.33)
378+
data = sanitizer.create_word_cells(
379+
space_width_factor_for_merge=space_width_factor_for_merge,
380+
enforce_same_font=enforce_same_font,
381+
)
359382

360383
segmented_page.word_cells = []
361384
for item in data:
@@ -365,9 +388,14 @@ def _create_word_cells(
365388
segmented_page.has_words = len(segmented_page.word_cells) > 0
366389

367390
def _create_textline_cells(
368-
self, segmented_page: SegmentedPdfPage, _loglevel: str = "fatal"
391+
self,
392+
segmented_page: SegmentedPdfPage,
393+
*,
394+
space_width_factor_for_merge: float = 1.0,
395+
space_width_factor_for_merge_with_space: float = 0.33,
396+
enforce_same_font: bool = True,
397+
_loglevel: str = "fatal",
369398
):
370-
371399
if len(segmented_page.textline_cells) > 0:
372400
return
373401

@@ -387,7 +415,12 @@ def _create_textline_cells(
387415

388416
sanitizer.set_char_cells(data=char_data)
389417

390-
data = sanitizer.create_line_cells()
418+
# data = sanitizer.create_line_cells()
419+
data = sanitizer.create_line_cells(
420+
space_width_factor_for_merge=space_width_factor_for_merge,
421+
space_width_factor_for_merge_with_space=space_width_factor_for_merge_with_space,
422+
enforce_same_font=enforce_same_font,
423+
)
391424

392425
segmented_page.textline_cells = []
393426
for item in data:

docling_parse/visualize.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,13 @@ def parse_args():
7676
help="Enable interactive mode (default: False)",
7777
)
7878

79+
# Add an optional boolean argument for enforcing same-font
80+
parser.add_argument(
81+
"--enforce-same-font",
82+
action="store_true",
83+
help="Enable interactive mode (default: False)",
84+
)
85+
7986
# Add an argument for the output directory, defaulting to "./tmp"
8087
parser.add_argument(
8188
"-o",
@@ -115,18 +122,21 @@ def parse_args():
115122
int(args.page),
116123
args.display_text,
117124
args.log_text,
125+
args.enforce_same_font,
118126
args.page_boundary,
119127
args.category,
120128
)
121129

122130

123131
def visualise_py(
132+
*,
124133
log_level: str,
125134
pdf_path: str,
126135
interactive: str,
127136
output_dir: Path,
128137
display_text: bool,
129138
log_text: bool,
139+
enforce_same_font: bool,
130140
page_boundary: str = "crop_box", # media_box
131141
category: str = "char", # "both", "sanitized", "original"
132142
page_num: int = -1,
@@ -142,7 +152,12 @@ def visualise_py(
142152
for page_no in page_nos:
143153
print(f"parsing {pdf_path} on page: {page_no}")
144154

145-
pdf_page: SegmentedPdfPage = pdf_doc.get_page(page_no=page_no)
155+
pdf_page: SegmentedPdfPage = pdf_doc.get_page(
156+
page_no=page_no,
157+
create_words=True,
158+
create_textlines=True,
159+
enforce_same_font=enforce_same_font,
160+
)
146161

147162
if os.path.exists(str(output_dir)):
148163
pdf_page.save_as_json(
@@ -234,6 +249,7 @@ def main():
234249
page_num,
235250
display_text,
236251
log_text,
252+
enforce_same_font,
237253
page_boundary,
238254
category,
239255
) = parse_args()
@@ -247,6 +263,7 @@ def main():
247263
output_dir=output_dir,
248264
display_text=display_text,
249265
log_text=log_text,
266+
enforce_same_font=enforce_same_font,
250267
page_boundary=page_boundary,
251268
category=category,
252269
page_num=page_num,

src/pybind/docling_sanitizer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ namespace docling
249249
// do a deep copy
250250
word_cells = char_cells;
251251

252-
LOG_S(INFO) << "#-wordcells: " << word_cells.size();
252+
LOG_S(INFO) << "#-word cells: " << word_cells.size();
253253

254254
// remove all spaces
255255
auto itr = word_cells.begin();
@@ -265,7 +265,7 @@ namespace docling
265265
}
266266
}
267267

268-
LOG_S(INFO) << "#-wordcells: " << word_cells.size();
268+
LOG_S(INFO) << "#-word cells: " << word_cells.size();
269269

270270
// > space_width_factor_for_merge, so nothing gets merged with a space
271271
double space_width_factor_for_merge_with_space = 2.0*space_width_factor_for_merge;

src/v2/pdf_resources/page_cells.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ namespace pdflib
3232
itr_type end() { return cells.end(); }
3333

3434
itr_type erase(itr_type itr) { return cells.erase(itr); }
35+
itr_type erase(itr_type itr_0, itr_type itr_1) { return cells.erase(itr_0, itr_1); }
3536

3637
pdf_resource<PAGE_CELL>& at(std::size_t i) { return cells.at(i); }
3738

src/v2/pdf_resources/page_font.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1555,7 +1555,7 @@ namespace pdflib
15551555
if(cmap_initialized) // we found a `ToUnicode` before. No need to go deeper!
15561556
{
15571557
LOG_S(WARNING) << "We found a `ToUnicode` before. No need to go deeper!";
1558-
// return;
1558+
return;
15591559
}
15601560
//else
15611561

0 commit comments

Comments
 (0)