Skip to content

Commit 3f03709

Browse files
authored
fix: Improve numbered list detection for msword docs (#2100)
* Improve numbered list detection for msword docs This fixes the list detection in MSWord docs by properly tracking and counting the list entries. It fixes #2090 * DCO Remediation Commit for Nikhil Verma <[email protected]> I, Nikhil Verma <[email protected]>, hereby add my Signed-off-by to this commit: 509da66 Signed-off-by: Nikhil Verma <[email protected]> --------- Signed-off-by: Nikhil Verma <[email protected]>
1 parent 94fcc46 commit 3f03709

File tree

3 files changed

+135
-25
lines changed

3 files changed

+135
-25
lines changed

docling/backend/msword_backend.py

Lines changed: 126 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ def __init__(
6767

6868
self.level = 0
6969
self.listIter = 0
70+
# Track list counters per numId and ilvl
71+
self.list_counters: dict[tuple[int, int], int] = {}
7072

7173
self.history: dict[str, Any] = {
7274
"names": [None],
@@ -315,6 +317,108 @@ def _get_numId_and_ilvl(
315317

316318
return None, None # If the paragraph is not part of a list
317319

320+
def _get_list_counter(self, numid: int, ilvl: int) -> int:
321+
"""Get and increment the counter for a specific numId and ilvl combination."""
322+
key = (numid, ilvl)
323+
if key not in self.list_counters:
324+
self.list_counters[key] = 0
325+
self.list_counters[key] += 1
326+
return self.list_counters[key]
327+
328+
def _reset_list_counters_for_new_sequence(self, numid: int):
329+
"""Reset counters when starting a new numbering sequence."""
330+
# Reset all counters for this numid
331+
keys_to_reset = [key for key in self.list_counters.keys() if key[0] == numid]
332+
for key in keys_to_reset:
333+
self.list_counters[key] = 0
334+
335+
def _is_numbered_list(self, docx_obj: DocxDocument, numId: int, ilvl: int) -> bool:
336+
"""Check if a list is numbered based on its numFmt value."""
337+
try:
338+
# Access the numbering part of the document
339+
if not hasattr(docx_obj, "part") or not hasattr(docx_obj.part, "package"):
340+
return False
341+
342+
numbering_part = None
343+
# Find the numbering part
344+
for part in docx_obj.part.package.parts:
345+
if "numbering" in part.partname:
346+
numbering_part = part
347+
break
348+
349+
if numbering_part is None:
350+
return False
351+
352+
# Parse the numbering XML
353+
numbering_root = numbering_part.element
354+
namespaces = {
355+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
356+
}
357+
358+
# Find the numbering definition with the given numId
359+
num_xpath = f".//w:num[@w:numId='{numId}']"
360+
num_element = numbering_root.find(num_xpath, namespaces=namespaces)
361+
362+
if num_element is None:
363+
return False
364+
365+
# Get the abstractNumId from the num element
366+
abstract_num_id_elem = num_element.find(
367+
".//w:abstractNumId", namespaces=namespaces
368+
)
369+
if abstract_num_id_elem is None:
370+
return False
371+
372+
abstract_num_id = abstract_num_id_elem.get(
373+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
374+
)
375+
if abstract_num_id is None:
376+
return False
377+
378+
# Find the abstract numbering definition
379+
abstract_num_xpath = (
380+
f".//w:abstractNum[@w:abstractNumId='{abstract_num_id}']"
381+
)
382+
abstract_num_element = numbering_root.find(
383+
abstract_num_xpath, namespaces=namespaces
384+
)
385+
386+
if abstract_num_element is None:
387+
return False
388+
389+
# Find the level definition for the given ilvl
390+
lvl_xpath = f".//w:lvl[@w:ilvl='{ilvl}']"
391+
lvl_element = abstract_num_element.find(lvl_xpath, namespaces=namespaces)
392+
393+
if lvl_element is None:
394+
return False
395+
396+
# Get the numFmt element
397+
num_fmt_element = lvl_element.find(".//w:numFmt", namespaces=namespaces)
398+
if num_fmt_element is None:
399+
return False
400+
401+
num_fmt = num_fmt_element.get(
402+
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
403+
)
404+
405+
# Numbered formats include: decimal, lowerRoman, upperRoman, lowerLetter, upperLetter
406+
# Bullet formats include: bullet
407+
numbered_formats = {
408+
"decimal",
409+
"lowerRoman",
410+
"upperRoman",
411+
"lowerLetter",
412+
"upperLetter",
413+
"decimalZero",
414+
}
415+
416+
return num_fmt in numbered_formats
417+
418+
except Exception as e:
419+
_log.debug(f"Error determining if list is numbered: {e}")
420+
return False
421+
318422
def _get_heading_and_level(self, style_label: str) -> tuple[str, Optional[int]]:
319423
parts = self._split_text_and_number(style_label)
320424

@@ -713,8 +817,6 @@ def _handle_text_elements( # noqa: C901
713817
# Common styles for bullet and numbered lists.
714818
# "List Bullet", "List Number", "List Paragraph"
715819
# Identify whether list is a numbered list or not
716-
# is_numbered = "List Bullet" not in paragraph.style.name
717-
is_numbered = False
718820
p_style_id, p_level = self._get_label_and_level(paragraph)
719821
numid, ilevel = self._get_numId_and_ilvl(paragraph)
720822

@@ -727,6 +829,9 @@ def _handle_text_elements( # noqa: C901
727829
and ilevel is not None
728830
and p_style_id not in ["Title", "Heading"]
729831
):
832+
# Check if this is actually a numbered list by examining the numFmt
833+
is_numbered = self._is_numbered_list(docx_obj, numid, ilevel)
834+
730835
self._add_list_item(
731836
doc=doc,
732837
numid=numid,
@@ -983,15 +1088,19 @@ def _add_list_item(
9831088
if self._prev_numid() is None: # Open new list
9841089
self.level_at_new_list = level
9851090

1091+
# Reset counters for the new numbering sequence
1092+
self._reset_list_counters_for_new_sequence(numid)
1093+
9861094
self.parents[level] = doc.add_list_group(
9871095
name="list", parent=self.parents[level - 1]
9881096
)
9891097

9901098
# Set marker and enumerated arguments if this is an enumeration element.
991-
self.listIter += 1
9921099
if is_numbered:
993-
enum_marker = str(self.listIter) + "."
994-
is_numbered = True
1100+
counter = self._get_list_counter(numid, ilevel)
1101+
enum_marker = str(counter) + "."
1102+
else:
1103+
enum_marker = ""
9951104
self._add_formatted_list_item(
9961105
doc, elements, enum_marker, is_numbered, level
9971106
)
@@ -1005,16 +1114,16 @@ def _add_list_item(
10051114
self.level_at_new_list + prev_indent + 1,
10061115
self.level_at_new_list + ilevel + 1,
10071116
):
1008-
self.listIter = 0
10091117
self.parents[i] = doc.add_list_group(
10101118
name="list", parent=self.parents[i - 1]
10111119
)
10121120

10131121
# TODO: Set marker and enumerated arguments if this is an enumeration element.
1014-
self.listIter += 1
10151122
if is_numbered:
1016-
enum_marker = str(self.listIter) + "."
1017-
is_numbered = True
1123+
counter = self._get_list_counter(numid, ilevel)
1124+
enum_marker = str(counter) + "."
1125+
else:
1126+
enum_marker = ""
10181127
self._add_formatted_list_item(
10191128
doc,
10201129
elements,
@@ -1033,25 +1142,26 @@ def _add_list_item(
10331142
self.parents[k] = None
10341143

10351144
# TODO: Set marker and enumerated arguments if this is an enumeration element.
1036-
self.listIter += 1
10371145
if is_numbered:
1038-
enum_marker = str(self.listIter) + "."
1039-
is_numbered = True
1146+
counter = self._get_list_counter(numid, ilevel)
1147+
enum_marker = str(counter) + "."
1148+
else:
1149+
enum_marker = ""
10401150
self._add_formatted_list_item(
10411151
doc,
10421152
elements,
10431153
enum_marker,
10441154
is_numbered,
10451155
self.level_at_new_list + ilevel,
10461156
)
1047-
self.listIter = 0
10481157

10491158
elif self._prev_numid() == numid or prev_indent == ilevel:
10501159
# TODO: Set marker and enumerated arguments if this is an enumeration element.
1051-
self.listIter += 1
10521160
if is_numbered:
1053-
enum_marker = str(self.listIter) + "."
1054-
is_numbered = True
1161+
counter = self._get_list_counter(numid, ilevel)
1162+
enum_marker = str(counter) + "."
1163+
else:
1164+
enum_marker = ""
10551165
self._add_formatted_list_item(
10561166
doc, elements, enum_marker, is_numbered, level - 1
10571167
)

tests/data/groundtruth/docling_v2/unit_test_lists.docx.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ Paragraph 2.1.2
1212

1313
#### Test 2:
1414

15-
- List item a
16-
- List item b
17-
- List item c
15+
1. List item a
16+
2. List item b
17+
3. List item c
1818

1919
#### Test 3:
2020

tests/data/groundtruth/docling_v2/word_sample.docx.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ To get started with swimming, first lay down in a water and try not to drown:
1818

1919
Also, don’t forget:
2020

21-
- Wear sunglasses
22-
- Don’t forget to drink water
23-
- Use sun cream
21+
1. Wear sunglasses
22+
2. Don’t forget to drink water
23+
3. Use sun cream
2424

2525
Hmm, what else…
2626

@@ -40,6 +40,6 @@ Here are some interesting things a respectful duck could eat:
4040

4141
And let’s add another list in the end:
4242

43-
- Leaves
44-
- Berries
45-
- Grain
43+
1. Leaves
44+
2. Berries
45+
3. Grain

0 commit comments

Comments
 (0)