diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 40ab1fe3..10b4e4d2 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -4318,3 +4318,67 @@ def validate_misplaced_list_items(self): hyperlink=li.hyperlink, ) return self + + def _normalize_references(self) -> None: + """Normalize ref numbering by ordering node items as per iterate_items().""" + new_body = GroupItem(**self.body.model_dump(exclude={"children"})) + + item_lists: dict[str, list[NodeItem]] = { + "groups": [], + "texts": [], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + } + orig_ref_to_new_ref: dict[str, str] = {} + + # collect items in traversal order + for item, _ in self.iterate_items( + with_groups=True, + traverse_pictures=True, + included_content_layers={c for c in ContentLayer}, + ): + key = item.self_ref.split("/")[1] + is_body = key == "body" + new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}" + # register cref mapping: + orig_ref_to_new_ref[item.self_ref] = new_cref + + if not is_body: + new_item = copy.deepcopy(item) + new_item.children = [] + + # put item in the right list + item_lists[key].append(new_item) + + # update item's self reference + new_item.self_ref = new_cref + + if item.parent: + # set item's parent + new_parent_cref = orig_ref_to_new_ref[item.parent.cref] + new_item.parent = RefItem(cref=new_parent_cref) + + # add item to parent's children + path_components = new_parent_cref.split("/") + num_components = len(path_components) + parent_node: NodeItem + if num_components == 3: + _, parent_key, parent_index_str = path_components + parent_index = int(parent_index_str) + parent_node = item_lists[parent_key][parent_index] + elif num_components == 2 and path_components[1] == "body": + parent_node = new_body + else: + raise RuntimeError(f"Unsupported ref format: {new_parent_cref}") + parent_node.children.append(RefItem(cref=new_cref)) + + # update document + self.groups = item_lists["groups"] # type: ignore + self.texts = item_lists["texts"] # type: ignore + self.pictures = item_lists["pictures"] # type: ignore + self.tables = item_lists["tables"] # type: ignore + self.key_value_items = item_lists["key_value_items"] # type: ignore + self.form_items = item_lists["form_items"] # type: ignore + self.body = new_body diff --git a/test/data/doc/dummy_doc.yaml b/test/data/doc/dummy_doc.yaml index 92a56c5d..ef0e6bba 100644 --- a/test/data/doc/dummy_doc.yaml +++ b/test/data/doc/dummy_doc.yaml @@ -98,7 +98,7 @@ texts: tables: # All tables... - - self_ref: "#/table/0" + - self_ref: "#/tables/0" label: "table" parent: $ref: "#/body" diff --git a/test/data/doc/misplaced_list_items.norm.out.yaml b/test/data/doc/misplaced_list_items.norm.out.yaml new file mode 100644 index 00000000..a0b76068 --- /dev/null +++ b/test/data/doc/misplaced_list_items.norm.out.yaml @@ -0,0 +1,84 @@ +body: + children: + - $ref: '#/groups/0' + - $ref: '#/texts/1' + - $ref: '#/groups/1' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/body' +form_items: [] +furniture: + children: [] + content_layer: furniture + label: unspecified + name: _root_ + self_ref: '#/furniture' +groups: +- children: + - $ref: '#/texts/0' + content_layer: body + label: ordered_list + name: group + parent: + $ref: '#/body' + self_ref: '#/groups/0' +- children: + - $ref: '#/texts/2' + - $ref: '#/texts/3' + content_layer: body + label: list + name: group + parent: + $ref: '#/body' + self_ref: '#/groups/1' +key_value_items: [] +name: '' +pages: {} +pictures: [] +schema_name: DoclingDocument +tables: [] +texts: +- children: [] + content_layer: body + enumerated: true + label: list_item + marker: '1.' + orig: foo + parent: + $ref: '#/groups/0' + prov: [] + self_ref: '#/texts/0' + text: foo +- children: [] + content_layer: body + label: text + orig: bar + parent: + $ref: '#/body' + prov: [] + self_ref: '#/texts/1' + text: bar +- children: [] + content_layer: body + enumerated: false + label: list_item + marker: '-' + orig: here + parent: + $ref: '#/groups/1' + prov: [] + self_ref: '#/texts/2' + text: here +- children: [] + content_layer: body + enumerated: false + label: list_item + marker: '-' + orig: there + parent: + $ref: '#/groups/1' + prov: [] + self_ref: '#/texts/3' + text: there +version: 1.4.0 diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index cbdb243b..7d9f4a14 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -1636,3 +1636,11 @@ def test_misplaced_list_items(): else: exp_doc = DoclingDocument.load_from_yaml(exp_file) assert doc == exp_doc + + doc._normalize_references() + exp_file = filename.parent / f"{filename.stem}.norm.out.yaml" + if GEN_TEST_DATA: + doc.save_as_yaml(exp_file) + else: + exp_doc = DoclingDocument.load_from_yaml(exp_file) + assert doc == exp_doc