Skip to content

Commit 532e645

Browse files
authored
text-splitters: Add keep_separator arg to HTMLSemanticPreservingSplitter (#31588)
### Description Add keep_separator arg to HTMLSemanticPreservingSplitter and pass value to instance of RecursiveCharacterTextSplitter used under the hood. ### Issue Documents returned by `HTMLSemanticPreservingSplitter.split_text(text)` are defaulted to use separators at beginning of page_content. [See third and fourth document in example output from how-to guide](https://python.langchain.com/docs/how_to/split_html/#using-htmlsemanticpreservingsplitter): ``` [Document(metadata={'Header 1': 'Main Title'}, page_content='This is an introductory paragraph with some basic content.'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='This section introduces the topic'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content='. Below is a list: First item Second item Third item with bold text and a link Subsection 1.1: Details This subsection provides additional details'), Document(metadata={'Header 2': 'Section 1: Introduction'}, page_content=". Here's a table: Header 1 Header 2 Header 3 Row 1, Cell 1 Row 1, Cell 2 Row 1, Cell 3 Row 2, Cell 1 Row 2, Cell 2 Row 2, Cell 3"), Document(metadata={'Header 2': 'Section 2: Media Content'}, page_content='This section contains an image and a video: ![image:example_image_link.mp4](example_image_link.mp4) ![video:example_video_link.mp4](example_video_link.mp4)'), Document(metadata={'Header 2': 'Section 3: Code Example'}, page_content='This section contains a code block: <code:html> <div> <p>This is a paragraph inside a div.</p> </div> </code>'), Document(metadata={'Header 2': 'Conclusion'}, page_content='This is the conclusion of the document.')] ``` ### Dependencies None @ttrumper3
1 parent 52e57cd commit 532e645

File tree

2 files changed

+152
-1
lines changed

2 files changed

+152
-1
lines changed

libs/text-splitters/langchain_text_splitters/html.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@
1010
Dict,
1111
Iterable,
1212
List,
13+
Literal,
1314
Optional,
1415
Sequence,
1516
Tuple,
1617
TypedDict,
18+
Union,
1719
cast,
1820
)
1921

@@ -535,6 +537,8 @@ class HTMLSemanticPreservingSplitter(BaseDocumentTransformer):
535537
preserve_parent_metadata (bool): Whether to pass through parent document
536538
metadata to split documents when calling
537539
``transform_documents/atransform_documents()``.
540+
keep_separator (Union[bool, Literal["start", "end"]]): Whether separators
541+
should be at the beginning of a chunk, at the end, or not at all.
538542
539543
Example:
540544
.. code-block:: python
@@ -584,6 +588,7 @@ def __init__(
584588
allowlist_tags: Optional[List[str]] = None,
585589
denylist_tags: Optional[List[str]] = None,
586590
preserve_parent_metadata: bool = False,
591+
keep_separator: Union[bool, Literal["start", "end"]] = True,
587592
):
588593
"""Initialize splitter."""
589594
try:
@@ -611,6 +616,7 @@ def __init__(
611616
self._external_metadata = external_metadata or {}
612617
self._allowlist_tags = allowlist_tags
613618
self._preserve_parent_metadata = preserve_parent_metadata
619+
self._keep_separator = keep_separator
614620
if allowlist_tags:
615621
self._allowlist_tags = list(
616622
set(allowlist_tags + [header[0] for header in headers_to_split_on])
@@ -625,12 +631,15 @@ def __init__(
625631
if separators:
626632
self._recursive_splitter = RecursiveCharacterTextSplitter(
627633
separators=separators,
634+
keep_separator=keep_separator,
628635
chunk_size=max_chunk_size,
629636
chunk_overlap=chunk_overlap,
630637
)
631638
else:
632639
self._recursive_splitter = RecursiveCharacterTextSplitter(
633-
chunk_size=max_chunk_size, chunk_overlap=chunk_overlap
640+
keep_separator=keep_separator,
641+
chunk_size=max_chunk_size,
642+
chunk_overlap=chunk_overlap,
634643
)
635644

636645
if self._stopword_removal:

libs/text-splitters/tests/unit_tests/test_text_splitters.py

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3375,6 +3375,148 @@ def test_html_splitter_with_media_preservation() -> None:
33753375
assert documents == expected
33763376

33773377

3378+
@pytest.mark.requires("bs4")
3379+
def test_html_splitter_keep_separator_true() -> None:
3380+
"""Test HTML splitting with keep_separator=True"""
3381+
html_content = """
3382+
<h1>Section 1</h1>
3383+
<p>This is some text. This is some other text.</p>
3384+
"""
3385+
splitter = HTMLSemanticPreservingSplitter(
3386+
headers_to_split_on=[("h1", "Header 1")],
3387+
max_chunk_size=10,
3388+
separators=[". "],
3389+
keep_separator=True,
3390+
)
3391+
documents = splitter.split_text(html_content)
3392+
3393+
expected = [
3394+
Document(
3395+
page_content="This is some text",
3396+
metadata={"Header 1": "Section 1"},
3397+
),
3398+
Document(
3399+
page_content=". This is some other text.",
3400+
metadata={"Header 1": "Section 1"},
3401+
),
3402+
]
3403+
3404+
assert documents == expected
3405+
3406+
3407+
@pytest.mark.requires("bs4")
3408+
def test_html_splitter_keep_separator_false() -> None:
3409+
"""Test HTML splitting with keep_separator=False"""
3410+
html_content = """
3411+
<h1>Section 1</h1>
3412+
<p>This is some text. This is some other text.</p>
3413+
"""
3414+
splitter = HTMLSemanticPreservingSplitter(
3415+
headers_to_split_on=[("h1", "Header 1")],
3416+
max_chunk_size=10,
3417+
separators=[". "],
3418+
keep_separator=False,
3419+
)
3420+
documents = splitter.split_text(html_content)
3421+
3422+
expected = [
3423+
Document(
3424+
page_content="This is some text",
3425+
metadata={"Header 1": "Section 1"},
3426+
),
3427+
Document(
3428+
page_content="This is some other text.",
3429+
metadata={"Header 1": "Section 1"},
3430+
),
3431+
]
3432+
3433+
assert documents == expected
3434+
3435+
3436+
@pytest.mark.requires("bs4")
3437+
def test_html_splitter_keep_separator_start() -> None:
3438+
"""Test HTML splitting with keep_separator="start" """
3439+
html_content = """
3440+
<h1>Section 1</h1>
3441+
<p>This is some text. This is some other text.</p>
3442+
"""
3443+
splitter = HTMLSemanticPreservingSplitter(
3444+
headers_to_split_on=[("h1", "Header 1")],
3445+
max_chunk_size=10,
3446+
separators=[". "],
3447+
keep_separator="start",
3448+
)
3449+
documents = splitter.split_text(html_content)
3450+
3451+
expected = [
3452+
Document(
3453+
page_content="This is some text",
3454+
metadata={"Header 1": "Section 1"},
3455+
),
3456+
Document(
3457+
page_content=". This is some other text.",
3458+
metadata={"Header 1": "Section 1"},
3459+
),
3460+
]
3461+
3462+
assert documents == expected
3463+
3464+
3465+
@pytest.mark.requires("bs4")
3466+
def test_html_splitter_keep_separator_end() -> None:
3467+
"""Test HTML splitting with keep_separator="end" """
3468+
html_content = """
3469+
<h1>Section 1</h1>
3470+
<p>This is some text. This is some other text.</p>
3471+
"""
3472+
splitter = HTMLSemanticPreservingSplitter(
3473+
headers_to_split_on=[("h1", "Header 1")],
3474+
max_chunk_size=10,
3475+
separators=[". "],
3476+
keep_separator="end",
3477+
)
3478+
documents = splitter.split_text(html_content)
3479+
3480+
expected = [
3481+
Document(
3482+
page_content="This is some text.",
3483+
metadata={"Header 1": "Section 1"},
3484+
),
3485+
Document(
3486+
page_content="This is some other text.",
3487+
metadata={"Header 1": "Section 1"},
3488+
),
3489+
]
3490+
3491+
assert documents == expected
3492+
3493+
3494+
@pytest.mark.requires("bs4")
3495+
def test_html_splitter_keep_separator_default() -> None:
3496+
"""Test HTML splitting with keep_separator not set"""
3497+
html_content = """
3498+
<h1>Section 1</h1>
3499+
<p>This is some text. This is some other text.</p>
3500+
"""
3501+
splitter = HTMLSemanticPreservingSplitter(
3502+
headers_to_split_on=[("h1", "Header 1")], max_chunk_size=10, separators=[". "]
3503+
)
3504+
documents = splitter.split_text(html_content)
3505+
3506+
expected = [
3507+
Document(
3508+
page_content="This is some text",
3509+
metadata={"Header 1": "Section 1"},
3510+
),
3511+
Document(
3512+
page_content=". This is some other text.",
3513+
metadata={"Header 1": "Section 1"},
3514+
),
3515+
]
3516+
3517+
assert documents == expected
3518+
3519+
33783520
def test_character_text_splitter_discard_regex_separator_on_merge() -> None:
33793521
"""Test that regex lookahead separator is not re-inserted when merging."""
33803522
text = "SCE191 First chunk. SCE103 Second chunk."

0 commit comments

Comments
 (0)