Skip to content

Add Unicode Normalization for Search Indexing #13384

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ Contributors
* Thomas Lamb -- linkcheck builder
* Thomas Waldmann -- apidoc module fixes
* Tim Hoffmann -- theme improvements
* Tokuhiro Matsuno -- search unicode normalization
* Vince Salvino -- JavaScript search improvements
* Will Maier -- directory HTML builder
* Zac Hatfield-Dodds -- doctest reporting improvements, intersphinx performance
Expand Down
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ Deprecated
Features added
--------------

* #13384: Add Unicode normalization option for search indexing.
This allows users to specify the type of Unicode normalization
(NFC, NFD, NFKC, NFKD) to apply during searches, improving the
accuracy and reliability of search results.
Patch by Tokuhiro Matsuno.

Bugs fixed
----------

Expand Down
20 changes: 20 additions & 0 deletions doc/usage/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2030,6 +2030,26 @@ and also make use of these options.

.. versionadded:: 1.0

.. confval:: html_search_unicode_normalization
:type: :code-py:`str`
:default: :code-py:`"NFKD"`

html_search_unicode_normalization is a setting that specifies the type
of Unicode normalization to apply during searches. It can take one of
the following values:

* **None** -- Disable the Unicode normalization.
* **"NFD"** -- Decomposes characters into their canonical decomposed form.
* **"NFC"** -- Composes characters into their canonical composed form.
* **"NFKD"** -- Decomposes characters into their compatibility decomposed form.
* **"NFKC"** -- Composes characters into their compatibility composed form.

This setting ensures that text is consistently normalized, improving the
accuracy and reliability of search results by handling different Unicode
representations of the same characters.

.. versionadded:: 8.3

.. confval:: html_search_language
:type: :code-py:`str`
:default: The value of **language**
Expand Down
5 changes: 5 additions & 0 deletions sphinx/builders/html/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ def prepare_writing(self, docnames: Set[str]) -> None:
lang,
self.config.html_search_options,
self.config.html_search_scorer,
self.config.html_search_unicode_normalization,
)
self.load_indexer(docnames)

Expand Down Expand Up @@ -544,6 +545,7 @@ def prepare_writing(self, docnames: Set[str]) -> None:
'has_source': self.config.html_copy_source,
'show_source': self.config.html_show_sourcelink,
'sourcelink_suffix': self.config.html_sourcelink_suffix,
'search_unicode_normalization': self.config.html_search_unicode_normalization,
'file_suffix': self.out_suffix,
'link_suffix': self.link_suffix,
'script_files': self._js_files,
Expand Down Expand Up @@ -1490,6 +1492,9 @@ def setup(app: Sphinx) -> ExtensionMetadata:
app.add_config_value(
'html_show_search_summary', True, 'html', types=frozenset({bool})
)
app.add_config_value(
'html_search_unicode_normalization', 'NFKD', 'html', types=frozenset({str})
)
app.add_config_value('html_show_sphinx', True, 'html', types=frozenset({bool}))
app.add_config_value('html_context', {}, 'html', types=frozenset({dict}))
app.add_config_value(
Expand Down
44 changes: 35 additions & 9 deletions sphinx/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import os
import pickle
import re
import unicodedata
from importlib import import_module
from typing import TYPE_CHECKING

Expand All @@ -21,7 +22,7 @@

if TYPE_CHECKING:
from collections.abc import Callable, Iterable
from typing import Any, Protocol, TypeVar
from typing import Any, Literal, Protocol, TypeVar

from docutils.nodes import Node

Expand Down Expand Up @@ -275,7 +276,12 @@ class IndexBuilder:
}

def __init__(
self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str
self,
env: BuildEnvironment,
lang: str,
options: dict[str, str],
scoring: str,
normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None = None,
) -> None:
self._domains = env.domains
self._env_version = env.version
Expand All @@ -301,6 +307,7 @@ def __init__(
self._objnames: dict[int, tuple[str, str, str]] = env._search_index_objnames
# add language-specific SearchLanguage instance
lang_class = languages.get(lang)
self._unicode_normalization = normalization

# fallback; try again with language-code
if lang_class is None and '_' in lang:
Expand Down Expand Up @@ -552,7 +559,11 @@ def _word_collector(self, doctree: nodes.document) -> WordStore:
split = self.lang.split
language = self.lang.lang
_feed_visit_nodes(
doctree, word_store=word_store, split=split, language=language
doctree,
word_store=word_store,
split=split,
language=language,
normalization=self._unicode_normalization,
)
return word_store

Expand Down Expand Up @@ -602,7 +613,14 @@ def _feed_visit_nodes(
word_store: WordStore,
split: Callable[[str], list[str]],
language: str,
normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None,
) -> None:
def normalize(text: str) -> str:
if normalization:
return unicodedata.normalize(normalization, text)
else:
return text

if isinstance(node, nodes.comment):
return
elif isinstance(node, nodes.Element) and 'no-search' in node['classes']:
Expand All @@ -626,18 +644,26 @@ def _feed_visit_nodes(
flags=re.IGNORECASE | re.DOTALL,
)
nodetext = re.sub(r'<[^<]+?>', '', nodetext)
word_store.words.extend(split(nodetext))
word_store.words.extend(split(normalize(nodetext)))
return
elif isinstance(node, nodes.meta) and _is_meta_keywords(node, language):
keywords = [keyword.strip() for keyword in node['content'].split(',')]
keywords = [
normalize(keyword.strip()) for keyword in node['content'].split(',')
]
word_store.words.extend(keywords)
elif isinstance(node, nodes.Text):
word_store.words.extend(split(node.astext()))
word_store.words.extend(split(normalize(node.astext())))
elif isinstance(node, nodes.title):
title, is_main_title = node.astext(), len(word_store.titles) == 0
ids = node.parent['ids']
title_node_id = None if is_main_title else ids[0] if ids else None
word_store.titles.append((title, title_node_id))
word_store.title_words.extend(split(title))
word_store.titles.append((normalize(title), title_node_id))
word_store.title_words.extend(split(normalize(title)))
for child in node.children:
_feed_visit_nodes(child, word_store=word_store, split=split, language=language)
_feed_visit_nodes(
child,
word_store=word_store,
split=split,
language=language,
normalization=normalization,
)
1 change: 1 addition & 0 deletions sphinx/themes/basic/static/documentation_options.js.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ const DOCUMENTATION_OPTIONS = {
NAVIGATION_WITH_KEYS: {{ 'true' if theme_navigation_with_keys|tobool else 'false'}},
SHOW_SEARCH_SUMMARY: {{ 'true' if show_search_summary else 'false' }},
ENABLE_SEARCH_SHORTCUTS: {{ 'true' if theme_enable_search_shortcuts|tobool else 'false'}},
SEARCH_UNICODE_NORMALIZATION: {{ '"' + search_unicode_normalization + '"' if search_unicode_normalization else 'null' }},
};
8 changes: 8 additions & 0 deletions sphinx/themes/basic/static/searchtools.js
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,15 @@ const Search = {
else Search.deferQuery(query);
},

_normalizeQuery: (query, form) => {
return query.normalize(form);
},
Comment on lines +279 to +281
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
_normalizeQuery: (query, form) => {
return query.normalize(form);
},
_normalizeQuery: (query) => {
const form = DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION;
if (!form) return query;
return query.normalize(form);
},


_parseQuery: (query) => {
if (DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION) {
query = Search._normalizeQuery(query, DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION);
}

// stem the search terms and add them to the correct list
Comment on lines 283 to 288
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
_parseQuery: (query) => {
if (DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION) {
query = Search._normalizeQuery(query, DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION);
}
// stem the search terms and add them to the correct list
_parseQuery: (query) => {
query = Search._normalizeQuery(query);
// stem the search terms and add them to the correct list

const stemmer = new Stemmer();
const searchTerms = new Set();
Expand Down
1 change: 1 addition & 0 deletions tests/js/fixtures/normalization/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions tests/js/roots/normalization/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
html_search_unicode_normalization = 'NFKC'
5 changes: 5 additions & 0 deletions tests/js/roots/normalization/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Sphinx
======

This is the main page of the ``normalization`` test project.

37 changes: 37 additions & 0 deletions tests/js/searchtools.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,44 @@ describe('Basic html theme search', function() {
]];
expect(Search.performTermsSearch(searchterms, excluded, terms, titleterms)).toEqual(hits);
});
});

describe('unicode normalization', function() {
it('should find documents indexed with half-width characters using a full-width query', function() {
DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION = 'NFKC';

eval(loadFixture("normalization/searchindex.js"));

[_searchQuery, searchterms, excluded, ..._remainingItems] = Search._parseQuery('Sphinx');

terms = Search._index.terms;
titleterms = Search._index.titleterms;

hits = [[
"index",
"Sphinx",
"",
null,
15,
"index.rst",
"text"],
];

expect(Search.performTermsSearch(searchterms, excluded, terms, titleterms)).toEqual(hits);
});

it('should parse queries with half-width and full-width characters equivalently', function() {
const halfWidthQuery = Search._normalizeQuery('Sphinx', 'NFKC');
const fullWidthQuery = Search._normalizeQuery('Sphinx', 'NFKC');

expect(halfWidthQuery).toEqual(fullWidthQuery);
});

afterEach(() => {
Object.keys(DOCUMENTATION_OPTIONS).forEach(key => {
delete DOCUMENTATION_OPTIONS[key];
});
});
});

describe('aggregation of search results', function() {
Expand Down
2 changes: 2 additions & 0 deletions tests/roots/test-search/tocitem.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ lorem ipsum
模块中 CAS service部分

可以Chinesetesttwo查看

Python
13 changes: 13 additions & 0 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,3 +484,16 @@ def test_check_js_search_indexes(make_app, sphinx_test_tempdir, directory):
f'Search index fixture {existing_searchindex} does not match regenerated copy.'
)
assert fresh_searchindex.read_bytes() == existing_searchindex.read_bytes(), msg


@pytest.mark.sphinx(
'html',
testroot='search',
confoverrides={'html_search_unicode_normalization': 'NFKC'},
srcdir='search_normalize',
)
def test_search_index_unicode_normalize(app: SphinxTestApp) -> None:
app.build(force_all=True)
index = load_searchindex(app.outdir / 'searchindex.js')
assert 'Python' not in index['terms']
assert 'python' in index['terms']
Loading