sphinx-doc · tokuhirom · Feb 23, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 26, 2025
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -107,6 +107,7 @@ Contributors
 * Thomas Lamb -- linkcheck builder
 * Thomas Waldmann -- apidoc module fixes
 * Tim Hoffmann -- theme improvements
+* Tokuhiro Matsuno -- search unicode normalization
 * Vince Salvino -- JavaScript search improvements
 * Will Maier -- directory HTML builder
 * Zac Hatfield-Dodds -- doctest reporting improvements, intersphinx performance

diff --git a/CHANGES.rst b/CHANGES.rst
@@ -13,6 +13,12 @@ Deprecated
 Features added
 --------------
 
+* #13384: Add Unicode normalization option for search indexing.
+  This allows users to specify the type of Unicode normalization
+  (NFC, NFD, NFKC, NFKD) to apply during searches, improving the
+  accuracy and reliability of search results.
+  Patch by Tokuhiro Matsuno.
+
 Bugs fixed
 ----------
 

diff --git a/doc/usage/configuration.rst b/doc/usage/configuration.rst
@@ -2030,6 +2030,26 @@ and also make use of these options.
 
    .. versionadded:: 1.0
 
+.. confval:: html_search_unicode_normalization
+   :type: :code-py:`str`
+   :default: :code-py:`"NFKD"`
+
+    html_search_unicode_normalization is a setting that specifies the type
+    of Unicode normalization to apply during searches. It can take one of
+    the following values:
+
+    * **None** -- Disable the Unicode normalization.
+    * **"NFD"** -- Decomposes characters into their canonical decomposed form.
+    * **"NFC"** -- Composes characters into their canonical composed form.
+    * **"NFKD"** -- Decomposes characters into their compatibility decomposed form.
+    * **"NFKC"** -- Composes characters into their compatibility composed form.
+
+    This setting ensures that text is consistently normalized, improving the
+    accuracy and reliability of search results by handling different Unicode
+    representations of the same characters.
+
+   .. versionadded:: 8.3
+
 .. confval:: html_search_language
    :type: :code-py:`str`
    :default: The value of **language**

diff --git a/sphinx/builders/html/__init__.py b/sphinx/builders/html/__init__.py
@@ -440,6 +440,7 @@ def prepare_writing(self, docnames: Set[str]) -> None:
                 lang,
                 self.config.html_search_options,
                 self.config.html_search_scorer,
+                self.config.html_search_unicode_normalization,
             )
             self.load_indexer(docnames)
 
@@ -544,6 +545,7 @@ def prepare_writing(self, docnames: Set[str]) -> None:
             'has_source': self.config.html_copy_source,
             'show_source': self.config.html_show_sourcelink,
             'sourcelink_suffix': self.config.html_sourcelink_suffix,
+            'search_unicode_normalization': self.config.html_search_unicode_normalization,
             'file_suffix': self.out_suffix,
             'link_suffix': self.link_suffix,
             'script_files': self._js_files,
@@ -1490,6 +1492,9 @@ def setup(app: Sphinx) -> ExtensionMetadata:
     app.add_config_value(
         'html_show_search_summary', True, 'html', types=frozenset({bool})
     )
+    app.add_config_value(
+        'html_search_unicode_normalization', 'NFKD', 'html', types=frozenset({str})
+    )
     app.add_config_value('html_show_sphinx', True, 'html', types=frozenset({bool}))
     app.add_config_value('html_context', {}, 'html', types=frozenset({dict}))
     app.add_config_value(

diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py
@@ -9,6 +9,7 @@
 import os
 import pickle
 import re
+import unicodedata
 from importlib import import_module
 from typing import TYPE_CHECKING
 
@@ -21,7 +22,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Iterable
-    from typing import Any, Protocol, TypeVar
+    from typing import Any, Literal, Protocol, TypeVar
 
     from docutils.nodes import Node
 
@@ -275,7 +276,12 @@ class IndexBuilder:
     }
 
     def __init__(
-        self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str
+        self,
+        env: BuildEnvironment,
+        lang: str,
+        options: dict[str, str],
+        scoring: str,
+        normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None = None,
     ) -> None:
         self._domains = env.domains
         self._env_version = env.version
@@ -301,6 +307,7 @@ def __init__(
         self._objnames: dict[int, tuple[str, str, str]] = env._search_index_objnames
         # add language-specific SearchLanguage instance
         lang_class = languages.get(lang)
+        self._unicode_normalization = normalization
 
         # fallback; try again with language-code
         if lang_class is None and '_' in lang:
@@ -552,7 +559,11 @@ def _word_collector(self, doctree: nodes.document) -> WordStore:
         split = self.lang.split
         language = self.lang.lang
         _feed_visit_nodes(
-            doctree, word_store=word_store, split=split, language=language
+            doctree,
+            word_store=word_store,
+            split=split,
+            language=language,
+            normalization=self._unicode_normalization,
         )
         return word_store
 
@@ -602,7 +613,14 @@ def _feed_visit_nodes(
     word_store: WordStore,
     split: Callable[[str], list[str]],
     language: str,
+    normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None,
 ) -> None:
+    def normalize(text: str) -> str:
+        if normalization:
+            return unicodedata.normalize(normalization, text)
+        else:
+            return text
+
     if isinstance(node, nodes.comment):
         return
     elif isinstance(node, nodes.Element) and 'no-search' in node['classes']:
@@ -626,18 +644,26 @@ def _feed_visit_nodes(
                 flags=re.IGNORECASE | re.DOTALL,
             )
             nodetext = re.sub(r'<[^<]+?>', '', nodetext)
-            word_store.words.extend(split(nodetext))
+            word_store.words.extend(split(normalize(nodetext)))
         return
     elif isinstance(node, nodes.meta) and _is_meta_keywords(node, language):
-        keywords = [keyword.strip() for keyword in node['content'].split(',')]
+        keywords = [
+            normalize(keyword.strip()) for keyword in node['content'].split(',')
+        ]
         word_store.words.extend(keywords)
     elif isinstance(node, nodes.Text):
-        word_store.words.extend(split(node.astext()))
+        word_store.words.extend(split(normalize(node.astext())))
     elif isinstance(node, nodes.title):
         title, is_main_title = node.astext(), len(word_store.titles) == 0
         ids = node.parent['ids']
         title_node_id = None if is_main_title else ids[0] if ids else None
-        word_store.titles.append((title, title_node_id))
-        word_store.title_words.extend(split(title))
+        word_store.titles.append((normalize(title), title_node_id))
+        word_store.title_words.extend(split(normalize(title)))
     for child in node.children:
-        _feed_visit_nodes(child, word_store=word_store, split=split, language=language)
+        _feed_visit_nodes(
+            child,
+            word_store=word_store,
+            split=split,
+            language=language,
+            normalization=normalization,
+        )
diff --git a/sphinx/themes/basic/static/documentation_options.js.jinja b/sphinx/themes/basic/static/documentation_options.js.jinja
@@ -10,4 +10,5 @@ const DOCUMENTATION_OPTIONS = {
     NAVIGATION_WITH_KEYS: {{ 'true' if theme_navigation_with_keys|tobool else 'false'}},
     SHOW_SEARCH_SUMMARY: {{ 'true' if show_search_summary else 'false' }},
     ENABLE_SEARCH_SHORTCUTS: {{ 'true' if theme_enable_search_shortcuts|tobool else 'false'}},
+    SEARCH_UNICODE_NORMALIZATION: {{ '"' + search_unicode_normalization + '"' if search_unicode_normalization else 'null' }},
 };
diff --git a/sphinx/themes/basic/static/searchtools.js b/sphinx/themes/basic/static/searchtools.js
@@ -276,7 +276,15 @@ const Search = {
     else Search.deferQuery(query);
   },
 
+  _normalizeQuery: (query, form) => {
+      return query.normalize(form);
+  },
-  _normalizeQuery: (query, form) => {
-      return query.normalize(form);
-  },
+  _normalizeQuery: (query) => {
+      const form = DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION;
+      if (!form) return query;
+      return query.normalize(form);
+  },
-  _normalizeQuery: (query, form) => {
-      return query.normalize(form);
-  },
+  _normalizeQuery: (query) => {
+      const form = DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION;
+      if (!form) return query;
+      return query.normalize(form);
+  },
+
   _parseQuery: (query) => {
+    if (DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION) {
+        query = Search._normalizeQuery(query, DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION);
+    }
+
     // stem the search terms and add them to the correct list
-  _parseQuery: (query) => {
-    if (DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION) {
-        query = Search._normalizeQuery(query, DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION);
-    }
-
-    // stem the search terms and add them to the correct list
+  _parseQuery: (query) => {
+    query = Search._normalizeQuery(query);
+
+    // stem the search terms and add them to the correct list
-  _parseQuery: (query) => {
-    if (DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION) {
-        query = Search._normalizeQuery(query, DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION);
-    }
-
-    // stem the search terms and add them to the correct list
+  _parseQuery: (query) => {
+    query = Search._normalizeQuery(query);
+
+    // stem the search terms and add them to the correct list
     const stemmer = new Stemmer();
     const searchTerms = new Set();

diff --git a/tests/js/fixtures/normalization/searchindex.js b/tests/js/fixtures/normalization/searchindex.js
diff --git a/tests/js/roots/normalization/conf.py b/tests/js/roots/normalization/conf.py
@@ -0,0 +1 @@
+html_search_unicode_normalization = 'NFKC'
diff --git a/tests/js/roots/normalization/index.rst b/tests/js/roots/normalization/index.rst
@@ -0,0 +1,5 @@
+Sphinx
+======
+
+This is the main page of the ``normalization`` test project.
+
diff --git a/tests/js/searchtools.spec.js b/tests/js/searchtools.spec.js
@@ -95,7 +95,44 @@ describe('Basic html theme search', function() {
       ]];
       expect(Search.performTermsSearch(searchterms, excluded, terms, titleterms)).toEqual(hits);
     });
+  });
+
+  describe('unicode normalization', function() {
+    it('should find documents indexed with half-width characters using a full-width query', function() {
+      DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION = 'NFKC';
+
+      eval(loadFixture("normalization/searchindex.js"));
+
+      [_searchQuery, searchterms, excluded, ..._remainingItems] = Search._parseQuery('Ｓｐｈｉｎｘ');
+
+      terms = Search._index.terms;
+      titleterms = Search._index.titleterms;
+
+      hits = [[
+          "index",
+          "Sphinx",
+          "",
+          null,
+          15,
+          "index.rst",
+          "text"],
+      ];
+
+      expect(Search.performTermsSearch(searchterms, excluded, terms, titleterms)).toEqual(hits);
+    });
 
+    it('should parse queries with half-width and full-width characters equivalently', function() {
+      const halfWidthQuery = Search._normalizeQuery('Sphinx', 'NFKC');
+      const fullWidthQuery = Search._normalizeQuery('Ｓｐｈｉｎｘ', 'NFKC');
+
+      expect(halfWidthQuery).toEqual(fullWidthQuery);
+    });
+
+    afterEach(() => {
+      Object.keys(DOCUMENTATION_OPTIONS).forEach(key => {
+        delete DOCUMENTATION_OPTIONS[key];
+      });
+    });
   });
 
   describe('aggregation of search results', function() {

diff --git a/tests/roots/test-search/tocitem.rst b/tests/roots/test-search/tocitem.rst
@@ -15,3 +15,5 @@ lorem ipsum
 模块中 CAS service部分
 
 可以Chinesetesttwo查看
+
+Ｐｙｔｈｏｎ
diff --git a/tests/test_search.py b/tests/test_search.py
@@ -484,3 +484,16 @@ def test_check_js_search_indexes(make_app, sphinx_test_tempdir, directory):
         f'Search index fixture {existing_searchindex} does not match regenerated copy.'
     )
     assert fresh_searchindex.read_bytes() == existing_searchindex.read_bytes(), msg
+
+
+@pytest.mark.sphinx(
+    'html',
+    testroot='search',
+    confoverrides={'html_search_unicode_normalization': 'NFKC'},
+    srcdir='search_normalize',
+)
+def test_search_index_unicode_normalize(app: SphinxTestApp) -> None:
+    app.build(force_all=True)
+    index = load_searchindex(app.outdir / 'searchindex.js')
+    assert 'Ｐｙｔｈｏｎ' not in index['terms']
+    assert 'python' in index['terms']
Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,3 +15,5 @@ lorem ipsum
		模块中 CAS service部分

		可以Chinesetesttwo查看

		Ｐｙｔｈｏｎ