Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions src/module-elasticsuite-core/Helper/Text.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?php
/**
* DISCLAIMER
*
* Do not edit or add to this file if you wish to upgrade Smile ElasticSuite to newer
* versions in the future.
*
* @category Smile
* @package Smile\ElasticsuiteThesaurus
* @author Pierre Gauthier <pigau@smile.fr>
* @copyright 2025 Smile
* @license Open Software License ("OSL") v. 3.0
*/

namespace Smile\ElasticsuiteCore\Helper;

/**
* Text manipulation helper.
*
* @category Smile
* @package Smile\ElasticsuiteCore
* @author Pierre Gauthier <pigau@smile.fr>
*/
class Text
{
/**
* Partial implementation of a multi-byte aware version of substr_replace.
* Required because the tokens offsets used as for parameters start and length
* are expressed as a number of (UTF-8) characters, independently of the number of bytes.
* Does not accept arrays as first and second parameters.
* Source: https://github.com/fluxbb/utf8/blob/master/functions/substr_replace.php
* Alternative: https://gist.github.com/bantya/563d7d070c286ba1b5a83b9036f0561a
*
* @param string $string Input string
* @param string $replacement Replacement string
* @param mixed $start Start offset
* @param mixed $length Length of replacement
*
* @return mixed
*/
public function mbSubstrReplace($string, $replacement, $start, $length = null)
{
preg_match_all('/./us', $string, $stringChars);
preg_match_all('/./us', $replacement, $replacementChars);
$length = is_int($length) ? $length : mb_strlen($string);
array_splice($stringChars[0], $start, $length, $replacementChars[0]);

return implode($stringChars[0]);
}

/**
* Count the number of words in a given text.
*
* @param string $text The input text.
*
* @return int
*/
public function mbWordCount(string $text): int
{
preg_match_all('/[\p{L}\p{N}\']+/u', $text, $matches);

return count($matches[0]);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@

namespace Smile\ElasticsuiteCore\Search\Request\Query\Fulltext;

use Smile\ElasticsuiteCore\Model\Search\Request\RelevanceConfig\Reader\Container;
use Smile\ElasticsuiteCore\Search\Request\Query\SpanQueryInterface;
use Smile\ElasticsuiteCore\Search\Request\QueryInterface;
use Smile\ElasticsuiteCore\Api\Index\MappingInterface;
use Smile\ElasticsuiteCore\Api\Index\Mapping\FieldFilterInterface;
use Smile\ElasticsuiteCore\Api\Index\Mapping\FieldInterface;
use Smile\ElasticsuiteCore\Search\Request\Query\QueryFactory;
use Smile\ElasticsuiteCore\Api\Index\MappingInterface;
use Smile\ElasticsuiteCore\Api\Search\Request\Container\RelevanceConfiguration\FuzzinessConfigurationInterface;
use Smile\ElasticsuiteCore\Api\Search\Request\ContainerConfigurationInterface;
use Smile\ElasticsuiteCore\Api\Search\SpellcheckerInterface;
use Smile\ElasticsuiteCore\Api\Index\Mapping\FieldFilterInterface;
use Smile\ElasticsuiteCore\Api\Search\Request\Container\RelevanceConfiguration\FuzzinessConfigurationInterface;
use Smile\ElasticsuiteCore\Helper\Text;
use Smile\ElasticsuiteCore\Search\Request\Query\QueryFactory;
use Smile\ElasticsuiteCore\Search\Request\Query\SpanQueryInterface;
use Smile\ElasticsuiteCore\Search\Request\QueryInterface;

/**
* Prepare a fulltext search query.
Expand All @@ -39,6 +39,11 @@ class QueryBuilder
*/
private $queryFactory;

/**
* @var Text
*/
private $textHelper;

/**
*
* @var FieldFilterInterface[]
Expand All @@ -49,11 +54,16 @@ class QueryBuilder
* Constructor.
*
* @param QueryFactory $queryFactory Query factory (used to build subqueries.
* @param Text $textHelper Helper text explaining multibyte string handling.
* @param FieldFilterInterface[] $fieldFilters Field filters models.
*/
public function __construct(QueryFactory $queryFactory, array $fieldFilters = [])
{
public function __construct(
QueryFactory $queryFactory,
Text $textHelper,
array $fieldFilters = []
) {
$this->queryFactory = $queryFactory;
$this->textHelper = $textHelper;
$this->fieldFilters = $fieldFilters;
}

Expand Down Expand Up @@ -176,7 +186,7 @@ private function getWeightedSearchQuery(ContainerConfigurationInterface $contain
$phraseAnalyzer = FieldInterface::ANALYZER_WHITESPACE;
$sortableMatchBoost = 2 * $phraseMatchBoost;

if (is_string($queryText) && str_word_count($queryText) > 1) {
if (is_string($queryText) && $this->textHelper->mbWordCount($queryText) > 1) {
$phraseAnalyzer = FieldInterface::ANALYZER_SHINGLE;
} elseif ($relevanceConfig->areExactMatchSingleTermBoostsCustomized()) {
$phraseMatchBoost = $relevanceConfig->getExactMatchSingleTermPhraseMatchBoost();
Expand Down Expand Up @@ -214,7 +224,7 @@ private function getPureStopwordsQuery(ContainerConfigurationInterface $containe
$relevanceConfig = $containerConfig->getRelevanceConfig();

$analyzer = FieldInterface::ANALYZER_WHITESPACE;
if (is_string($queryText) && str_word_count($queryText) > 1) {
if (is_string($queryText) && $this->textHelper->mbWordCount($queryText) > 1) {
$analyzer = FieldInterface::ANALYZER_SHINGLE;
}

Expand Down Expand Up @@ -289,7 +299,7 @@ private function getFuzzyQuery(ContainerConfigurationInterface $containerConfig,

$standardAnalyzer = FieldInterface::ANALYZER_WHITESPACE;
$phraseAnalyzer = FieldInterface::ANALYZER_WHITESPACE;
if (is_string($queryText) && str_word_count($queryText) > 1) {
if (is_string($queryText) && $this->textHelper->mbWordCount($queryText) > 1) {
$phraseAnalyzer = FieldInterface::ANALYZER_SHINGLE;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
*/
namespace Smile\ElasticsuiteCore\Test\Unit\Search\Request\Query\Fulltext;

use PHPUnit\Framework\MockObject\MockObject;
use Smile\ElasticsuiteCore\Api\Search\Request\Container\RelevanceConfiguration\FuzzinessConfigurationInterface;
use Smile\ElasticsuiteCore\Search\Request\Query\Fulltext\QueryBuilder;
use Smile\ElasticsuiteCore\Index\Mapping\Field;
Expand All @@ -21,6 +22,7 @@
use Smile\ElasticsuiteCore\Api\Index\Mapping\FieldFilterInterface;
use Smile\ElasticsuiteCore\Api\Search\Request\Container\RelevanceConfigurationInterface;
use Smile\ElasticsuiteCore\Api\Search\Request\ContainerConfigurationInterface;
use Smile\ElasticsuiteCore\Helper\Text;
use Magento\Framework\ObjectManagerInterface;
use Smile\ElasticsuiteCore\Search\Request\Query\QueryFactory;
use Smile\ElasticsuiteCore\Index\Mapping;
Expand Down Expand Up @@ -148,8 +150,9 @@ private function runTestQueryBuilder($searchTerms, $spellingType, $expectedQuery
$queryFactory = $this->getQueryFactory($this->mockedQueryTypes);
$fieldFilters = $this->getFieldFilters();
$containerConfig = $this->getContainerConfigMock($this->fields);
$textHelper = $this->getTextHelperMock();

$builder = new QueryBuilder($queryFactory, $fieldFilters);
$builder = new QueryBuilder($queryFactory, $textHelper, $fieldFilters);

$query = $builder->create($containerConfig, $searchTerms, $spellingType);

Expand Down Expand Up @@ -202,6 +205,18 @@ private function getContainerConfigMock($fields)
return $config;
}

/**
* Get Elasticsuite text helper mock.
*
* @return MockObject|Text
*/
private function getTextHelperMock()
{
return $this->getMockBuilder(Text::class)
->disableOriginalConstructor()
->getMock();
}

/**
* Mock the relevace configuration object used by the query builder.
*
Expand Down
40 changes: 12 additions & 28 deletions src/module-elasticsuite-thesaurus/Model/Index.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
use Smile\ElasticsuiteCore\Helper\IndexSettings as IndexSettingsHelper;
use Smile\ElasticsuiteCore\Api\Client\ClientInterface;
use Smile\ElasticsuiteCore\Api\Search\Request\ContainerConfigurationInterface;
use Smile\ElasticsuiteCore\Helper\Text;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfigFactory;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusConfig;
use Smile\ElasticsuiteThesaurus\Config\ThesaurusCacheConfig;
Expand Down Expand Up @@ -67,6 +68,11 @@ class Index
*/
private $cacheHelper;

/**
* @var Text
*/
private $textHelper;

/**
* @var ThesaurusConfig
*/
Expand All @@ -78,20 +84,23 @@ class Index
* @param ClientInterface $client ES client.
* @param IndexSettingsHelper $indexSettingsHelper Index Settings Helper.
* @param CacheHelper $cacheHelper ES caching helper.
* @param Text $textHelper Helper text explaining multibyte string handling.
* @param ThesaurusConfigFactory $thesaurusConfigFactory Thesaurus configuration factory.
* @param ThesaurusCacheConfig $thesaurusCacheConfig Thesaurus cache configuration helper.
*/
public function __construct(
ClientInterface $client,
IndexSettingsHelper $indexSettingsHelper,
CacheHelper $cacheHelper,
Text $textHelper,
ThesaurusConfigFactory $thesaurusConfigFactory,
ThesaurusCacheConfig $thesaurusCacheConfig
) {
$this->client = $client;
$this->indexSettingsHelper = $indexSettingsHelper;
$this->thesaurusConfigFactory = $thesaurusConfigFactory;
$this->cacheHelper = $cacheHelper;
$this->textHelper = $textHelper;
$this->thesaurusCacheConfig = $thesaurusCacheConfig;
}

Expand Down Expand Up @@ -275,7 +284,7 @@ private function getSynonymRewrites($storeId, $queryText, $type, $maxRewrites)
*/
private function getQueryCombinations($storeId, $queryText)
{
if (str_word_count($queryText) < 2) {
if ($this->textHelper->mbWordCount($queryText) < 2) {
return [$queryText]; // No need to compute variations of shingles with a one-word-query.
}

Expand All @@ -297,7 +306,7 @@ private function getQueryCombinations($storeId, $queryText)
foreach ($analysis['tokens'] ?? [] as $token) {
$startOffset = $token['start_offset'];
$length = $token['end_offset'] - $token['start_offset'];
$rewrittenQueryText = $this->mbSubstrReplace($queryText, $token['token'], $startOffset, $length);
$rewrittenQueryText = $this->textHelper->mbSubstrReplace($queryText, $token['token'], $startOffset, $length);
$queries[] = $rewrittenQueryText;
}
$queries = array_unique($queries);
Expand Down Expand Up @@ -327,7 +336,7 @@ private function combineSynonyms($queryText, $synonymByPositions, $maxRewrites,
foreach ($currentPositionSynonyms as $synonym) {
$startOffset = $synonym['start_offset'] + $offset;
$length = $synonym['end_offset'] - $synonym['start_offset'];
$rewrittenQueryText = $this->mbSubstrReplace($queryText, $synonym['token'], $startOffset, $length);
$rewrittenQueryText = $this->textHelper->mbSubstrReplace($queryText, $synonym['token'], $startOffset, $length);
$newOffset = mb_strlen($rewrittenQueryText) - mb_strlen($queryText) + $offset;
$combinations[$rewrittenQueryText] = $substitutions + 1;

Expand Down Expand Up @@ -367,29 +376,4 @@ private function getWeightedRewrites($queryRewrites, $divider, $baseWeight = 1)

return array_map($mapper, $queryRewrites);
}

/**
* Partial implementation of a multi-byte aware version of substr_replace.
* Required because the tokens offsets used as for parameters start and length
* are expressed as a number of (UTF-8) characters, independently of the number of bytes.
* Does not accept arrays as first and second parameters.
* Source: https://github.com/fluxbb/utf8/blob/master/functions/substr_replace.php
* Alternative: https://gist.github.com/bantya/563d7d070c286ba1b5a83b9036f0561a
*
* @param string $string Input string
* @param string $replacement Replacement string
* @param mixed $start Start offset
* @param mixed $length Length of replacement
*
* @return mixed
*/
private function mbSubstrReplace($string, $replacement, $start, $length = null)
{
preg_match_all('/./us', $string, $stringChars);
preg_match_all('/./us', $replacement, $replacementChars);
$length = is_int($length) ? $length : mb_strlen($string);
array_splice($stringChars[0], $start, $length, $replacementChars[0]);

return implode($stringChars[0]);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,17 @@ class FulltextQueryBuilderInterceptor extends \Smile\ElasticsuiteCore\Search\Req
* Constructor
*
* @param \Smile\ElasticsuiteCore\Search\Request\Query\QueryFactory $queryFactory Query factory
* @param \Smile\ElasticsuiteCore\Helper\Text $textHelper Text helper
* @param array $fieldFilters Field filters
*/
public function __construct(\Smile\ElasticsuiteCore\Search\Request\Query\QueryFactory $queryFactory, array $fieldFilters = [])
{
public function __construct(
\Smile\ElasticsuiteCore\Search\Request\Query\QueryFactory $queryFactory,
\Smile\ElasticsuiteCore\Helper\Text $textHelper,
array $fieldFilters = []
) {
$this->___init();

parent::__construct($queryFactory, $fieldFilters);
parent::__construct($queryFactory, $textHelper, $fieldFilters);
}

/**
Expand Down
Loading