Skip to content

Commit b88f870

Browse files
authored
fix jieba_query error (#187)
* fix jieba_query issue #176 * add tests * update github action * update github action
1 parent 7ae9998 commit b88f870

File tree

5 files changed

+18
-4
lines changed

5 files changed

+18
-4
lines changed

.github/workflows/main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ jobs:
148148

149149
- uses: actions/checkout@v4
150150
with:
151+
fetch-depth: 0
151152
submodules: true
152153

153154
- name: Update apt-get

build-and-run

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ simple.example() {
7676
run "cd output/bin/"
7777
run "cat ${ProjectRoot}/example.sql ${ProjectRoot}/example-jieba.sql | ./sqlite3"
7878
run "./simple_cpp_example"
79-
run "cd ${ProjectRoot}"
80-
run "python3 examples/python3/db_connector.py './output/bin/libsimple'"
79+
# run "cd ${ProjectRoot}"
80+
# run "python3 examples/python3/db_connector.py './output/bin/libsimple'"
8181
}
8282

8383
main() {

build-and-run-no-jieba

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ simple.example() {
7676
run "cd output-no-jieba/bin/"
7777
run "./sqlite3 < ${ProjectRoot}/example.sql"
7878
run "./simple_cpp_example"
79-
run "cd ${ProjectRoot}"
80-
run "python3 examples/python3/db_connector.py"
79+
# run "cd ${ProjectRoot}"
80+
# run "python3 examples/python3/db_connector.py"
8181
}
8282

8383
main() {

src/simple_tokenizer.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,16 @@ std::string SimpleTokenizer::tokenize_jieba_query(const char *text, int textLen,
7373
std::vector<cppjieba::Word> words;
7474
jieba.Cut(text, words);
7575
for (auto word : words) {
76+
// if all char is the same category, then use that category
77+
// otherwise use OTHER
78+
// fix https://github.com/wangfenjin/simple/issues/176
7679
TokenCategory category = from_char(text[word.offset]);
80+
for (auto c : word.word) {
81+
if (from_char(c) != category) {
82+
category = TokenCategory::OTHER;
83+
break;
84+
}
85+
}
7786
append_result(result, word.word, category, word.offset, flags);
7887
}
7988
return result;

test/tokenizer_test.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ TEST(simple, tokenizer_with_pinyin) {
1818
query.push_back(R"VAGON(( z+h+o+u* OR zhou* ) AND "杰" AND "伦")VAGON");
1919
arr.push_back("杰伦 zhou 123");
2020
query.push_back(R"VAGON("杰" AND "伦" AND ( z+h+o+u* OR zhou* ) AND "123"*)VAGON");
21+
arr.push_back("c#");
22+
query.push_back(R"VAGON(c* AND "#")VAGON");
2123
for (int i = 0; i < arr.size(); i++) {
2224
std::string s = arr[i];
2325
std::cout << s << " as doc:\n";
@@ -66,6 +68,8 @@ TEST(simple, jieba_tokenizer_with_pinyin) {
6668
query.push_back(R"VAGON(( z+h+o+u* OR zhou* ) AND "杰伦")VAGON");
6769
arr.push_back("杰伦 zhou 123");
6870
query.push_back(R"VAGON("杰伦" AND ( z+h+o+u* OR zhou* ) AND "123"*)VAGON");
71+
arr.push_back("c#");
72+
query.push_back(R"VAGON("c#")VAGON");
6973
for (int i = 0; i < arr.size(); i++) {
7074
std::string s = arr[i];
7175
std::cout << s << " as doc:\n";

0 commit comments

Comments
 (0)