Skip to content

Commit 5de0ba0

Browse files
committed
support jieba_query
Change-Id: Ie657171a2b69490dfc437113ec6fe20af7b76ffd
1 parent c04304f commit 5de0ba0

15 files changed

+306
-53
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ libsimple.*
66
build/
77
*.gch
88
bin/
9-
output/
9+
output/
10+
output-no-jieba/

CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ cmrc_add_resource_library(PINYIN_TEXT NAMESPACE pinyin_text contrib/pinyin.txt)
2727
# https://github.com/vector-of-bool/cmrc/issues/17#issuecomment-659501280
2828
set_property(TARGET PINYIN_TEXT PROPERTY POSITION_INDEPENDENT_CODE ON)
2929

30-
3130
# Code Coverage Configuration
3231
if(NOT TARGET coverage_config)
3332
add_library(coverage_config INTERFACE)
@@ -49,7 +48,14 @@ if(CODE_COVERAGE)
4948
endif(CODE_COVERAGE)
5049
# endif(CODE_COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
5150

51+
# https://stackoverflow.com/a/15212881/1203241
52+
OPTION(SIMPLE_WITH_JIEBA "Option to build with cppjieba" ON)
53+
if(SIMPLE_WITH_JIEBA)
54+
add_definitions(-DUSE_JIEBA=1)
55+
endif()
56+
5257
add_subdirectory(src)
58+
5359
add_subdirectory(examples/cpp)
5460
enable_testing()
5561
add_subdirectory(test)

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ simple 是一个支持中文和拼音的 [sqlite3 fts5](https://www.sqlite.org/f
88

99
实现相关介绍:https://www.wangfenjin.com/posts/simple-tokenizer/
1010

11+
在此基础上,我们还支持通过 (cppjieba)[https://github.com/yanyiwu/cppjieba] 实现更精准的词组匹配。
12+
1113
## 用法
1214

1315
首先需要确认你用到的 sqlite 版本支持 fts5 拓展,确认方法是:
@@ -23,6 +25,7 @@ select fts5(?1);
2325
3. simple_highlight() 实现连续高亮 match 的词汇,与 sqlite 自带的 highlight 类似,但是 simple_highlight 实现了连续 match 的词汇分到同一组的逻辑,理论上用户更需要这样
2426
4. simple_highlight_pos() 实现返回 match 的词汇位置,用户可以自行决定怎么使用
2527
5. simple_snippet() 实现截取 match 片段的功能,与 sqlite 自带的 snippet 功能类似,同样是增强连续 match 的词汇分到同一组的逻辑
28+
6. jieba_query() 实现jieba分词的效果,在索引不变的情况下,可以实现更精准的匹配。
2629

2730
## 开发
2831

build-and-run

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ simple.clean() {
5151
simple.build() {
5252
hl.subtle "build..."
5353
run "cd build/run"
54+
find . -name "*.gcda" -print0 | xargs -0 rm
5455
run "cmake -DCODE_COVERAGE=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_INSTALL_PREFIX=${ProjectRoot}/output ../.."
5556
run.set-next show-output-on
5657
run "make -j 12"
@@ -73,7 +74,7 @@ simple.example() {
7374
}
7475
hl.subtle "run example..."
7576
run "cd output/bin/"
76-
run "./sqlite3 < ${ProjectRoot}/example.sql"
77+
run "cat ${ProjectRoot}/example.sql ${ProjectRoot}/example-jieba.sql | ./sqlite3"
7778
run "./simple_cpp_example"
7879
run "cd ${ProjectRoot}"
7980

build-and-run-no-jieba

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/usr/bin/env bash
2+
#
3+
# © 2018-2019 Konstantin Gredeskoul, All Rights Reserved.
4+
# MIT License
5+
#
6+
# WARNING: This BASH script is completely optional. You don't need it to build this project.
7+
#
8+
# If you choose to run this script to build the project, run:
9+
#
10+
# $ ./build-and-run
11+
#
12+
# It will clean, build and run the tests.
13+
#
14+
15+
[[ -z $(which git) ]] && {
16+
echo "You need git installed. Please run 'xcode-select --install' first."
17+
exit 1
18+
}
19+
20+
export BashMatic="${HOME}/.bashmatic"
21+
[[ ! -f "${BashMatic}/init.sh" ]] && {
22+
bash -c "$(curl -fsSL https://bashmatic.re1.re); bashmatic-install"
23+
}
24+
source "${BashMatic}/init.sh"
25+
26+
export ProjectRoot=$(pwd)
27+
export BuildDir="${ProjectRoot}/build/run"
28+
export BashLibRoot="${ProjectRoot}/bin/lib-bash"
29+
export LibBashRepo="https://github.com/kigster/lib-bash"
30+
31+
simple.header() {
32+
h1.purple "Simple Tokenizer no jieba"
33+
local OIFC=${IFC}
34+
IFS="|" read -r -a gcc_info <<< "$(gcc --version 2>&1 | tr '\n' '|')"
35+
export IFC=${OIFC}
36+
h1 "${bldylw}GCC" "${gcc_info[1]}" "${gcc_info[2]}" "${gcc_info[3]}" "${gcc_info[4]}"
37+
h1 "${bldylw}GIT: ${bldblu}$(git --version)"
38+
h1 "${bldylw}CMAKE: ${bldblu}$(cmake --version | tr '\n' ' ')"
39+
}
40+
41+
simple.setup() {
42+
hl.subtle "Creating Build Folder..."
43+
run "mkdir -p build/run-no-jieba"
44+
}
45+
46+
simple.clean() {
47+
hl.subtle "Cleaning output folders..."
48+
run 'rm -rf bin-no-jieba/* include/* lib/* build/*'
49+
}
50+
51+
simple.build() {
52+
hl.subtle "build..."
53+
run "cd build/run-no-jieba"
54+
find . -name "*.gcda" -print0 | xargs -0 rm
55+
run "cmake -DCODE_COVERAGE=ON -DSIMPLE_WITH_JIEBA=OFF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_INSTALL_PREFIX=${ProjectRoot}/output-no-jieba ../.."
56+
run.set-next show-output-on
57+
run "make -j 12"
58+
run "make install | egrep -v 'gmock|gtest'"
59+
run "cd ${ProjectRoot}"
60+
}
61+
62+
simple.tests() {
63+
hl.subtle "testing..."
64+
run.set-all show-output-on
65+
run "cd build/run-no-jieba"
66+
run "ctest . -V"
67+
run "cd ${ProjectRoot}"
68+
}
69+
70+
simple.example() {
71+
[[ ! -f ./output-no-jieba/bin/sqlite3 ]] && {
72+
error "You don't have the cmpiled sqlite3 binary yet".
73+
exit 3
74+
}
75+
hl.subtle "run example..."
76+
run "cd output-no-jieba/bin/"
77+
run "./sqlite3 < ${ProjectRoot}/example.sql"
78+
run "./simple_cpp_example"
79+
run "cd ${ProjectRoot}"
80+
81+
}
82+
83+
main() {
84+
simple.header
85+
simple.setup
86+
simple.build
87+
simple.tests
88+
simple.example
89+
}
90+
91+
(( $_s_ )) || main

example-jieba.sql

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
select '使用jieba分词:';
2+
-- will match
3+
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match simple_query('国中woai');
4+
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match jieba_query('中国woai');
5+
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match jieba_query('中国woai', 0);
6+
-- will not match, in jieba_query, the order matters
7+
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match jieba_query('国中woai');

example.sql

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ select '搜索 love zg:';
4949
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match simple_query('love zg');
5050
select ' ', simple_highlight_pos(t1, 0) from t1 where x match simple_query('love zg');
5151

52-
5352
select '';
5453
select '';
5554
select '--------------------------------------------------------------------------------';

examples/cpp/main.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,16 @@ int main() {
7474
"simple_query('@\"._''-&%')";
7575
rc = sqlite3_exec(db, sql.c_str(), callback, 0, &zErrMsg);
7676
handle_rc(db, rc);
77+
#ifdef USE_JIEBA
78+
// case 4: jieba, no match
79+
sql = "select simple_highlight(t1, 0, '[', ']') as no_matched_jieba from t1 where x match jieba_query('国中')";
80+
rc = sqlite3_exec(db, sql.c_str(), callback, 0, &zErrMsg);
81+
handle_rc(db, rc);
82+
// case 5: jieba, match
83+
sql = "select simple_highlight(t1, 0, '[', ']') as matched_jieba from t1 where x match jieba_query('中国')";
84+
rc = sqlite3_exec(db, sql.c_str(), callback, 0, &zErrMsg);
85+
handle_rc(db, rc);
86+
#endif
7787

7888
// Close the connection
7989
sqlite3_close(db);

src/CMakeLists.txt

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
11
cmake_minimum_required(VERSION 3.2)
22
project(simple CXX)
33

4+
if(SIMPLE_WITH_JIEBA)
5+
include(ExternalProject)
6+
ExternalProject_Add(
7+
cppjieba
8+
PREFIX ${CMAKE_BINARY_DIR}/cppjieba
9+
GIT_REPOSITORY https://github.com/yanyiwu/cppjieba.git
10+
CONFIGURE_COMMAND ""
11+
BUILD_COMMAND cmake -E echo "Skipping build cppjieba."
12+
INSTALL_COMMAND cmake -E echo "Skipping install cppjieba."
13+
LOG_DOWNLOAD ON
14+
)
15+
ExternalProject_Get_Property(cppjieba source_dir)
16+
endif()
17+
418
set(SOURCE_FILES
519
pinyin.h
620
simple_highlight.h
@@ -11,8 +25,25 @@ set(SOURCE_FILES
1125
entry.cc
1226
)
1327

14-
include_directories(${SQLITE3_HEADERS_DIR})
28+
if(SIMPLE_WITH_JIEBA)
29+
include_directories(${SQLITE3_HEADERS_DIR} ${source_dir}/include ${source_dir}/deps)
30+
INSTALL(DIRECTORY ${source_dir}/dict/ DESTINATION bin/dict FILES_MATCHING PATTERN "*.utf8")
31+
else()
32+
include_directories(${SQLITE3_HEADERS_DIR})
33+
endif()
34+
1535
add_library(simple SHARED ${SOURCE_FILES})
16-
target_link_libraries(simple PUBLIC coverage_config PRIVATE PINYIN_TEXT SQLite3)
36+
37+
if(SIMPLE_WITH_JIEBA)
38+
target_include_directories(simple INTERFACE ${SQLITE3_HEADERS_DIR} ${source_dir}/include ${source_dir}/deps)
39+
# for tests only
40+
add_custom_command(TARGET simple PRE_BUILD
41+
COMMAND ${CMAKE_COMMAND} -E copy_directory
42+
${source_dir}/dict/ $<TARGET_FILE_DIR:simple>/../test/dict/)
43+
else()
44+
target_include_directories(simple INTERFACE ${SQLITE3_HEADERS_DIR})
45+
endif()
46+
47+
target_link_libraries(simple PUBLIC coverage_config PRIVATE PINYIN_TEXT SQLite3)
1748

1849
install(TARGETS simple DESTINATION bin)

src/entry.cc

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,25 @@ static int fts5_api_from_db(sqlite3 *db, fts5_api **ppApi) {
4444
return rc;
4545
}
4646

47+
#ifdef USE_JIEBA
48+
static void jieba_query(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
49+
int rc;
50+
if (nVal >= 1) {
51+
const char *text = (const char *)sqlite3_value_text(apVal[0]);
52+
if (text) {
53+
int flags = 1;
54+
if (nVal >= 2) {
55+
flags = atoi((const char *)sqlite3_value_text(apVal[1]));
56+
}
57+
std::string result = simple_tokenizer::SimpleTokenizer::tokenize_jieba_query(text, std::strlen(text), flags);
58+
sqlite3_result_text(pCtx, result.c_str(), -1, SQLITE_TRANSIENT);
59+
return;
60+
}
61+
}
62+
sqlite3_result_null(pCtx);
63+
}
64+
#endif
65+
4766
static void simple_query(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
4867
int rc;
4968
if (nVal >= 1) {
@@ -67,6 +86,10 @@ int sqlite3_simple_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines
6786

6887
rc = sqlite3_create_function(db, "simple_query", -1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, &simple_query, NULL,
6988
NULL);
89+
#ifdef USE_JIEBA
90+
rc = sqlite3_create_function(db, "jieba_query", -1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, &jieba_query, NULL,
91+
NULL);
92+
#endif
7093

7194
// fts5_tokenizer tokenizer = {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize };
7295
fts5_tokenizer tokenizer = {fts5_simple_xCreate, fts5_simple_xDelete, fts5_simple_xTokenize};

0 commit comments

Comments
 (0)