Skip to content

Commit 1b9baba

Browse files
committed
fix: invalid utf-8 parsing
1 parent 72cc8fa commit 1b9baba

File tree

2 files changed

+7
-6
lines changed

2 files changed

+7
-6
lines changed

src/lib.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use base64;
12
use fancy_regex::Regex;
23
use mlua::prelude::*;
34
use rustc_hash::FxHashMap as HashMap;
@@ -6,7 +7,6 @@ use std::fs::File;
67
use std::io::{BufRead, BufReader};
78
use std::sync::{Arc, Mutex};
89
use std::thread;
9-
use base64;
1010

1111
#[cfg(feature = "multithreading")]
1212
const MAX_NUM_THREADS: usize = 128;
@@ -203,7 +203,7 @@ pub fn tiktoken_core(lua: &mlua::Lua) -> LuaResult<LuaTable> {
203203
Ok(())
204204
},
205205
)?;
206-
let _encode = lua.create_function(move |_, text: String| encode(&*state2, text))?;
206+
let _encode = lua.create_function(move |_, text: mlua::String| encode(&*state2, text))?;
207207

208208
let exports = lua.create_table()?;
209209
exports.set("new", _new)?;
@@ -261,7 +261,8 @@ fn new(
261261
});
262262
}
263263

264-
fn encode(state: &State, text: String) -> LuaResult<(Vec<usize>, usize, usize)> {
264+
fn encode(state: &State, text: mlua::String) -> LuaResult<(Vec<usize>, usize, usize)> {
265+
let encoded_str = String::from_utf8_lossy(text.as_bytes());
265266
let allowed_special = HashSet::new();
266267
let max_tokens = None;
267268
Ok(state
@@ -270,7 +271,7 @@ fn encode(state: &State, text: String) -> LuaResult<(Vec<usize>, usize, usize)>
270271
.unwrap()
271272
.as_ref()
272273
.unwrap()
273-
._encode_native(&text, &allowed_special, max_tokens))
274+
._encode_native(&encoded_str, &allowed_special, max_tokens))
274275
}
275276

276277
pub struct CoreBPENative {

tiktoken_core-0.2.2-1.rockspec renamed to tiktoken_core-0.2.3-1.rockspec

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
package = "tiktoken_core"
2-
version = "0.2.2-1"
2+
version = "0.2.3-1"
33

44
source = {
55
url = "git+https://github.com/gptlang/lua-tiktoken",
6-
tag = "v0.2.2",
6+
tag = "v0.2.3",
77
}
88

99
description = {

0 commit comments

Comments
 (0)