-
Notifications
You must be signed in to change notification settings - Fork 56
Expand file tree
/
Copy pathqwen_asr_tokenizer.h
More file actions
38 lines (30 loc) · 1.22 KB
/
qwen_asr_tokenizer.h
File metadata and controls
38 lines (30 loc) · 1.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/*
* qwen_asr_tokenizer.h - Qwen2/Qwen3 BPE tokenizer (GPT-2 byte-level)
*
* Supports:
* - decode token ID -> UTF-8 text
* - encode UTF-8 text -> token IDs (vocab.json + merges.txt)
*/
#ifndef QWEN_ASR_TOKENIZER_H
#define QWEN_ASR_TOKENIZER_H
typedef struct {
char **id_to_text; /* [vocab_size] decoded text strings */
char **id_to_bpe; /* [vocab_size] raw BPE token strings from vocab.json */
int vocab_size;
/* Internal hash maps (opaque to callers) */
void *vocab_map;
int vocab_map_cap;
void *merge_map;
int merge_map_cap;
} qwen_tokenizer_t;
/* Load tokenizer from vocab.json in model directory */
qwen_tokenizer_t *qwen_tokenizer_load(const char *vocab_json_path);
/* Decode a single token ID to text. Returns pointer to internal string. */
const char *qwen_tokenizer_decode(const qwen_tokenizer_t *tok, int token_id);
/* Encode UTF-8 text into token IDs using BPE.
* Returns malloc'd array of token IDs and sets *out_n_tokens.
* Returns NULL on error (and sets *out_n_tokens to 0). */
int *qwen_tokenizer_encode(const qwen_tokenizer_t *tok, const char *text, int *out_n_tokens);
/* Free tokenizer */
void qwen_tokenizer_free(qwen_tokenizer_t *tok);
#endif /* QWEN_ASR_TOKENIZER_H */