Skip to content

Commit ed833b9

Browse files
authored
Merge pull request #244 from instructlab/fix-pretrain-max
Fix pretrain token list->int for masking
2 parents 63f128c + 5c368ee commit ed833b9

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

src/instructlab/training/data_process.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ def main(args: DataProcessArgs):
310310
print("\033[92mCategorizing training data type...\033[0m")
311311
data_with_input_ids = data_with_input_ids.map(
312312
lambda x: {
313-
"is_pretrain": get_sp_token(tokenizer, "<|pretrain|>") in x["input_ids"]
313+
"is_pretrain": get_sp_token(tokenizer, "<|pretrain|>")[0] in x["input_ids"]
314314
},
315315
num_proc=NUM_PROC,
316316
)
@@ -320,8 +320,8 @@ def main(args: DataProcessArgs):
320320
user_tokens=user_tk,
321321
assist_tokens=assistant_tk,
322322
system_tokens=system_tk,
323-
pretrain_token=get_sp_token(tokenizer, "<|pretrain|>"),
324-
pretrain_end_token=get_sp_token(tokenizer, "<|/pretrain|>"),
323+
pretrain_token=get_sp_token(tokenizer, "<|pretrain|>")[0],
324+
pretrain_end_token=get_sp_token(tokenizer, "<|/pretrain|>")[0],
325325
)
326326
print("\033[92munmasking the appropriate message content...\033[0m")
327327
data_with_labels = data_with_input_ids.map(

0 commit comments

Comments
 (0)