Skip to content

Commit 8e6c160

Browse files
authored
Merge pull request #238 from RobotSail/fix-checkpoint-selection
fix: updates sorting logic to correctly compare numbers
2 parents 8b252d8 + 0310cae commit 8e6c160

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

src/instructlab/training/utils.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -870,8 +870,13 @@ def load_latest_full_state(args, accelerator) -> None:
870870
if not output_dir.is_dir():
871871
return
872872

873-
# picks checkpoint with the largest number of samples seen, by name.
874-
checkpoint_list = sorted(list(output_dir.iterdir()), reverse=True)
873+
# picks checkpoint with the largest number of samples by splitting the "samples_NNNN" string on _
874+
# and comparing the number at the end of the string
875+
checkpoint_list = sorted(
876+
list(output_dir.iterdir()),
877+
reverse=True,
878+
key=lambda x: int(str(x).rsplit("_", maxsplit=1)[-1]),
879+
)
875880

876881
if len(checkpoint_list) == 0:
877882
log_rank_0(

0 commit comments

Comments
 (0)