From d96825f4869ba75aafb12b32fad6109a5ebe945d Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Tue, 25 Jul 2023 12:25:54 +0200 Subject: [PATCH 1/5] Added pad bos eos functionality (and .gitignore) --- .gitignore | 2 ++ tokenizer.py | 24 +++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..ef1f3072 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# ignore __pycache__ folder +__pycache__/ \ No newline at end of file diff --git a/tokenizer.py b/tokenizer.py index 8a64c905..dd52437c 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -19,15 +19,29 @@ def __init__(self, tokenizer_model_path): self.newline_token_id = 13 + print(self.encode("hello world")) + print(self.encode("hello world", pad_bos=True, pad_eos=True)) + + print(self.encode(["hello world", "hello world", "hello world"], pad_bos=False, pad_eos=False)) + print(self.encode(["hello world", "hello world", "hello world"], pad_bos=True, pad_eos=True)) + # Encode string - def encode(self, text, return_mask = False, max_seq_len = 2048): + def encode(self, text, return_mask = False, max_seq_len = 2048, pad_eos = False, pad_bos = False): if isinstance(text, list): # text is a list of strings list_ids = self.tokenizer.EncodeAsIds(text) + + # pad bos and eos + + if pad_bos: + for ids in list_ids: ids.insert(0, self.bos_token_id) + if pad_eos: + for ids in list_ids: ids.append(self.eos_token_id) + max_length = max([len(ids) for ids in list_ids]) needs_mask = False @@ -56,6 +70,14 @@ def encode(self, text, return_mask = False, max_seq_len = 2048): # text is a single string ids = self.tokenizer.EncodeAsIds(text) + + # pad bos and eos + + if pad_bos: + ids = [self.bos_token_id] + ids + if pad_eos: + ids = ids + [self.eos_token_id] + stacked_ids = torch.tensor(ids).unsqueeze(0) if return_mask: From f20c14903bccb680abd1cb7d9a7cb6f2041e0476 Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Tue, 25 Jul 2023 12:28:40 +0200 Subject: [PATCH 2/5] Deleted debug lines --- tokenizer.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tokenizer.py b/tokenizer.py index dd52437c..4a3027fe 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -18,13 +18,6 @@ def __init__(self, tokenizer_model_path): self.pad_token_id = 0 # self.tokenizer.pad_id() self.newline_token_id = 13 - - print(self.encode("hello world")) - print(self.encode("hello world", pad_bos=True, pad_eos=True)) - - print(self.encode(["hello world", "hello world", "hello world"], pad_bos=False, pad_eos=False)) - print(self.encode(["hello world", "hello world", "hello world"], pad_bos=True, pad_eos=True)) - # Encode string def encode(self, text, return_mask = False, max_seq_len = 2048, pad_eos = False, pad_bos = False): @@ -72,7 +65,7 @@ def encode(self, text, return_mask = False, max_seq_len = 2048, pad_eos = False, ids = self.tokenizer.EncodeAsIds(text) # pad bos and eos - + if pad_bos: ids = [self.bos_token_id] + ids if pad_eos: From 12aac5299c2fa2ca9900ce647cbf751de8576a8c Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Tue, 25 Jul 2023 16:24:36 +0200 Subject: [PATCH 3/5] Changed 'pad' to 'add' --- tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tokenizer.py b/tokenizer.py index 4a3027fe..b73f942f 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -20,7 +20,7 @@ def __init__(self, tokenizer_model_path): # Encode string - def encode(self, text, return_mask = False, max_seq_len = 2048, pad_eos = False, pad_bos = False): + def encode(self, text, return_mask = False, max_seq_len = 2048, add_bos = False, add_eos = False): if isinstance(text, list): @@ -30,9 +30,9 @@ def encode(self, text, return_mask = False, max_seq_len = 2048, pad_eos = False, # pad bos and eos - if pad_bos: + if add_bos: for ids in list_ids: ids.insert(0, self.bos_token_id) - if pad_eos: + if add_eos: for ids in list_ids: ids.append(self.eos_token_id) max_length = max([len(ids) for ids in list_ids]) From bf884877fc0020c7dc3fe26f5f0a041cf4d258a8 Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Tue, 25 Jul 2023 22:42:37 +0200 Subject: [PATCH 4/5] fix --- tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tokenizer.py b/tokenizer.py index b73f942f..e2ace47e 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -66,9 +66,9 @@ def encode(self, text, return_mask = False, max_seq_len = 2048, add_bos = False, # pad bos and eos - if pad_bos: + if add_bos: ids = [self.bos_token_id] + ids - if pad_eos: + if add_eos: ids = ids + [self.eos_token_id] stacked_ids = torch.tensor(ids).unsqueeze(0) From 0fae40bed63228b027de6826f755cfb7cd296a30 Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Tue, 25 Jul 2023 23:55:08 +0200 Subject: [PATCH 5/5] fix inconsistent indentation --- tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tokenizer.py b/tokenizer.py index e2ace47e..c2befc26 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -67,9 +67,9 @@ def encode(self, text, return_mask = False, max_seq_len = 2048, add_bos = False, # pad bos and eos if add_bos: - ids = [self.bos_token_id] + ids + ids = [self.bos_token_id] + ids if add_eos: - ids = ids + [self.eos_token_id] + ids = ids + [self.eos_token_id] stacked_ids = torch.tensor(ids).unsqueeze(0)