Skip to content
This repository was archived by the owner on Mar 20, 2026. It is now read-only.

Commit d13e14a

Browse files
authored
Remove index caching support from fasta due to security concerns (#5619)
1 parent ecbf110 commit d13e14a

1 file changed

Lines changed: 3 additions & 19 deletions

File tree

fairseq/data/fasta_dataset.py

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ def __init__(self, path: str, cache_indices=False):
3232
self.offsets, self.sizes = self._build_index(path)
3333
np.save(self.cache, np.stack([self.offsets, self.sizes]))
3434
else:
35-
self.offsets, self.sizes = self._build_index(path)
35+
raise ValueError(
36+
"`cache_indices` is not supported anymore due to security concerns."
37+
)
3638

3739
def _get_file(self):
3840
if not hasattr(self.threadlocal, "f"):
@@ -53,24 +55,6 @@ def __getitem__(self, idx):
5355
def __len__(self):
5456
return self.offsets.size
5557

56-
def _build_index(self, path: str):
57-
# Use grep and awk to get 100M/s on local SSD.
58-
# Should process your enormous 100G fasta in ~10 min single core...
59-
path = fasta_file_path(path)
60-
bytes_offsets = subprocess.check_output(
61-
f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
62-
"| grep --byte-offset '^>' -o | cut -d: -f1",
63-
shell=True,
64-
)
65-
fasta_lengths = subprocess.check_output(
66-
f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
67-
"| awk '/^>/ {print \"\";next;} { printf(\"%s\",$0);}' | tail -n+2 | awk '{print length($1)}'",
68-
shell=True,
69-
)
70-
bytes_np = np.fromstring(bytes_offsets, dtype=np.int64, sep=" ")
71-
sizes_np = np.fromstring(fasta_lengths, dtype=np.int64, sep=" ")
72-
return bytes_np, sizes_np
73-
7458
def __setstate__(self, state):
7559
self.__dict__ = state
7660
self.threadlocal = threading.local()

0 commit comments

Comments
 (0)