@@ -32,7 +32,9 @@ def __init__(self, path: str, cache_indices=False):
3232 self .offsets , self .sizes = self ._build_index (path )
3333 np .save (self .cache , np .stack ([self .offsets , self .sizes ]))
3434 else :
35- self .offsets , self .sizes = self ._build_index (path )
35+ raise ValueError (
36+ "`cache_indices` is not supported anymore due to security concerns."
37+ )
3638
3739 def _get_file (self ):
3840 if not hasattr (self .threadlocal , "f" ):
@@ -53,24 +55,6 @@ def __getitem__(self, idx):
5355 def __len__ (self ):
5456 return self .offsets .size
5557
56- def _build_index (self , path : str ):
57- # Use grep and awk to get 100M/s on local SSD.
58- # Should process your enormous 100G fasta in ~10 min single core...
59- path = fasta_file_path (path )
60- bytes_offsets = subprocess .check_output (
61- f"cat { path } | tqdm --bytes --total $(wc -c < { path } )"
62- "| grep --byte-offset '^>' -o | cut -d: -f1" ,
63- shell = True ,
64- )
65- fasta_lengths = subprocess .check_output (
66- f"cat { path } | tqdm --bytes --total $(wc -c < { path } )"
67- "| awk '/^>/ {print \" \" ;next;} { printf(\" %s\" ,$0);}' | tail -n+2 | awk '{print length($1)}'" ,
68- shell = True ,
69- )
70- bytes_np = np .fromstring (bytes_offsets , dtype = np .int64 , sep = " " )
71- sizes_np = np .fromstring (fasta_lengths , dtype = np .int64 , sep = " " )
72- return bytes_np , sizes_np
73-
7458 def __setstate__ (self , state ):
7559 self .__dict__ = state
7660 self .threadlocal = threading .local ()
0 commit comments