@@ -9,36 +9,41 @@ Author: Edward S. Rice (erice11@unl.edu)
99import argparse
1010import os
1111import sys
12- import math
1312import random
1413
15- from subprocess import Popen , PIPE , check_call , check_output
14+ from subprocess import check_call
15+
1616
1717class HistogramError (Exception ):
1818 def __init__ (self , histo_cmd ):
1919 self .message = "Could not find min and max counts in histogram. " + \
20- "Try running the following command yourself and manually " + \
21- "choosing cutoffs: {}" .format (histo_cmd )
20+ "Try running the following command yourself and manually " + \
21+ "choosing cutoffs: {}" .format (histo_cmd )
22+
2223
2324def parse_args ():
24- parser = argparse .ArgumentParser (description = 'Given multiple short-read '
25- 'libraries, find k-mers that are unique to each library.' )
25+ parser = argparse .ArgumentParser (
26+ description = 'Given multiple short-read '
27+ 'libraries, find k-mers that are unique to each library.' )
2628 parser .add_argument ('-k' , '--kmer-size' , type = int , required = True )
27- parser .add_argument ('--path-to-kmc' , default = 'kmc' ,
28- help = "path to the kmc binary, in case it's not in PATH" )
29+ parser .add_argument (
30+ '--path-to-kmc' , default = 'kmc' ,
31+ help = "path to the kmc binary, in case it's not in PATH" )
2932 parser .add_argument ('-p' , '--threads' , type = int , default = 1 ,
30- help = 'number of threads to use in kmc [1]' )
33+ help = 'number of threads to use in kmc [1]' )
3134 parser .add_argument ('-o' , '--outpath' , default = '.' , help = 'Prefix to write '
32- 'output haplotypes to' )
35+ 'output haplotypes to' )
3336 parser .add_argument ('-s' , '--scratch-dir' , default = '.' , help = 'Directory '
34- 'for large temporary files' )
35- parser .add_argument ('read_files' , nargs = 2 , help = 'one comma-separated '
36- 'list of file paths for both libraries being compared. Files can '
37- 'be in fasta or fastq format, and uncompressed or gzipped.' )
37+ 'for large temporary files' )
38+ parser .add_argument (
39+ 'read_files' , nargs = 2 , help = 'one comma-separated '
40+ 'list of file paths for both libraries being compared. Files can '
41+ 'be in fasta or fastq format, and uncompressed or gzipped.' )
3842 return parser .parse_args ()
3943
44+
4045def run_kmc (infile_paths , outfile_path , k , threads = 1 , kmc_path = 'kmc' ,
41- scratch_dir = '.' ):
46+ scratch_dir = '.' ):
4247 """
4348 Given a list of input fastq files, run kmc on them.
4449
@@ -60,19 +65,20 @@ def run_kmc(infile_paths, outfile_path, k, threads=1, kmc_path='kmc',
6065
6166 # run kmc
6267 kmc_cmd = [kmc_path , '-k{}' .format (k ), '-t{}' .format (threads ),
63- '@' + paths_list_file_path , outfile_path , scratch_dir ]
68+ '@' + paths_list_file_path , outfile_path , scratch_dir ]
6469
6570 try :
6671 check_call (kmc_cmd )
6772 except FileNotFoundError :
6873 print ('Cannot execute kmc. Make sure kmc is installed and either in\n '
69- 'your PATH or specified by the --path-to-kmc option.' ,
70- file = sys .stderr )
74+ 'your PATH or specified by the --path-to-kmc option.' ,
75+ file = sys .stderr )
7176 sys .exit (1 )
7277 finally :
7378 # clean up temp file
7479 os .remove (paths_list_file_path )
7580
81+
7682def analyze_histogram (kmc_db , kmc_path = 'kmc' , scratch_path = '.' ):
7783 """
7884 Given a jellyfish database, run jellyfish histo to
@@ -89,17 +95,17 @@ def analyze_histogram(kmc_db, kmc_path='kmc', scratch_path='.'):
8995 """
9096
9197 # run the kmc histogram program
92- temp_histogram_path = '{}/{}' .format (scratch_path ,
93- random .randint (1 , 9999999 ))
98+ temp_histogram_path = '{}/{}' .format (
99+ scratch_path , random .randint (1 , 9999999 ))
94100 histo_cmd = [kmc_path + '_tools' , 'transform' , kmc_db , 'histogram' ,
95- temp_histogram_path ]
101+ temp_histogram_path ]
96102 check_call (histo_cmd )
97103
98104 # go through the histogram file looking for the first local minimum
99105 min_coverage , max_coverage = False , False
100106 for line in open (temp_histogram_path ):
101107 coverage , count = map (int , line .strip ().split ())
102- if coverage != 2 : # don't do anything except record the first entry
108+ if coverage != 2 : # don't do anything except record the first entry
103109
104110 # if we haven't yet found the minimum coverage, we're looking for
105111 # a local minimum, i.e., a place where count starts increasing
@@ -122,14 +128,15 @@ def analyze_histogram(kmc_db, kmc_path='kmc', scratch_path='.'):
122128
123129 if max_coverage - min_coverage < 5 :
124130 print ('WARNING: min and max coverage not very far apart. This may be '
125- 'a result of coverage being too low. Try taking a look at ' +
126- 'the histogram in "{}" yourself.' .format (temp_histogram_path ),
127- file = sys .stderr )
131+ 'a result of coverage being too low. Try taking a look at '
132+ 'the histogram in "{}" yourself.' .format (temp_histogram_path ),
133+ file = sys .stderr )
128134 else :
129135 os .remove (temp_histogram_path )
130136
131137 return min_coverage , max_coverage
132138
139+
133140def run_kmc_subtract (kmc_db_a , kmc_db_b , kmc_cmd = 'kmc' ):
134141 """
135142 Given two kmc databases along with min and max counts
@@ -152,6 +159,7 @@ def run_kmc_subtract(kmc_db_a, kmc_db_b, kmc_cmd='kmc'):
152159 check_call (subtract_cmd )
153160 return out_path
154161
162+
155163def run_kmc_dump (kmc_db , out_path , min_count , max_count , kmc_cmd = 'kmc' ):
156164 """
157165 Given a kmc database, dump it into a text file.
@@ -167,7 +175,7 @@ def run_kmc_dump(kmc_db, out_path, min_count, max_count, kmc_cmd='kmc'):
167175 """
168176
169177 dump_cmd = [kmc_cmd + '_dump' , kmc_db , out_path ,
170- '-ci' + str (min_count ), '-cx' + str (max_count )]
178+ '-ci' + str (min_count ), '-cx' + str (max_count )]
171179 check_call (dump_cmd )
172180
173181 num_lines = 0
@@ -176,26 +184,27 @@ def run_kmc_dump(kmc_db, out_path, min_count, max_count, kmc_cmd='kmc'):
176184 num_lines += 1
177185 return num_lines
178186
187+
179188def main ():
180189 args = parse_args ()
181190
182191 # Count k-mers in all haplotypes
183- kmc_databases = [] # list of databases (db_path, min_count, max_count)
192+ kmc_databases = [] # list of databases (db_path, min_count, max_count)
184193 for hap_ID , haplotype_files_string in zip (['A' , 'B' ], args .read_files ):
185194 print ('\033 [92mCounting k-mers in haplotype {}...\033 [0m'
186- .format (hap_ID ), file = sys .stderr )
195+ .format (hap_ID ), file = sys .stderr )
187196 haplotype_files = haplotype_files_string .split (',' )
188197 outfile_path = "{}/haplotype{}" .format (args .outpath , hap_ID )
189198 run_kmc (haplotype_files , outfile_path , args .kmer_size ,
190199 args .threads , args .path_to_kmc , args .scratch_dir )
191200
192201 # get the histogram for this haplotype and analyze it
193202 print ("\033 [92mComputing and analyzing histogram...\033 [0m" ,
194- file = sys .stderr )
195- min_count , max_count = analyze_histogram (outfile_path ,
196- args .path_to_kmc , args .scratch_dir )
203+ file = sys .stderr )
204+ min_count , max_count = analyze_histogram (
205+ outfile_path , args .path_to_kmc , args .scratch_dir )
197206 print ("\033 [92mUsing counts in range [{},{}].\033 [0m"
198- .format (min_count , max_count ), file = sys .stderr )
207+ .format (min_count , max_count ), file = sys .stderr )
199208
200209 kmc_databases .append ((outfile_path , min_count , max_count ))
201210
@@ -205,26 +214,29 @@ def main():
205214
206215 # make a database of k-mers that appear only in haplotype A & dump
207216 print ('\033 [92mFinding k-mers unique to haplotype A...\033 [0m' ,
208- file = sys .stderr )
217+ file = sys .stderr )
209218 hap_a_only_db = run_kmc_subtract (kmc_db_a , kmc_db_b , args .path_to_kmc )
210219 print ('\033 [92mDumping k-mers unique to haplotype A...\033 [0m' ,
211- file = sys .stderr )
212- hap_a_num_kmers = run_kmc_dump (hap_a_only_db , args .outpath + \
213- '/hapA_only_kmers.txt' , min_count_a , max_count_a , args .path_to_kmc )
220+ file = sys .stderr )
221+ hap_a_num_kmers = run_kmc_dump (
222+ hap_a_only_db , args .outpath + '/hapA_only_kmers.txt' ,
223+ min_count_a , max_count_a , args .path_to_kmc )
214224
215225 # make a database of k-mers that appear only in haplotype B & dump
216226 print ('\033 [92mFinding k-mers unique to haplotype B...\033 [0m' ,
217- file = sys .stderr )
227+ file = sys .stderr )
218228 hap_b_only_db = run_kmc_subtract (kmc_db_b , kmc_db_a , args .path_to_kmc )
219229 print ('\033 [92mDumping k-mers unique to haplotype A...\033 [0m' ,
220- file = sys .stderr )
221- hap_b_num_kmers = run_kmc_dump (hap_b_only_db , args .outpath + \
222- '/hapB_only_kmers.txt' , min_count_b , max_count_b , args .path_to_kmc )
230+ file = sys .stderr )
231+ hap_b_num_kmers = run_kmc_dump (
232+ hap_b_only_db , args .outpath + '/hapB_only_kmers.txt' ,
233+ min_count_b , max_count_b , args .path_to_kmc )
223234
224235 print ('\n \n \033 [94m# of unique k-mers in haplotype A: {}\033 [0m'
225- .format (hap_a_num_kmers ), file = sys .stderr )
236+ .format (hap_a_num_kmers ), file = sys .stderr )
226237 print ('\033 [94m# of unique k-mers in haplotype B: {}\033 [0m'
227- .format (hap_b_num_kmers ), file = sys .stderr )
238+ .format (hap_b_num_kmers ), file = sys .stderr )
239+
228240
229241if __name__ == '__main__' :
230242 main ()
0 commit comments