Skip to content

Commit bae4095

Browse files
committed
Reformatting per flake8 only
1 parent ad55d71 commit bae4095

File tree

1 file changed

+54
-42
lines changed

1 file changed

+54
-42
lines changed

find-unique-kmers

Lines changed: 54 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -9,36 +9,41 @@ Author: Edward S. Rice (erice11@unl.edu)
99
import argparse
1010
import os
1111
import sys
12-
import math
1312
import random
1413

15-
from subprocess import Popen, PIPE, check_call, check_output
14+
from subprocess import check_call
15+
1616

1717
class HistogramError(Exception):
1818
def __init__(self, histo_cmd):
1919
self.message = "Could not find min and max counts in histogram. " + \
20-
"Try running the following command yourself and manually " + \
21-
"choosing cutoffs: {}".format(histo_cmd)
20+
"Try running the following command yourself and manually " + \
21+
"choosing cutoffs: {}".format(histo_cmd)
22+
2223

2324
def parse_args():
24-
parser = argparse.ArgumentParser(description='Given multiple short-read '
25-
'libraries, find k-mers that are unique to each library.')
25+
parser = argparse.ArgumentParser(
26+
description='Given multiple short-read '
27+
'libraries, find k-mers that are unique to each library.')
2628
parser.add_argument('-k', '--kmer-size', type=int, required=True)
27-
parser.add_argument('--path-to-kmc', default='kmc',
28-
help="path to the kmc binary, in case it's not in PATH")
29+
parser.add_argument(
30+
'--path-to-kmc', default='kmc',
31+
help="path to the kmc binary, in case it's not in PATH")
2932
parser.add_argument('-p', '--threads', type=int, default=1,
30-
help='number of threads to use in kmc [1]')
33+
help='number of threads to use in kmc [1]')
3134
parser.add_argument('-o', '--outpath', default='.', help='Prefix to write '
32-
'output haplotypes to')
35+
'output haplotypes to')
3336
parser.add_argument('-s', '--scratch-dir', default='.', help='Directory '
34-
'for large temporary files')
35-
parser.add_argument('read_files', nargs=2, help='one comma-separated '
36-
'list of file paths for both libraries being compared. Files can '
37-
'be in fasta or fastq format, and uncompressed or gzipped.')
37+
'for large temporary files')
38+
parser.add_argument(
39+
'read_files', nargs=2, help='one comma-separated '
40+
'list of file paths for both libraries being compared. Files can '
41+
'be in fasta or fastq format, and uncompressed or gzipped.')
3842
return parser.parse_args()
3943

44+
4045
def run_kmc(infile_paths, outfile_path, k, threads=1, kmc_path='kmc',
41-
scratch_dir='.'):
46+
scratch_dir='.'):
4247
"""
4348
Given a list of input fastq files, run kmc on them.
4449
@@ -60,19 +65,20 @@ def run_kmc(infile_paths, outfile_path, k, threads=1, kmc_path='kmc',
6065

6166
# run kmc
6267
kmc_cmd = [kmc_path, '-k{}'.format(k), '-t{}'.format(threads),
63-
'@' + paths_list_file_path, outfile_path, scratch_dir]
68+
'@' + paths_list_file_path, outfile_path, scratch_dir]
6469

6570
try:
6671
check_call(kmc_cmd)
6772
except FileNotFoundError:
6873
print('Cannot execute kmc. Make sure kmc is installed and either in\n'
69-
'your PATH or specified by the --path-to-kmc option.',
70-
file=sys.stderr)
74+
'your PATH or specified by the --path-to-kmc option.',
75+
file=sys.stderr)
7176
sys.exit(1)
7277
finally:
7378
# clean up temp file
7479
os.remove(paths_list_file_path)
7580

81+
7682
def analyze_histogram(kmc_db, kmc_path='kmc', scratch_path='.'):
7783
"""
7884
Given a jellyfish database, run jellyfish histo to
@@ -89,17 +95,17 @@ def analyze_histogram(kmc_db, kmc_path='kmc', scratch_path='.'):
8995
"""
9096

9197
# run the kmc histogram program
92-
temp_histogram_path = '{}/{}'.format(scratch_path,
93-
random.randint(1, 9999999))
98+
temp_histogram_path = '{}/{}'.format(
99+
scratch_path, random.randint(1, 9999999))
94100
histo_cmd = [kmc_path + '_tools', 'transform', kmc_db, 'histogram',
95-
temp_histogram_path]
101+
temp_histogram_path]
96102
check_call(histo_cmd)
97103

98104
# go through the histogram file looking for the first local minimum
99105
min_coverage, max_coverage = False, False
100106
for line in open(temp_histogram_path):
101107
coverage, count = map(int, line.strip().split())
102-
if coverage != 2: # don't do anything except record the first entry
108+
if coverage != 2: # don't do anything except record the first entry
103109

104110
# if we haven't yet found the minimum coverage, we're looking for
105111
# a local minimum, i.e., a place where count starts increasing
@@ -122,14 +128,15 @@ def analyze_histogram(kmc_db, kmc_path='kmc', scratch_path='.'):
122128

123129
if max_coverage - min_coverage < 5:
124130
print('WARNING: min and max coverage not very far apart. This may be '
125-
'a result of coverage being too low. Try taking a look at ' +
126-
'the histogram in "{}" yourself.'.format(temp_histogram_path),
127-
file=sys.stderr)
131+
'a result of coverage being too low. Try taking a look at '
132+
'the histogram in "{}" yourself.'.format(temp_histogram_path),
133+
file=sys.stderr)
128134
else:
129135
os.remove(temp_histogram_path)
130136

131137
return min_coverage, max_coverage
132138

139+
133140
def run_kmc_subtract(kmc_db_a, kmc_db_b, kmc_cmd='kmc'):
134141
"""
135142
Given two kmc databases along with min and max counts
@@ -152,6 +159,7 @@ def run_kmc_subtract(kmc_db_a, kmc_db_b, kmc_cmd='kmc'):
152159
check_call(subtract_cmd)
153160
return out_path
154161

162+
155163
def run_kmc_dump(kmc_db, out_path, min_count, max_count, kmc_cmd='kmc'):
156164
"""
157165
Given a kmc database, dump it into a text file.
@@ -167,7 +175,7 @@ def run_kmc_dump(kmc_db, out_path, min_count, max_count, kmc_cmd='kmc'):
167175
"""
168176

169177
dump_cmd = [kmc_cmd + '_dump', kmc_db, out_path,
170-
'-ci' + str(min_count), '-cx' + str(max_count)]
178+
'-ci' + str(min_count), '-cx' + str(max_count)]
171179
check_call(dump_cmd)
172180

173181
num_lines = 0
@@ -176,26 +184,27 @@ def run_kmc_dump(kmc_db, out_path, min_count, max_count, kmc_cmd='kmc'):
176184
num_lines += 1
177185
return num_lines
178186

187+
179188
def main():
180189
args = parse_args()
181190

182191
# Count k-mers in all haplotypes
183-
kmc_databases = [] # list of databases (db_path, min_count, max_count)
192+
kmc_databases = [] # list of databases (db_path, min_count, max_count)
184193
for hap_ID, haplotype_files_string in zip(['A', 'B'], args.read_files):
185194
print('\033[92mCounting k-mers in haplotype {}...\033[0m'
186-
.format(hap_ID), file=sys.stderr)
195+
.format(hap_ID), file=sys.stderr)
187196
haplotype_files = haplotype_files_string.split(',')
188197
outfile_path = "{}/haplotype{}".format(args.outpath, hap_ID)
189198
run_kmc(haplotype_files, outfile_path, args.kmer_size,
190199
args.threads, args.path_to_kmc, args.scratch_dir)
191200

192201
# get the histogram for this haplotype and analyze it
193202
print("\033[92mComputing and analyzing histogram...\033[0m",
194-
file=sys.stderr)
195-
min_count, max_count = analyze_histogram(outfile_path,
196-
args.path_to_kmc, args.scratch_dir)
203+
file=sys.stderr)
204+
min_count, max_count = analyze_histogram(
205+
outfile_path, args.path_to_kmc, args.scratch_dir)
197206
print("\033[92mUsing counts in range [{},{}].\033[0m"
198-
.format(min_count, max_count), file=sys.stderr)
207+
.format(min_count, max_count), file=sys.stderr)
199208

200209
kmc_databases.append((outfile_path, min_count, max_count))
201210

@@ -205,26 +214,29 @@ def main():
205214

206215
# make a database of k-mers that appear only in haplotype A & dump
207216
print('\033[92mFinding k-mers unique to haplotype A...\033[0m',
208-
file=sys.stderr)
217+
file=sys.stderr)
209218
hap_a_only_db = run_kmc_subtract(kmc_db_a, kmc_db_b, args.path_to_kmc)
210219
print('\033[92mDumping k-mers unique to haplotype A...\033[0m',
211-
file=sys.stderr)
212-
hap_a_num_kmers = run_kmc_dump(hap_a_only_db, args.outpath + \
213-
'/hapA_only_kmers.txt', min_count_a, max_count_a, args.path_to_kmc)
220+
file=sys.stderr)
221+
hap_a_num_kmers = run_kmc_dump(
222+
hap_a_only_db, args.outpath + '/hapA_only_kmers.txt',
223+
min_count_a, max_count_a, args.path_to_kmc)
214224

215225
# make a database of k-mers that appear only in haplotype B & dump
216226
print('\033[92mFinding k-mers unique to haplotype B...\033[0m',
217-
file=sys.stderr)
227+
file=sys.stderr)
218228
hap_b_only_db = run_kmc_subtract(kmc_db_b, kmc_db_a, args.path_to_kmc)
219229
print('\033[92mDumping k-mers unique to haplotype A...\033[0m',
220-
file=sys.stderr)
221-
hap_b_num_kmers = run_kmc_dump(hap_b_only_db, args.outpath + \
222-
'/hapB_only_kmers.txt', min_count_b, max_count_b, args.path_to_kmc)
230+
file=sys.stderr)
231+
hap_b_num_kmers = run_kmc_dump(
232+
hap_b_only_db, args.outpath + '/hapB_only_kmers.txt',
233+
min_count_b, max_count_b, args.path_to_kmc)
223234

224235
print('\n\n\033[94m# of unique k-mers in haplotype A: {}\033[0m'
225-
.format(hap_a_num_kmers), file=sys.stderr)
236+
.format(hap_a_num_kmers), file=sys.stderr)
226237
print('\033[94m# of unique k-mers in haplotype B: {}\033[0m'
227-
.format(hap_b_num_kmers), file=sys.stderr)
238+
.format(hap_b_num_kmers), file=sys.stderr)
239+
228240

229241
if __name__ == '__main__':
230242
main()

0 commit comments

Comments
 (0)