1+ # Copyright 2016-2022 Kevin Murray/Gekkonid Consulting
2+ #
3+ # This Source Code Form is subject to the terms of the Mozilla Public License,
4+ # v. 2.0. If a copy of the MPL was not distributed with this file, You can
5+ # obtain one at http://mozilla.org/MPL/2.0/.
6+
17import csv
28from collections import defaultdict
39from glob import glob
1622
1723HERE = os .path .abspath (os .path .dirname (__file__ ))
1824
19- class __Rules (object ):
20- def __init__ (self ):
21- for rulefile in glob (f"{ HERE } /rules/*.rules" ):
22- rule = splitext (basename (rulefile ))[0 ]
23- setattr (self , rule , rulefile )
24-
25- rules = __Rules ()
26-
2725profiles = {}
2826for profiledir in glob (f"{ HERE } /profiles/*" ):
2927 profile = basename (profiledir )
@@ -32,156 +30,3 @@ def __init__(self):
3230
3331def get_resource (file ):
3432 return f"{ HERE } /{ file } "
35-
36- def rule_resources (config , rule , ** defaults ):
37- def resource (wildcards , attempt , value , maxvalue ):
38- return int (min (value * 2 ^ (attempt - 1 ), maxvalue ))
39- C = config .get ("cluster_resources" , {})
40- maxes = C .get ("max_values" , {})
41- global_defaults = C .get ("defaults" , {})
42- rules = C .get ("rules" , {})
43-
44- values = {}
45- values .update (global_defaults )
46- values .update (defaults )
47- values .update (rules .get (rule , {}))
48- ret = {}
49- for res , val in values .items ():
50- if isinstance (val , str ):
51- # the logic below allows restarting with increased resources. If
52- # the resource's value is string, you can't double it with each
53- # attempt, so just return it as a constant.
54- # this is used for things like cluster queues etc.
55- ret [res ] = val
56- if C .get ("DEBUG" , False ):
57- print (rule , res , val )
58- continue
59- maxval = maxes .get (res , inf )
60- if C .get ("DEBUG" , False ):
61- print (rule , res , val , maxval )
62- ret [res ] = partial (resource , value = val , maxvalue = maxval )
63- return ret
64-
65-
66- def populate_metadata (config , runlib2samp = None , sample_meta = None , setfile_glob = None ):
67- try :
68- if runlib2samp is None :
69- runlib2samp = config ["metadata" ]["runlib2samp_file" ]
70- if sample_meta is None :
71- sample_meta = config ["metadata" ]["sample_meta_file" ]
72- if setfile_glob is None :
73- setfile_glob = config ["metadata" ]["setfile_glob" ]
74- except KeyError as exc :
75- raise ValueError ("ERROR: metadata files must be configured in config, or passed to populate_metadata()" )
76- RL2S , S2RL = make_runlib2samp (runlib2samp )
77- config ["RUNLIB2SAMP" ] = RL2S
78- config ["SAMP2RUNLIB" ] = S2RL
79- config ["SAMPLESETS" ] = make_samplesets (runlib2samp , setfile_glob )
80- if "refs" not in config :
81- raise RuntimeError ("ERROR: reference(s) must be configured in config file" )
82- config ["CHROMS" ] = make_chroms (config ["refs" ])
83- if "varcall" in config :
84- config ["VARCALL_REGIONS" ] = {
85- vc : make_regions (config ["refs" ], window = config ["varcall" ]["chunksize" ][vc ])
86- for vc in config ["varcall" ]["chunksize" ]
87- }
88-
89-
90- def parsefai (fai ):
91- with open (fai ) as fh :
92- for l in fh :
93- cname , clen , _ , _ , _ = l .split ()
94- clen = int (clen )
95- yield cname , clen
96-
97-
98- def make_regions (rdict , window = 1e6 , base = 1 ):
99- window = int (window )
100- ret = {}
101- for refname , refbits in rdict .items ():
102- fai = refbits ['fasta' ]+ ".fai"
103- windows = []
104- curwin = []
105- curwinlen = 0
106- for cname , clen in parsefai (fai ):
107- for start in range (0 , clen , window ):
108- wlen = min (clen - start , window )
109- windows .append ("{}:{:09d}-{:09d}" .format (cname , start + base , start + wlen ))
110- ret [refname ] = windows
111- return ret
112-
113-
114- def make_chroms (rdict ):
115- ret = {}
116- for refname , refbits in rdict .items ():
117- fai = refbits ['fasta' ]+ ".fai"
118- ref = dict ()
119- for cname , clen in parsefai (fai ):
120- ref [cname ] = clen
121- ret [refname ] = ref
122- return ret
123-
124-
125- def _iter_metadata (s2rl_file ):
126- with open (s2rl_file ) as fh :
127- dialect = "excel"
128- if s2rl_file .endswith (".tsv" ):
129- dialect = "excel-tab"
130- for samp in csv .DictReader (fh , dialect = dialect ):
131- yield {k .lower (): v for k , v in samp .items ()}
132-
133-
134- def make_runlib2samp (s2rl_file ):
135- rl2s = {}
136- s2rl = defaultdict (list )
137- for run in _iter_metadata (s2rl_file ):
138- if not run ["library" ] or run ["library" ].lower ().startswith ("blank" ):
139- # Skip blanks
140- continue
141- if run .get ("include" , "Y" ) != "Y" :
142- # Remove non-sequenced ones
143- continue
144- rl = (run ["run" ], run ["library" ])
145- samp = run ["sample" ]
146- rl2s [rl ] = samp
147- s2rl [samp ].append (rl )
148- return dict (rl2s ), dict (s2rl )
149-
150-
151- def stripext (path , exts = ".txt" ):
152- if isinstance (exts , str ):
153- exts = [exts ,]
154- for ext in exts :
155- if path .endswith (ext ):
156- path = path [:- len (ext )]
157- return path
158-
159-
160- def make_samplesets (s2rl_file , setfile_glob ):
161- ssets = defaultdict (list )
162- everything = set ()
163- for setfile in glob (setfile_glob ):
164- setname = stripext (basename (setfile ), ".txt" )
165- with open (setfile ) as fh :
166- samples = [x .strip () for x in fh ]
167- ssets [setname ] = samples
168- everything .update (samples )
169- ssets ["all_samples" ] = everything
170-
171- if not os .path .exists ("data/samplelists" ):
172- os .makedirs ("data/samplelists" , exist_ok = True )
173- with open ("data/samplelists/GENERATED_FILES_DO_NOT_EDIT" , "w" ) as fh :
174- print ("you're probably looking for" , setfile_glob , file = fh )
175- for setname , setsamps in ssets .items ():
176- fname = "data/samplelists/{}.txt" .format (setname )
177- try :
178- with open (fname ) as fh :
179- currsamps = set ([l .strip () for l in fh ])
180- except IOError :
181- currsamps = set ()
182- if set (setsamps ) != currsamps :
183- with open (fname , "w" ) as fh :
184- print ("WARNING: updating sample sets, this will trigger reruns" , setname , file = stderr )
185- for s in sorted (setsamps ):
186- print (s , file = fh )
187- return {n : list (sorted (set (s ))) for n , s in ssets .items ()}
0 commit comments