10
|
1 """
|
|
2 Parameters:
|
|
3 - star: arguments to pass to STAR aligner
|
|
4 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10]
|
|
5 """
|
|
6
|
|
7 STEPS = ['align', 'sort', 'bamcount']
|
|
8
|
|
9 FILES = ['sjout.zst',
|
|
10 'bamcount_nonref.csv.zst',
|
|
11 'bamcount_auc.tsv',
|
|
12 'bamcount_frag.tsv',
|
|
13 'Chimeric.out.junction.zst',
|
|
14 'all.exon_bw_count.zst', 'unique.exon_bw_count.zst',
|
|
15 'manifest']
|
|
16
|
14
|
17 #get rid of the pesky warnings about current paths
|
|
18 #and make sure we're not assuming any wrong relative paths
|
12
|
19 import os
|
|
20 config['output']=os.path.abspath(config['output'])
|
13
|
21 config['temp']=os.path.abspath(config['temp'])
|
14
|
22 config['exon_bed']=os.path.abspath(config['exon_bed'])
|
|
23 config['ref']=os.path.abspath(config['ref'])
|
12
|
24
|
10
|
25 import subprocess
|
|
26 def run_command(cmd_args):
|
|
27 cmd_args = ' '.join(cmd_args)
|
|
28 try:
|
|
29 subprocess.check_call(args=cmd_args, shell=True)
|
|
30 except subprocess.CalledProcessError as cpe:
|
|
31 sys.stderr.write("error in run_command for command: %s\n" % cmd_args)
|
|
32 raise cpe
|
|
33
|
|
34
|
|
35 import re
|
11
|
36 FASTQ_PATT=re.compile(r'([^_\.]+)(_(\d+))?\.((fastq)|fq)(\.gz)?$')
|
10
|
37 def prep_for_galaxy_run():
|
|
38 try:
|
|
39 os.mkdir(config['temp'])
|
|
40 except OSError as ose:
|
|
41 pass
|
|
42 fastqs = config['inputs'].split(',')
|
|
43 m = FASTQ_PATT.search(fastqs[0])
|
11
|
44 run_acc = 'sample'
|
|
45 if m is not None:
|
|
46 run_acc = m.group(1)
|
12
|
47 study_acc = 'study'
|
10
|
48 if 'study' in config:
|
|
49 study_acc = config['study']
|
|
50 genome = 'hg38'
|
|
51 if 'genome' in config:
|
|
52 genome = config['genome']
|
|
53 method = 'sra'
|
|
54 # SRR,SRP,genome
|
|
55 # e.g. SRR1557855,SRP045778,ce10
|
|
56 #create links which will be used in the align step
|
|
57 i = 1
|
|
58 for f in fastqs:
|
|
59 newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i)
|
15
|
60 if 'compressed' in config:
|
|
61 run_command(['zcat',f,'>',newf])
|
|
62 else:
|
|
63 os.symlink(os.path.abspath(f), newf)
|
10
|
64 #create fastq 0
|
|
65 if i == 1:
|
15
|
66 os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0))
|
10
|
67 i += 1
|
|
68 #create fastq 2 if not paired
|
|
69 if i == 2:
|
|
70 open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close()
|
|
71 #create expected file structure for annotated exon bed file & reference index
|
|
72 ref = config['ref']
|
|
73 config['ref'] = '.'
|
|
74 os.makedirs('%s/%s' % (config['ref'], genome))
|
|
75 os.symlink(ref, '%s/%s/star_idx' % (config['ref'], genome))
|
|
76 os.makedirs('%s/%s/gtf' % (config['ref'], genome))
|
|
77 os.symlink(config['exon_bed'], 'exons.tmp')
|
|
78 os.symlink('../../exons.tmp', '%s/%s/gtf/%s' % (config['ref'], genome, config.get('bw_bed', 'exons.bed')))
|
|
79 return([run_acc, study_acc, genome, method])
|
|
80
|
|
81
|
|
82 def get_accessions(wildcards):
|
|
83 """
|
|
84 Grouping of SRRs with the same SRP could happen here
|
|
85 """
|
|
86 #if running under galaxy where the user will input the
|
|
87 #FASTQs, study, and genome directly
|
|
88 if 'inputs' in config:
|
|
89 (run_acc, study_acc, genome, method) = prep_for_galaxy_run()
|
|
90 for ext in FILES:
|
|
91 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext))
|
|
92 #here to get the make_galaxy_links rule to fire
|
|
93 yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method))
|
|
94 #not running under galaxy, takes a list of accessions
|
|
95 else:
|
|
96 for fn in config['input'].split():
|
|
97 with open(fn, 'r') as fh:
|
|
98 for ln in fh:
|
|
99 if ln.count(',') < 2:
|
|
100 continue
|
|
101 toks = ln.rstrip().split(',')
|
|
102 assert 3 <= len(toks) <= 4
|
|
103 method = 'sra'
|
|
104 if len(toks) == 4:
|
|
105 method = toks[3]
|
|
106 # SRR,SRP,genome
|
|
107 # e.g. SRR1557855,SRP045778,ce10
|
|
108 for ext in FILES:
|
|
109 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext))
|
|
110
|
|
111 rule all:
|
|
112 input:
|
|
113 get_accessions
|
|
114
|
|
115
|
|
116 rule make_manifest:
|
|
117 input:
|
|
118 config['output'] + '/{quad}.sjout.zst',
|
|
119 config['output'] + '/{quad}.Chimeric.out.junction.zst',
|
|
120 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
|
|
121 config['output'] + '/{quad}.bamcount_auc.tsv',
|
|
122 config['output'] + '/{quad}.bamcount_frag.tsv',
|
|
123 config['output'] + '/{quad}.all.exon_bw_count.zst',
|
|
124 config['output'] + '/{quad}.unique.exon_bw_count.zst',
|
|
125 output:
|
|
126 config['output'] + '/{quad}.manifest'
|
|
127 params:
|
|
128 quad=lambda wildcards: wildcards.quad
|
|
129 run:
|
|
130 with open(output[0], 'wt') as fh:
|
|
131 for fn in FILES:
|
|
132 fh.write(params.quad + "." + fn + '\n')
|
|
133
|
|
134 def galaxy_link_files(op):
|
|
135 a = [op + '/' + f for f in FILES]
|
|
136 a.extend([op + '/align.log', op + '/bamcount.log'])
|
|
137 return a
|
|
138
|
|
139 rule make_galaxy_links:
|
|
140 input:
|
|
141 config['output'] + '/{quad}.sjout.zst',
|
|
142 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
|
|
143 config['output'] + '/{quad}.bamcount_auc.tsv',
|
|
144 config['output'] + '/{quad}.bamcount_frag.tsv',
|
|
145 config['output'] + '/{quad}.Chimeric.out.junction.zst',
|
|
146 config['output'] + '/{quad}.all.exon_bw_count.zst',
|
|
147 config['output'] + '/{quad}.unique.exon_bw_count.zst',
|
|
148 config['output'] + '/{quad}.manifest'
|
|
149 output:
|
|
150 config['output'] + '/{quad}.done'
|
|
151 params:
|
|
152 quad=lambda wildcards: wildcards.quad,
|
|
153 out=galaxy_link_files(config['output'])
|
|
154 run:
|
|
155 inputs = input.extend([config['output'] + '/' + params.quad + '.align.log',
|
|
156 config['output'] + '/' + params.quad + '.bamcount.log'])
|
|
157 for (i,fn) in enumerate(input):
|
|
158 os.symlink(os.path.abspath(fn), params.out[i])
|
|
159 os.symlink(os.path.abspath(input[-3]), output[0])
|
|
160
|
|
161
|
|
162 rule bamcount:
|
|
163 input:
|
|
164 bam=config['temp'] + '/{quad}-sorted.bam',
|
|
165 bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
|
|
166 bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed'))
|
|
167 output:
|
|
168 nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst',
|
|
169 auc=config['output'] + '/{quad}.bamcount_auc.tsv',
|
|
170 frag=config['output'] + '/{quad}.bamcount_frag.tsv',
|
|
171 all_bw=config['output'] + '/{quad}.all.bw.zst',
|
|
172 unique_bw=config['output'] + '/{quad}.unique.bw.zst',
|
|
173 all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst',
|
|
174 unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst'
|
|
175 log:
|
|
176 config['output'] + '/{quad}.bamcount.log'
|
|
177 params:
|
|
178 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
179 uniq_qual=config.get('unique_qual', 10)
|
|
180 threads: 4
|
|
181 shell:
|
|
182 """
|
|
183 TMP={config[temp]}/{params.srr}_bamcount
|
14
|
184 bamcount {input.bam} \
|
10
|
185 --threads {threads} \
|
|
186 --coverage \
|
|
187 --no-head \
|
|
188 --require-mdz \
|
|
189 --min-unique-qual {params.uniq_qual} \
|
|
190 --frag-dist ${{TMP}} \
|
|
191 --bigwig ${{TMP}} \
|
|
192 --annotation {input.bed} ${{TMP}} \
|
|
193 --auc ${{TMP}} \
|
|
194 --alts ${{TMP}} \
|
|
195 2>&1 | tee -a {log}
|
|
196
|
|
197 #
|
|
198 # --alts
|
|
199 #
|
|
200
|
|
201 (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log}
|
|
202 size=$(wc -c < {output.nonref})
|
|
203 echo "COUNT_NonrefSize ${{size}}"
|
|
204 rm -f ${{TMP}}.alts.tsv
|
|
205
|
|
206 #
|
|
207 # --auc
|
|
208 #
|
|
209 mv ${{TMP}}.auc.tsv {output.auc}
|
|
210 size=$(wc -c < {output.auc})
|
|
211 echo "COUNT_AucSize ${{size}}"
|
|
212 rm -f ${{TMP}}.auc.tsv
|
|
213
|
|
214 #
|
|
215 # --frag-dist
|
|
216 #
|
|
217 mv ${{TMP}}.frags.tsv {output.frag}
|
|
218 size=$(wc -c < {output.frag})
|
|
219 echo "COUNT_FragDistSize ${{size}}"
|
|
220 rm -f ${{TMP}}.frags.tsv
|
|
221
|
|
222 #
|
|
223 # --bigwig
|
|
224 #
|
|
225
|
|
226 (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log}
|
|
227 size=$(wc -c < {output.all_bw})
|
|
228 echo "COUNT_BwSize ${{size}}"
|
|
229 rm -f ${{TMP}}.all.bw
|
|
230
|
|
231 (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log}
|
|
232 size=$(wc -c < {output.unique_bw})
|
|
233 echo "COUNT_BwSize ${{size}}"
|
|
234 rm -f ${{TMP}}.unique.bw
|
|
235
|
|
236 #
|
|
237 # --annotation
|
|
238 #
|
|
239
|
|
240 (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log}
|
|
241 size=$(wc -c < {output.all_bw_count})
|
|
242 echo "COUNT_BwQuantSize ${{size}}"
|
|
243 rm -f ${{TMP}}.all.tsv
|
|
244
|
|
245 (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log}
|
|
246 size=$(wc -c < {output.unique_bw_count})
|
|
247 echo "COUNT_BwQuantSize ${{size}}"
|
|
248 rm -f ${{TMP}}.unique.tsv
|
|
249
|
|
250 # Check that all temporaries were properly purged
|
|
251 set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l)
|
|
252 if (( $num_files > 0 )) ; then
|
|
253 echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)"
|
|
254 exit 1
|
|
255 fi
|
|
256
|
|
257 echo "COUNT_BamcountComplete 1"
|
|
258 """
|
|
259
|
|
260 rule sort:
|
|
261 input:
|
|
262 config['temp'] + '/{quad}.bam'
|
|
263 wildcard_constraints:
|
|
264 quad="[^-]+"
|
|
265 output:
|
|
266 bam=temp(config['temp'] + '/{quad}-sorted.bam'),
|
|
267 bai=temp(config['temp'] + '/{quad}-sorted.bam.bai')
|
|
268 log:
|
|
269 config['output'] + '/{quad}.sort.log'
|
|
270 params:
|
|
271 srr=lambda wildcards: wildcards.quad.split('_')[0]
|
|
272 threads: 8
|
|
273 shell:
|
|
274 """
|
|
275 TMP="{config[temp]}/sort_temp.{params.srr}"
|
|
276 mkdir -p ${{TMP}}
|
|
277 time samtools sort \
|
|
278 -T ${{TMP}}/samtools_temp \
|
|
279 -@ {threads} \
|
|
280 -m 64M \
|
|
281 -o {output.bam} {input} 2>&1 | tee -a {log}
|
|
282 rm -rf ${{TMP}}
|
|
283 size=$(wc -c < {output.bam})
|
|
284 echo "COUNT_SortedBAMBytes ${{size}}"
|
|
285
|
|
286 time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log}
|
|
287 echo "COUNT_SortComplete 1"
|
|
288 """
|
|
289
|
|
290 rule align:
|
|
291 input:
|
|
292 reads0=config['temp'] + '/{quad}_0.fastq',
|
|
293 reads1=config['temp'] + '/{quad}_1.fastq',
|
|
294 reads2=config['temp'] + '/{quad}_2.fastq',
|
|
295 index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
296 index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2])
|
|
297 wildcard_constraints:
|
|
298 quad="[^-]+"
|
|
299 output:
|
|
300 bam=temp(config['temp'] + '/{quad}.bam'),
|
|
301 jxs=config['output'] + '/{quad}.sjout.zst',
|
|
302 chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst',
|
|
303 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
|
|
304 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq'
|
|
305 log:
|
|
306 config['output'] + '/{quad}.align.log'
|
|
307 params:
|
|
308 index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
309 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
310 star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '')
|
|
311 threads: 16
|
|
312 shell:
|
|
313 """
|
|
314 READ_FILES="{input.reads0}"
|
|
315 if [[ -s {input.reads2} ]] ; then
|
|
316 READ_FILES="{input.reads1} {input.reads2}"
|
|
317 fi
|
|
318 TMP="{config[temp]}/align_temp.{params.srr}"
|
|
319 rm -rf ${{TMP}}
|
|
320 time STAR \
|
|
321 {params.star_params} \
|
|
322 --runMode alignReads \
|
|
323 --runThreadN {threads} \
|
|
324 --genomeDir {params.index_base} \
|
|
325 --readFilesIn ${{READ_FILES}} \
|
|
326 --twopassMode None \
|
|
327 --outTmpDir ${{TMP}} \
|
|
328 --outReadsUnmapped Fastx \
|
|
329 --outMultimapperOrder Random \
|
|
330 --outSAMreadID Number \
|
|
331 --outSAMtype BAM Unsorted \
|
|
332 --outSAMmode NoQS \
|
|
333 --outSAMattributes NH MD \
|
|
334 --chimOutType Junctions \
|
|
335 --chimOutJunctionFormat 1 \
|
|
336 --chimSegmentReadGapMax 3 \
|
|
337 --chimJunctionOverhangMin 12 \
|
|
338 --chimSegmentMin 12 2>&1 | tee -a {log}
|
|
339
|
|
340 # Full set of output files:
|
|
341 #
|
|
342 # Aligned.out.bam
|
|
343 # Chimeric.out.junction
|
|
344 # Log.final.out
|
|
345 # Log.out
|
|
346 # Log.progress.out
|
|
347 # SJ.out.tab
|
|
348 # Unmapped.out.mate1
|
|
349 # Unmapped.out.mate2 (if any reads were paired-end)
|
|
350
|
|
351 #
|
|
352 # Logs
|
|
353 #
|
|
354 rm -rf ${{TMP}}
|
|
355 cat Log.out >> {log}
|
|
356 cat Log.final.out >> {log}
|
|
357 rm -f Log*.out
|
|
358
|
|
359 #
|
|
360 # Junctions
|
|
361 #
|
|
362 test -f SJ.out.tab
|
|
363 time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log}
|
|
364 rm -f SJ.out.tab
|
|
365 size=$(wc -c < {output.jxs})
|
|
366 echo "COUNT_CompressedJxBytes ${{size}}"
|
|
367
|
|
368 #
|
|
369 # Chimerics
|
|
370 #
|
|
371 test -f Chimeric.out.junction
|
|
372 test -s Chimeric.out.junction
|
|
373 sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted
|
|
374 time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log}
|
|
375 rm -f Chimeric.out.junction Chimeric.out.junction.sorted
|
|
376 size=$(wc -c < {output.chimeric})
|
|
377 echo "COUNT_ChimericBytes ${{size}}"
|
|
378
|
|
379 #
|
|
380 # Unmapped
|
|
381 #
|
|
382 touch {output.unmapped2}
|
|
383 test -f Unmapped.out.mate1
|
|
384 mv Unmapped.out.mate1 {output.unmapped1}
|
|
385 if [[ -f Unmapped.out.mate2 ]] ; then
|
|
386 mv Unmapped.out.mate2 {output.unmapped2}
|
|
387 fi
|
|
388
|
|
389 #
|
|
390 # Alignments
|
|
391 #
|
|
392 size=$(wc -c < Aligned.out.bam)
|
|
393 echo "COUNT_BAMBytes ${{size}}"
|
|
394 mv Aligned.out.bam {output.bam}
|
|
395 echo "COUNT_AlignComplete 1"
|
|
396 """
|