10
|
1 """
|
|
2 Parameters:
|
16
|
3 - fastq_dump_args: arguments to pass to fastq dumping tool
|
|
4 - fastq_dump_retries: number of retry attempts before dying
|
|
5 - fastq_dump_tool: name of tool to run, e.g. 'fastq-dump' or 'fasterq-dump'
|
10
|
6 - star: arguments to pass to STAR aligner
|
16
|
7 - salmon_args: arguments to pass to salmon quant
|
10
|
8 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10]
|
16
|
9 - bw_bed: name of BED file to use with bwtool
|
|
10 - max_unalign: maximum number of unaligned reads to save per run accession
|
10
|
11 """
|
|
12
|
16
|
13 STEPS = ['download', 'fastq_check', 'align', 'sort',
|
|
14 'bamcount',
|
|
15 'salmon',
|
|
16 'align_unmapped',
|
|
17 'extract_jx']
|
10
|
18
|
16
|
19 FILES = ['sjout.zst', 'fastq_check.tsv.zst',
|
|
20 'unmapped.bam', 'unmapped.fastq.zst',
|
10
|
21 'bamcount_nonref.csv.zst',
|
|
22 'bamcount_auc.tsv',
|
|
23 'bamcount_frag.tsv',
|
|
24 'Chimeric.out.junction.zst',
|
|
25 'all.exon_bw_count.zst', 'unique.exon_bw_count.zst',
|
16
|
26 'all.bw.zst', 'unique.bw.zst',
|
|
27 'salmon.tsv.zst',
|
|
28 'jx_bed.zst',
|
|
29 'manifest'] + list(map(lambda x: x + '.log', STEPS))
|
10
|
30
|
14
|
31 #get rid of the pesky warnings about current paths
|
|
32 #and make sure we're not assuming any wrong relative paths
|
12
|
33 import os
|
|
34 config['output']=os.path.abspath(config['output'])
|
13
|
35 config['temp']=os.path.abspath(config['temp'])
|
14
|
36 config['exon_bed']=os.path.abspath(config['exon_bed'])
|
|
37 config['ref']=os.path.abspath(config['ref'])
|
12
|
38
|
10
|
39 import subprocess
|
|
40 def run_command(cmd_args):
|
|
41 cmd_args = ' '.join(cmd_args)
|
|
42 try:
|
|
43 subprocess.check_call(args=cmd_args, shell=True)
|
|
44 except subprocess.CalledProcessError as cpe:
|
|
45 sys.stderr.write("error in run_command for command: %s\n" % cmd_args)
|
|
46 raise cpe
|
|
47
|
16
|
48 import re
|
35
|
49 #limit the outputs/steps to only 1) STAR called junctions 2) all reads per-base coverage (BigWigs) 3) all reads exon summarized coverage 4) AUC (for QC)
|
37
|
50 STEPS_FILES_FILTER=re.compile(r'(unmapped)|(download)|(salmon)|(extract_jx)|(jx_bed)|(manifest)|(nonref)|(Chimeric)|(fastq_check)|(frag)|(unique)')
|
16
|
51 def remove_steps_files():
|
|
52 #modify STEP and FILES
|
|
53 #so we don't run download or unmapped steps
|
|
54 global FILES
|
|
55 global STEPS
|
|
56 for i in reversed(range(0,len(STEPS))):
|
|
57 f = STEPS[i]
|
|
58 if STEPS_FILES_FILTER.search(f):
|
|
59 del(STEPS[i])
|
|
60 for i in reversed(range(0,len(FILES))):
|
|
61 f = FILES[i]
|
|
62 if STEPS_FILES_FILTER.search(f):
|
|
63 del(FILES[i])
|
10
|
64
|
16
|
65
|
11
|
66 FASTQ_PATT=re.compile(r'([^_\.]+)(_(\d+))?\.((fastq)|fq)(\.gz)?$')
|
10
|
67 def prep_for_galaxy_run():
|
16
|
68 remove_steps_files()
|
10
|
69 try:
|
|
70 os.mkdir(config['temp'])
|
|
71 except OSError as ose:
|
|
72 pass
|
|
73 fastqs = config['inputs'].split(',')
|
|
74 m = FASTQ_PATT.search(fastqs[0])
|
11
|
75 run_acc = 'sample'
|
|
76 if m is not None:
|
|
77 run_acc = m.group(1)
|
12
|
78 study_acc = 'study'
|
10
|
79 if 'study' in config:
|
|
80 study_acc = config['study']
|
|
81 genome = 'hg38'
|
|
82 if 'genome' in config:
|
|
83 genome = config['genome']
|
|
84 method = 'sra'
|
|
85 # SRR,SRP,genome
|
|
86 # e.g. SRR1557855,SRP045778,ce10
|
|
87 #create links which will be used in the align step
|
|
88 i = 1
|
|
89 for f in fastqs:
|
|
90 newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i)
|
15
|
91 if 'compressed' in config:
|
|
92 run_command(['zcat',f,'>',newf])
|
|
93 else:
|
|
94 os.symlink(os.path.abspath(f), newf)
|
10
|
95 #create fastq 0
|
|
96 if i == 1:
|
15
|
97 os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0))
|
10
|
98 i += 1
|
|
99 #create fastq 2 if not paired
|
|
100 if i == 2:
|
|
101 open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close()
|
|
102 return([run_acc, study_acc, genome, method])
|
|
103
|
|
104
|
|
105 def get_accessions(wildcards):
|
|
106 """
|
|
107 Grouping of SRRs with the same SRP could happen here
|
|
108 """
|
|
109 #if running under galaxy where the user will input the
|
|
110 #FASTQs, study, and genome directly
|
|
111 if 'inputs' in config:
|
|
112 (run_acc, study_acc, genome, method) = prep_for_galaxy_run()
|
|
113 for ext in FILES:
|
|
114 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext))
|
|
115 #here to get the make_galaxy_links rule to fire
|
|
116 yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method))
|
|
117 #not running under galaxy, takes a list of accessions
|
|
118 else:
|
|
119 for fn in config['input'].split():
|
|
120 with open(fn, 'r') as fh:
|
|
121 for ln in fh:
|
|
122 if ln.count(',') < 2:
|
|
123 continue
|
|
124 toks = ln.rstrip().split(',')
|
|
125 assert 3 <= len(toks) <= 4
|
|
126 method = 'sra'
|
|
127 if len(toks) == 4:
|
|
128 method = toks[3]
|
|
129 # SRR,SRP,genome
|
|
130 # e.g. SRR1557855,SRP045778,ce10
|
|
131 for ext in FILES:
|
|
132 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext))
|
|
133
|
|
134 rule all:
|
|
135 input:
|
|
136 get_accessions
|
|
137
|
|
138 rule make_manifest:
|
|
139 input:
|
16
|
140 config['output'] + '/{quad}.salmon.tsv.zst',
|
10
|
141 config['output'] + '/{quad}.sjout.zst',
|
16
|
142 config['output'] + '/{quad}.jx_bed.zst',
|
10
|
143 config['output'] + '/{quad}.Chimeric.out.junction.zst',
|
16
|
144 config['output'] + '/{quad}.unmapped.bam',
|
|
145 config['output'] + '/{quad}.unmapped.fastq.zst',
|
10
|
146 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
|
|
147 config['output'] + '/{quad}.bamcount_auc.tsv',
|
|
148 config['output'] + '/{quad}.bamcount_frag.tsv',
|
16
|
149 config['output'] + '/{quad}.fastq_check.tsv.zst',
|
10
|
150 config['output'] + '/{quad}.all.exon_bw_count.zst',
|
|
151 config['output'] + '/{quad}.unique.exon_bw_count.zst',
|
16
|
152 config['output'] + '/{quad}.all.bw.zst',
|
|
153 config['output'] + '/{quad}.unique.bw.zst',
|
|
154 config['output'] + '/{quad}.align.log',
|
|
155 config['output'] + '/{quad}.extract_jx.log',
|
|
156 config['output'] + '/{quad}.bamcount.log',
|
|
157 config['output'] + '/{quad}.align_unmapped.log',
|
|
158 config['output'] + '/{quad}.download.log',
|
|
159 config['output'] + '/{quad}.fastq_check.log',
|
|
160 config['output'] + '/{quad}.sort.log',
|
|
161 config['output'] + '/{quad}.salmon.log'
|
10
|
162 output:
|
|
163 config['output'] + '/{quad}.manifest'
|
|
164 params:
|
|
165 quad=lambda wildcards: wildcards.quad
|
|
166 run:
|
|
167 with open(output[0], 'wt') as fh:
|
|
168 for fn in FILES:
|
|
169 fh.write(params.quad + "." + fn + '\n')
|
|
170
|
|
171 def galaxy_link_files(op):
|
16
|
172 remove_steps_files()
|
10
|
173 a = [op + '/' + f for f in FILES]
|
|
174 return a
|
|
175
|
|
176 rule make_galaxy_links:
|
|
177 input:
|
|
178 config['output'] + '/{quad}.sjout.zst',
|
35
|
179 #config['output'] + '/{quad}.fastq_check.tsv.zst',
|
10
|
180 config['output'] + '/{quad}.bamcount_auc.tsv',
|
35
|
181 #config['output'] + '/{quad}.bamcount_frag.tsv',
|
10
|
182 config['output'] + '/{quad}.all.exon_bw_count.zst',
|
35
|
183 #config['output'] + '/{quad}.unique.exon_bw_count.zst',
|
16
|
184 config['output'] + '/{quad}.all.bw.zst',
|
35
|
185 #config['output'] + '/{quad}.unique.bw.zst',
|
|
186 #config['output'] + '/{quad}.fastq_check.log',
|
|
187 #config['output'] + '/{quad}.align.log',
|
|
188 #config['output'] + '/{quad}.sort.log',
|
|
189 #config['output'] + '/{quad}.bamcount.log',
|
10
|
190 output:
|
|
191 config['output'] + '/{quad}.done'
|
|
192 params:
|
|
193 quad=lambda wildcards: wildcards.quad,
|
|
194 out=galaxy_link_files(config['output'])
|
|
195 run:
|
|
196 for (i,fn) in enumerate(input):
|
|
197 os.symlink(os.path.abspath(fn), params.out[i])
|
16
|
198 os.symlink(os.path.abspath(input[-1]), output[0])
|
10
|
199
|
|
200
|
|
201 rule bamcount:
|
|
202 input:
|
|
203 bam=config['temp'] + '/{quad}-sorted.bam',
|
|
204 bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
|
|
205 bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed'))
|
|
206 output:
|
|
207 nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst',
|
|
208 auc=config['output'] + '/{quad}.bamcount_auc.tsv',
|
|
209 frag=config['output'] + '/{quad}.bamcount_frag.tsv',
|
|
210 all_bw=config['output'] + '/{quad}.all.bw.zst',
|
|
211 unique_bw=config['output'] + '/{quad}.unique.bw.zst',
|
|
212 all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst',
|
|
213 unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst'
|
|
214 log:
|
|
215 config['output'] + '/{quad}.bamcount.log'
|
|
216 params:
|
|
217 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
218 uniq_qual=config.get('unique_qual', 10)
|
|
219 threads: 4
|
|
220 shell:
|
|
221 """
|
|
222 TMP={config[temp]}/{params.srr}_bamcount
|
14
|
223 bamcount {input.bam} \
|
10
|
224 --threads {threads} \
|
|
225 --coverage \
|
|
226 --no-head \
|
|
227 --require-mdz \
|
|
228 --min-unique-qual {params.uniq_qual} \
|
|
229 --frag-dist ${{TMP}} \
|
|
230 --bigwig ${{TMP}} \
|
|
231 --annotation {input.bed} ${{TMP}} \
|
|
232 --auc ${{TMP}} \
|
|
233 --alts ${{TMP}} \
|
|
234 2>&1 | tee -a {log}
|
|
235
|
|
236 #
|
|
237 # --alts
|
|
238 #
|
|
239
|
|
240 (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log}
|
|
241 size=$(wc -c < {output.nonref})
|
|
242 echo "COUNT_NonrefSize ${{size}}"
|
|
243 rm -f ${{TMP}}.alts.tsv
|
|
244
|
|
245 #
|
|
246 # --auc
|
|
247 #
|
|
248 mv ${{TMP}}.auc.tsv {output.auc}
|
|
249 size=$(wc -c < {output.auc})
|
|
250 echo "COUNT_AucSize ${{size}}"
|
|
251 rm -f ${{TMP}}.auc.tsv
|
|
252
|
|
253 #
|
|
254 # --frag-dist
|
|
255 #
|
|
256 mv ${{TMP}}.frags.tsv {output.frag}
|
|
257 size=$(wc -c < {output.frag})
|
|
258 echo "COUNT_FragDistSize ${{size}}"
|
|
259 rm -f ${{TMP}}.frags.tsv
|
|
260
|
|
261 #
|
|
262 # --bigwig
|
|
263 #
|
|
264
|
|
265 (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log}
|
|
266 size=$(wc -c < {output.all_bw})
|
|
267 echo "COUNT_BwSize ${{size}}"
|
|
268 rm -f ${{TMP}}.all.bw
|
|
269
|
|
270 (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log}
|
|
271 size=$(wc -c < {output.unique_bw})
|
|
272 echo "COUNT_BwSize ${{size}}"
|
|
273 rm -f ${{TMP}}.unique.bw
|
|
274
|
|
275 #
|
|
276 # --annotation
|
|
277 #
|
|
278
|
|
279 (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log}
|
|
280 size=$(wc -c < {output.all_bw_count})
|
|
281 echo "COUNT_BwQuantSize ${{size}}"
|
|
282 rm -f ${{TMP}}.all.tsv
|
|
283
|
|
284 (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log}
|
|
285 size=$(wc -c < {output.unique_bw_count})
|
|
286 echo "COUNT_BwQuantSize ${{size}}"
|
|
287 rm -f ${{TMP}}.unique.tsv
|
|
288
|
|
289 # Check that all temporaries were properly purged
|
|
290 set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l)
|
|
291 if (( $num_files > 0 )) ; then
|
|
292 echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)"
|
|
293 exit 1
|
|
294 fi
|
|
295
|
|
296 echo "COUNT_BamcountComplete 1"
|
|
297 """
|
|
298
|
16
|
299 rule bw_zstd:
|
|
300 input:
|
|
301 config['temp'] + '/{prefix}.bw'
|
|
302 output:
|
|
303 config['output'] + '/{prefix}.bw.zst'
|
|
304 shell:
|
|
305 """
|
|
306 zstd {input} -o {output}
|
|
307 size=$(wc -c < {output})
|
|
308 echo "COUNT_BwBytes ${{size}}"
|
|
309 echo "COUNT_BwZstdComplete 1"
|
|
310 """
|
|
311
|
10
|
312 rule sort:
|
|
313 input:
|
|
314 config['temp'] + '/{quad}.bam'
|
|
315 wildcard_constraints:
|
|
316 quad="[^-]+"
|
|
317 output:
|
|
318 bam=temp(config['temp'] + '/{quad}-sorted.bam'),
|
|
319 bai=temp(config['temp'] + '/{quad}-sorted.bam.bai')
|
|
320 log:
|
|
321 config['output'] + '/{quad}.sort.log'
|
|
322 params:
|
|
323 srr=lambda wildcards: wildcards.quad.split('_')[0]
|
|
324 threads: 8
|
|
325 shell:
|
|
326 """
|
|
327 TMP="{config[temp]}/sort_temp.{params.srr}"
|
|
328 mkdir -p ${{TMP}}
|
|
329 time samtools sort \
|
|
330 -T ${{TMP}}/samtools_temp \
|
|
331 -@ {threads} \
|
|
332 -m 64M \
|
|
333 -o {output.bam} {input} 2>&1 | tee -a {log}
|
|
334 rm -rf ${{TMP}}
|
|
335 size=$(wc -c < {output.bam})
|
|
336 echo "COUNT_SortedBAMBytes ${{size}}"
|
|
337
|
|
338 time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log}
|
|
339 echo "COUNT_SortComplete 1"
|
|
340 """
|
|
341
|
16
|
342 rule salmon:
|
|
343 input:
|
|
344 reads0=config['temp'] + '/{quad}_0.fastq',
|
|
345 reads1=config['temp'] + '/{quad}_1.fastq',
|
|
346 reads2=config['temp'] + '/{quad}_2.fastq',
|
|
347 index1=lambda wildcards: '%s/%s/salmon_index/hash.bin' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
348 index2=lambda wildcards: '%s/%s/salmon_index/sa.bin' % (config['ref'], wildcards.quad.split('_')[2])
|
|
349 output:
|
|
350 config['output'] + '/{quad}.salmon.tsv.zst'
|
|
351 log:
|
|
352 config['output'] + '/{quad}.salmon.log'
|
|
353 params:
|
|
354 index_base=lambda wildcards: '%s/%s/salmon_index' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
355 salmon_args=config.get('salmon_args', '')
|
|
356 threads: 8
|
|
357 shell:
|
|
358 """
|
|
359 READ_FILES="-r {input.reads0}"
|
|
360 if [[ -s {input.reads2} ]] ; then
|
|
361 READ_FILES="-1 {input.reads1} -2 {input.reads2}"
|
|
362 fi
|
|
363 if set -o pipefail && time salmon quant \
|
|
364 --libType U \
|
|
365 --quiet \
|
|
366 --validateMappings \
|
|
367 -i {params.index_base} \
|
|
368 -p {threads} \
|
|
369 {params.salmon_args} \
|
|
370 ${{READ_FILES}} \
|
|
371 --output salmon_quant \
|
|
372 --minAssignedFrags 1 \
|
|
373 2>&1 | tee -a {log}
|
|
374 then
|
|
375 echo "COUNT_SalmonSuccess 1"
|
|
376
|
|
377 time zstd salmon_quant/quant.sf -o {output} 2>&1 | tee -a {log}
|
|
378 size=$(wc -c < {output})
|
|
379 echo "COUNT_SalmonQuantBytes ${{size}}"
|
|
380 else
|
|
381 touch {output}
|
|
382 echo "COUNT_SalmonFailure 1"
|
|
383 fi
|
|
384
|
|
385 rm -rf salmon_quant
|
|
386 echo "COUNT_SalmonComplete 1"
|
|
387 """
|
|
388
|
|
389 rule align_unmapped:
|
|
390 input:
|
|
391 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
|
|
392 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq',
|
|
393 index=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome.1.ht2' % (config['ref'], wildcards.quad.split('_')[2])
|
|
394 output:
|
|
395 bam=config['output'] + '/{quad}.unmapped.bam',
|
|
396 sample=config['output'] + '/{quad}.unmapped.fastq.zst'
|
|
397 log:
|
|
398 config['output'] + '/{quad}.align_unmapped.log'
|
|
399 params:
|
|
400 index_base=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
401 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
402 hisat2_params=config.get('hisat2', ''),
|
|
403 max_unalign=config.get('max_unalign', 100000)
|
|
404 threads: 16
|
|
405 shell:
|
|
406 """
|
|
407 TMP="{config[temp]}/align_unmapped_temp.{params.srr}"
|
|
408 READ_FILES="-1 {input.unmapped1} -2 {input.unmapped2}"
|
|
409 if [[ ! -s {input.unmapped2} ]] ; then
|
|
410 READ_FILES="-U {input.unmapped1}"
|
|
411 fi
|
|
412 time hisat2 \
|
|
413 $READ_FILES \
|
|
414 -t --mm \
|
|
415 -x {params.index_base} \
|
|
416 --threads {threads} \
|
|
417 {params.hisat2_params} \
|
|
418 --un ${{TMP}}.un \
|
|
419 --un-conc ${{TMP}}.un_conc \
|
|
420 -S ${{TMP}}.sam \
|
|
421 2>&1 | tee -a {log}
|
|
422
|
|
423 #
|
|
424 # Make BAM file out of aligned reads
|
|
425 #
|
|
426 time samtools view \
|
|
427 -b -F 4 \
|
|
428 -o {output.bam} \
|
|
429 ${{TMP}}.sam \
|
|
430 2>&1 | tee -a {log}
|
|
431 rm -f ${{TMP}}.sam
|
|
432
|
|
433 #
|
|
434 # Save a subset of the doubly unaligned reads
|
|
435 #
|
|
436 if [[ ! -s {input.unmapped2} ]] ; then
|
|
437 # unpaired
|
|
438 fastq-sample \
|
|
439 -n {params.max_unalign} \
|
|
440 -o ${{TMP}}.samp \
|
|
441 ${{TMP}}.un
|
|
442 rm -f ${{TMP}}.un
|
|
443 else
|
|
444 # paired-end
|
|
445 fastq-sample \
|
|
446 -n {params.max_unalign} \
|
|
447 -o ${{TMP}}.samp \
|
|
448 ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
|
|
449 rm -f ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
|
|
450
|
|
451 # interleave the two output fastqs into single file
|
|
452 paste ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq \
|
|
453 | paste - - - - \
|
|
454 | awk -v OFS="\n" -v FS="\t" '{{print($1,$3,$5,$7,$2,$4,$6,$8)}}' \
|
|
455 > ${{TMP}}.samp.fastq
|
|
456 rm -f ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq
|
|
457 fi
|
|
458
|
|
459 test -f ${{TMP}}.samp.fastq
|
|
460 time zstd ${{TMP}}.samp.fastq -o {output.sample} 2>&1 | tee -a {log}
|
|
461 size=$(wc -c < {output.sample})
|
|
462 echo "COUNT_UnmappedSampleBytes ${{size}}"
|
|
463 rm -f ${{TMP}}.samp.fastq
|
|
464
|
|
465 size=$(wc -c < {output.bam})
|
|
466 echo "COUNT_UnmappedBamBytes ${{size}}"
|
|
467
|
|
468 echo "COUNT_AlignUnmappedComplete 1"
|
|
469 """
|
|
470
|
|
471 rule extract_jx:
|
|
472 input:
|
|
473 bam=config['temp'] + '/{quad}-sorted.bam',
|
|
474 bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
|
|
475 fa=lambda wildcards: '%s/%s/fasta/genome.fa' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
476 gtf=lambda wildcards: '%s/%s/gtf/genes.gtf' % (config['ref'], wildcards.quad.split('_')[2])
|
|
477 output:
|
|
478 config['output'] + '/{quad}.jx_bed.zst'
|
|
479 params:
|
|
480 srr=lambda wildcards: wildcards.quad.split('_')[0]
|
|
481 log:
|
|
482 config['output'] + '/{quad}.extract_jx.log'
|
|
483 shell:
|
|
484 """
|
|
485 TMP="{config[temp]}/extract_jx.{params.srr}"
|
|
486 time regtools junctions extract \
|
|
487 -i 20 -a 1 \
|
|
488 -o ${{TMP}}.jx_tmp \
|
|
489 {input.bam} 2>&1 | tee -a {log}
|
|
490 time zstd ${{TMP}}.jx_tmp -o {output} 2>&1 | tee -a {log}
|
|
491 rm -f ${{TMP}}.jx_tmp
|
|
492
|
|
493 size=$(wc -c < {output})
|
|
494 echo "COUNT_ExtractJxBytes ${{size}}"
|
|
495
|
|
496 echo "COUNT_ExtractJxComplete 1"
|
|
497 """
|
|
498
|
10
|
499 rule align:
|
|
500 input:
|
|
501 reads0=config['temp'] + '/{quad}_0.fastq',
|
|
502 reads1=config['temp'] + '/{quad}_1.fastq',
|
|
503 reads2=config['temp'] + '/{quad}_2.fastq',
|
|
504 index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
505 index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2])
|
|
506 wildcard_constraints:
|
|
507 quad="[^-]+"
|
|
508 output:
|
|
509 bam=temp(config['temp'] + '/{quad}.bam'),
|
|
510 jxs=config['output'] + '/{quad}.sjout.zst',
|
|
511 chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst',
|
|
512 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
|
|
513 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq'
|
|
514 log:
|
|
515 config['output'] + '/{quad}.align.log'
|
|
516 params:
|
|
517 index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
518 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
519 star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '')
|
|
520 threads: 16
|
|
521 shell:
|
|
522 """
|
|
523 READ_FILES="{input.reads0}"
|
|
524 if [[ -s {input.reads2} ]] ; then
|
|
525 READ_FILES="{input.reads1} {input.reads2}"
|
|
526 fi
|
|
527 TMP="{config[temp]}/align_temp.{params.srr}"
|
|
528 rm -rf ${{TMP}}
|
|
529 time STAR \
|
|
530 {params.star_params} \
|
|
531 --runMode alignReads \
|
|
532 --runThreadN {threads} \
|
|
533 --genomeDir {params.index_base} \
|
|
534 --readFilesIn ${{READ_FILES}} \
|
|
535 --twopassMode None \
|
|
536 --outTmpDir ${{TMP}} \
|
|
537 --outReadsUnmapped Fastx \
|
|
538 --outMultimapperOrder Random \
|
|
539 --outSAMreadID Number \
|
|
540 --outSAMtype BAM Unsorted \
|
|
541 --outSAMmode NoQS \
|
|
542 --outSAMattributes NH MD \
|
|
543 --chimOutType Junctions \
|
|
544 --chimOutJunctionFormat 1 \
|
|
545 --chimSegmentReadGapMax 3 \
|
|
546 --chimJunctionOverhangMin 12 \
|
|
547 --chimSegmentMin 12 2>&1 | tee -a {log}
|
|
548
|
|
549 # Full set of output files:
|
|
550 #
|
|
551 # Aligned.out.bam
|
|
552 # Chimeric.out.junction
|
|
553 # Log.final.out
|
|
554 # Log.out
|
|
555 # Log.progress.out
|
|
556 # SJ.out.tab
|
|
557 # Unmapped.out.mate1
|
|
558 # Unmapped.out.mate2 (if any reads were paired-end)
|
|
559
|
|
560 #
|
|
561 # Logs
|
|
562 #
|
|
563 rm -rf ${{TMP}}
|
|
564 cat Log.out >> {log}
|
|
565 cat Log.final.out >> {log}
|
|
566 rm -f Log*.out
|
|
567
|
|
568 #
|
|
569 # Junctions
|
|
570 #
|
|
571 test -f SJ.out.tab
|
|
572 time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log}
|
|
573 rm -f SJ.out.tab
|
|
574 size=$(wc -c < {output.jxs})
|
|
575 echo "COUNT_CompressedJxBytes ${{size}}"
|
|
576
|
|
577 #
|
|
578 # Chimerics
|
|
579 #
|
|
580 test -f Chimeric.out.junction
|
|
581 test -s Chimeric.out.junction
|
|
582 sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted
|
|
583 time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log}
|
|
584 rm -f Chimeric.out.junction Chimeric.out.junction.sorted
|
|
585 size=$(wc -c < {output.chimeric})
|
|
586 echo "COUNT_ChimericBytes ${{size}}"
|
|
587
|
|
588 #
|
|
589 # Unmapped
|
|
590 #
|
|
591 touch {output.unmapped2}
|
|
592 test -f Unmapped.out.mate1
|
|
593 mv Unmapped.out.mate1 {output.unmapped1}
|
|
594 if [[ -f Unmapped.out.mate2 ]] ; then
|
|
595 mv Unmapped.out.mate2 {output.unmapped2}
|
|
596 fi
|
|
597
|
|
598 #
|
|
599 # Alignments
|
|
600 #
|
|
601 size=$(wc -c < Aligned.out.bam)
|
|
602 echo "COUNT_BAMBytes ${{size}}"
|
|
603 mv Aligned.out.bam {output.bam}
|
16
|
604
|
10
|
605 echo "COUNT_AlignComplete 1"
|
|
606 """
|
16
|
607
|
|
608 rule fastq_check:
|
|
609 input:
|
|
610 reads0=config['temp'] + '/{quad}_0.fastq',
|
|
611 reads1=config['temp'] + '/{quad}_1.fastq',
|
|
612 reads2=config['temp'] + '/{quad}_2.fastq'
|
|
613 output:
|
|
614 config['output'] + '/{quad}.fastq_check.tsv.zst'
|
|
615 log:
|
|
616 config['output'] + '/{quad}.fastq_check.log'
|
|
617 params:
|
|
618 srr=lambda wildcards: wildcards.quad.split('_')[0]
|
|
619 shell:
|
|
620 """
|
|
621 TMP="{config[temp]}/fastq_check-{params.srr}.tsv"
|
|
622 touch ${{TMP}}
|
|
623 if [[ -s {input.reads0} ]] ; then
|
|
624 time seqtk fqchk -q0 {input.reads0} >>${{TMP}} 2>>{log}
|
|
625 fi
|
|
626 if [[ -s {input.reads1} ]] ; then
|
|
627 time seqtk fqchk -q0 {input.reads1} >>${{TMP}} 2>>{log}
|
|
628 fi
|
|
629 if [[ -s {input.reads2} ]] ; then
|
|
630 time seqtk fqchk -q0 {input.reads2} >>${{TMP}} 2>>{log}
|
|
631 fi
|
|
632 time zstd ${{TMP}} -o {output} 2>&1 | tee -a {log}
|
|
633 size=$(wc -c < {output})
|
|
634 echo "COUNT_FastqCheckBytes ${{size}}"
|
|
635
|
|
636 echo "COUNT_FastqCheckComplete 1"
|
|
637 """
|
|
638
|
|
639 rule download:
|
|
640 output:
|
|
641 temp(config['temp'] + '/{quad}_0.fastq'),
|
|
642 temp(config['temp'] + '/{quad}_1.fastq'),
|
|
643 temp(config['temp'] + '/{quad}_2.fastq')
|
|
644 log:
|
|
645 config['output'] + '/{quad}.download.log'
|
|
646 params:
|
|
647 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
648 method=lambda wildcards: wildcards.quad.split('_')[3],
|
|
649 fd_params=config.get('fastq_dump_args', ''),
|
|
650 retries=config.get('fastq_dump_retries', '5')
|
|
651 threads: 4
|
|
652 shell:
|
|
653 """
|
|
654 set -x
|
|
655 SUCCESS=0
|
|
656 TIMEOUT=10
|
|
657 PARAMS=""
|
|
658 TMP="{config[temp]}/dl-{params.srr}"
|
|
659 ! test -d ${{TMP}}
|
|
660
|
|
661 if [[ {params.method} == "sra" ]] ; then
|
|
662 USE_FASTERQ=1
|
|
663 for i in {{ 1..{params.retries} }} ; do
|
|
664 if time prefetch -t fasp -O ${{TMP}} -L info {params.srr} 2>&1 >> {log} ; then
|
|
665 SUCCESS=1
|
|
666 break
|
|
667 else
|
|
668 echo "COUNT_SraRetries 1"
|
|
669 TIMEOUT=$((${{TIMEOUT}} * 2))
|
|
670 sleep ${{TIMEOUT}}
|
|
671 fi
|
|
672 done
|
|
673 if (( $SUCCESS == 0 )) ; then
|
|
674 echo "COUNT_SraFailures 1"
|
|
675 exit 1
|
|
676 fi
|
|
677 test -f ${{TMP}}/*.sra
|
|
678 size=$(cat ${{TMP}}/*.sra | wc -c)
|
|
679 echo "COUNT_SraBytesDownloaded ${{size}}"
|
|
680 if (( ${{USE_FASTERQ}} == 1 )) ; then
|
|
681 time fasterq-dump ${{TMP}}/*.sra \
|
|
682 -e {threads} \
|
|
683 -t ${{TMP}} \
|
|
684 -L info \
|
|
685 --split-files \
|
|
686 --skip-technical \
|
|
687 -o {params.srr}.fastq \
|
|
688 2>&1 >> {log}
|
|
689 test -f {params.srr}_2.fastq || mv {params.srr}.fastq {params.srr}_0.fastq
|
|
690 else
|
|
691 time fastq-dump ${{TMP}}/*.sra \
|
|
692 -L info \
|
|
693 --split-files \
|
|
694 --skip-technical \
|
|
695 -O . \
|
|
696 2>&1 >> {log}
|
|
697 test -f {params.srr}_2.fastq || mv {params.srr}_1.fastq {params.srr}_0.fastq
|
|
698 fi
|
|
699 rm -rf ${{TMP}}
|
|
700 elif [[ {params.method} == "gdc" ]] ; then
|
|
701 TOKEN=~/gdc/latest.txt
|
|
702 if [[ ! -f ${{TOKEN}} ]] ; then
|
|
703 echo "ERROR: no GDC token file found at ${{TOKEN}}"
|
|
704 exit 1
|
|
705 fi
|
|
706 for i in {{ 1..{params.retries} }} ; do
|
|
707 if time gdc-client download \
|
|
708 -t ${{TOKEN}} \
|
|
709 --log-file {{TMP}}/log.txt \
|
|
710 -d {{TEMP}} \
|
|
711 {params.srr} 2>&1 >> {log}
|
|
712 then
|
|
713 SUCCESS=1
|
|
714 break
|
|
715 else
|
|
716 echo "COUNT_GdcRetries 1"
|
|
717 TIMEOUT=$((${{TIMEOUT}} * 2))
|
|
718 sleep ${{TIMEOUT}}
|
|
719 fi
|
|
720 done
|
|
721 if (( $SUCCESS == 0 )) ; then
|
|
722 echo "COUNT_GdcFailures 1"
|
|
723 exit 1
|
|
724 fi
|
|
725 test -d {{TEMP}}/{params.srr}
|
|
726 test -f {{TEMP}}/{params.srr}/*.tar.gz
|
|
727
|
|
728 echo "=== gdc-client log.txt begin ===" >> {log}
|
|
729 cat {{TMP}}/log.txt >> {log}
|
|
730 echo "=== gdc-client log.txt end===" >> {log}
|
|
731
|
|
732 size=$(cat {{TEMP}}/{params.srr}/*.tar.gz | wc -c)
|
|
733 echo "COUNT_GdcBytesDownloaded ${{size}}"
|
|
734
|
|
735 tar zxvf {{TEMP}}/{params.srr}/*.tar.gz
|
|
736 rm -rf {{TEMP}}
|
|
737
|
|
738 num_1s=$(ls -1 *_1.fastq | wc -l)
|
|
739 num_2s=$(ls -1 *_2.fastq | wc -l)
|
|
740 if (( ${{num_1s}} == 0 )) ; then
|
|
741 echo "ERROR: No _1.fastq files output"
|
|
742 exit 1
|
|
743 fi
|
|
744 if (( ${{num_1s}} > 1 )) ; then
|
|
745 echo "ERROR: More than one _1.fastq file found"
|
|
746 exit 1
|
|
747 fi
|
|
748 if (( ${{num_2s}} == 0 )) ; then
|
|
749 # unpaired
|
|
750 mv *_1.fastq {params.srr}_0.fastq
|
|
751 else
|
|
752 # paired-end
|
|
753 mv *_1.fastq {params.srr}_1.fastq
|
|
754 mv *_2.fastq {params.srr}_2.fastq
|
|
755 fi
|
|
756 fi
|
|
757
|
|
758 # Next chunk expects the FASTQ files to exist in the current directory
|
|
759 # named {{params.srr}}_{{0,1,2}}.fastq
|
|
760 size=0
|
|
761 for i in {{0..2}} ; do
|
|
762 fn={params.srr}_${{i}}.fastq
|
|
763 if [[ -f ${{fn}} ]] ; then
|
|
764 echo "COUNT_FilesDownloaded 1"
|
|
765 else
|
|
766 touch ${{fn}}
|
|
767 fi
|
|
768 size=$((${{size}} + $(wc -c < ${{fn}})))
|
|
769 mv ${{fn}} {config[temp]}/{wildcards.quad}_${{i}}.fastq
|
|
770 done
|
|
771 echo "COUNT_BytesDownloaded ${{size}}"
|
|
772 echo "COUNT_DownloadComplete 1"
|
|
773 """
|