10
|
1 """
|
|
2 Parameters:
|
16
|
3 - fastq_dump_args: arguments to pass to fastq dumping tool
|
|
4 - fastq_dump_retries: number of retry attempts before dying
|
|
5 - fastq_dump_tool: name of tool to run, e.g. 'fastq-dump' or 'fasterq-dump'
|
10
|
6 - star: arguments to pass to STAR aligner
|
16
|
7 - salmon_args: arguments to pass to salmon quant
|
10
|
8 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10]
|
16
|
9 - bw_bed: name of BED file to use with bwtool
|
|
10 - max_unalign: maximum number of unaligned reads to save per run accession
|
10
|
11 """
|
|
12
|
16
|
13 STEPS = ['download', 'fastq_check', 'align', 'sort',
|
|
14 'bamcount',
|
|
15 'salmon',
|
|
16 'align_unmapped',
|
|
17 'extract_jx']
|
10
|
18
|
16
|
19 FILES = ['sjout.zst', 'fastq_check.tsv.zst',
|
|
20 'unmapped.bam', 'unmapped.fastq.zst',
|
10
|
21 'bamcount_nonref.csv.zst',
|
|
22 'bamcount_auc.tsv',
|
|
23 'bamcount_frag.tsv',
|
|
24 'Chimeric.out.junction.zst',
|
|
25 'all.exon_bw_count.zst', 'unique.exon_bw_count.zst',
|
16
|
26 'all.bw.zst', 'unique.bw.zst',
|
|
27 'salmon.tsv.zst',
|
|
28 'jx_bed.zst',
|
|
29 'manifest'] + list(map(lambda x: x + '.log', STEPS))
|
10
|
30
|
14
|
31 #get rid of the pesky warnings about current paths
|
|
32 #and make sure we're not assuming any wrong relative paths
|
12
|
33 import os
|
|
34 config['output']=os.path.abspath(config['output'])
|
13
|
35 config['temp']=os.path.abspath(config['temp'])
|
14
|
36 config['exon_bed']=os.path.abspath(config['exon_bed'])
|
|
37 config['ref']=os.path.abspath(config['ref'])
|
12
|
38
|
10
|
39 import subprocess
|
|
40 def run_command(cmd_args):
|
|
41 cmd_args = ' '.join(cmd_args)
|
|
42 try:
|
|
43 subprocess.check_call(args=cmd_args, shell=True)
|
|
44 except subprocess.CalledProcessError as cpe:
|
|
45 sys.stderr.write("error in run_command for command: %s\n" % cmd_args)
|
|
46 raise cpe
|
|
47
|
16
|
48 import re
|
|
49 STEPS_FILES_FILTER=re.compile(r'(unmapped)|(download)|(salmon)|(extract_jx)|(jx_bed)|(manifest)')
|
|
50 def remove_steps_files():
|
|
51 #modify STEP and FILES
|
|
52 #so we don't run download or unmapped steps
|
|
53 global FILES
|
|
54 global STEPS
|
|
55 for i in reversed(range(0,len(STEPS))):
|
|
56 f = STEPS[i]
|
|
57 if STEPS_FILES_FILTER.search(f):
|
|
58 del(STEPS[i])
|
|
59 for i in reversed(range(0,len(FILES))):
|
|
60 f = FILES[i]
|
|
61 if STEPS_FILES_FILTER.search(f):
|
|
62 del(FILES[i])
|
10
|
63
|
16
|
64
|
11
|
65 FASTQ_PATT=re.compile(r'([^_\.]+)(_(\d+))?\.((fastq)|fq)(\.gz)?$')
|
10
|
66 def prep_for_galaxy_run():
|
16
|
67 remove_steps_files()
|
10
|
68 try:
|
|
69 os.mkdir(config['temp'])
|
|
70 except OSError as ose:
|
|
71 pass
|
|
72 fastqs = config['inputs'].split(',')
|
|
73 m = FASTQ_PATT.search(fastqs[0])
|
11
|
74 run_acc = 'sample'
|
|
75 if m is not None:
|
|
76 run_acc = m.group(1)
|
12
|
77 study_acc = 'study'
|
10
|
78 if 'study' in config:
|
|
79 study_acc = config['study']
|
|
80 genome = 'hg38'
|
|
81 if 'genome' in config:
|
|
82 genome = config['genome']
|
|
83 method = 'sra'
|
|
84 # SRR,SRP,genome
|
|
85 # e.g. SRR1557855,SRP045778,ce10
|
|
86 #create links which will be used in the align step
|
|
87 i = 1
|
|
88 for f in fastqs:
|
|
89 newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i)
|
15
|
90 if 'compressed' in config:
|
|
91 run_command(['zcat',f,'>',newf])
|
|
92 else:
|
|
93 os.symlink(os.path.abspath(f), newf)
|
10
|
94 #create fastq 0
|
|
95 if i == 1:
|
15
|
96 os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0))
|
10
|
97 i += 1
|
|
98 #create fastq 2 if not paired
|
|
99 if i == 2:
|
|
100 open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close()
|
|
101 #create expected file structure for annotated exon bed file & reference index
|
|
102 ref = config['ref']
|
16
|
103 config['ref'] = os.path.abspath('.')
|
10
|
104 os.makedirs('%s/%s' % (config['ref'], genome))
|
|
105 os.symlink(ref, '%s/%s/star_idx' % (config['ref'], genome))
|
|
106 os.makedirs('%s/%s/gtf' % (config['ref'], genome))
|
|
107 os.symlink(config['exon_bed'], 'exons.tmp')
|
|
108 os.symlink('../../exons.tmp', '%s/%s/gtf/%s' % (config['ref'], genome, config.get('bw_bed', 'exons.bed')))
|
16
|
109 #TODO may need to add additional symlinking for HISAT2 unmapped alignments
|
10
|
110 return([run_acc, study_acc, genome, method])
|
|
111
|
|
112
|
|
113 def get_accessions(wildcards):
|
|
114 """
|
|
115 Grouping of SRRs with the same SRP could happen here
|
|
116 """
|
|
117 #if running under galaxy where the user will input the
|
|
118 #FASTQs, study, and genome directly
|
|
119 if 'inputs' in config:
|
|
120 (run_acc, study_acc, genome, method) = prep_for_galaxy_run()
|
|
121 for ext in FILES:
|
|
122 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext))
|
|
123 #here to get the make_galaxy_links rule to fire
|
|
124 yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method))
|
|
125 #not running under galaxy, takes a list of accessions
|
|
126 else:
|
|
127 for fn in config['input'].split():
|
|
128 with open(fn, 'r') as fh:
|
|
129 for ln in fh:
|
|
130 if ln.count(',') < 2:
|
|
131 continue
|
|
132 toks = ln.rstrip().split(',')
|
|
133 assert 3 <= len(toks) <= 4
|
|
134 method = 'sra'
|
|
135 if len(toks) == 4:
|
|
136 method = toks[3]
|
|
137 # SRR,SRP,genome
|
|
138 # e.g. SRR1557855,SRP045778,ce10
|
|
139 for ext in FILES:
|
|
140 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext))
|
|
141
|
|
142 rule all:
|
|
143 input:
|
|
144 get_accessions
|
|
145
|
|
146 rule make_manifest:
|
|
147 input:
|
16
|
148 config['output'] + '/{quad}.salmon.tsv.zst',
|
10
|
149 config['output'] + '/{quad}.sjout.zst',
|
16
|
150 config['output'] + '/{quad}.jx_bed.zst',
|
10
|
151 config['output'] + '/{quad}.Chimeric.out.junction.zst',
|
16
|
152 config['output'] + '/{quad}.unmapped.bam',
|
|
153 config['output'] + '/{quad}.unmapped.fastq.zst',
|
10
|
154 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
|
|
155 config['output'] + '/{quad}.bamcount_auc.tsv',
|
|
156 config['output'] + '/{quad}.bamcount_frag.tsv',
|
16
|
157 config['output'] + '/{quad}.fastq_check.tsv.zst',
|
10
|
158 config['output'] + '/{quad}.all.exon_bw_count.zst',
|
|
159 config['output'] + '/{quad}.unique.exon_bw_count.zst',
|
16
|
160 config['output'] + '/{quad}.all.bw.zst',
|
|
161 config['output'] + '/{quad}.unique.bw.zst',
|
|
162 config['output'] + '/{quad}.align.log',
|
|
163 config['output'] + '/{quad}.extract_jx.log',
|
|
164 config['output'] + '/{quad}.bamcount.log',
|
|
165 config['output'] + '/{quad}.align_unmapped.log',
|
|
166 config['output'] + '/{quad}.download.log',
|
|
167 config['output'] + '/{quad}.fastq_check.log',
|
|
168 config['output'] + '/{quad}.sort.log',
|
|
169 config['output'] + '/{quad}.salmon.log'
|
10
|
170 output:
|
|
171 config['output'] + '/{quad}.manifest'
|
|
172 params:
|
|
173 quad=lambda wildcards: wildcards.quad
|
|
174 run:
|
|
175 with open(output[0], 'wt') as fh:
|
|
176 for fn in FILES:
|
|
177 fh.write(params.quad + "." + fn + '\n')
|
|
178
|
|
179 def galaxy_link_files(op):
|
16
|
180 remove_steps_files()
|
10
|
181 a = [op + '/' + f for f in FILES]
|
|
182 return a
|
|
183
|
|
184 rule make_galaxy_links:
|
|
185 input:
|
|
186 config['output'] + '/{quad}.sjout.zst',
|
16
|
187 config['output'] + '/{quad}.fastq_check.tsv.zst',
|
10
|
188 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
|
|
189 config['output'] + '/{quad}.bamcount_auc.tsv',
|
|
190 config['output'] + '/{quad}.bamcount_frag.tsv',
|
|
191 config['output'] + '/{quad}.Chimeric.out.junction.zst',
|
|
192 config['output'] + '/{quad}.all.exon_bw_count.zst',
|
|
193 config['output'] + '/{quad}.unique.exon_bw_count.zst',
|
16
|
194 config['output'] + '/{quad}.all.bw.zst',
|
|
195 config['output'] + '/{quad}.unique.bw.zst',
|
|
196 config['output'] + '/{quad}.fastq_check.log',
|
|
197 config['output'] + '/{quad}.align.log',
|
|
198 config['output'] + '/{quad}.sort.log',
|
|
199 config['output'] + '/{quad}.bamcount.log',
|
10
|
200 output:
|
|
201 config['output'] + '/{quad}.done'
|
|
202 params:
|
|
203 quad=lambda wildcards: wildcards.quad,
|
|
204 out=galaxy_link_files(config['output'])
|
|
205 run:
|
|
206 for (i,fn) in enumerate(input):
|
|
207 os.symlink(os.path.abspath(fn), params.out[i])
|
16
|
208 os.symlink(os.path.abspath(input[-1]), output[0])
|
10
|
209
|
|
210
|
|
211 rule bamcount:
|
|
212 input:
|
|
213 bam=config['temp'] + '/{quad}-sorted.bam',
|
|
214 bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
|
|
215 bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed'))
|
|
216 output:
|
|
217 nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst',
|
|
218 auc=config['output'] + '/{quad}.bamcount_auc.tsv',
|
|
219 frag=config['output'] + '/{quad}.bamcount_frag.tsv',
|
|
220 all_bw=config['output'] + '/{quad}.all.bw.zst',
|
|
221 unique_bw=config['output'] + '/{quad}.unique.bw.zst',
|
|
222 all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst',
|
|
223 unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst'
|
|
224 log:
|
|
225 config['output'] + '/{quad}.bamcount.log'
|
|
226 params:
|
|
227 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
228 uniq_qual=config.get('unique_qual', 10)
|
|
229 threads: 4
|
|
230 shell:
|
|
231 """
|
|
232 TMP={config[temp]}/{params.srr}_bamcount
|
14
|
233 bamcount {input.bam} \
|
10
|
234 --threads {threads} \
|
|
235 --coverage \
|
|
236 --no-head \
|
|
237 --require-mdz \
|
|
238 --min-unique-qual {params.uniq_qual} \
|
|
239 --frag-dist ${{TMP}} \
|
|
240 --bigwig ${{TMP}} \
|
|
241 --annotation {input.bed} ${{TMP}} \
|
|
242 --auc ${{TMP}} \
|
|
243 --alts ${{TMP}} \
|
|
244 2>&1 | tee -a {log}
|
|
245
|
|
246 #
|
|
247 # --alts
|
|
248 #
|
|
249
|
|
250 (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log}
|
|
251 size=$(wc -c < {output.nonref})
|
|
252 echo "COUNT_NonrefSize ${{size}}"
|
|
253 rm -f ${{TMP}}.alts.tsv
|
|
254
|
|
255 #
|
|
256 # --auc
|
|
257 #
|
|
258 mv ${{TMP}}.auc.tsv {output.auc}
|
|
259 size=$(wc -c < {output.auc})
|
|
260 echo "COUNT_AucSize ${{size}}"
|
|
261 rm -f ${{TMP}}.auc.tsv
|
|
262
|
|
263 #
|
|
264 # --frag-dist
|
|
265 #
|
|
266 mv ${{TMP}}.frags.tsv {output.frag}
|
|
267 size=$(wc -c < {output.frag})
|
|
268 echo "COUNT_FragDistSize ${{size}}"
|
|
269 rm -f ${{TMP}}.frags.tsv
|
|
270
|
|
271 #
|
|
272 # --bigwig
|
|
273 #
|
|
274
|
|
275 (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log}
|
|
276 size=$(wc -c < {output.all_bw})
|
|
277 echo "COUNT_BwSize ${{size}}"
|
|
278 rm -f ${{TMP}}.all.bw
|
|
279
|
|
280 (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log}
|
|
281 size=$(wc -c < {output.unique_bw})
|
|
282 echo "COUNT_BwSize ${{size}}"
|
|
283 rm -f ${{TMP}}.unique.bw
|
|
284
|
|
285 #
|
|
286 # --annotation
|
|
287 #
|
|
288
|
|
289 (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log}
|
|
290 size=$(wc -c < {output.all_bw_count})
|
|
291 echo "COUNT_BwQuantSize ${{size}}"
|
|
292 rm -f ${{TMP}}.all.tsv
|
|
293
|
|
294 (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log}
|
|
295 size=$(wc -c < {output.unique_bw_count})
|
|
296 echo "COUNT_BwQuantSize ${{size}}"
|
|
297 rm -f ${{TMP}}.unique.tsv
|
|
298
|
|
299 # Check that all temporaries were properly purged
|
|
300 set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l)
|
|
301 if (( $num_files > 0 )) ; then
|
|
302 echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)"
|
|
303 exit 1
|
|
304 fi
|
|
305
|
|
306 echo "COUNT_BamcountComplete 1"
|
|
307 """
|
|
308
|
16
|
309 rule bw_zstd:
|
|
310 input:
|
|
311 config['temp'] + '/{prefix}.bw'
|
|
312 output:
|
|
313 config['output'] + '/{prefix}.bw.zst'
|
|
314 shell:
|
|
315 """
|
|
316 zstd {input} -o {output}
|
|
317 size=$(wc -c < {output})
|
|
318 echo "COUNT_BwBytes ${{size}}"
|
|
319 echo "COUNT_BwZstdComplete 1"
|
|
320 """
|
|
321
|
10
|
322 rule sort:
|
|
323 input:
|
|
324 config['temp'] + '/{quad}.bam'
|
|
325 wildcard_constraints:
|
|
326 quad="[^-]+"
|
|
327 output:
|
|
328 bam=temp(config['temp'] + '/{quad}-sorted.bam'),
|
|
329 bai=temp(config['temp'] + '/{quad}-sorted.bam.bai')
|
|
330 log:
|
|
331 config['output'] + '/{quad}.sort.log'
|
|
332 params:
|
|
333 srr=lambda wildcards: wildcards.quad.split('_')[0]
|
|
334 threads: 8
|
|
335 shell:
|
|
336 """
|
|
337 TMP="{config[temp]}/sort_temp.{params.srr}"
|
|
338 mkdir -p ${{TMP}}
|
|
339 time samtools sort \
|
|
340 -T ${{TMP}}/samtools_temp \
|
|
341 -@ {threads} \
|
|
342 -m 64M \
|
|
343 -o {output.bam} {input} 2>&1 | tee -a {log}
|
|
344 rm -rf ${{TMP}}
|
|
345 size=$(wc -c < {output.bam})
|
|
346 echo "COUNT_SortedBAMBytes ${{size}}"
|
|
347
|
|
348 time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log}
|
|
349 echo "COUNT_SortComplete 1"
|
|
350 """
|
|
351
|
16
|
352 rule salmon:
|
|
353 input:
|
|
354 reads0=config['temp'] + '/{quad}_0.fastq',
|
|
355 reads1=config['temp'] + '/{quad}_1.fastq',
|
|
356 reads2=config['temp'] + '/{quad}_2.fastq',
|
|
357 index1=lambda wildcards: '%s/%s/salmon_index/hash.bin' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
358 index2=lambda wildcards: '%s/%s/salmon_index/sa.bin' % (config['ref'], wildcards.quad.split('_')[2])
|
|
359 output:
|
|
360 config['output'] + '/{quad}.salmon.tsv.zst'
|
|
361 log:
|
|
362 config['output'] + '/{quad}.salmon.log'
|
|
363 params:
|
|
364 index_base=lambda wildcards: '%s/%s/salmon_index' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
365 salmon_args=config.get('salmon_args', '')
|
|
366 threads: 8
|
|
367 shell:
|
|
368 """
|
|
369 READ_FILES="-r {input.reads0}"
|
|
370 if [[ -s {input.reads2} ]] ; then
|
|
371 READ_FILES="-1 {input.reads1} -2 {input.reads2}"
|
|
372 fi
|
|
373 if set -o pipefail && time salmon quant \
|
|
374 --libType U \
|
|
375 --quiet \
|
|
376 --validateMappings \
|
|
377 -i {params.index_base} \
|
|
378 -p {threads} \
|
|
379 {params.salmon_args} \
|
|
380 ${{READ_FILES}} \
|
|
381 --output salmon_quant \
|
|
382 --minAssignedFrags 1 \
|
|
383 2>&1 | tee -a {log}
|
|
384 then
|
|
385 echo "COUNT_SalmonSuccess 1"
|
|
386
|
|
387 time zstd salmon_quant/quant.sf -o {output} 2>&1 | tee -a {log}
|
|
388 size=$(wc -c < {output})
|
|
389 echo "COUNT_SalmonQuantBytes ${{size}}"
|
|
390 else
|
|
391 touch {output}
|
|
392 echo "COUNT_SalmonFailure 1"
|
|
393 fi
|
|
394
|
|
395 rm -rf salmon_quant
|
|
396 echo "COUNT_SalmonComplete 1"
|
|
397 """
|
|
398
|
|
399 rule align_unmapped:
|
|
400 input:
|
|
401 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
|
|
402 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq',
|
|
403 index=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome.1.ht2' % (config['ref'], wildcards.quad.split('_')[2])
|
|
404 output:
|
|
405 bam=config['output'] + '/{quad}.unmapped.bam',
|
|
406 sample=config['output'] + '/{quad}.unmapped.fastq.zst'
|
|
407 log:
|
|
408 config['output'] + '/{quad}.align_unmapped.log'
|
|
409 params:
|
|
410 index_base=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
411 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
412 hisat2_params=config.get('hisat2', ''),
|
|
413 max_unalign=config.get('max_unalign', 100000)
|
|
414 threads: 16
|
|
415 shell:
|
|
416 """
|
|
417 TMP="{config[temp]}/align_unmapped_temp.{params.srr}"
|
|
418 READ_FILES="-1 {input.unmapped1} -2 {input.unmapped2}"
|
|
419 if [[ ! -s {input.unmapped2} ]] ; then
|
|
420 READ_FILES="-U {input.unmapped1}"
|
|
421 fi
|
|
422 time hisat2 \
|
|
423 $READ_FILES \
|
|
424 -t --mm \
|
|
425 -x {params.index_base} \
|
|
426 --threads {threads} \
|
|
427 {params.hisat2_params} \
|
|
428 --un ${{TMP}}.un \
|
|
429 --un-conc ${{TMP}}.un_conc \
|
|
430 -S ${{TMP}}.sam \
|
|
431 2>&1 | tee -a {log}
|
|
432
|
|
433 #
|
|
434 # Make BAM file out of aligned reads
|
|
435 #
|
|
436 time samtools view \
|
|
437 -b -F 4 \
|
|
438 -o {output.bam} \
|
|
439 ${{TMP}}.sam \
|
|
440 2>&1 | tee -a {log}
|
|
441 rm -f ${{TMP}}.sam
|
|
442
|
|
443 #
|
|
444 # Save a subset of the doubly unaligned reads
|
|
445 #
|
|
446 if [[ ! -s {input.unmapped2} ]] ; then
|
|
447 # unpaired
|
|
448 fastq-sample \
|
|
449 -n {params.max_unalign} \
|
|
450 -o ${{TMP}}.samp \
|
|
451 ${{TMP}}.un
|
|
452 rm -f ${{TMP}}.un
|
|
453 else
|
|
454 # paired-end
|
|
455 fastq-sample \
|
|
456 -n {params.max_unalign} \
|
|
457 -o ${{TMP}}.samp \
|
|
458 ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
|
|
459 rm -f ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
|
|
460
|
|
461 # interleave the two output fastqs into single file
|
|
462 paste ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq \
|
|
463 | paste - - - - \
|
|
464 | awk -v OFS="\n" -v FS="\t" '{{print($1,$3,$5,$7,$2,$4,$6,$8)}}' \
|
|
465 > ${{TMP}}.samp.fastq
|
|
466 rm -f ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq
|
|
467 fi
|
|
468
|
|
469 test -f ${{TMP}}.samp.fastq
|
|
470 time zstd ${{TMP}}.samp.fastq -o {output.sample} 2>&1 | tee -a {log}
|
|
471 size=$(wc -c < {output.sample})
|
|
472 echo "COUNT_UnmappedSampleBytes ${{size}}"
|
|
473 rm -f ${{TMP}}.samp.fastq
|
|
474
|
|
475 size=$(wc -c < {output.bam})
|
|
476 echo "COUNT_UnmappedBamBytes ${{size}}"
|
|
477
|
|
478 echo "COUNT_AlignUnmappedComplete 1"
|
|
479 """
|
|
480
|
|
481 rule extract_jx:
|
|
482 input:
|
|
483 bam=config['temp'] + '/{quad}-sorted.bam',
|
|
484 bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
|
|
485 fa=lambda wildcards: '%s/%s/fasta/genome.fa' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
486 gtf=lambda wildcards: '%s/%s/gtf/genes.gtf' % (config['ref'], wildcards.quad.split('_')[2])
|
|
487 output:
|
|
488 config['output'] + '/{quad}.jx_bed.zst'
|
|
489 params:
|
|
490 srr=lambda wildcards: wildcards.quad.split('_')[0]
|
|
491 log:
|
|
492 config['output'] + '/{quad}.extract_jx.log'
|
|
493 shell:
|
|
494 """
|
|
495 TMP="{config[temp]}/extract_jx.{params.srr}"
|
|
496 time regtools junctions extract \
|
|
497 -i 20 -a 1 \
|
|
498 -o ${{TMP}}.jx_tmp \
|
|
499 {input.bam} 2>&1 | tee -a {log}
|
|
500 time zstd ${{TMP}}.jx_tmp -o {output} 2>&1 | tee -a {log}
|
|
501 rm -f ${{TMP}}.jx_tmp
|
|
502
|
|
503 size=$(wc -c < {output})
|
|
504 echo "COUNT_ExtractJxBytes ${{size}}"
|
|
505
|
|
506 echo "COUNT_ExtractJxComplete 1"
|
|
507 """
|
|
508
|
10
|
509 rule align:
|
|
510 input:
|
|
511 reads0=config['temp'] + '/{quad}_0.fastq',
|
|
512 reads1=config['temp'] + '/{quad}_1.fastq',
|
|
513 reads2=config['temp'] + '/{quad}_2.fastq',
|
|
514 index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
515 index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2])
|
|
516 wildcard_constraints:
|
|
517 quad="[^-]+"
|
|
518 output:
|
|
519 bam=temp(config['temp'] + '/{quad}.bam'),
|
|
520 jxs=config['output'] + '/{quad}.sjout.zst',
|
|
521 chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst',
|
|
522 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
|
|
523 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq'
|
|
524 log:
|
|
525 config['output'] + '/{quad}.align.log'
|
|
526 params:
|
|
527 index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]),
|
|
528 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
529 star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '')
|
|
530 threads: 16
|
|
531 shell:
|
|
532 """
|
|
533 READ_FILES="{input.reads0}"
|
|
534 if [[ -s {input.reads2} ]] ; then
|
|
535 READ_FILES="{input.reads1} {input.reads2}"
|
|
536 fi
|
|
537 TMP="{config[temp]}/align_temp.{params.srr}"
|
|
538 rm -rf ${{TMP}}
|
|
539 time STAR \
|
|
540 {params.star_params} \
|
|
541 --runMode alignReads \
|
|
542 --runThreadN {threads} \
|
|
543 --genomeDir {params.index_base} \
|
|
544 --readFilesIn ${{READ_FILES}} \
|
|
545 --twopassMode None \
|
|
546 --outTmpDir ${{TMP}} \
|
|
547 --outReadsUnmapped Fastx \
|
|
548 --outMultimapperOrder Random \
|
|
549 --outSAMreadID Number \
|
|
550 --outSAMtype BAM Unsorted \
|
|
551 --outSAMmode NoQS \
|
|
552 --outSAMattributes NH MD \
|
|
553 --chimOutType Junctions \
|
|
554 --chimOutJunctionFormat 1 \
|
|
555 --chimSegmentReadGapMax 3 \
|
|
556 --chimJunctionOverhangMin 12 \
|
|
557 --chimSegmentMin 12 2>&1 | tee -a {log}
|
|
558
|
|
559 # Full set of output files:
|
|
560 #
|
|
561 # Aligned.out.bam
|
|
562 # Chimeric.out.junction
|
|
563 # Log.final.out
|
|
564 # Log.out
|
|
565 # Log.progress.out
|
|
566 # SJ.out.tab
|
|
567 # Unmapped.out.mate1
|
|
568 # Unmapped.out.mate2 (if any reads were paired-end)
|
|
569
|
|
570 #
|
|
571 # Logs
|
|
572 #
|
|
573 rm -rf ${{TMP}}
|
|
574 cat Log.out >> {log}
|
|
575 cat Log.final.out >> {log}
|
|
576 rm -f Log*.out
|
|
577
|
|
578 #
|
|
579 # Junctions
|
|
580 #
|
|
581 test -f SJ.out.tab
|
|
582 time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log}
|
|
583 rm -f SJ.out.tab
|
|
584 size=$(wc -c < {output.jxs})
|
|
585 echo "COUNT_CompressedJxBytes ${{size}}"
|
|
586
|
|
587 #
|
|
588 # Chimerics
|
|
589 #
|
|
590 test -f Chimeric.out.junction
|
|
591 test -s Chimeric.out.junction
|
|
592 sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted
|
|
593 time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log}
|
|
594 rm -f Chimeric.out.junction Chimeric.out.junction.sorted
|
|
595 size=$(wc -c < {output.chimeric})
|
|
596 echo "COUNT_ChimericBytes ${{size}}"
|
|
597
|
|
598 #
|
|
599 # Unmapped
|
|
600 #
|
|
601 touch {output.unmapped2}
|
|
602 test -f Unmapped.out.mate1
|
|
603 mv Unmapped.out.mate1 {output.unmapped1}
|
|
604 if [[ -f Unmapped.out.mate2 ]] ; then
|
|
605 mv Unmapped.out.mate2 {output.unmapped2}
|
|
606 fi
|
|
607
|
|
608 #
|
|
609 # Alignments
|
|
610 #
|
|
611 size=$(wc -c < Aligned.out.bam)
|
|
612 echo "COUNT_BAMBytes ${{size}}"
|
|
613 mv Aligned.out.bam {output.bam}
|
16
|
614
|
10
|
615 echo "COUNT_AlignComplete 1"
|
|
616 """
|
16
|
617
|
|
618 rule fastq_check:
|
|
619 input:
|
|
620 reads0=config['temp'] + '/{quad}_0.fastq',
|
|
621 reads1=config['temp'] + '/{quad}_1.fastq',
|
|
622 reads2=config['temp'] + '/{quad}_2.fastq'
|
|
623 output:
|
|
624 config['output'] + '/{quad}.fastq_check.tsv.zst'
|
|
625 log:
|
|
626 config['output'] + '/{quad}.fastq_check.log'
|
|
627 params:
|
|
628 srr=lambda wildcards: wildcards.quad.split('_')[0]
|
|
629 shell:
|
|
630 """
|
|
631 TMP="{config[temp]}/fastq_check-{params.srr}.tsv"
|
|
632 touch ${{TMP}}
|
|
633 if [[ -s {input.reads0} ]] ; then
|
|
634 time seqtk fqchk -q0 {input.reads0} >>${{TMP}} 2>>{log}
|
|
635 fi
|
|
636 if [[ -s {input.reads1} ]] ; then
|
|
637 time seqtk fqchk -q0 {input.reads1} >>${{TMP}} 2>>{log}
|
|
638 fi
|
|
639 if [[ -s {input.reads2} ]] ; then
|
|
640 time seqtk fqchk -q0 {input.reads2} >>${{TMP}} 2>>{log}
|
|
641 fi
|
|
642 time zstd ${{TMP}} -o {output} 2>&1 | tee -a {log}
|
|
643 size=$(wc -c < {output})
|
|
644 echo "COUNT_FastqCheckBytes ${{size}}"
|
|
645
|
|
646 echo "COUNT_FastqCheckComplete 1"
|
|
647 """
|
|
648
|
|
649 rule download:
|
|
650 output:
|
|
651 temp(config['temp'] + '/{quad}_0.fastq'),
|
|
652 temp(config['temp'] + '/{quad}_1.fastq'),
|
|
653 temp(config['temp'] + '/{quad}_2.fastq')
|
|
654 log:
|
|
655 config['output'] + '/{quad}.download.log'
|
|
656 params:
|
|
657 srr=lambda wildcards: wildcards.quad.split('_')[0],
|
|
658 method=lambda wildcards: wildcards.quad.split('_')[3],
|
|
659 fd_params=config.get('fastq_dump_args', ''),
|
|
660 retries=config.get('fastq_dump_retries', '5')
|
|
661 threads: 4
|
|
662 shell:
|
|
663 """
|
|
664 set -x
|
|
665 SUCCESS=0
|
|
666 TIMEOUT=10
|
|
667 PARAMS=""
|
|
668 TMP="{config[temp]}/dl-{params.srr}"
|
|
669 ! test -d ${{TMP}}
|
|
670
|
|
671 if [[ {params.method} == "sra" ]] ; then
|
|
672 USE_FASTERQ=1
|
|
673 for i in {{ 1..{params.retries} }} ; do
|
|
674 if time prefetch -t fasp -O ${{TMP}} -L info {params.srr} 2>&1 >> {log} ; then
|
|
675 SUCCESS=1
|
|
676 break
|
|
677 else
|
|
678 echo "COUNT_SraRetries 1"
|
|
679 TIMEOUT=$((${{TIMEOUT}} * 2))
|
|
680 sleep ${{TIMEOUT}}
|
|
681 fi
|
|
682 done
|
|
683 if (( $SUCCESS == 0 )) ; then
|
|
684 echo "COUNT_SraFailures 1"
|
|
685 exit 1
|
|
686 fi
|
|
687 test -f ${{TMP}}/*.sra
|
|
688 size=$(cat ${{TMP}}/*.sra | wc -c)
|
|
689 echo "COUNT_SraBytesDownloaded ${{size}}"
|
|
690 if (( ${{USE_FASTERQ}} == 1 )) ; then
|
|
691 time fasterq-dump ${{TMP}}/*.sra \
|
|
692 -e {threads} \
|
|
693 -t ${{TMP}} \
|
|
694 -L info \
|
|
695 --split-files \
|
|
696 --skip-technical \
|
|
697 -o {params.srr}.fastq \
|
|
698 2>&1 >> {log}
|
|
699 test -f {params.srr}_2.fastq || mv {params.srr}.fastq {params.srr}_0.fastq
|
|
700 else
|
|
701 time fastq-dump ${{TMP}}/*.sra \
|
|
702 -L info \
|
|
703 --split-files \
|
|
704 --skip-technical \
|
|
705 -O . \
|
|
706 2>&1 >> {log}
|
|
707 test -f {params.srr}_2.fastq || mv {params.srr}_1.fastq {params.srr}_0.fastq
|
|
708 fi
|
|
709 rm -rf ${{TMP}}
|
|
710 elif [[ {params.method} == "gdc" ]] ; then
|
|
711 TOKEN=~/gdc/latest.txt
|
|
712 if [[ ! -f ${{TOKEN}} ]] ; then
|
|
713 echo "ERROR: no GDC token file found at ${{TOKEN}}"
|
|
714 exit 1
|
|
715 fi
|
|
716 for i in {{ 1..{params.retries} }} ; do
|
|
717 if time gdc-client download \
|
|
718 -t ${{TOKEN}} \
|
|
719 --log-file {{TMP}}/log.txt \
|
|
720 -d {{TEMP}} \
|
|
721 {params.srr} 2>&1 >> {log}
|
|
722 then
|
|
723 SUCCESS=1
|
|
724 break
|
|
725 else
|
|
726 echo "COUNT_GdcRetries 1"
|
|
727 TIMEOUT=$((${{TIMEOUT}} * 2))
|
|
728 sleep ${{TIMEOUT}}
|
|
729 fi
|
|
730 done
|
|
731 if (( $SUCCESS == 0 )) ; then
|
|
732 echo "COUNT_GdcFailures 1"
|
|
733 exit 1
|
|
734 fi
|
|
735 test -d {{TEMP}}/{params.srr}
|
|
736 test -f {{TEMP}}/{params.srr}/*.tar.gz
|
|
737
|
|
738 echo "=== gdc-client log.txt begin ===" >> {log}
|
|
739 cat {{TMP}}/log.txt >> {log}
|
|
740 echo "=== gdc-client log.txt end===" >> {log}
|
|
741
|
|
742 size=$(cat {{TEMP}}/{params.srr}/*.tar.gz | wc -c)
|
|
743 echo "COUNT_GdcBytesDownloaded ${{size}}"
|
|
744
|
|
745 tar zxvf {{TEMP}}/{params.srr}/*.tar.gz
|
|
746 rm -rf {{TEMP}}
|
|
747
|
|
748 num_1s=$(ls -1 *_1.fastq | wc -l)
|
|
749 num_2s=$(ls -1 *_2.fastq | wc -l)
|
|
750 if (( ${{num_1s}} == 0 )) ; then
|
|
751 echo "ERROR: No _1.fastq files output"
|
|
752 exit 1
|
|
753 fi
|
|
754 if (( ${{num_1s}} > 1 )) ; then
|
|
755 echo "ERROR: More than one _1.fastq file found"
|
|
756 exit 1
|
|
757 fi
|
|
758 if (( ${{num_2s}} == 0 )) ; then
|
|
759 # unpaired
|
|
760 mv *_1.fastq {params.srr}_0.fastq
|
|
761 else
|
|
762 # paired-end
|
|
763 mv *_1.fastq {params.srr}_1.fastq
|
|
764 mv *_2.fastq {params.srr}_2.fastq
|
|
765 fi
|
|
766 fi
|
|
767
|
|
768 # Next chunk expects the FASTQ files to exist in the current directory
|
|
769 # named {{params.srr}}_{{0,1,2}}.fastq
|
|
770 size=0
|
|
771 for i in {{0..2}} ; do
|
|
772 fn={params.srr}_${{i}}.fastq
|
|
773 if [[ -f ${{fn}} ]] ; then
|
|
774 echo "COUNT_FilesDownloaded 1"
|
|
775 else
|
|
776 touch ${{fn}}
|
|
777 fi
|
|
778 size=$((${{size}} + $(wc -c < ${{fn}})))
|
|
779 mv ${{fn}} {config[temp]}/{wildcards.quad}_${{i}}.fastq
|
|
780 done
|
|
781 echo "COUNT_BytesDownloaded ${{size}}"
|
|
782 echo "COUNT_DownloadComplete 1"
|
|
783 """
|