comparison Snakefile @ 10:f43dd6f7c687 draft

Uploaded
author chrisw
date Wed, 13 Feb 2019 15:37:51 -0500
parents
children 35ed7314038d
comparison
equal deleted inserted replaced
9:05932011f686 10:f43dd6f7c687
1 """
2 Parameters:
3 - star: arguments to pass to STAR aligner
4 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10]
5 """
6
7 STEPS = ['align', 'sort', 'bamcount']
8
9 FILES = ['sjout.zst',
10 'bamcount_nonref.csv.zst',
11 'bamcount_auc.tsv',
12 'bamcount_frag.tsv',
13 'Chimeric.out.junction.zst',
14 'all.exon_bw_count.zst', 'unique.exon_bw_count.zst',
15 'manifest']
16
17 import subprocess
18 def run_command(cmd_args):
19 cmd_args = ' '.join(cmd_args)
20 try:
21 subprocess.check_call(args=cmd_args, shell=True)
22 except subprocess.CalledProcessError as cpe:
23 sys.stderr.write("error in run_command for command: %s\n" % cmd_args)
24 raise cpe
25
26
27 import re
28 FASTQ_PATT=re.compile(r'([^_\.]+)_(\d+)\.((fastq)|fq)(\.gz)?$')
29 import os
30 def prep_for_galaxy_run():
31 try:
32 os.mkdir(config['temp'])
33 except OSError as ose:
34 pass
35 fastqs = config['inputs'].split(',')
36 m = FASTQ_PATT.search(fastqs[0])
37 run_acc = m.group(1)
38 study_acc = run_acc
39 if 'study' in config:
40 study_acc = config['study']
41 genome = 'hg38'
42 if 'genome' in config:
43 genome = config['genome']
44 method = 'sra'
45 # SRR,SRP,genome
46 # e.g. SRR1557855,SRP045778,ce10
47 #create links which will be used in the align step
48 i = 1
49 for f in fastqs:
50 newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i)
51 run_command(['zcat',f,'>',newf])
52 #create fastq 0
53 if i == 1:
54 try:
55 os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0))
56 except FileExistsError as fee:
57 pass
58 i += 1
59 #create fastq 2 if not paired
60 if i == 2:
61 open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close()
62 #create expected file structure for annotated exon bed file & reference index
63 ref = config['ref']
64 config['ref'] = '.'
65 os.makedirs('%s/%s' % (config['ref'], genome))
66 os.symlink(ref, '%s/%s/star_idx' % (config['ref'], genome))
67 os.makedirs('%s/%s/gtf' % (config['ref'], genome))
68 os.symlink(config['exon_bed'], 'exons.tmp')
69 os.symlink('../../exons.tmp', '%s/%s/gtf/%s' % (config['ref'], genome, config.get('bw_bed', 'exons.bed')))
70 return([run_acc, study_acc, genome, method])
71
72
73 def get_accessions(wildcards):
74 """
75 Grouping of SRRs with the same SRP could happen here
76 """
77 #if running under galaxy where the user will input the
78 #FASTQs, study, and genome directly
79 if 'inputs' in config:
80 (run_acc, study_acc, genome, method) = prep_for_galaxy_run()
81 for ext in FILES:
82 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext))
83 #here to get the make_galaxy_links rule to fire
84 yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method))
85 #not running under galaxy, takes a list of accessions
86 else:
87 for fn in config['input'].split():
88 with open(fn, 'r') as fh:
89 for ln in fh:
90 if ln.count(',') < 2:
91 continue
92 toks = ln.rstrip().split(',')
93 assert 3 <= len(toks) <= 4
94 method = 'sra'
95 if len(toks) == 4:
96 method = toks[3]
97 # SRR,SRP,genome
98 # e.g. SRR1557855,SRP045778,ce10
99 for ext in FILES:
100 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext))
101
102 rule all:
103 input:
104 get_accessions
105
106
107 rule make_manifest:
108 input:
109 config['output'] + '/{quad}.sjout.zst',
110 config['output'] + '/{quad}.Chimeric.out.junction.zst',
111 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
112 config['output'] + '/{quad}.bamcount_auc.tsv',
113 config['output'] + '/{quad}.bamcount_frag.tsv',
114 config['output'] + '/{quad}.all.exon_bw_count.zst',
115 config['output'] + '/{quad}.unique.exon_bw_count.zst',
116 output:
117 config['output'] + '/{quad}.manifest'
118 params:
119 quad=lambda wildcards: wildcards.quad
120 run:
121 with open(output[0], 'wt') as fh:
122 for fn in FILES:
123 fh.write(params.quad + "." + fn + '\n')
124
125 def galaxy_link_files(op):
126 a = [op + '/' + f for f in FILES]
127 a.extend([op + '/align.log', op + '/bamcount.log'])
128 return a
129
130 rule make_galaxy_links:
131 input:
132 config['output'] + '/{quad}.sjout.zst',
133 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
134 config['output'] + '/{quad}.bamcount_auc.tsv',
135 config['output'] + '/{quad}.bamcount_frag.tsv',
136 config['output'] + '/{quad}.Chimeric.out.junction.zst',
137 config['output'] + '/{quad}.all.exon_bw_count.zst',
138 config['output'] + '/{quad}.unique.exon_bw_count.zst',
139 config['output'] + '/{quad}.manifest'
140 output:
141 config['output'] + '/{quad}.done'
142 params:
143 quad=lambda wildcards: wildcards.quad,
144 out=galaxy_link_files(config['output'])
145 run:
146 inputs = input.extend([config['output'] + '/' + params.quad + '.align.log',
147 config['output'] + '/' + params.quad + '.bamcount.log'])
148 for (i,fn) in enumerate(input):
149 os.symlink(os.path.abspath(fn), params.out[i])
150 os.symlink(os.path.abspath(input[-3]), output[0])
151
152
153 rule bamcount:
154 input:
155 bam=config['temp'] + '/{quad}-sorted.bam',
156 bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
157 #exe='/bamcount/bamcount',
158 exe='bamcount',
159 bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed'))
160 output:
161 nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst',
162 auc=config['output'] + '/{quad}.bamcount_auc.tsv',
163 frag=config['output'] + '/{quad}.bamcount_frag.tsv',
164 all_bw=config['output'] + '/{quad}.all.bw.zst',
165 unique_bw=config['output'] + '/{quad}.unique.bw.zst',
166 all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst',
167 unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst'
168 log:
169 config['output'] + '/{quad}.bamcount.log'
170 params:
171 srr=lambda wildcards: wildcards.quad.split('_')[0],
172 uniq_qual=config.get('unique_qual', 10)
173 threads: 4
174 shell:
175 """
176 TMP={config[temp]}/{params.srr}_bamcount
177 {input.exe} {input.bam} \
178 --threads {threads} \
179 --coverage \
180 --no-head \
181 --require-mdz \
182 --min-unique-qual {params.uniq_qual} \
183 --frag-dist ${{TMP}} \
184 --bigwig ${{TMP}} \
185 --annotation {input.bed} ${{TMP}} \
186 --auc ${{TMP}} \
187 --alts ${{TMP}} \
188 2>&1 | tee -a {log}
189
190 #
191 # --alts
192 #
193
194 (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log}
195 size=$(wc -c < {output.nonref})
196 echo "COUNT_NonrefSize ${{size}}"
197 rm -f ${{TMP}}.alts.tsv
198
199 #
200 # --auc
201 #
202 mv ${{TMP}}.auc.tsv {output.auc}
203 size=$(wc -c < {output.auc})
204 echo "COUNT_AucSize ${{size}}"
205 rm -f ${{TMP}}.auc.tsv
206
207 #
208 # --frag-dist
209 #
210 mv ${{TMP}}.frags.tsv {output.frag}
211 size=$(wc -c < {output.frag})
212 echo "COUNT_FragDistSize ${{size}}"
213 rm -f ${{TMP}}.frags.tsv
214
215 #
216 # --bigwig
217 #
218
219 (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log}
220 size=$(wc -c < {output.all_bw})
221 echo "COUNT_BwSize ${{size}}"
222 rm -f ${{TMP}}.all.bw
223
224 (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log}
225 size=$(wc -c < {output.unique_bw})
226 echo "COUNT_BwSize ${{size}}"
227 rm -f ${{TMP}}.unique.bw
228
229 #
230 # --annotation
231 #
232
233 (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log}
234 size=$(wc -c < {output.all_bw_count})
235 echo "COUNT_BwQuantSize ${{size}}"
236 rm -f ${{TMP}}.all.tsv
237
238 (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log}
239 size=$(wc -c < {output.unique_bw_count})
240 echo "COUNT_BwQuantSize ${{size}}"
241 rm -f ${{TMP}}.unique.tsv
242
243 # Check that all temporaries were properly purged
244 set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l)
245 if (( $num_files > 0 )) ; then
246 echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)"
247 exit 1
248 fi
249
250 echo "COUNT_BamcountComplete 1"
251 """
252
253 rule sort:
254 input:
255 config['temp'] + '/{quad}.bam'
256 wildcard_constraints:
257 quad="[^-]+"
258 output:
259 bam=temp(config['temp'] + '/{quad}-sorted.bam'),
260 bai=temp(config['temp'] + '/{quad}-sorted.bam.bai')
261 log:
262 config['output'] + '/{quad}.sort.log'
263 params:
264 srr=lambda wildcards: wildcards.quad.split('_')[0]
265 threads: 8
266 shell:
267 """
268 TMP="{config[temp]}/sort_temp.{params.srr}"
269 mkdir -p ${{TMP}}
270 time samtools sort \
271 -T ${{TMP}}/samtools_temp \
272 -@ {threads} \
273 -m 64M \
274 -o {output.bam} {input} 2>&1 | tee -a {log}
275 rm -rf ${{TMP}}
276 size=$(wc -c < {output.bam})
277 echo "COUNT_SortedBAMBytes ${{size}}"
278
279 time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log}
280 echo "COUNT_SortComplete 1"
281 """
282
283 rule align:
284 input:
285 reads0=config['temp'] + '/{quad}_0.fastq',
286 reads1=config['temp'] + '/{quad}_1.fastq',
287 reads2=config['temp'] + '/{quad}_2.fastq',
288 index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]),
289 index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2])
290 wildcard_constraints:
291 quad="[^-]+"
292 output:
293 bam=temp(config['temp'] + '/{quad}.bam'),
294 jxs=config['output'] + '/{quad}.sjout.zst',
295 chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst',
296 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
297 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq'
298 log:
299 config['output'] + '/{quad}.align.log'
300 params:
301 index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]),
302 srr=lambda wildcards: wildcards.quad.split('_')[0],
303 star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '')
304 threads: 16
305 shell:
306 """
307 READ_FILES="{input.reads0}"
308 if [[ -s {input.reads2} ]] ; then
309 READ_FILES="{input.reads1} {input.reads2}"
310 fi
311 TMP="{config[temp]}/align_temp.{params.srr}"
312 rm -rf ${{TMP}}
313 time STAR \
314 {params.star_params} \
315 --runMode alignReads \
316 --runThreadN {threads} \
317 --genomeDir {params.index_base} \
318 --readFilesIn ${{READ_FILES}} \
319 --twopassMode None \
320 --outTmpDir ${{TMP}} \
321 --outReadsUnmapped Fastx \
322 --outMultimapperOrder Random \
323 --outSAMreadID Number \
324 --outSAMtype BAM Unsorted \
325 --outSAMmode NoQS \
326 --outSAMattributes NH MD \
327 --chimOutType Junctions \
328 --chimOutJunctionFormat 1 \
329 --chimSegmentReadGapMax 3 \
330 --chimJunctionOverhangMin 12 \
331 --chimSegmentMin 12 2>&1 | tee -a {log}
332
333 # Full set of output files:
334 #
335 # Aligned.out.bam
336 # Chimeric.out.junction
337 # Log.final.out
338 # Log.out
339 # Log.progress.out
340 # SJ.out.tab
341 # Unmapped.out.mate1
342 # Unmapped.out.mate2 (if any reads were paired-end)
343
344 #
345 # Logs
346 #
347 rm -rf ${{TMP}}
348 cat Log.out >> {log}
349 cat Log.final.out >> {log}
350 rm -f Log*.out
351
352 #
353 # Junctions
354 #
355 test -f SJ.out.tab
356 time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log}
357 rm -f SJ.out.tab
358 size=$(wc -c < {output.jxs})
359 echo "COUNT_CompressedJxBytes ${{size}}"
360
361 #
362 # Chimerics
363 #
364 test -f Chimeric.out.junction
365 test -s Chimeric.out.junction
366 sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted
367 time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log}
368 rm -f Chimeric.out.junction Chimeric.out.junction.sorted
369 size=$(wc -c < {output.chimeric})
370 echo "COUNT_ChimericBytes ${{size}}"
371
372 #
373 # Unmapped
374 #
375 touch {output.unmapped2}
376 test -f Unmapped.out.mate1
377 mv Unmapped.out.mate1 {output.unmapped1}
378 if [[ -f Unmapped.out.mate2 ]] ; then
379 mv Unmapped.out.mate2 {output.unmapped2}
380 fi
381
382 #
383 # Alignments
384 #
385 size=$(wc -c < Aligned.out.bam)
386 echo "COUNT_BAMBytes ${{size}}"
387 mv Aligned.out.bam {output.bam}
388 echo "COUNT_AlignComplete 1"
389 """