annotate Snakefile @ 15:3fa68a0d59c5 draft

Uploaded
author chrisw
date Wed, 13 Feb 2019 17:03:56 -0500
parents f2258de365ee
children d2770bc432e1
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
1 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
2 Parameters:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
3 - star: arguments to pass to STAR aligner
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
4 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10]
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
5 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
6
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
7 STEPS = ['align', 'sort', 'bamcount']
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
8
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
9 FILES = ['sjout.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
10 'bamcount_nonref.csv.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
11 'bamcount_auc.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
12 'bamcount_frag.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
13 'Chimeric.out.junction.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
14 'all.exon_bw_count.zst', 'unique.exon_bw_count.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
15 'manifest']
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
16
14
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
17 #get rid of the pesky warnings about current paths
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
18 #and make sure we're not assuming any wrong relative paths
12
849ab444b4f1 Uploaded
chrisw
parents: 11
diff changeset
19 import os
849ab444b4f1 Uploaded
chrisw
parents: 11
diff changeset
20 config['output']=os.path.abspath(config['output'])
13
af1f49e5242d Uploaded
chrisw
parents: 12
diff changeset
21 config['temp']=os.path.abspath(config['temp'])
14
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
22 config['exon_bed']=os.path.abspath(config['exon_bed'])
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
23 config['ref']=os.path.abspath(config['ref'])
12
849ab444b4f1 Uploaded
chrisw
parents: 11
diff changeset
24
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
25 import subprocess
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
26 def run_command(cmd_args):
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
27 cmd_args = ' '.join(cmd_args)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
28 try:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
29 subprocess.check_call(args=cmd_args, shell=True)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
30 except subprocess.CalledProcessError as cpe:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
31 sys.stderr.write("error in run_command for command: %s\n" % cmd_args)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
32 raise cpe
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
33
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
34
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
35 import re
11
35ed7314038d Uploaded
chrisw
parents: 10
diff changeset
36 FASTQ_PATT=re.compile(r'([^_\.]+)(_(\d+))?\.((fastq)|fq)(\.gz)?$')
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
37 def prep_for_galaxy_run():
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
38 try:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
39 os.mkdir(config['temp'])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
40 except OSError as ose:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
41 pass
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
42 fastqs = config['inputs'].split(',')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
43 m = FASTQ_PATT.search(fastqs[0])
11
35ed7314038d Uploaded
chrisw
parents: 10
diff changeset
44 run_acc = 'sample'
35ed7314038d Uploaded
chrisw
parents: 10
diff changeset
45 if m is not None:
35ed7314038d Uploaded
chrisw
parents: 10
diff changeset
46 run_acc = m.group(1)
12
849ab444b4f1 Uploaded
chrisw
parents: 11
diff changeset
47 study_acc = 'study'
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
48 if 'study' in config:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
49 study_acc = config['study']
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
50 genome = 'hg38'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
51 if 'genome' in config:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
52 genome = config['genome']
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
53 method = 'sra'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
54 # SRR,SRP,genome
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
55 # e.g. SRR1557855,SRP045778,ce10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
56 #create links which will be used in the align step
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
57 i = 1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
58 for f in fastqs:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
59 newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i)
15
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
60 if 'compressed' in config:
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
61 run_command(['zcat',f,'>',newf])
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
62 else:
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
63 os.symlink(os.path.abspath(f), newf)
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
64 #create fastq 0
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
65 if i == 1:
15
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
66 os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0))
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
67 i += 1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
68 #create fastq 2 if not paired
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
69 if i == 2:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
70 open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close()
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
71 #create expected file structure for annotated exon bed file & reference index
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
72 ref = config['ref']
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
73 config['ref'] = '.'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
74 os.makedirs('%s/%s' % (config['ref'], genome))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
75 os.symlink(ref, '%s/%s/star_idx' % (config['ref'], genome))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
76 os.makedirs('%s/%s/gtf' % (config['ref'], genome))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
77 os.symlink(config['exon_bed'], 'exons.tmp')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
78 os.symlink('../../exons.tmp', '%s/%s/gtf/%s' % (config['ref'], genome, config.get('bw_bed', 'exons.bed')))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
79 return([run_acc, study_acc, genome, method])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
80
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
81
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
82 def get_accessions(wildcards):
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
83 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
84 Grouping of SRRs with the same SRP could happen here
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
85 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
86 #if running under galaxy where the user will input the
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
87 #FASTQs, study, and genome directly
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
88 if 'inputs' in config:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
89 (run_acc, study_acc, genome, method) = prep_for_galaxy_run()
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
90 for ext in FILES:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
91 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
92 #here to get the make_galaxy_links rule to fire
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
93 yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
94 #not running under galaxy, takes a list of accessions
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
95 else:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
96 for fn in config['input'].split():
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
97 with open(fn, 'r') as fh:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
98 for ln in fh:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
99 if ln.count(',') < 2:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
100 continue
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
101 toks = ln.rstrip().split(',')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
102 assert 3 <= len(toks) <= 4
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
103 method = 'sra'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
104 if len(toks) == 4:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
105 method = toks[3]
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
106 # SRR,SRP,genome
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
107 # e.g. SRR1557855,SRP045778,ce10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
108 for ext in FILES:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
109 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
110
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
111 rule all:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
112 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
113 get_accessions
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
114
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
115
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
116 rule make_manifest:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
117 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
118 config['output'] + '/{quad}.sjout.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
119 config['output'] + '/{quad}.Chimeric.out.junction.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
120 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
121 config['output'] + '/{quad}.bamcount_auc.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
122 config['output'] + '/{quad}.bamcount_frag.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
123 config['output'] + '/{quad}.all.exon_bw_count.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
124 config['output'] + '/{quad}.unique.exon_bw_count.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
125 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
126 config['output'] + '/{quad}.manifest'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
127 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
128 quad=lambda wildcards: wildcards.quad
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
129 run:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
130 with open(output[0], 'wt') as fh:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
131 for fn in FILES:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
132 fh.write(params.quad + "." + fn + '\n')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
133
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
134 def galaxy_link_files(op):
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
135 a = [op + '/' + f for f in FILES]
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
136 a.extend([op + '/align.log', op + '/bamcount.log'])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
137 return a
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
138
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
139 rule make_galaxy_links:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
140 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
141 config['output'] + '/{quad}.sjout.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
142 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
143 config['output'] + '/{quad}.bamcount_auc.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
144 config['output'] + '/{quad}.bamcount_frag.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
145 config['output'] + '/{quad}.Chimeric.out.junction.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
146 config['output'] + '/{quad}.all.exon_bw_count.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
147 config['output'] + '/{quad}.unique.exon_bw_count.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
148 config['output'] + '/{quad}.manifest'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
149 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
150 config['output'] + '/{quad}.done'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
151 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
152 quad=lambda wildcards: wildcards.quad,
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
153 out=galaxy_link_files(config['output'])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
154 run:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
155 inputs = input.extend([config['output'] + '/' + params.quad + '.align.log',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
156 config['output'] + '/' + params.quad + '.bamcount.log'])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
157 for (i,fn) in enumerate(input):
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
158 os.symlink(os.path.abspath(fn), params.out[i])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
159 os.symlink(os.path.abspath(input[-3]), output[0])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
160
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
161
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
162 rule bamcount:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
163 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
164 bam=config['temp'] + '/{quad}-sorted.bam',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
165 bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
166 bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed'))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
167 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
168 nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
169 auc=config['output'] + '/{quad}.bamcount_auc.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
170 frag=config['output'] + '/{quad}.bamcount_frag.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
171 all_bw=config['output'] + '/{quad}.all.bw.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
172 unique_bw=config['output'] + '/{quad}.unique.bw.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
173 all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
174 unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
175 log:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
176 config['output'] + '/{quad}.bamcount.log'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
177 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
178 srr=lambda wildcards: wildcards.quad.split('_')[0],
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
179 uniq_qual=config.get('unique_qual', 10)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
180 threads: 4
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
181 shell:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
182 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
183 TMP={config[temp]}/{params.srr}_bamcount
14
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
184 bamcount {input.bam} \
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
185 --threads {threads} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
186 --coverage \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
187 --no-head \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
188 --require-mdz \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
189 --min-unique-qual {params.uniq_qual} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
190 --frag-dist ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
191 --bigwig ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
192 --annotation {input.bed} ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
193 --auc ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
194 --alts ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
195 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
196
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
197 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
198 # --alts
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
199 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
200
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
201 (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
202 size=$(wc -c < {output.nonref})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
203 echo "COUNT_NonrefSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
204 rm -f ${{TMP}}.alts.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
205
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
206 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
207 # --auc
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
208 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
209 mv ${{TMP}}.auc.tsv {output.auc}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
210 size=$(wc -c < {output.auc})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
211 echo "COUNT_AucSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
212 rm -f ${{TMP}}.auc.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
213
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
214 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
215 # --frag-dist
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
216 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
217 mv ${{TMP}}.frags.tsv {output.frag}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
218 size=$(wc -c < {output.frag})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
219 echo "COUNT_FragDistSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
220 rm -f ${{TMP}}.frags.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
221
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
222 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
223 # --bigwig
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
224 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
225
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
226 (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
227 size=$(wc -c < {output.all_bw})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
228 echo "COUNT_BwSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
229 rm -f ${{TMP}}.all.bw
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
230
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
231 (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
232 size=$(wc -c < {output.unique_bw})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
233 echo "COUNT_BwSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
234 rm -f ${{TMP}}.unique.bw
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
235
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
236 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
237 # --annotation
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
238 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
239
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
240 (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
241 size=$(wc -c < {output.all_bw_count})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
242 echo "COUNT_BwQuantSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
243 rm -f ${{TMP}}.all.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
244
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
245 (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
246 size=$(wc -c < {output.unique_bw_count})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
247 echo "COUNT_BwQuantSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
248 rm -f ${{TMP}}.unique.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
249
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
250 # Check that all temporaries were properly purged
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
251 set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
252 if (( $num_files > 0 )) ; then
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
253 echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
254 exit 1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
255 fi
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
256
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
257 echo "COUNT_BamcountComplete 1"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
258 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
259
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
260 rule sort:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
261 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
262 config['temp'] + '/{quad}.bam'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
263 wildcard_constraints:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
264 quad="[^-]+"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
265 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
266 bam=temp(config['temp'] + '/{quad}-sorted.bam'),
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
267 bai=temp(config['temp'] + '/{quad}-sorted.bam.bai')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
268 log:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
269 config['output'] + '/{quad}.sort.log'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
270 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
271 srr=lambda wildcards: wildcards.quad.split('_')[0]
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
272 threads: 8
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
273 shell:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
274 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
275 TMP="{config[temp]}/sort_temp.{params.srr}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
276 mkdir -p ${{TMP}}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
277 time samtools sort \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
278 -T ${{TMP}}/samtools_temp \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
279 -@ {threads} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
280 -m 64M \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
281 -o {output.bam} {input} 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
282 rm -rf ${{TMP}}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
283 size=$(wc -c < {output.bam})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
284 echo "COUNT_SortedBAMBytes ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
285
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
286 time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
287 echo "COUNT_SortComplete 1"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
288 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
289
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
290 rule align:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
291 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
292 reads0=config['temp'] + '/{quad}_0.fastq',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
293 reads1=config['temp'] + '/{quad}_1.fastq',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
294 reads2=config['temp'] + '/{quad}_2.fastq',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
295 index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]),
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
296 index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
297 wildcard_constraints:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
298 quad="[^-]+"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
299 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
300 bam=temp(config['temp'] + '/{quad}.bam'),
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
301 jxs=config['output'] + '/{quad}.sjout.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
302 chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
303 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
304 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
305 log:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
306 config['output'] + '/{quad}.align.log'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
307 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
308 index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]),
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
309 srr=lambda wildcards: wildcards.quad.split('_')[0],
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
310 star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
311 threads: 16
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
312 shell:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
313 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
314 READ_FILES="{input.reads0}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
315 if [[ -s {input.reads2} ]] ; then
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
316 READ_FILES="{input.reads1} {input.reads2}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
317 fi
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
318 TMP="{config[temp]}/align_temp.{params.srr}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
319 rm -rf ${{TMP}}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
320 time STAR \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
321 {params.star_params} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
322 --runMode alignReads \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
323 --runThreadN {threads} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
324 --genomeDir {params.index_base} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
325 --readFilesIn ${{READ_FILES}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
326 --twopassMode None \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
327 --outTmpDir ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
328 --outReadsUnmapped Fastx \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
329 --outMultimapperOrder Random \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
330 --outSAMreadID Number \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
331 --outSAMtype BAM Unsorted \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
332 --outSAMmode NoQS \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
333 --outSAMattributes NH MD \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
334 --chimOutType Junctions \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
335 --chimOutJunctionFormat 1 \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
336 --chimSegmentReadGapMax 3 \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
337 --chimJunctionOverhangMin 12 \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
338 --chimSegmentMin 12 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
339
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
340 # Full set of output files:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
341 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
342 # Aligned.out.bam
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
343 # Chimeric.out.junction
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
344 # Log.final.out
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
345 # Log.out
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
346 # Log.progress.out
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
347 # SJ.out.tab
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
348 # Unmapped.out.mate1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
349 # Unmapped.out.mate2 (if any reads were paired-end)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
350
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
351 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
352 # Logs
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
353 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
354 rm -rf ${{TMP}}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
355 cat Log.out >> {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
356 cat Log.final.out >> {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
357 rm -f Log*.out
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
358
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
359 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
360 # Junctions
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
361 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
362 test -f SJ.out.tab
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
363 time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
364 rm -f SJ.out.tab
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
365 size=$(wc -c < {output.jxs})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
366 echo "COUNT_CompressedJxBytes ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
367
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
368 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
369 # Chimerics
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
370 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
371 test -f Chimeric.out.junction
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
372 test -s Chimeric.out.junction
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
373 sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
374 time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
375 rm -f Chimeric.out.junction Chimeric.out.junction.sorted
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
376 size=$(wc -c < {output.chimeric})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
377 echo "COUNT_ChimericBytes ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
378
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
379 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
380 # Unmapped
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
381 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
382 touch {output.unmapped2}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
383 test -f Unmapped.out.mate1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
384 mv Unmapped.out.mate1 {output.unmapped1}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
385 if [[ -f Unmapped.out.mate2 ]] ; then
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
386 mv Unmapped.out.mate2 {output.unmapped2}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
387 fi
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
388
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
389 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
390 # Alignments
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
391 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
392 size=$(wc -c < Aligned.out.bam)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
393 echo "COUNT_BAMBytes ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
394 mv Aligned.out.bam {output.bam}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
395 echo "COUNT_AlignComplete 1"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
396 """