annotate Snakefile @ 34:358a1cf83b7b draft

Uploaded
author chrisw
date Tue, 19 Nov 2019 05:44:09 +0000
parents 13df910d0683
children e4dc3d0c31de
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
1 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
2 Parameters:
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
3 - fastq_dump_args: arguments to pass to fastq dumping tool
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
4 - fastq_dump_retries: number of retry attempts before dying
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
5 - fastq_dump_tool: name of tool to run, e.g. 'fastq-dump' or 'fasterq-dump'
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
6 - star: arguments to pass to STAR aligner
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
7 - salmon_args: arguments to pass to salmon quant
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
8 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10]
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
9 - bw_bed: name of BED file to use with bwtool
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
10 - max_unalign: maximum number of unaligned reads to save per run accession
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
11 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
12
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
13 STEPS = ['download', 'fastq_check', 'align', 'sort',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
14 'bamcount',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
15 'salmon',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
16 'align_unmapped',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
17 'extract_jx']
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
18
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
19 FILES = ['sjout.zst', 'fastq_check.tsv.zst',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
20 'unmapped.bam', 'unmapped.fastq.zst',
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
21 'bamcount_nonref.csv.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
22 'bamcount_auc.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
23 'bamcount_frag.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
24 'Chimeric.out.junction.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
25 'all.exon_bw_count.zst', 'unique.exon_bw_count.zst',
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
26 'all.bw.zst', 'unique.bw.zst',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
27 'salmon.tsv.zst',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
28 'jx_bed.zst',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
29 'manifest'] + list(map(lambda x: x + '.log', STEPS))
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
30
14
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
31 #get rid of the pesky warnings about current paths
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
32 #and make sure we're not assuming any wrong relative paths
12
849ab444b4f1 Uploaded
chrisw
parents: 11
diff changeset
33 import os
849ab444b4f1 Uploaded
chrisw
parents: 11
diff changeset
34 config['output']=os.path.abspath(config['output'])
13
af1f49e5242d Uploaded
chrisw
parents: 12
diff changeset
35 config['temp']=os.path.abspath(config['temp'])
14
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
36 config['exon_bed']=os.path.abspath(config['exon_bed'])
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
37 config['ref']=os.path.abspath(config['ref'])
12
849ab444b4f1 Uploaded
chrisw
parents: 11
diff changeset
38
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
39 import subprocess
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
40 def run_command(cmd_args):
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
41 cmd_args = ' '.join(cmd_args)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
42 try:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
43 subprocess.check_call(args=cmd_args, shell=True)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
44 except subprocess.CalledProcessError as cpe:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
45 sys.stderr.write("error in run_command for command: %s\n" % cmd_args)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
46 raise cpe
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
47
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
48 import re
32
13df910d0683 Uploaded
chrisw
parents: 31
diff changeset
49 STEPS_FILES_FILTER=re.compile(r'(unmapped)|(download)|(salmon)|(extract_jx)|(jx_bed)|(manifest)|(nonref)|(Chimeric)')
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
50 def remove_steps_files():
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
51 #modify STEP and FILES
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
52 #so we don't run download or unmapped steps
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
53 global FILES
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
54 global STEPS
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
55 for i in reversed(range(0,len(STEPS))):
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
56 f = STEPS[i]
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
57 if STEPS_FILES_FILTER.search(f):
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
58 del(STEPS[i])
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
59 for i in reversed(range(0,len(FILES))):
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
60 f = FILES[i]
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
61 if STEPS_FILES_FILTER.search(f):
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
62 del(FILES[i])
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
63
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
64
11
35ed7314038d Uploaded
chrisw
parents: 10
diff changeset
65 FASTQ_PATT=re.compile(r'([^_\.]+)(_(\d+))?\.((fastq)|fq)(\.gz)?$')
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
66 def prep_for_galaxy_run():
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
67 remove_steps_files()
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
68 try:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
69 os.mkdir(config['temp'])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
70 except OSError as ose:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
71 pass
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
72 fastqs = config['inputs'].split(',')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
73 m = FASTQ_PATT.search(fastqs[0])
11
35ed7314038d Uploaded
chrisw
parents: 10
diff changeset
74 run_acc = 'sample'
35ed7314038d Uploaded
chrisw
parents: 10
diff changeset
75 if m is not None:
35ed7314038d Uploaded
chrisw
parents: 10
diff changeset
76 run_acc = m.group(1)
12
849ab444b4f1 Uploaded
chrisw
parents: 11
diff changeset
77 study_acc = 'study'
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
78 if 'study' in config:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
79 study_acc = config['study']
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
80 genome = 'hg38'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
81 if 'genome' in config:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
82 genome = config['genome']
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
83 method = 'sra'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
84 # SRR,SRP,genome
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
85 # e.g. SRR1557855,SRP045778,ce10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
86 #create links which will be used in the align step
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
87 i = 1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
88 for f in fastqs:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
89 newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i)
15
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
90 if 'compressed' in config:
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
91 run_command(['zcat',f,'>',newf])
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
92 else:
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
93 os.symlink(os.path.abspath(f), newf)
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
94 #create fastq 0
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
95 if i == 1:
15
3fa68a0d59c5 Uploaded
chrisw
parents: 14
diff changeset
96 os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0))
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
97 i += 1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
98 #create fastq 2 if not paired
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
99 if i == 2:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
100 open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close()
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
101 return([run_acc, study_acc, genome, method])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
102
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
103
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
104 def get_accessions(wildcards):
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
105 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
106 Grouping of SRRs with the same SRP could happen here
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
107 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
108 #if running under galaxy where the user will input the
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
109 #FASTQs, study, and genome directly
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
110 if 'inputs' in config:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
111 (run_acc, study_acc, genome, method) = prep_for_galaxy_run()
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
112 for ext in FILES:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
113 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
114 #here to get the make_galaxy_links rule to fire
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
115 yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
116 #not running under galaxy, takes a list of accessions
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
117 else:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
118 for fn in config['input'].split():
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
119 with open(fn, 'r') as fh:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
120 for ln in fh:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
121 if ln.count(',') < 2:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
122 continue
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
123 toks = ln.rstrip().split(',')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
124 assert 3 <= len(toks) <= 4
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
125 method = 'sra'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
126 if len(toks) == 4:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
127 method = toks[3]
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
128 # SRR,SRP,genome
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
129 # e.g. SRR1557855,SRP045778,ce10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
130 for ext in FILES:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
131 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
132
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
133 rule all:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
134 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
135 get_accessions
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
136
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
137 rule make_manifest:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
138 input:
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
139 config['output'] + '/{quad}.salmon.tsv.zst',
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
140 config['output'] + '/{quad}.sjout.zst',
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
141 config['output'] + '/{quad}.jx_bed.zst',
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
142 config['output'] + '/{quad}.Chimeric.out.junction.zst',
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
143 config['output'] + '/{quad}.unmapped.bam',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
144 config['output'] + '/{quad}.unmapped.fastq.zst',
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
145 config['output'] + '/{quad}.bamcount_nonref.csv.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
146 config['output'] + '/{quad}.bamcount_auc.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
147 config['output'] + '/{quad}.bamcount_frag.tsv',
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
148 config['output'] + '/{quad}.fastq_check.tsv.zst',
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
149 config['output'] + '/{quad}.all.exon_bw_count.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
150 config['output'] + '/{quad}.unique.exon_bw_count.zst',
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
151 config['output'] + '/{quad}.all.bw.zst',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
152 config['output'] + '/{quad}.unique.bw.zst',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
153 config['output'] + '/{quad}.align.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
154 config['output'] + '/{quad}.extract_jx.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
155 config['output'] + '/{quad}.bamcount.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
156 config['output'] + '/{quad}.align_unmapped.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
157 config['output'] + '/{quad}.download.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
158 config['output'] + '/{quad}.fastq_check.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
159 config['output'] + '/{quad}.sort.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
160 config['output'] + '/{quad}.salmon.log'
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
161 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
162 config['output'] + '/{quad}.manifest'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
163 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
164 quad=lambda wildcards: wildcards.quad
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
165 run:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
166 with open(output[0], 'wt') as fh:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
167 for fn in FILES:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
168 fh.write(params.quad + "." + fn + '\n')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
169
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
170 def galaxy_link_files(op):
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
171 remove_steps_files()
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
172 a = [op + '/' + f for f in FILES]
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
173 return a
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
174
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
175 rule make_galaxy_links:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
176 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
177 config['output'] + '/{quad}.sjout.zst',
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
178 config['output'] + '/{quad}.fastq_check.tsv.zst',
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
179 config['output'] + '/{quad}.bamcount_auc.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
180 config['output'] + '/{quad}.bamcount_frag.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
181 config['output'] + '/{quad}.all.exon_bw_count.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
182 config['output'] + '/{quad}.unique.exon_bw_count.zst',
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
183 config['output'] + '/{quad}.all.bw.zst',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
184 config['output'] + '/{quad}.unique.bw.zst',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
185 config['output'] + '/{quad}.fastq_check.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
186 config['output'] + '/{quad}.align.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
187 config['output'] + '/{quad}.sort.log',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
188 config['output'] + '/{quad}.bamcount.log',
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
189 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
190 config['output'] + '/{quad}.done'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
191 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
192 quad=lambda wildcards: wildcards.quad,
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
193 out=galaxy_link_files(config['output'])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
194 run:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
195 for (i,fn) in enumerate(input):
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
196 os.symlink(os.path.abspath(fn), params.out[i])
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
197 os.symlink(os.path.abspath(input[-1]), output[0])
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
198
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
199
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
200 rule bamcount:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
201 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
202 bam=config['temp'] + '/{quad}-sorted.bam',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
203 bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
204 bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed'))
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
205 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
206 nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
207 auc=config['output'] + '/{quad}.bamcount_auc.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
208 frag=config['output'] + '/{quad}.bamcount_frag.tsv',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
209 all_bw=config['output'] + '/{quad}.all.bw.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
210 unique_bw=config['output'] + '/{quad}.unique.bw.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
211 all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
212 unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
213 log:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
214 config['output'] + '/{quad}.bamcount.log'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
215 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
216 srr=lambda wildcards: wildcards.quad.split('_')[0],
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
217 uniq_qual=config.get('unique_qual', 10)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
218 threads: 4
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
219 shell:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
220 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
221 TMP={config[temp]}/{params.srr}_bamcount
14
f2258de365ee Uploaded
chrisw
parents: 13
diff changeset
222 bamcount {input.bam} \
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
223 --threads {threads} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
224 --coverage \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
225 --no-head \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
226 --require-mdz \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
227 --min-unique-qual {params.uniq_qual} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
228 --frag-dist ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
229 --bigwig ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
230 --annotation {input.bed} ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
231 --auc ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
232 --alts ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
233 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
234
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
235 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
236 # --alts
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
237 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
238
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
239 (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
240 size=$(wc -c < {output.nonref})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
241 echo "COUNT_NonrefSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
242 rm -f ${{TMP}}.alts.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
243
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
244 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
245 # --auc
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
246 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
247 mv ${{TMP}}.auc.tsv {output.auc}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
248 size=$(wc -c < {output.auc})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
249 echo "COUNT_AucSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
250 rm -f ${{TMP}}.auc.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
251
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
252 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
253 # --frag-dist
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
254 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
255 mv ${{TMP}}.frags.tsv {output.frag}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
256 size=$(wc -c < {output.frag})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
257 echo "COUNT_FragDistSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
258 rm -f ${{TMP}}.frags.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
259
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
260 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
261 # --bigwig
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
262 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
263
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
264 (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
265 size=$(wc -c < {output.all_bw})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
266 echo "COUNT_BwSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
267 rm -f ${{TMP}}.all.bw
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
268
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
269 (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
270 size=$(wc -c < {output.unique_bw})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
271 echo "COUNT_BwSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
272 rm -f ${{TMP}}.unique.bw
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
273
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
274 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
275 # --annotation
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
276 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
277
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
278 (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
279 size=$(wc -c < {output.all_bw_count})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
280 echo "COUNT_BwQuantSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
281 rm -f ${{TMP}}.all.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
282
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
283 (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
284 size=$(wc -c < {output.unique_bw_count})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
285 echo "COUNT_BwQuantSize ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
286 rm -f ${{TMP}}.unique.tsv
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
287
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
288 # Check that all temporaries were properly purged
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
289 set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
290 if (( $num_files > 0 )) ; then
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
291 echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
292 exit 1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
293 fi
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
294
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
295 echo "COUNT_BamcountComplete 1"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
296 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
297
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
298 rule bw_zstd:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
299 input:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
300 config['temp'] + '/{prefix}.bw'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
301 output:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
302 config['output'] + '/{prefix}.bw.zst'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
303 shell:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
304 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
305 zstd {input} -o {output}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
306 size=$(wc -c < {output})
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
307 echo "COUNT_BwBytes ${{size}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
308 echo "COUNT_BwZstdComplete 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
309 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
310
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
311 rule sort:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
312 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
313 config['temp'] + '/{quad}.bam'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
314 wildcard_constraints:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
315 quad="[^-]+"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
316 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
317 bam=temp(config['temp'] + '/{quad}-sorted.bam'),
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
318 bai=temp(config['temp'] + '/{quad}-sorted.bam.bai')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
319 log:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
320 config['output'] + '/{quad}.sort.log'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
321 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
322 srr=lambda wildcards: wildcards.quad.split('_')[0]
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
323 threads: 8
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
324 shell:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
325 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
326 TMP="{config[temp]}/sort_temp.{params.srr}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
327 mkdir -p ${{TMP}}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
328 time samtools sort \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
329 -T ${{TMP}}/samtools_temp \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
330 -@ {threads} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
331 -m 64M \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
332 -o {output.bam} {input} 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
333 rm -rf ${{TMP}}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
334 size=$(wc -c < {output.bam})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
335 echo "COUNT_SortedBAMBytes ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
336
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
337 time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
338 echo "COUNT_SortComplete 1"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
339 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
340
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
341 rule salmon:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
342 input:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
343 reads0=config['temp'] + '/{quad}_0.fastq',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
344 reads1=config['temp'] + '/{quad}_1.fastq',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
345 reads2=config['temp'] + '/{quad}_2.fastq',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
346 index1=lambda wildcards: '%s/%s/salmon_index/hash.bin' % (config['ref'], wildcards.quad.split('_')[2]),
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
347 index2=lambda wildcards: '%s/%s/salmon_index/sa.bin' % (config['ref'], wildcards.quad.split('_')[2])
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
348 output:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
349 config['output'] + '/{quad}.salmon.tsv.zst'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
350 log:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
351 config['output'] + '/{quad}.salmon.log'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
352 params:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
353 index_base=lambda wildcards: '%s/%s/salmon_index' % (config['ref'], wildcards.quad.split('_')[2]),
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
354 salmon_args=config.get('salmon_args', '')
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
355 threads: 8
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
356 shell:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
357 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
358 READ_FILES="-r {input.reads0}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
359 if [[ -s {input.reads2} ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
360 READ_FILES="-1 {input.reads1} -2 {input.reads2}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
361 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
362 if set -o pipefail && time salmon quant \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
363 --libType U \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
364 --quiet \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
365 --validateMappings \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
366 -i {params.index_base} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
367 -p {threads} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
368 {params.salmon_args} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
369 ${{READ_FILES}} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
370 --output salmon_quant \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
371 --minAssignedFrags 1 \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
372 2>&1 | tee -a {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
373 then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
374 echo "COUNT_SalmonSuccess 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
375
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
376 time zstd salmon_quant/quant.sf -o {output} 2>&1 | tee -a {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
377 size=$(wc -c < {output})
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
378 echo "COUNT_SalmonQuantBytes ${{size}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
379 else
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
380 touch {output}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
381 echo "COUNT_SalmonFailure 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
382 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
383
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
384 rm -rf salmon_quant
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
385 echo "COUNT_SalmonComplete 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
386 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
387
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
388 rule align_unmapped:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
389 input:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
390 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
391 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
392 index=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome.1.ht2' % (config['ref'], wildcards.quad.split('_')[2])
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
393 output:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
394 bam=config['output'] + '/{quad}.unmapped.bam',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
395 sample=config['output'] + '/{quad}.unmapped.fastq.zst'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
396 log:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
397 config['output'] + '/{quad}.align_unmapped.log'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
398 params:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
399 index_base=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome' % (config['ref'], wildcards.quad.split('_')[2]),
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
400 srr=lambda wildcards: wildcards.quad.split('_')[0],
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
401 hisat2_params=config.get('hisat2', ''),
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
402 max_unalign=config.get('max_unalign', 100000)
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
403 threads: 16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
404 shell:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
405 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
406 TMP="{config[temp]}/align_unmapped_temp.{params.srr}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
407 READ_FILES="-1 {input.unmapped1} -2 {input.unmapped2}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
408 if [[ ! -s {input.unmapped2} ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
409 READ_FILES="-U {input.unmapped1}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
410 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
411 time hisat2 \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
412 $READ_FILES \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
413 -t --mm \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
414 -x {params.index_base} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
415 --threads {threads} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
416 {params.hisat2_params} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
417 --un ${{TMP}}.un \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
418 --un-conc ${{TMP}}.un_conc \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
419 -S ${{TMP}}.sam \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
420 2>&1 | tee -a {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
421
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
422 #
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
423 # Make BAM file out of aligned reads
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
424 #
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
425 time samtools view \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
426 -b -F 4 \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
427 -o {output.bam} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
428 ${{TMP}}.sam \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
429 2>&1 | tee -a {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
430 rm -f ${{TMP}}.sam
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
431
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
432 #
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
433 # Save a subset of the doubly unaligned reads
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
434 #
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
435 if [[ ! -s {input.unmapped2} ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
436 # unpaired
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
437 fastq-sample \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
438 -n {params.max_unalign} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
439 -o ${{TMP}}.samp \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
440 ${{TMP}}.un
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
441 rm -f ${{TMP}}.un
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
442 else
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
443 # paired-end
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
444 fastq-sample \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
445 -n {params.max_unalign} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
446 -o ${{TMP}}.samp \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
447 ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
448 rm -f ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
449
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
450 # interleave the two output fastqs into single file
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
451 paste ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
452 | paste - - - - \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
453 | awk -v OFS="\n" -v FS="\t" '{{print($1,$3,$5,$7,$2,$4,$6,$8)}}' \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
454 > ${{TMP}}.samp.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
455 rm -f ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
456 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
457
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
458 test -f ${{TMP}}.samp.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
459 time zstd ${{TMP}}.samp.fastq -o {output.sample} 2>&1 | tee -a {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
460 size=$(wc -c < {output.sample})
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
461 echo "COUNT_UnmappedSampleBytes ${{size}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
462 rm -f ${{TMP}}.samp.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
463
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
464 size=$(wc -c < {output.bam})
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
465 echo "COUNT_UnmappedBamBytes ${{size}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
466
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
467 echo "COUNT_AlignUnmappedComplete 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
468 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
469
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
470 rule extract_jx:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
471 input:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
472 bam=config['temp'] + '/{quad}-sorted.bam',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
473 bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
474 fa=lambda wildcards: '%s/%s/fasta/genome.fa' % (config['ref'], wildcards.quad.split('_')[2]),
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
475 gtf=lambda wildcards: '%s/%s/gtf/genes.gtf' % (config['ref'], wildcards.quad.split('_')[2])
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
476 output:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
477 config['output'] + '/{quad}.jx_bed.zst'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
478 params:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
479 srr=lambda wildcards: wildcards.quad.split('_')[0]
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
480 log:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
481 config['output'] + '/{quad}.extract_jx.log'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
482 shell:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
483 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
484 TMP="{config[temp]}/extract_jx.{params.srr}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
485 time regtools junctions extract \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
486 -i 20 -a 1 \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
487 -o ${{TMP}}.jx_tmp \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
488 {input.bam} 2>&1 | tee -a {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
489 time zstd ${{TMP}}.jx_tmp -o {output} 2>&1 | tee -a {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
490 rm -f ${{TMP}}.jx_tmp
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
491
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
492 size=$(wc -c < {output})
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
493 echo "COUNT_ExtractJxBytes ${{size}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
494
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
495 echo "COUNT_ExtractJxComplete 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
496 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
497
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
498 rule align:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
499 input:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
500 reads0=config['temp'] + '/{quad}_0.fastq',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
501 reads1=config['temp'] + '/{quad}_1.fastq',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
502 reads2=config['temp'] + '/{quad}_2.fastq',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
503 index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]),
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
504 index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2])
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
505 wildcard_constraints:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
506 quad="[^-]+"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
507 output:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
508 bam=temp(config['temp'] + '/{quad}.bam'),
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
509 jxs=config['output'] + '/{quad}.sjout.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
510 chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
511 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
512 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
513 log:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
514 config['output'] + '/{quad}.align.log'
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
515 params:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
516 index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]),
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
517 srr=lambda wildcards: wildcards.quad.split('_')[0],
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
518 star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '')
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
519 threads: 16
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
520 shell:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
521 """
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
522 READ_FILES="{input.reads0}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
523 if [[ -s {input.reads2} ]] ; then
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
524 READ_FILES="{input.reads1} {input.reads2}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
525 fi
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
526 TMP="{config[temp]}/align_temp.{params.srr}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
527 rm -rf ${{TMP}}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
528 time STAR \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
529 {params.star_params} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
530 --runMode alignReads \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
531 --runThreadN {threads} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
532 --genomeDir {params.index_base} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
533 --readFilesIn ${{READ_FILES}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
534 --twopassMode None \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
535 --outTmpDir ${{TMP}} \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
536 --outReadsUnmapped Fastx \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
537 --outMultimapperOrder Random \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
538 --outSAMreadID Number \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
539 --outSAMtype BAM Unsorted \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
540 --outSAMmode NoQS \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
541 --outSAMattributes NH MD \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
542 --chimOutType Junctions \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
543 --chimOutJunctionFormat 1 \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
544 --chimSegmentReadGapMax 3 \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
545 --chimJunctionOverhangMin 12 \
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
546 --chimSegmentMin 12 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
547
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
548 # Full set of output files:
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
549 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
550 # Aligned.out.bam
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
551 # Chimeric.out.junction
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
552 # Log.final.out
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
553 # Log.out
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
554 # Log.progress.out
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
555 # SJ.out.tab
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
556 # Unmapped.out.mate1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
557 # Unmapped.out.mate2 (if any reads were paired-end)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
558
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
559 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
560 # Logs
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
561 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
562 rm -rf ${{TMP}}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
563 cat Log.out >> {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
564 cat Log.final.out >> {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
565 rm -f Log*.out
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
566
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
567 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
568 # Junctions
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
569 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
570 test -f SJ.out.tab
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
571 time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
572 rm -f SJ.out.tab
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
573 size=$(wc -c < {output.jxs})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
574 echo "COUNT_CompressedJxBytes ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
575
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
576 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
577 # Chimerics
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
578 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
579 test -f Chimeric.out.junction
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
580 test -s Chimeric.out.junction
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
581 sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
582 time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
583 rm -f Chimeric.out.junction Chimeric.out.junction.sorted
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
584 size=$(wc -c < {output.chimeric})
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
585 echo "COUNT_ChimericBytes ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
586
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
587 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
588 # Unmapped
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
589 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
590 touch {output.unmapped2}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
591 test -f Unmapped.out.mate1
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
592 mv Unmapped.out.mate1 {output.unmapped1}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
593 if [[ -f Unmapped.out.mate2 ]] ; then
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
594 mv Unmapped.out.mate2 {output.unmapped2}
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
595 fi
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
596
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
597 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
598 # Alignments
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
599 #
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
600 size=$(wc -c < Aligned.out.bam)
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
601 echo "COUNT_BAMBytes ${{size}}"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
602 mv Aligned.out.bam {output.bam}
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
603
10
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
604 echo "COUNT_AlignComplete 1"
f43dd6f7c687 Uploaded
chrisw
parents:
diff changeset
605 """
16
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
606
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
607 rule fastq_check:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
608 input:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
609 reads0=config['temp'] + '/{quad}_0.fastq',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
610 reads1=config['temp'] + '/{quad}_1.fastq',
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
611 reads2=config['temp'] + '/{quad}_2.fastq'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
612 output:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
613 config['output'] + '/{quad}.fastq_check.tsv.zst'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
614 log:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
615 config['output'] + '/{quad}.fastq_check.log'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
616 params:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
617 srr=lambda wildcards: wildcards.quad.split('_')[0]
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
618 shell:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
619 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
620 TMP="{config[temp]}/fastq_check-{params.srr}.tsv"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
621 touch ${{TMP}}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
622 if [[ -s {input.reads0} ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
623 time seqtk fqchk -q0 {input.reads0} >>${{TMP}} 2>>{log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
624 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
625 if [[ -s {input.reads1} ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
626 time seqtk fqchk -q0 {input.reads1} >>${{TMP}} 2>>{log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
627 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
628 if [[ -s {input.reads2} ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
629 time seqtk fqchk -q0 {input.reads2} >>${{TMP}} 2>>{log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
630 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
631 time zstd ${{TMP}} -o {output} 2>&1 | tee -a {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
632 size=$(wc -c < {output})
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
633 echo "COUNT_FastqCheckBytes ${{size}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
634
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
635 echo "COUNT_FastqCheckComplete 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
636 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
637
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
638 rule download:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
639 output:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
640 temp(config['temp'] + '/{quad}_0.fastq'),
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
641 temp(config['temp'] + '/{quad}_1.fastq'),
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
642 temp(config['temp'] + '/{quad}_2.fastq')
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
643 log:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
644 config['output'] + '/{quad}.download.log'
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
645 params:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
646 srr=lambda wildcards: wildcards.quad.split('_')[0],
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
647 method=lambda wildcards: wildcards.quad.split('_')[3],
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
648 fd_params=config.get('fastq_dump_args', ''),
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
649 retries=config.get('fastq_dump_retries', '5')
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
650 threads: 4
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
651 shell:
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
652 """
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
653 set -x
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
654 SUCCESS=0
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
655 TIMEOUT=10
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
656 PARAMS=""
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
657 TMP="{config[temp]}/dl-{params.srr}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
658 ! test -d ${{TMP}}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
659
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
660 if [[ {params.method} == "sra" ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
661 USE_FASTERQ=1
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
662 for i in {{ 1..{params.retries} }} ; do
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
663 if time prefetch -t fasp -O ${{TMP}} -L info {params.srr} 2>&1 >> {log} ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
664 SUCCESS=1
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
665 break
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
666 else
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
667 echo "COUNT_SraRetries 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
668 TIMEOUT=$((${{TIMEOUT}} * 2))
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
669 sleep ${{TIMEOUT}}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
670 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
671 done
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
672 if (( $SUCCESS == 0 )) ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
673 echo "COUNT_SraFailures 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
674 exit 1
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
675 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
676 test -f ${{TMP}}/*.sra
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
677 size=$(cat ${{TMP}}/*.sra | wc -c)
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
678 echo "COUNT_SraBytesDownloaded ${{size}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
679 if (( ${{USE_FASTERQ}} == 1 )) ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
680 time fasterq-dump ${{TMP}}/*.sra \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
681 -e {threads} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
682 -t ${{TMP}} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
683 -L info \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
684 --split-files \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
685 --skip-technical \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
686 -o {params.srr}.fastq \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
687 2>&1 >> {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
688 test -f {params.srr}_2.fastq || mv {params.srr}.fastq {params.srr}_0.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
689 else
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
690 time fastq-dump ${{TMP}}/*.sra \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
691 -L info \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
692 --split-files \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
693 --skip-technical \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
694 -O . \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
695 2>&1 >> {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
696 test -f {params.srr}_2.fastq || mv {params.srr}_1.fastq {params.srr}_0.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
697 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
698 rm -rf ${{TMP}}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
699 elif [[ {params.method} == "gdc" ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
700 TOKEN=~/gdc/latest.txt
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
701 if [[ ! -f ${{TOKEN}} ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
702 echo "ERROR: no GDC token file found at ${{TOKEN}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
703 exit 1
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
704 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
705 for i in {{ 1..{params.retries} }} ; do
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
706 if time gdc-client download \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
707 -t ${{TOKEN}} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
708 --log-file {{TMP}}/log.txt \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
709 -d {{TEMP}} \
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
710 {params.srr} 2>&1 >> {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
711 then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
712 SUCCESS=1
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
713 break
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
714 else
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
715 echo "COUNT_GdcRetries 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
716 TIMEOUT=$((${{TIMEOUT}} * 2))
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
717 sleep ${{TIMEOUT}}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
718 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
719 done
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
720 if (( $SUCCESS == 0 )) ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
721 echo "COUNT_GdcFailures 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
722 exit 1
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
723 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
724 test -d {{TEMP}}/{params.srr}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
725 test -f {{TEMP}}/{params.srr}/*.tar.gz
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
726
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
727 echo "=== gdc-client log.txt begin ===" >> {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
728 cat {{TMP}}/log.txt >> {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
729 echo "=== gdc-client log.txt end===" >> {log}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
730
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
731 size=$(cat {{TEMP}}/{params.srr}/*.tar.gz | wc -c)
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
732 echo "COUNT_GdcBytesDownloaded ${{size}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
733
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
734 tar zxvf {{TEMP}}/{params.srr}/*.tar.gz
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
735 rm -rf {{TEMP}}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
736
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
737 num_1s=$(ls -1 *_1.fastq | wc -l)
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
738 num_2s=$(ls -1 *_2.fastq | wc -l)
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
739 if (( ${{num_1s}} == 0 )) ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
740 echo "ERROR: No _1.fastq files output"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
741 exit 1
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
742 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
743 if (( ${{num_1s}} > 1 )) ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
744 echo "ERROR: More than one _1.fastq file found"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
745 exit 1
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
746 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
747 if (( ${{num_2s}} == 0 )) ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
748 # unpaired
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
749 mv *_1.fastq {params.srr}_0.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
750 else
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
751 # paired-end
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
752 mv *_1.fastq {params.srr}_1.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
753 mv *_2.fastq {params.srr}_2.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
754 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
755 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
756
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
757 # Next chunk expects the FASTQ files to exist in the current directory
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
758 # named {{params.srr}}_{{0,1,2}}.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
759 size=0
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
760 for i in {{0..2}} ; do
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
761 fn={params.srr}_${{i}}.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
762 if [[ -f ${{fn}} ]] ; then
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
763 echo "COUNT_FilesDownloaded 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
764 else
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
765 touch ${{fn}}
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
766 fi
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
767 size=$((${{size}} + $(wc -c < ${{fn}})))
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
768 mv ${{fn}} {config[temp]}/{wildcards.quad}_${{i}}.fastq
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
769 done
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
770 echo "COUNT_BytesDownloaded ${{size}}"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
771 echo "COUNT_DownloadComplete 1"
d2770bc432e1 Uploaded
chrisw
parents: 15
diff changeset
772 """