Mercurial > repos > chrisw > monorail_test
comparison Snakefile @ 10:f43dd6f7c687 draft
Uploaded
| author | chrisw |
|---|---|
| date | Wed, 13 Feb 2019 15:37:51 -0500 |
| parents | |
| children | 35ed7314038d |
comparison
equal
deleted
inserted
replaced
| 9:05932011f686 | 10:f43dd6f7c687 |
|---|---|
| 1 """ | |
| 2 Parameters: | |
| 3 - star: arguments to pass to STAR aligner | |
| 4 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10] | |
| 5 """ | |
| 6 | |
| 7 STEPS = ['align', 'sort', 'bamcount'] | |
| 8 | |
| 9 FILES = ['sjout.zst', | |
| 10 'bamcount_nonref.csv.zst', | |
| 11 'bamcount_auc.tsv', | |
| 12 'bamcount_frag.tsv', | |
| 13 'Chimeric.out.junction.zst', | |
| 14 'all.exon_bw_count.zst', 'unique.exon_bw_count.zst', | |
| 15 'manifest'] | |
| 16 | |
| 17 import subprocess | |
| 18 def run_command(cmd_args): | |
| 19 cmd_args = ' '.join(cmd_args) | |
| 20 try: | |
| 21 subprocess.check_call(args=cmd_args, shell=True) | |
| 22 except subprocess.CalledProcessError as cpe: | |
| 23 sys.stderr.write("error in run_command for command: %s\n" % cmd_args) | |
| 24 raise cpe | |
| 25 | |
| 26 | |
| 27 import re | |
| 28 FASTQ_PATT=re.compile(r'([^_\.]+)_(\d+)\.((fastq)|fq)(\.gz)?$') | |
| 29 import os | |
| 30 def prep_for_galaxy_run(): | |
| 31 try: | |
| 32 os.mkdir(config['temp']) | |
| 33 except OSError as ose: | |
| 34 pass | |
| 35 fastqs = config['inputs'].split(',') | |
| 36 m = FASTQ_PATT.search(fastqs[0]) | |
| 37 run_acc = m.group(1) | |
| 38 study_acc = run_acc | |
| 39 if 'study' in config: | |
| 40 study_acc = config['study'] | |
| 41 genome = 'hg38' | |
| 42 if 'genome' in config: | |
| 43 genome = config['genome'] | |
| 44 method = 'sra' | |
| 45 # SRR,SRP,genome | |
| 46 # e.g. SRR1557855,SRP045778,ce10 | |
| 47 #create links which will be used in the align step | |
| 48 i = 1 | |
| 49 for f in fastqs: | |
| 50 newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i) | |
| 51 run_command(['zcat',f,'>',newf]) | |
| 52 #create fastq 0 | |
| 53 if i == 1: | |
| 54 try: | |
| 55 os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0)) | |
| 56 except FileExistsError as fee: | |
| 57 pass | |
| 58 i += 1 | |
| 59 #create fastq 2 if not paired | |
| 60 if i == 2: | |
| 61 open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close() | |
| 62 #create expected file structure for annotated exon bed file & reference index | |
| 63 ref = config['ref'] | |
| 64 config['ref'] = '.' | |
| 65 os.makedirs('%s/%s' % (config['ref'], genome)) | |
| 66 os.symlink(ref, '%s/%s/star_idx' % (config['ref'], genome)) | |
| 67 os.makedirs('%s/%s/gtf' % (config['ref'], genome)) | |
| 68 os.symlink(config['exon_bed'], 'exons.tmp') | |
| 69 os.symlink('../../exons.tmp', '%s/%s/gtf/%s' % (config['ref'], genome, config.get('bw_bed', 'exons.bed'))) | |
| 70 return([run_acc, study_acc, genome, method]) | |
| 71 | |
| 72 | |
| 73 def get_accessions(wildcards): | |
| 74 """ | |
| 75 Grouping of SRRs with the same SRP could happen here | |
| 76 """ | |
| 77 #if running under galaxy where the user will input the | |
| 78 #FASTQs, study, and genome directly | |
| 79 if 'inputs' in config: | |
| 80 (run_acc, study_acc, genome, method) = prep_for_galaxy_run() | |
| 81 for ext in FILES: | |
| 82 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext)) | |
| 83 #here to get the make_galaxy_links rule to fire | |
| 84 yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method)) | |
| 85 #not running under galaxy, takes a list of accessions | |
| 86 else: | |
| 87 for fn in config['input'].split(): | |
| 88 with open(fn, 'r') as fh: | |
| 89 for ln in fh: | |
| 90 if ln.count(',') < 2: | |
| 91 continue | |
| 92 toks = ln.rstrip().split(',') | |
| 93 assert 3 <= len(toks) <= 4 | |
| 94 method = 'sra' | |
| 95 if len(toks) == 4: | |
| 96 method = toks[3] | |
| 97 # SRR,SRP,genome | |
| 98 # e.g. SRR1557855,SRP045778,ce10 | |
| 99 for ext in FILES: | |
| 100 yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext)) | |
| 101 | |
| 102 rule all: | |
| 103 input: | |
| 104 get_accessions | |
| 105 | |
| 106 | |
| 107 rule make_manifest: | |
| 108 input: | |
| 109 config['output'] + '/{quad}.sjout.zst', | |
| 110 config['output'] + '/{quad}.Chimeric.out.junction.zst', | |
| 111 config['output'] + '/{quad}.bamcount_nonref.csv.zst', | |
| 112 config['output'] + '/{quad}.bamcount_auc.tsv', | |
| 113 config['output'] + '/{quad}.bamcount_frag.tsv', | |
| 114 config['output'] + '/{quad}.all.exon_bw_count.zst', | |
| 115 config['output'] + '/{quad}.unique.exon_bw_count.zst', | |
| 116 output: | |
| 117 config['output'] + '/{quad}.manifest' | |
| 118 params: | |
| 119 quad=lambda wildcards: wildcards.quad | |
| 120 run: | |
| 121 with open(output[0], 'wt') as fh: | |
| 122 for fn in FILES: | |
| 123 fh.write(params.quad + "." + fn + '\n') | |
| 124 | |
| 125 def galaxy_link_files(op): | |
| 126 a = [op + '/' + f for f in FILES] | |
| 127 a.extend([op + '/align.log', op + '/bamcount.log']) | |
| 128 return a | |
| 129 | |
| 130 rule make_galaxy_links: | |
| 131 input: | |
| 132 config['output'] + '/{quad}.sjout.zst', | |
| 133 config['output'] + '/{quad}.bamcount_nonref.csv.zst', | |
| 134 config['output'] + '/{quad}.bamcount_auc.tsv', | |
| 135 config['output'] + '/{quad}.bamcount_frag.tsv', | |
| 136 config['output'] + '/{quad}.Chimeric.out.junction.zst', | |
| 137 config['output'] + '/{quad}.all.exon_bw_count.zst', | |
| 138 config['output'] + '/{quad}.unique.exon_bw_count.zst', | |
| 139 config['output'] + '/{quad}.manifest' | |
| 140 output: | |
| 141 config['output'] + '/{quad}.done' | |
| 142 params: | |
| 143 quad=lambda wildcards: wildcards.quad, | |
| 144 out=galaxy_link_files(config['output']) | |
| 145 run: | |
| 146 inputs = input.extend([config['output'] + '/' + params.quad + '.align.log', | |
| 147 config['output'] + '/' + params.quad + '.bamcount.log']) | |
| 148 for (i,fn) in enumerate(input): | |
| 149 os.symlink(os.path.abspath(fn), params.out[i]) | |
| 150 os.symlink(os.path.abspath(input[-3]), output[0]) | |
| 151 | |
| 152 | |
| 153 rule bamcount: | |
| 154 input: | |
| 155 bam=config['temp'] + '/{quad}-sorted.bam', | |
| 156 bamidx=config['temp'] + '/{quad}-sorted.bam.bai', | |
| 157 #exe='/bamcount/bamcount', | |
| 158 exe='bamcount', | |
| 159 bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed')) | |
| 160 output: | |
| 161 nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst', | |
| 162 auc=config['output'] + '/{quad}.bamcount_auc.tsv', | |
| 163 frag=config['output'] + '/{quad}.bamcount_frag.tsv', | |
| 164 all_bw=config['output'] + '/{quad}.all.bw.zst', | |
| 165 unique_bw=config['output'] + '/{quad}.unique.bw.zst', | |
| 166 all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst', | |
| 167 unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst' | |
| 168 log: | |
| 169 config['output'] + '/{quad}.bamcount.log' | |
| 170 params: | |
| 171 srr=lambda wildcards: wildcards.quad.split('_')[0], | |
| 172 uniq_qual=config.get('unique_qual', 10) | |
| 173 threads: 4 | |
| 174 shell: | |
| 175 """ | |
| 176 TMP={config[temp]}/{params.srr}_bamcount | |
| 177 {input.exe} {input.bam} \ | |
| 178 --threads {threads} \ | |
| 179 --coverage \ | |
| 180 --no-head \ | |
| 181 --require-mdz \ | |
| 182 --min-unique-qual {params.uniq_qual} \ | |
| 183 --frag-dist ${{TMP}} \ | |
| 184 --bigwig ${{TMP}} \ | |
| 185 --annotation {input.bed} ${{TMP}} \ | |
| 186 --auc ${{TMP}} \ | |
| 187 --alts ${{TMP}} \ | |
| 188 2>&1 | tee -a {log} | |
| 189 | |
| 190 # | |
| 191 # --alts | |
| 192 # | |
| 193 | |
| 194 (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log} | |
| 195 size=$(wc -c < {output.nonref}) | |
| 196 echo "COUNT_NonrefSize ${{size}}" | |
| 197 rm -f ${{TMP}}.alts.tsv | |
| 198 | |
| 199 # | |
| 200 # --auc | |
| 201 # | |
| 202 mv ${{TMP}}.auc.tsv {output.auc} | |
| 203 size=$(wc -c < {output.auc}) | |
| 204 echo "COUNT_AucSize ${{size}}" | |
| 205 rm -f ${{TMP}}.auc.tsv | |
| 206 | |
| 207 # | |
| 208 # --frag-dist | |
| 209 # | |
| 210 mv ${{TMP}}.frags.tsv {output.frag} | |
| 211 size=$(wc -c < {output.frag}) | |
| 212 echo "COUNT_FragDistSize ${{size}}" | |
| 213 rm -f ${{TMP}}.frags.tsv | |
| 214 | |
| 215 # | |
| 216 # --bigwig | |
| 217 # | |
| 218 | |
| 219 (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log} | |
| 220 size=$(wc -c < {output.all_bw}) | |
| 221 echo "COUNT_BwSize ${{size}}" | |
| 222 rm -f ${{TMP}}.all.bw | |
| 223 | |
| 224 (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log} | |
| 225 size=$(wc -c < {output.unique_bw}) | |
| 226 echo "COUNT_BwSize ${{size}}" | |
| 227 rm -f ${{TMP}}.unique.bw | |
| 228 | |
| 229 # | |
| 230 # --annotation | |
| 231 # | |
| 232 | |
| 233 (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log} | |
| 234 size=$(wc -c < {output.all_bw_count}) | |
| 235 echo "COUNT_BwQuantSize ${{size}}" | |
| 236 rm -f ${{TMP}}.all.tsv | |
| 237 | |
| 238 (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log} | |
| 239 size=$(wc -c < {output.unique_bw_count}) | |
| 240 echo "COUNT_BwQuantSize ${{size}}" | |
| 241 rm -f ${{TMP}}.unique.tsv | |
| 242 | |
| 243 # Check that all temporaries were properly purged | |
| 244 set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l) | |
| 245 if (( $num_files > 0 )) ; then | |
| 246 echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)" | |
| 247 exit 1 | |
| 248 fi | |
| 249 | |
| 250 echo "COUNT_BamcountComplete 1" | |
| 251 """ | |
| 252 | |
| 253 rule sort: | |
| 254 input: | |
| 255 config['temp'] + '/{quad}.bam' | |
| 256 wildcard_constraints: | |
| 257 quad="[^-]+" | |
| 258 output: | |
| 259 bam=temp(config['temp'] + '/{quad}-sorted.bam'), | |
| 260 bai=temp(config['temp'] + '/{quad}-sorted.bam.bai') | |
| 261 log: | |
| 262 config['output'] + '/{quad}.sort.log' | |
| 263 params: | |
| 264 srr=lambda wildcards: wildcards.quad.split('_')[0] | |
| 265 threads: 8 | |
| 266 shell: | |
| 267 """ | |
| 268 TMP="{config[temp]}/sort_temp.{params.srr}" | |
| 269 mkdir -p ${{TMP}} | |
| 270 time samtools sort \ | |
| 271 -T ${{TMP}}/samtools_temp \ | |
| 272 -@ {threads} \ | |
| 273 -m 64M \ | |
| 274 -o {output.bam} {input} 2>&1 | tee -a {log} | |
| 275 rm -rf ${{TMP}} | |
| 276 size=$(wc -c < {output.bam}) | |
| 277 echo "COUNT_SortedBAMBytes ${{size}}" | |
| 278 | |
| 279 time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log} | |
| 280 echo "COUNT_SortComplete 1" | |
| 281 """ | |
| 282 | |
| 283 rule align: | |
| 284 input: | |
| 285 reads0=config['temp'] + '/{quad}_0.fastq', | |
| 286 reads1=config['temp'] + '/{quad}_1.fastq', | |
| 287 reads2=config['temp'] + '/{quad}_2.fastq', | |
| 288 index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]), | |
| 289 index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2]) | |
| 290 wildcard_constraints: | |
| 291 quad="[^-]+" | |
| 292 output: | |
| 293 bam=temp(config['temp'] + '/{quad}.bam'), | |
| 294 jxs=config['output'] + '/{quad}.sjout.zst', | |
| 295 chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst', | |
| 296 unmapped1=config['temp'] + '/{quad}_1.unmappedfastq', | |
| 297 unmapped2=config['temp'] + '/{quad}_2.unmappedfastq' | |
| 298 log: | |
| 299 config['output'] + '/{quad}.align.log' | |
| 300 params: | |
| 301 index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]), | |
| 302 srr=lambda wildcards: wildcards.quad.split('_')[0], | |
| 303 star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '') | |
| 304 threads: 16 | |
| 305 shell: | |
| 306 """ | |
| 307 READ_FILES="{input.reads0}" | |
| 308 if [[ -s {input.reads2} ]] ; then | |
| 309 READ_FILES="{input.reads1} {input.reads2}" | |
| 310 fi | |
| 311 TMP="{config[temp]}/align_temp.{params.srr}" | |
| 312 rm -rf ${{TMP}} | |
| 313 time STAR \ | |
| 314 {params.star_params} \ | |
| 315 --runMode alignReads \ | |
| 316 --runThreadN {threads} \ | |
| 317 --genomeDir {params.index_base} \ | |
| 318 --readFilesIn ${{READ_FILES}} \ | |
| 319 --twopassMode None \ | |
| 320 --outTmpDir ${{TMP}} \ | |
| 321 --outReadsUnmapped Fastx \ | |
| 322 --outMultimapperOrder Random \ | |
| 323 --outSAMreadID Number \ | |
| 324 --outSAMtype BAM Unsorted \ | |
| 325 --outSAMmode NoQS \ | |
| 326 --outSAMattributes NH MD \ | |
| 327 --chimOutType Junctions \ | |
| 328 --chimOutJunctionFormat 1 \ | |
| 329 --chimSegmentReadGapMax 3 \ | |
| 330 --chimJunctionOverhangMin 12 \ | |
| 331 --chimSegmentMin 12 2>&1 | tee -a {log} | |
| 332 | |
| 333 # Full set of output files: | |
| 334 # | |
| 335 # Aligned.out.bam | |
| 336 # Chimeric.out.junction | |
| 337 # Log.final.out | |
| 338 # Log.out | |
| 339 # Log.progress.out | |
| 340 # SJ.out.tab | |
| 341 # Unmapped.out.mate1 | |
| 342 # Unmapped.out.mate2 (if any reads were paired-end) | |
| 343 | |
| 344 # | |
| 345 # Logs | |
| 346 # | |
| 347 rm -rf ${{TMP}} | |
| 348 cat Log.out >> {log} | |
| 349 cat Log.final.out >> {log} | |
| 350 rm -f Log*.out | |
| 351 | |
| 352 # | |
| 353 # Junctions | |
| 354 # | |
| 355 test -f SJ.out.tab | |
| 356 time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log} | |
| 357 rm -f SJ.out.tab | |
| 358 size=$(wc -c < {output.jxs}) | |
| 359 echo "COUNT_CompressedJxBytes ${{size}}" | |
| 360 | |
| 361 # | |
| 362 # Chimerics | |
| 363 # | |
| 364 test -f Chimeric.out.junction | |
| 365 test -s Chimeric.out.junction | |
| 366 sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted | |
| 367 time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log} | |
| 368 rm -f Chimeric.out.junction Chimeric.out.junction.sorted | |
| 369 size=$(wc -c < {output.chimeric}) | |
| 370 echo "COUNT_ChimericBytes ${{size}}" | |
| 371 | |
| 372 # | |
| 373 # Unmapped | |
| 374 # | |
| 375 touch {output.unmapped2} | |
| 376 test -f Unmapped.out.mate1 | |
| 377 mv Unmapped.out.mate1 {output.unmapped1} | |
| 378 if [[ -f Unmapped.out.mate2 ]] ; then | |
| 379 mv Unmapped.out.mate2 {output.unmapped2} | |
| 380 fi | |
| 381 | |
| 382 # | |
| 383 # Alignments | |
| 384 # | |
| 385 size=$(wc -c < Aligned.out.bam) | |
| 386 echo "COUNT_BAMBytes ${{size}}" | |
| 387 mv Aligned.out.bam {output.bam} | |
| 388 echo "COUNT_AlignComplete 1" | |
| 389 """ |
