| 10 | 1 """ | 
|  | 2 Parameters: | 
|  | 3 - star: arguments to pass to STAR aligner | 
|  | 4 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10] | 
|  | 5 """ | 
|  | 6 | 
|  | 7 STEPS = ['align', 'sort', 'bamcount'] | 
|  | 8 | 
|  | 9 FILES = ['sjout.zst', | 
|  | 10          'bamcount_nonref.csv.zst', | 
|  | 11          'bamcount_auc.tsv', | 
|  | 12          'bamcount_frag.tsv', | 
|  | 13          'Chimeric.out.junction.zst', | 
|  | 14          'all.exon_bw_count.zst', 'unique.exon_bw_count.zst', | 
|  | 15          'manifest'] | 
|  | 16 | 
|  | 17 import subprocess | 
|  | 18 def run_command(cmd_args): | 
|  | 19     cmd_args = ' '.join(cmd_args) | 
|  | 20     try: | 
|  | 21         subprocess.check_call(args=cmd_args, shell=True) | 
|  | 22     except subprocess.CalledProcessError as cpe: | 
|  | 23         sys.stderr.write("error in run_command for command: %s\n" % cmd_args) | 
|  | 24         raise cpe | 
|  | 25 | 
|  | 26 | 
|  | 27 import re | 
|  | 28 FASTQ_PATT=re.compile(r'([^_\.]+)_(\d+)\.((fastq)|fq)(\.gz)?$') | 
|  | 29 import os | 
|  | 30 def prep_for_galaxy_run(): | 
|  | 31     try: | 
|  | 32         os.mkdir(config['temp']) | 
|  | 33     except OSError as ose: | 
|  | 34         pass | 
|  | 35     fastqs = config['inputs'].split(',') | 
|  | 36     m = FASTQ_PATT.search(fastqs[0]) | 
|  | 37     run_acc = m.group(1) | 
|  | 38     study_acc = run_acc | 
|  | 39     if 'study' in config: | 
|  | 40         study_acc = config['study'] | 
|  | 41     genome = 'hg38' | 
|  | 42     if 'genome' in config: | 
|  | 43         genome = config['genome'] | 
|  | 44     method = 'sra' | 
|  | 45     # SRR,SRP,genome | 
|  | 46     # e.g. SRR1557855,SRP045778,ce10 | 
|  | 47     #create links which will be used in the align step | 
|  | 48     i = 1 | 
|  | 49     for f in fastqs: | 
|  | 50         newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i) | 
|  | 51         run_command(['zcat',f,'>',newf]) | 
|  | 52         #create fastq 0 | 
|  | 53         if i == 1: | 
|  | 54             try: | 
|  | 55                 os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0)) | 
|  | 56             except FileExistsError as fee: | 
|  | 57                 pass | 
|  | 58         i += 1 | 
|  | 59     #create fastq 2 if not paired | 
|  | 60     if i == 2: | 
|  | 61         open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close() | 
|  | 62     #create expected file structure for annotated exon bed file & reference index | 
|  | 63     ref = config['ref'] | 
|  | 64     config['ref'] = '.' | 
|  | 65     os.makedirs('%s/%s' % (config['ref'], genome)) | 
|  | 66     os.symlink(ref, '%s/%s/star_idx' % (config['ref'], genome)) | 
|  | 67     os.makedirs('%s/%s/gtf' % (config['ref'], genome)) | 
|  | 68     os.symlink(config['exon_bed'], 'exons.tmp') | 
|  | 69     os.symlink('../../exons.tmp', '%s/%s/gtf/%s' % (config['ref'], genome, config.get('bw_bed', 'exons.bed'))) | 
|  | 70     return([run_acc, study_acc, genome, method]) | 
|  | 71 | 
|  | 72 | 
|  | 73 def get_accessions(wildcards): | 
|  | 74     """ | 
|  | 75     Grouping of SRRs with the same SRP could happen here | 
|  | 76     """ | 
|  | 77     #if running under galaxy where the user will input the | 
|  | 78     #FASTQs, study, and genome directly | 
|  | 79     if 'inputs' in config: | 
|  | 80         (run_acc, study_acc, genome, method) = prep_for_galaxy_run() | 
|  | 81         for ext in FILES: | 
|  | 82             yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext)) | 
|  | 83         #here to get the make_galaxy_links rule to fire | 
|  | 84         yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method)) | 
|  | 85     #not running under galaxy, takes a list of accessions | 
|  | 86     else: | 
|  | 87         for fn in config['input'].split(): | 
|  | 88             with open(fn, 'r') as fh: | 
|  | 89                 for ln in fh: | 
|  | 90                     if ln.count(',') < 2: | 
|  | 91                         continue | 
|  | 92                     toks = ln.rstrip().split(',') | 
|  | 93                     assert 3 <= len(toks) <= 4 | 
|  | 94                     method = 'sra' | 
|  | 95                     if len(toks) == 4: | 
|  | 96                         method = toks[3] | 
|  | 97                     # SRR,SRP,genome | 
|  | 98                     # e.g. SRR1557855,SRP045778,ce10 | 
|  | 99                     for ext in FILES: | 
|  | 100                         yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext)) | 
|  | 101 | 
|  | 102 rule all: | 
|  | 103     input: | 
|  | 104         get_accessions | 
|  | 105 | 
|  | 106 | 
|  | 107 rule make_manifest: | 
|  | 108     input: | 
|  | 109         config['output'] + '/{quad}.sjout.zst', | 
|  | 110         config['output'] + '/{quad}.Chimeric.out.junction.zst', | 
|  | 111         config['output'] + '/{quad}.bamcount_nonref.csv.zst', | 
|  | 112         config['output'] + '/{quad}.bamcount_auc.tsv', | 
|  | 113         config['output'] + '/{quad}.bamcount_frag.tsv', | 
|  | 114         config['output'] + '/{quad}.all.exon_bw_count.zst', | 
|  | 115         config['output'] + '/{quad}.unique.exon_bw_count.zst', | 
|  | 116     output: | 
|  | 117         config['output'] + '/{quad}.manifest' | 
|  | 118     params: | 
|  | 119         quad=lambda wildcards: wildcards.quad | 
|  | 120     run: | 
|  | 121         with open(output[0], 'wt') as fh: | 
|  | 122             for fn in FILES: | 
|  | 123                 fh.write(params.quad + "." + fn + '\n') | 
|  | 124 | 
|  | 125 def galaxy_link_files(op): | 
|  | 126     a = [op + '/' + f for f in FILES] | 
|  | 127     a.extend([op + '/align.log', op + '/bamcount.log']) | 
|  | 128     return a | 
|  | 129 | 
|  | 130 rule make_galaxy_links: | 
|  | 131     input: | 
|  | 132         config['output'] + '/{quad}.sjout.zst', | 
|  | 133         config['output'] + '/{quad}.bamcount_nonref.csv.zst', | 
|  | 134         config['output'] + '/{quad}.bamcount_auc.tsv', | 
|  | 135         config['output'] + '/{quad}.bamcount_frag.tsv', | 
|  | 136         config['output'] + '/{quad}.Chimeric.out.junction.zst', | 
|  | 137         config['output'] + '/{quad}.all.exon_bw_count.zst', | 
|  | 138         config['output'] + '/{quad}.unique.exon_bw_count.zst', | 
|  | 139         config['output'] + '/{quad}.manifest' | 
|  | 140     output: | 
|  | 141         config['output'] + '/{quad}.done' | 
|  | 142     params: | 
|  | 143         quad=lambda wildcards: wildcards.quad, | 
|  | 144         out=galaxy_link_files(config['output']) | 
|  | 145     run: | 
|  | 146         inputs = input.extend([config['output'] + '/' + params.quad + '.align.log', | 
|  | 147                                config['output'] + '/' + params.quad + '.bamcount.log']) | 
|  | 148         for (i,fn) in enumerate(input): | 
|  | 149             os.symlink(os.path.abspath(fn), params.out[i]) | 
|  | 150         os.symlink(os.path.abspath(input[-3]), output[0]) | 
|  | 151 | 
|  | 152 | 
|  | 153 rule bamcount: | 
|  | 154     input: | 
|  | 155         bam=config['temp'] + '/{quad}-sorted.bam', | 
|  | 156         bamidx=config['temp'] + '/{quad}-sorted.bam.bai', | 
|  | 157         #exe='/bamcount/bamcount', | 
|  | 158         exe='bamcount', | 
|  | 159         bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed')) | 
|  | 160     output: | 
|  | 161         nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst', | 
|  | 162         auc=config['output'] + '/{quad}.bamcount_auc.tsv', | 
|  | 163         frag=config['output'] + '/{quad}.bamcount_frag.tsv', | 
|  | 164         all_bw=config['output'] + '/{quad}.all.bw.zst', | 
|  | 165         unique_bw=config['output'] + '/{quad}.unique.bw.zst', | 
|  | 166         all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst', | 
|  | 167         unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst' | 
|  | 168     log: | 
|  | 169         config['output'] + '/{quad}.bamcount.log' | 
|  | 170     params: | 
|  | 171         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 172         uniq_qual=config.get('unique_qual', 10) | 
|  | 173     threads: 4 | 
|  | 174     shell: | 
|  | 175         """ | 
|  | 176         TMP={config[temp]}/{params.srr}_bamcount | 
|  | 177         {input.exe} {input.bam} \ | 
|  | 178             --threads {threads} \ | 
|  | 179             --coverage \ | 
|  | 180             --no-head \ | 
|  | 181             --require-mdz \ | 
|  | 182             --min-unique-qual {params.uniq_qual} \ | 
|  | 183             --frag-dist ${{TMP}} \ | 
|  | 184             --bigwig ${{TMP}} \ | 
|  | 185             --annotation {input.bed} ${{TMP}} \ | 
|  | 186             --auc ${{TMP}} \ | 
|  | 187             --alts ${{TMP}} \ | 
|  | 188             2>&1 | tee -a {log} | 
|  | 189 | 
|  | 190         # | 
|  | 191         # --alts | 
|  | 192         # | 
|  | 193 | 
|  | 194         (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log} | 
|  | 195         size=$(wc -c < {output.nonref}) | 
|  | 196         echo "COUNT_NonrefSize ${{size}}" | 
|  | 197         rm -f ${{TMP}}.alts.tsv | 
|  | 198 | 
|  | 199         # | 
|  | 200         # --auc | 
|  | 201         # | 
|  | 202         mv ${{TMP}}.auc.tsv {output.auc} | 
|  | 203         size=$(wc -c < {output.auc}) | 
|  | 204         echo "COUNT_AucSize ${{size}}" | 
|  | 205         rm -f ${{TMP}}.auc.tsv | 
|  | 206 | 
|  | 207         # | 
|  | 208         # --frag-dist | 
|  | 209         # | 
|  | 210         mv ${{TMP}}.frags.tsv {output.frag} | 
|  | 211         size=$(wc -c < {output.frag}) | 
|  | 212         echo "COUNT_FragDistSize ${{size}}" | 
|  | 213         rm -f ${{TMP}}.frags.tsv | 
|  | 214 | 
|  | 215         # | 
|  | 216         # --bigwig | 
|  | 217         # | 
|  | 218 | 
|  | 219         (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log} | 
|  | 220         size=$(wc -c < {output.all_bw}) | 
|  | 221         echo "COUNT_BwSize ${{size}}" | 
|  | 222         rm -f ${{TMP}}.all.bw | 
|  | 223 | 
|  | 224         (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log} | 
|  | 225         size=$(wc -c < {output.unique_bw}) | 
|  | 226         echo "COUNT_BwSize ${{size}}" | 
|  | 227         rm -f ${{TMP}}.unique.bw | 
|  | 228 | 
|  | 229         # | 
|  | 230         # --annotation | 
|  | 231         # | 
|  | 232 | 
|  | 233         (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log} | 
|  | 234         size=$(wc -c < {output.all_bw_count}) | 
|  | 235         echo "COUNT_BwQuantSize ${{size}}" | 
|  | 236         rm -f ${{TMP}}.all.tsv | 
|  | 237 | 
|  | 238         (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log} | 
|  | 239         size=$(wc -c < {output.unique_bw_count}) | 
|  | 240         echo "COUNT_BwQuantSize ${{size}}" | 
|  | 241         rm -f ${{TMP}}.unique.tsv | 
|  | 242 | 
|  | 243         # Check that all temporaries were properly purged | 
|  | 244         set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l) | 
|  | 245         if (( $num_files > 0 )) ; then | 
|  | 246             echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)" | 
|  | 247             exit 1 | 
|  | 248         fi | 
|  | 249 | 
|  | 250         echo "COUNT_BamcountComplete 1" | 
|  | 251         """ | 
|  | 252 | 
|  | 253 rule sort: | 
|  | 254     input: | 
|  | 255         config['temp'] + '/{quad}.bam' | 
|  | 256     wildcard_constraints: | 
|  | 257         quad="[^-]+" | 
|  | 258     output: | 
|  | 259         bam=temp(config['temp'] + '/{quad}-sorted.bam'), | 
|  | 260         bai=temp(config['temp'] + '/{quad}-sorted.bam.bai') | 
|  | 261     log: | 
|  | 262         config['output'] + '/{quad}.sort.log' | 
|  | 263     params: | 
|  | 264         srr=lambda wildcards: wildcards.quad.split('_')[0] | 
|  | 265     threads: 8 | 
|  | 266     shell: | 
|  | 267         """ | 
|  | 268         TMP="{config[temp]}/sort_temp.{params.srr}" | 
|  | 269         mkdir -p ${{TMP}} | 
|  | 270         time samtools sort \ | 
|  | 271             -T ${{TMP}}/samtools_temp \ | 
|  | 272             -@ {threads} \ | 
|  | 273             -m 64M \ | 
|  | 274             -o {output.bam} {input} 2>&1 | tee -a {log} | 
|  | 275         rm -rf ${{TMP}} | 
|  | 276         size=$(wc -c < {output.bam}) | 
|  | 277         echo "COUNT_SortedBAMBytes ${{size}}" | 
|  | 278 | 
|  | 279         time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log} | 
|  | 280         echo "COUNT_SortComplete 1" | 
|  | 281         """ | 
|  | 282 | 
|  | 283 rule align: | 
|  | 284     input: | 
|  | 285         reads0=config['temp'] + '/{quad}_0.fastq', | 
|  | 286         reads1=config['temp'] + '/{quad}_1.fastq', | 
|  | 287         reads2=config['temp'] + '/{quad}_2.fastq', | 
|  | 288         index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 289         index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2]) | 
|  | 290     wildcard_constraints: | 
|  | 291         quad="[^-]+" | 
|  | 292     output: | 
|  | 293         bam=temp(config['temp'] + '/{quad}.bam'), | 
|  | 294         jxs=config['output'] + '/{quad}.sjout.zst', | 
|  | 295         chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst', | 
|  | 296         unmapped1=config['temp'] + '/{quad}_1.unmappedfastq', | 
|  | 297         unmapped2=config['temp'] + '/{quad}_2.unmappedfastq' | 
|  | 298     log: | 
|  | 299         config['output'] + '/{quad}.align.log' | 
|  | 300     params: | 
|  | 301         index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 302         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 303         star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '') | 
|  | 304     threads: 16 | 
|  | 305     shell: | 
|  | 306         """ | 
|  | 307         READ_FILES="{input.reads0}" | 
|  | 308         if [[ -s {input.reads2} ]] ; then | 
|  | 309             READ_FILES="{input.reads1} {input.reads2}" | 
|  | 310         fi | 
|  | 311         TMP="{config[temp]}/align_temp.{params.srr}" | 
|  | 312         rm -rf ${{TMP}} | 
|  | 313         time STAR \ | 
|  | 314             {params.star_params} \ | 
|  | 315             --runMode alignReads \ | 
|  | 316             --runThreadN {threads} \ | 
|  | 317             --genomeDir {params.index_base} \ | 
|  | 318             --readFilesIn ${{READ_FILES}} \ | 
|  | 319             --twopassMode None \ | 
|  | 320             --outTmpDir ${{TMP}} \ | 
|  | 321             --outReadsUnmapped Fastx \ | 
|  | 322             --outMultimapperOrder Random \ | 
|  | 323             --outSAMreadID Number \ | 
|  | 324             --outSAMtype BAM Unsorted \ | 
|  | 325             --outSAMmode NoQS \ | 
|  | 326             --outSAMattributes NH MD \ | 
|  | 327             --chimOutType Junctions \ | 
|  | 328             --chimOutJunctionFormat 1 \ | 
|  | 329             --chimSegmentReadGapMax 3 \ | 
|  | 330             --chimJunctionOverhangMin 12 \ | 
|  | 331             --chimSegmentMin 12 2>&1 | tee -a {log} | 
|  | 332 | 
|  | 333         # Full set of output files: | 
|  | 334         # | 
|  | 335         # Aligned.out.bam | 
|  | 336         # Chimeric.out.junction | 
|  | 337         # Log.final.out | 
|  | 338         # Log.out | 
|  | 339         # Log.progress.out | 
|  | 340         # SJ.out.tab | 
|  | 341         # Unmapped.out.mate1 | 
|  | 342         # Unmapped.out.mate2 (if any reads were paired-end) | 
|  | 343 | 
|  | 344         # | 
|  | 345         # Logs | 
|  | 346         # | 
|  | 347         rm -rf ${{TMP}} | 
|  | 348         cat Log.out >> {log} | 
|  | 349         cat Log.final.out >> {log} | 
|  | 350         rm -f Log*.out | 
|  | 351 | 
|  | 352         # | 
|  | 353         # Junctions | 
|  | 354         # | 
|  | 355         test -f SJ.out.tab | 
|  | 356         time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log} | 
|  | 357         rm -f SJ.out.tab | 
|  | 358         size=$(wc -c < {output.jxs}) | 
|  | 359         echo "COUNT_CompressedJxBytes ${{size}}" | 
|  | 360 | 
|  | 361         # | 
|  | 362         # Chimerics | 
|  | 363         # | 
|  | 364         test -f Chimeric.out.junction | 
|  | 365         test -s Chimeric.out.junction | 
|  | 366         sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted | 
|  | 367         time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log} | 
|  | 368         rm -f Chimeric.out.junction Chimeric.out.junction.sorted | 
|  | 369         size=$(wc -c < {output.chimeric}) | 
|  | 370         echo "COUNT_ChimericBytes ${{size}}" | 
|  | 371 | 
|  | 372         # | 
|  | 373         # Unmapped | 
|  | 374         # | 
|  | 375         touch {output.unmapped2} | 
|  | 376         test -f Unmapped.out.mate1 | 
|  | 377         mv Unmapped.out.mate1 {output.unmapped1} | 
|  | 378         if [[ -f Unmapped.out.mate2 ]] ; then | 
|  | 379             mv Unmapped.out.mate2 {output.unmapped2} | 
|  | 380         fi | 
|  | 381 | 
|  | 382         # | 
|  | 383         # Alignments | 
|  | 384         # | 
|  | 385         size=$(wc -c < Aligned.out.bam) | 
|  | 386         echo "COUNT_BAMBytes ${{size}}" | 
|  | 387         mv Aligned.out.bam {output.bam} | 
|  | 388         echo "COUNT_AlignComplete 1" | 
|  | 389         """ |