| 10 | 1 """ | 
|  | 2 Parameters: | 
| 16 | 3 - fastq_dump_args: arguments to pass to fastq dumping tool | 
|  | 4 - fastq_dump_retries: number of retry attempts before dying | 
|  | 5 - fastq_dump_tool: name of tool to run, e.g. 'fastq-dump' or 'fasterq-dump' | 
| 10 | 6 - star: arguments to pass to STAR aligner | 
| 16 | 7 - salmon_args: arguments to pass to salmon quant | 
| 10 | 8 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10] | 
| 16 | 9 - bw_bed: name of BED file to use with bwtool | 
|  | 10 - max_unalign: maximum number of unaligned reads to save per run accession | 
| 10 | 11 """ | 
|  | 12 | 
| 16 | 13 STEPS = ['download', 'fastq_check', 'align', 'sort', | 
|  | 14          'bamcount', | 
|  | 15          'salmon', | 
|  | 16          'align_unmapped', | 
|  | 17          'extract_jx'] | 
| 10 | 18 | 
| 16 | 19 FILES = ['sjout.zst', 'fastq_check.tsv.zst', | 
|  | 20          'unmapped.bam', 'unmapped.fastq.zst', | 
| 10 | 21          'bamcount_nonref.csv.zst', | 
|  | 22          'bamcount_auc.tsv', | 
|  | 23          'bamcount_frag.tsv', | 
|  | 24          'Chimeric.out.junction.zst', | 
|  | 25          'all.exon_bw_count.zst', 'unique.exon_bw_count.zst', | 
| 16 | 26          'all.bw.zst', 'unique.bw.zst', | 
|  | 27          'salmon.tsv.zst', | 
|  | 28          'jx_bed.zst', | 
|  | 29          'manifest'] + list(map(lambda x: x + '.log', STEPS)) | 
| 10 | 30 | 
| 14 | 31 #get rid of the pesky warnings about current paths | 
|  | 32 #and make sure we're not assuming any wrong relative paths | 
| 12 | 33 import os | 
|  | 34 config['output']=os.path.abspath(config['output']) | 
| 13 | 35 config['temp']=os.path.abspath(config['temp']) | 
| 14 | 36 config['exon_bed']=os.path.abspath(config['exon_bed']) | 
|  | 37 config['ref']=os.path.abspath(config['ref']) | 
| 12 | 38 | 
| 10 | 39 import subprocess | 
|  | 40 def run_command(cmd_args): | 
|  | 41     cmd_args = ' '.join(cmd_args) | 
|  | 42     try: | 
|  | 43         subprocess.check_call(args=cmd_args, shell=True) | 
|  | 44     except subprocess.CalledProcessError as cpe: | 
|  | 45         sys.stderr.write("error in run_command for command: %s\n" % cmd_args) | 
|  | 46         raise cpe | 
|  | 47 | 
| 16 | 48 import re | 
|  | 49 STEPS_FILES_FILTER=re.compile(r'(unmapped)|(download)|(salmon)|(extract_jx)|(jx_bed)|(manifest)') | 
|  | 50 def remove_steps_files(): | 
|  | 51     #modify STEP and FILES | 
|  | 52     #so we don't run download or unmapped steps | 
|  | 53     global FILES | 
|  | 54     global STEPS | 
|  | 55     for i in reversed(range(0,len(STEPS))): | 
|  | 56         f = STEPS[i] | 
|  | 57         if STEPS_FILES_FILTER.search(f): | 
|  | 58             del(STEPS[i]) | 
|  | 59     for i in reversed(range(0,len(FILES))): | 
|  | 60         f = FILES[i] | 
|  | 61         if STEPS_FILES_FILTER.search(f): | 
|  | 62             del(FILES[i]) | 
| 10 | 63 | 
| 16 | 64 | 
| 11 | 65 FASTQ_PATT=re.compile(r'([^_\.]+)(_(\d+))?\.((fastq)|fq)(\.gz)?$') | 
| 10 | 66 def prep_for_galaxy_run(): | 
| 16 | 67     remove_steps_files() | 
| 10 | 68     try: | 
|  | 69         os.mkdir(config['temp']) | 
|  | 70     except OSError as ose: | 
|  | 71         pass | 
|  | 72     fastqs = config['inputs'].split(',') | 
|  | 73     m = FASTQ_PATT.search(fastqs[0]) | 
| 11 | 74     run_acc = 'sample' | 
|  | 75     if m is not None: | 
|  | 76         run_acc = m.group(1) | 
| 12 | 77     study_acc = 'study' | 
| 10 | 78     if 'study' in config: | 
|  | 79         study_acc = config['study'] | 
|  | 80     genome = 'hg38' | 
|  | 81     if 'genome' in config: | 
|  | 82         genome = config['genome'] | 
|  | 83     method = 'sra' | 
|  | 84     # SRR,SRP,genome | 
|  | 85     # e.g. SRR1557855,SRP045778,ce10 | 
|  | 86     #create links which will be used in the align step | 
|  | 87     i = 1 | 
|  | 88     for f in fastqs: | 
|  | 89         newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i) | 
| 15 | 90         if 'compressed' in config: | 
|  | 91             run_command(['zcat',f,'>',newf]) | 
|  | 92         else: | 
|  | 93             os.symlink(os.path.abspath(f), newf) | 
| 10 | 94         #create fastq 0 | 
|  | 95         if i == 1: | 
| 15 | 96             os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0)) | 
| 10 | 97         i += 1 | 
|  | 98     #create fastq 2 if not paired | 
|  | 99     if i == 2: | 
|  | 100         open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close() | 
|  | 101     #create expected file structure for annotated exon bed file & reference index | 
|  | 102     ref = config['ref'] | 
| 16 | 103     config['ref'] = os.path.abspath('.') | 
| 10 | 104     os.makedirs('%s/%s' % (config['ref'], genome)) | 
|  | 105     os.symlink(ref, '%s/%s/star_idx' % (config['ref'], genome)) | 
|  | 106     os.makedirs('%s/%s/gtf' % (config['ref'], genome)) | 
|  | 107     os.symlink(config['exon_bed'], 'exons.tmp') | 
|  | 108     os.symlink('../../exons.tmp', '%s/%s/gtf/%s' % (config['ref'], genome, config.get('bw_bed', 'exons.bed'))) | 
| 16 | 109     #TODO may need to add additional symlinking for HISAT2 unmapped alignments | 
| 10 | 110     return([run_acc, study_acc, genome, method]) | 
|  | 111 | 
|  | 112 | 
|  | 113 def get_accessions(wildcards): | 
|  | 114     """ | 
|  | 115     Grouping of SRRs with the same SRP could happen here | 
|  | 116     """ | 
|  | 117     #if running under galaxy where the user will input the | 
|  | 118     #FASTQs, study, and genome directly | 
|  | 119     if 'inputs' in config: | 
|  | 120         (run_acc, study_acc, genome, method) = prep_for_galaxy_run() | 
|  | 121         for ext in FILES: | 
|  | 122             yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext)) | 
|  | 123         #here to get the make_galaxy_links rule to fire | 
|  | 124         yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method)) | 
|  | 125     #not running under galaxy, takes a list of accessions | 
|  | 126     else: | 
|  | 127         for fn in config['input'].split(): | 
|  | 128             with open(fn, 'r') as fh: | 
|  | 129                 for ln in fh: | 
|  | 130                     if ln.count(',') < 2: | 
|  | 131                         continue | 
|  | 132                     toks = ln.rstrip().split(',') | 
|  | 133                     assert 3 <= len(toks) <= 4 | 
|  | 134                     method = 'sra' | 
|  | 135                     if len(toks) == 4: | 
|  | 136                         method = toks[3] | 
|  | 137                     # SRR,SRP,genome | 
|  | 138                     # e.g. SRR1557855,SRP045778,ce10 | 
|  | 139                     for ext in FILES: | 
|  | 140                         yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext)) | 
|  | 141 | 
|  | 142 rule all: | 
|  | 143     input: | 
|  | 144         get_accessions | 
|  | 145 | 
|  | 146 rule make_manifest: | 
|  | 147     input: | 
| 16 | 148         config['output'] + '/{quad}.salmon.tsv.zst', | 
| 10 | 149         config['output'] + '/{quad}.sjout.zst', | 
| 16 | 150         config['output'] + '/{quad}.jx_bed.zst', | 
| 10 | 151         config['output'] + '/{quad}.Chimeric.out.junction.zst', | 
| 16 | 152         config['output'] + '/{quad}.unmapped.bam', | 
|  | 153         config['output'] + '/{quad}.unmapped.fastq.zst', | 
| 10 | 154         config['output'] + '/{quad}.bamcount_nonref.csv.zst', | 
|  | 155         config['output'] + '/{quad}.bamcount_auc.tsv', | 
|  | 156         config['output'] + '/{quad}.bamcount_frag.tsv', | 
| 16 | 157         config['output'] + '/{quad}.fastq_check.tsv.zst', | 
| 10 | 158         config['output'] + '/{quad}.all.exon_bw_count.zst', | 
|  | 159         config['output'] + '/{quad}.unique.exon_bw_count.zst', | 
| 16 | 160         config['output'] + '/{quad}.all.bw.zst', | 
|  | 161         config['output'] + '/{quad}.unique.bw.zst', | 
|  | 162         config['output'] + '/{quad}.align.log', | 
|  | 163         config['output'] + '/{quad}.extract_jx.log', | 
|  | 164         config['output'] + '/{quad}.bamcount.log', | 
|  | 165         config['output'] + '/{quad}.align_unmapped.log', | 
|  | 166         config['output'] + '/{quad}.download.log', | 
|  | 167         config['output'] + '/{quad}.fastq_check.log', | 
|  | 168         config['output'] + '/{quad}.sort.log', | 
|  | 169         config['output'] + '/{quad}.salmon.log' | 
| 10 | 170     output: | 
|  | 171         config['output'] + '/{quad}.manifest' | 
|  | 172     params: | 
|  | 173         quad=lambda wildcards: wildcards.quad | 
|  | 174     run: | 
|  | 175         with open(output[0], 'wt') as fh: | 
|  | 176             for fn in FILES: | 
|  | 177                 fh.write(params.quad + "." + fn + '\n') | 
|  | 178 | 
|  | 179 def galaxy_link_files(op): | 
| 16 | 180     remove_steps_files() | 
| 10 | 181     a = [op + '/' + f for f in FILES] | 
|  | 182     return a | 
|  | 183 | 
|  | 184 rule make_galaxy_links: | 
|  | 185     input: | 
|  | 186         config['output'] + '/{quad}.sjout.zst', | 
| 16 | 187         config['output'] + '/{quad}.fastq_check.tsv.zst', | 
| 10 | 188         config['output'] + '/{quad}.bamcount_nonref.csv.zst', | 
|  | 189         config['output'] + '/{quad}.bamcount_auc.tsv', | 
|  | 190         config['output'] + '/{quad}.bamcount_frag.tsv', | 
|  | 191         config['output'] + '/{quad}.Chimeric.out.junction.zst', | 
|  | 192         config['output'] + '/{quad}.all.exon_bw_count.zst', | 
|  | 193         config['output'] + '/{quad}.unique.exon_bw_count.zst', | 
| 16 | 194         config['output'] + '/{quad}.all.bw.zst', | 
|  | 195         config['output'] + '/{quad}.unique.bw.zst', | 
|  | 196         config['output'] + '/{quad}.fastq_check.log', | 
|  | 197         config['output'] + '/{quad}.align.log', | 
|  | 198         config['output'] + '/{quad}.sort.log', | 
|  | 199         config['output'] + '/{quad}.bamcount.log', | 
| 10 | 200     output: | 
|  | 201         config['output'] + '/{quad}.done' | 
|  | 202     params: | 
|  | 203         quad=lambda wildcards: wildcards.quad, | 
|  | 204         out=galaxy_link_files(config['output']) | 
|  | 205     run: | 
|  | 206         for (i,fn) in enumerate(input): | 
|  | 207             os.symlink(os.path.abspath(fn), params.out[i]) | 
| 16 | 208         os.symlink(os.path.abspath(input[-1]), output[0]) | 
| 10 | 209 | 
|  | 210 | 
|  | 211 rule bamcount: | 
|  | 212     input: | 
|  | 213         bam=config['temp'] + '/{quad}-sorted.bam', | 
|  | 214         bamidx=config['temp'] + '/{quad}-sorted.bam.bai', | 
|  | 215         bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed')) | 
|  | 216     output: | 
|  | 217         nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst', | 
|  | 218         auc=config['output'] + '/{quad}.bamcount_auc.tsv', | 
|  | 219         frag=config['output'] + '/{quad}.bamcount_frag.tsv', | 
|  | 220         all_bw=config['output'] + '/{quad}.all.bw.zst', | 
|  | 221         unique_bw=config['output'] + '/{quad}.unique.bw.zst', | 
|  | 222         all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst', | 
|  | 223         unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst' | 
|  | 224     log: | 
|  | 225         config['output'] + '/{quad}.bamcount.log' | 
|  | 226     params: | 
|  | 227         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 228         uniq_qual=config.get('unique_qual', 10) | 
|  | 229     threads: 4 | 
|  | 230     shell: | 
|  | 231         """ | 
|  | 232         TMP={config[temp]}/{params.srr}_bamcount | 
| 14 | 233         bamcount {input.bam} \ | 
| 10 | 234             --threads {threads} \ | 
|  | 235             --coverage \ | 
|  | 236             --no-head \ | 
|  | 237             --require-mdz \ | 
|  | 238             --min-unique-qual {params.uniq_qual} \ | 
|  | 239             --frag-dist ${{TMP}} \ | 
|  | 240             --bigwig ${{TMP}} \ | 
|  | 241             --annotation {input.bed} ${{TMP}} \ | 
|  | 242             --auc ${{TMP}} \ | 
|  | 243             --alts ${{TMP}} \ | 
|  | 244             2>&1 | tee -a {log} | 
|  | 245 | 
|  | 246         # | 
|  | 247         # --alts | 
|  | 248         # | 
|  | 249 | 
|  | 250         (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log} | 
|  | 251         size=$(wc -c < {output.nonref}) | 
|  | 252         echo "COUNT_NonrefSize ${{size}}" | 
|  | 253         rm -f ${{TMP}}.alts.tsv | 
|  | 254 | 
|  | 255         # | 
|  | 256         # --auc | 
|  | 257         # | 
|  | 258         mv ${{TMP}}.auc.tsv {output.auc} | 
|  | 259         size=$(wc -c < {output.auc}) | 
|  | 260         echo "COUNT_AucSize ${{size}}" | 
|  | 261         rm -f ${{TMP}}.auc.tsv | 
|  | 262 | 
|  | 263         # | 
|  | 264         # --frag-dist | 
|  | 265         # | 
|  | 266         mv ${{TMP}}.frags.tsv {output.frag} | 
|  | 267         size=$(wc -c < {output.frag}) | 
|  | 268         echo "COUNT_FragDistSize ${{size}}" | 
|  | 269         rm -f ${{TMP}}.frags.tsv | 
|  | 270 | 
|  | 271         # | 
|  | 272         # --bigwig | 
|  | 273         # | 
|  | 274 | 
|  | 275         (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log} | 
|  | 276         size=$(wc -c < {output.all_bw}) | 
|  | 277         echo "COUNT_BwSize ${{size}}" | 
|  | 278         rm -f ${{TMP}}.all.bw | 
|  | 279 | 
|  | 280         (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log} | 
|  | 281         size=$(wc -c < {output.unique_bw}) | 
|  | 282         echo "COUNT_BwSize ${{size}}" | 
|  | 283         rm -f ${{TMP}}.unique.bw | 
|  | 284 | 
|  | 285         # | 
|  | 286         # --annotation | 
|  | 287         # | 
|  | 288 | 
|  | 289         (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log} | 
|  | 290         size=$(wc -c < {output.all_bw_count}) | 
|  | 291         echo "COUNT_BwQuantSize ${{size}}" | 
|  | 292         rm -f ${{TMP}}.all.tsv | 
|  | 293 | 
|  | 294         (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log} | 
|  | 295         size=$(wc -c < {output.unique_bw_count}) | 
|  | 296         echo "COUNT_BwQuantSize ${{size}}" | 
|  | 297         rm -f ${{TMP}}.unique.tsv | 
|  | 298 | 
|  | 299         # Check that all temporaries were properly purged | 
|  | 300         set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l) | 
|  | 301         if (( $num_files > 0 )) ; then | 
|  | 302             echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)" | 
|  | 303             exit 1 | 
|  | 304         fi | 
|  | 305 | 
|  | 306         echo "COUNT_BamcountComplete 1" | 
|  | 307         """ | 
|  | 308 | 
| 16 | 309 rule bw_zstd: | 
|  | 310     input: | 
|  | 311         config['temp'] + '/{prefix}.bw' | 
|  | 312     output: | 
|  | 313         config['output'] + '/{prefix}.bw.zst' | 
|  | 314     shell: | 
|  | 315         """ | 
|  | 316         zstd {input} -o {output} | 
|  | 317         size=$(wc -c < {output}) | 
|  | 318         echo "COUNT_BwBytes ${{size}}" | 
|  | 319         echo "COUNT_BwZstdComplete 1" | 
|  | 320         """ | 
|  | 321 | 
| 10 | 322 rule sort: | 
|  | 323     input: | 
|  | 324         config['temp'] + '/{quad}.bam' | 
|  | 325     wildcard_constraints: | 
|  | 326         quad="[^-]+" | 
|  | 327     output: | 
|  | 328         bam=temp(config['temp'] + '/{quad}-sorted.bam'), | 
|  | 329         bai=temp(config['temp'] + '/{quad}-sorted.bam.bai') | 
|  | 330     log: | 
|  | 331         config['output'] + '/{quad}.sort.log' | 
|  | 332     params: | 
|  | 333         srr=lambda wildcards: wildcards.quad.split('_')[0] | 
|  | 334     threads: 8 | 
|  | 335     shell: | 
|  | 336         """ | 
|  | 337         TMP="{config[temp]}/sort_temp.{params.srr}" | 
|  | 338         mkdir -p ${{TMP}} | 
|  | 339         time samtools sort \ | 
|  | 340             -T ${{TMP}}/samtools_temp \ | 
|  | 341             -@ {threads} \ | 
|  | 342             -m 64M \ | 
|  | 343             -o {output.bam} {input} 2>&1 | tee -a {log} | 
|  | 344         rm -rf ${{TMP}} | 
|  | 345         size=$(wc -c < {output.bam}) | 
|  | 346         echo "COUNT_SortedBAMBytes ${{size}}" | 
|  | 347 | 
|  | 348         time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log} | 
|  | 349         echo "COUNT_SortComplete 1" | 
|  | 350         """ | 
|  | 351 | 
| 16 | 352 rule salmon: | 
|  | 353     input: | 
|  | 354         reads0=config['temp'] + '/{quad}_0.fastq', | 
|  | 355         reads1=config['temp'] + '/{quad}_1.fastq', | 
|  | 356         reads2=config['temp'] + '/{quad}_2.fastq', | 
|  | 357         index1=lambda wildcards: '%s/%s/salmon_index/hash.bin' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 358         index2=lambda wildcards: '%s/%s/salmon_index/sa.bin' % (config['ref'], wildcards.quad.split('_')[2]) | 
|  | 359     output: | 
|  | 360         config['output'] + '/{quad}.salmon.tsv.zst' | 
|  | 361     log: | 
|  | 362         config['output'] + '/{quad}.salmon.log' | 
|  | 363     params: | 
|  | 364         index_base=lambda wildcards: '%s/%s/salmon_index' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 365         salmon_args=config.get('salmon_args', '') | 
|  | 366     threads: 8 | 
|  | 367     shell: | 
|  | 368         """ | 
|  | 369         READ_FILES="-r {input.reads0}" | 
|  | 370         if [[ -s {input.reads2} ]] ; then | 
|  | 371             READ_FILES="-1 {input.reads1} -2 {input.reads2}" | 
|  | 372         fi | 
|  | 373         if set -o pipefail && time salmon quant \ | 
|  | 374             --libType U \ | 
|  | 375             --quiet \ | 
|  | 376             --validateMappings \ | 
|  | 377             -i {params.index_base} \ | 
|  | 378             -p {threads} \ | 
|  | 379             {params.salmon_args} \ | 
|  | 380             ${{READ_FILES}} \ | 
|  | 381             --output salmon_quant \ | 
|  | 382             --minAssignedFrags 1 \ | 
|  | 383             2>&1 | tee -a {log} | 
|  | 384         then | 
|  | 385             echo "COUNT_SalmonSuccess 1" | 
|  | 386 | 
|  | 387             time zstd salmon_quant/quant.sf -o {output} 2>&1 | tee -a {log} | 
|  | 388             size=$(wc -c < {output}) | 
|  | 389             echo "COUNT_SalmonQuantBytes ${{size}}" | 
|  | 390         else | 
|  | 391             touch {output} | 
|  | 392             echo "COUNT_SalmonFailure 1" | 
|  | 393         fi | 
|  | 394 | 
|  | 395         rm -rf salmon_quant | 
|  | 396         echo "COUNT_SalmonComplete 1" | 
|  | 397         """ | 
|  | 398 | 
|  | 399 rule align_unmapped: | 
|  | 400     input: | 
|  | 401         unmapped1=config['temp'] + '/{quad}_1.unmappedfastq', | 
|  | 402         unmapped2=config['temp'] + '/{quad}_2.unmappedfastq', | 
|  | 403         index=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome.1.ht2' % (config['ref'], wildcards.quad.split('_')[2]) | 
|  | 404     output: | 
|  | 405         bam=config['output'] + '/{quad}.unmapped.bam', | 
|  | 406         sample=config['output'] + '/{quad}.unmapped.fastq.zst' | 
|  | 407     log: | 
|  | 408         config['output'] + '/{quad}.align_unmapped.log' | 
|  | 409     params: | 
|  | 410         index_base=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 411         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 412         hisat2_params=config.get('hisat2', ''), | 
|  | 413         max_unalign=config.get('max_unalign', 100000) | 
|  | 414     threads: 16 | 
|  | 415     shell: | 
|  | 416         """ | 
|  | 417         TMP="{config[temp]}/align_unmapped_temp.{params.srr}" | 
|  | 418         READ_FILES="-1 {input.unmapped1} -2 {input.unmapped2}" | 
|  | 419         if [[ ! -s {input.unmapped2} ]] ; then | 
|  | 420             READ_FILES="-U {input.unmapped1}" | 
|  | 421         fi | 
|  | 422         time hisat2 \ | 
|  | 423             $READ_FILES \ | 
|  | 424             -t --mm \ | 
|  | 425             -x {params.index_base} \ | 
|  | 426             --threads {threads} \ | 
|  | 427             {params.hisat2_params} \ | 
|  | 428             --un ${{TMP}}.un \ | 
|  | 429             --un-conc ${{TMP}}.un_conc \ | 
|  | 430             -S ${{TMP}}.sam \ | 
|  | 431             2>&1 | tee -a {log} | 
|  | 432 | 
|  | 433         # | 
|  | 434         # Make BAM file out of aligned reads | 
|  | 435         # | 
|  | 436         time samtools view \ | 
|  | 437             -b -F 4 \ | 
|  | 438             -o {output.bam} \ | 
|  | 439             ${{TMP}}.sam \ | 
|  | 440             2>&1 | tee -a {log} | 
|  | 441         rm -f ${{TMP}}.sam | 
|  | 442 | 
|  | 443         # | 
|  | 444         # Save a subset of the doubly unaligned reads | 
|  | 445         # | 
|  | 446         if [[ ! -s {input.unmapped2} ]] ; then | 
|  | 447             # unpaired | 
|  | 448             fastq-sample \ | 
|  | 449                 -n {params.max_unalign} \ | 
|  | 450                 -o ${{TMP}}.samp \ | 
|  | 451                 ${{TMP}}.un | 
|  | 452             rm -f ${{TMP}}.un | 
|  | 453         else | 
|  | 454             # paired-end | 
|  | 455             fastq-sample \ | 
|  | 456                 -n {params.max_unalign} \ | 
|  | 457                 -o ${{TMP}}.samp \ | 
|  | 458                 ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc | 
|  | 459             rm -f ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc | 
|  | 460 | 
|  | 461             # interleave the two output fastqs into single file | 
|  | 462             paste ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq \ | 
|  | 463                 | paste - - - - \ | 
|  | 464                 | awk -v OFS="\n" -v FS="\t" '{{print($1,$3,$5,$7,$2,$4,$6,$8)}}' \ | 
|  | 465                 > ${{TMP}}.samp.fastq | 
|  | 466             rm -f ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq | 
|  | 467         fi | 
|  | 468 | 
|  | 469         test -f ${{TMP}}.samp.fastq | 
|  | 470         time zstd ${{TMP}}.samp.fastq -o {output.sample} 2>&1 | tee -a {log} | 
|  | 471         size=$(wc -c < {output.sample}) | 
|  | 472         echo "COUNT_UnmappedSampleBytes ${{size}}" | 
|  | 473         rm -f ${{TMP}}.samp.fastq | 
|  | 474 | 
|  | 475         size=$(wc -c < {output.bam}) | 
|  | 476         echo "COUNT_UnmappedBamBytes ${{size}}" | 
|  | 477 | 
|  | 478         echo "COUNT_AlignUnmappedComplete 1" | 
|  | 479         """ | 
|  | 480 | 
|  | 481 rule extract_jx: | 
|  | 482     input: | 
|  | 483         bam=config['temp'] + '/{quad}-sorted.bam', | 
|  | 484         bamidx=config['temp'] + '/{quad}-sorted.bam.bai', | 
|  | 485         fa=lambda wildcards: '%s/%s/fasta/genome.fa' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 486         gtf=lambda wildcards: '%s/%s/gtf/genes.gtf' % (config['ref'], wildcards.quad.split('_')[2]) | 
|  | 487     output: | 
|  | 488         config['output'] + '/{quad}.jx_bed.zst' | 
|  | 489     params: | 
|  | 490         srr=lambda wildcards: wildcards.quad.split('_')[0] | 
|  | 491     log: | 
|  | 492         config['output'] + '/{quad}.extract_jx.log' | 
|  | 493     shell: | 
|  | 494         """ | 
|  | 495         TMP="{config[temp]}/extract_jx.{params.srr}" | 
|  | 496         time regtools junctions extract \ | 
|  | 497             -i 20 -a 1 \ | 
|  | 498             -o ${{TMP}}.jx_tmp \ | 
|  | 499             {input.bam} 2>&1 | tee -a {log} | 
|  | 500         time zstd ${{TMP}}.jx_tmp -o {output} 2>&1 | tee -a {log} | 
|  | 501         rm -f ${{TMP}}.jx_tmp | 
|  | 502 | 
|  | 503         size=$(wc -c < {output}) | 
|  | 504         echo "COUNT_ExtractJxBytes ${{size}}" | 
|  | 505 | 
|  | 506         echo "COUNT_ExtractJxComplete 1" | 
|  | 507         """ | 
|  | 508 | 
| 10 | 509 rule align: | 
|  | 510     input: | 
|  | 511         reads0=config['temp'] + '/{quad}_0.fastq', | 
|  | 512         reads1=config['temp'] + '/{quad}_1.fastq', | 
|  | 513         reads2=config['temp'] + '/{quad}_2.fastq', | 
|  | 514         index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 515         index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2]) | 
|  | 516     wildcard_constraints: | 
|  | 517         quad="[^-]+" | 
|  | 518     output: | 
|  | 519         bam=temp(config['temp'] + '/{quad}.bam'), | 
|  | 520         jxs=config['output'] + '/{quad}.sjout.zst', | 
|  | 521         chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst', | 
|  | 522         unmapped1=config['temp'] + '/{quad}_1.unmappedfastq', | 
|  | 523         unmapped2=config['temp'] + '/{quad}_2.unmappedfastq' | 
|  | 524     log: | 
|  | 525         config['output'] + '/{quad}.align.log' | 
|  | 526     params: | 
|  | 527         index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 528         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 529         star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '') | 
|  | 530     threads: 16 | 
|  | 531     shell: | 
|  | 532         """ | 
|  | 533         READ_FILES="{input.reads0}" | 
|  | 534         if [[ -s {input.reads2} ]] ; then | 
|  | 535             READ_FILES="{input.reads1} {input.reads2}" | 
|  | 536         fi | 
|  | 537         TMP="{config[temp]}/align_temp.{params.srr}" | 
|  | 538         rm -rf ${{TMP}} | 
|  | 539         time STAR \ | 
|  | 540             {params.star_params} \ | 
|  | 541             --runMode alignReads \ | 
|  | 542             --runThreadN {threads} \ | 
|  | 543             --genomeDir {params.index_base} \ | 
|  | 544             --readFilesIn ${{READ_FILES}} \ | 
|  | 545             --twopassMode None \ | 
|  | 546             --outTmpDir ${{TMP}} \ | 
|  | 547             --outReadsUnmapped Fastx \ | 
|  | 548             --outMultimapperOrder Random \ | 
|  | 549             --outSAMreadID Number \ | 
|  | 550             --outSAMtype BAM Unsorted \ | 
|  | 551             --outSAMmode NoQS \ | 
|  | 552             --outSAMattributes NH MD \ | 
|  | 553             --chimOutType Junctions \ | 
|  | 554             --chimOutJunctionFormat 1 \ | 
|  | 555             --chimSegmentReadGapMax 3 \ | 
|  | 556             --chimJunctionOverhangMin 12 \ | 
|  | 557             --chimSegmentMin 12 2>&1 | tee -a {log} | 
|  | 558 | 
|  | 559         # Full set of output files: | 
|  | 560         # | 
|  | 561         # Aligned.out.bam | 
|  | 562         # Chimeric.out.junction | 
|  | 563         # Log.final.out | 
|  | 564         # Log.out | 
|  | 565         # Log.progress.out | 
|  | 566         # SJ.out.tab | 
|  | 567         # Unmapped.out.mate1 | 
|  | 568         # Unmapped.out.mate2 (if any reads were paired-end) | 
|  | 569 | 
|  | 570         # | 
|  | 571         # Logs | 
|  | 572         # | 
|  | 573         rm -rf ${{TMP}} | 
|  | 574         cat Log.out >> {log} | 
|  | 575         cat Log.final.out >> {log} | 
|  | 576         rm -f Log*.out | 
|  | 577 | 
|  | 578         # | 
|  | 579         # Junctions | 
|  | 580         # | 
|  | 581         test -f SJ.out.tab | 
|  | 582         time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log} | 
|  | 583         rm -f SJ.out.tab | 
|  | 584         size=$(wc -c < {output.jxs}) | 
|  | 585         echo "COUNT_CompressedJxBytes ${{size}}" | 
|  | 586 | 
|  | 587         # | 
|  | 588         # Chimerics | 
|  | 589         # | 
|  | 590         test -f Chimeric.out.junction | 
|  | 591         test -s Chimeric.out.junction | 
|  | 592         sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted | 
|  | 593         time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log} | 
|  | 594         rm -f Chimeric.out.junction Chimeric.out.junction.sorted | 
|  | 595         size=$(wc -c < {output.chimeric}) | 
|  | 596         echo "COUNT_ChimericBytes ${{size}}" | 
|  | 597 | 
|  | 598         # | 
|  | 599         # Unmapped | 
|  | 600         # | 
|  | 601         touch {output.unmapped2} | 
|  | 602         test -f Unmapped.out.mate1 | 
|  | 603         mv Unmapped.out.mate1 {output.unmapped1} | 
|  | 604         if [[ -f Unmapped.out.mate2 ]] ; then | 
|  | 605             mv Unmapped.out.mate2 {output.unmapped2} | 
|  | 606         fi | 
|  | 607 | 
|  | 608         # | 
|  | 609         # Alignments | 
|  | 610         # | 
|  | 611         size=$(wc -c < Aligned.out.bam) | 
|  | 612         echo "COUNT_BAMBytes ${{size}}" | 
|  | 613         mv Aligned.out.bam {output.bam} | 
| 16 | 614 | 
| 10 | 615         echo "COUNT_AlignComplete 1" | 
|  | 616         """ | 
| 16 | 617 | 
|  | 618 rule fastq_check: | 
|  | 619     input: | 
|  | 620         reads0=config['temp'] + '/{quad}_0.fastq', | 
|  | 621         reads1=config['temp'] + '/{quad}_1.fastq', | 
|  | 622         reads2=config['temp'] + '/{quad}_2.fastq' | 
|  | 623     output: | 
|  | 624         config['output'] + '/{quad}.fastq_check.tsv.zst' | 
|  | 625     log: | 
|  | 626         config['output'] + '/{quad}.fastq_check.log' | 
|  | 627     params: | 
|  | 628         srr=lambda wildcards: wildcards.quad.split('_')[0] | 
|  | 629     shell: | 
|  | 630         """ | 
|  | 631         TMP="{config[temp]}/fastq_check-{params.srr}.tsv" | 
|  | 632         touch ${{TMP}} | 
|  | 633         if [[ -s {input.reads0} ]] ; then | 
|  | 634             time seqtk fqchk -q0 {input.reads0} >>${{TMP}} 2>>{log} | 
|  | 635         fi | 
|  | 636         if [[ -s {input.reads1} ]] ; then | 
|  | 637             time seqtk fqchk -q0 {input.reads1} >>${{TMP}} 2>>{log} | 
|  | 638         fi | 
|  | 639         if [[ -s {input.reads2} ]] ; then | 
|  | 640             time seqtk fqchk -q0 {input.reads2} >>${{TMP}} 2>>{log} | 
|  | 641         fi | 
|  | 642         time zstd ${{TMP}} -o {output} 2>&1 | tee -a {log} | 
|  | 643         size=$(wc -c < {output}) | 
|  | 644         echo "COUNT_FastqCheckBytes ${{size}}" | 
|  | 645 | 
|  | 646         echo "COUNT_FastqCheckComplete 1" | 
|  | 647         """ | 
|  | 648 | 
|  | 649 rule download: | 
|  | 650     output: | 
|  | 651         temp(config['temp'] + '/{quad}_0.fastq'), | 
|  | 652         temp(config['temp'] + '/{quad}_1.fastq'), | 
|  | 653         temp(config['temp'] + '/{quad}_2.fastq') | 
|  | 654     log: | 
|  | 655         config['output'] + '/{quad}.download.log' | 
|  | 656     params: | 
|  | 657         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 658         method=lambda wildcards: wildcards.quad.split('_')[3], | 
|  | 659         fd_params=config.get('fastq_dump_args', ''), | 
|  | 660         retries=config.get('fastq_dump_retries', '5') | 
|  | 661     threads: 4 | 
|  | 662     shell: | 
|  | 663         """ | 
|  | 664                 set -x | 
|  | 665         SUCCESS=0 | 
|  | 666         TIMEOUT=10 | 
|  | 667         PARAMS="" | 
|  | 668         TMP="{config[temp]}/dl-{params.srr}" | 
|  | 669         ! test -d ${{TMP}} | 
|  | 670 | 
|  | 671         if [[ {params.method} == "sra" ]] ; then | 
|  | 672             USE_FASTERQ=1 | 
|  | 673             for i in {{ 1..{params.retries} }} ; do | 
|  | 674                 if time prefetch -t fasp -O ${{TMP}} -L info {params.srr} 2>&1 >> {log} ; then | 
|  | 675                     SUCCESS=1 | 
|  | 676                     break | 
|  | 677                 else | 
|  | 678                     echo "COUNT_SraRetries 1" | 
|  | 679                     TIMEOUT=$((${{TIMEOUT}} * 2)) | 
|  | 680                     sleep ${{TIMEOUT}} | 
|  | 681                 fi | 
|  | 682             done | 
|  | 683             if (( $SUCCESS == 0 )) ; then | 
|  | 684                 echo "COUNT_SraFailures 1" | 
|  | 685                 exit 1 | 
|  | 686             fi | 
|  | 687             test -f ${{TMP}}/*.sra | 
|  | 688             size=$(cat ${{TMP}}/*.sra | wc -c) | 
|  | 689             echo "COUNT_SraBytesDownloaded ${{size}}" | 
|  | 690             if (( ${{USE_FASTERQ}} == 1 )) ; then | 
|  | 691                 time fasterq-dump ${{TMP}}/*.sra \ | 
|  | 692                     -e {threads} \ | 
|  | 693                     -t ${{TMP}} \ | 
|  | 694                     -L info \ | 
|  | 695                     --split-files \ | 
|  | 696                     --skip-technical \ | 
|  | 697                     -o {params.srr}.fastq \ | 
|  | 698                     2>&1 >> {log} | 
|  | 699                 test -f {params.srr}_2.fastq || mv {params.srr}.fastq {params.srr}_0.fastq | 
|  | 700             else | 
|  | 701                 time fastq-dump ${{TMP}}/*.sra \ | 
|  | 702                     -L info \ | 
|  | 703                     --split-files \ | 
|  | 704                     --skip-technical \ | 
|  | 705                     -O . \ | 
|  | 706                     2>&1 >> {log} | 
|  | 707                 test -f {params.srr}_2.fastq || mv {params.srr}_1.fastq {params.srr}_0.fastq | 
|  | 708             fi | 
|  | 709             rm -rf ${{TMP}} | 
|  | 710         elif [[ {params.method} == "gdc" ]] ; then | 
|  | 711             TOKEN=~/gdc/latest.txt | 
|  | 712             if [[ ! -f ${{TOKEN}} ]] ; then | 
|  | 713                 echo "ERROR: no GDC token file found at ${{TOKEN}}" | 
|  | 714                 exit 1 | 
|  | 715             fi | 
|  | 716             for i in {{ 1..{params.retries} }} ; do | 
|  | 717                 if time gdc-client download \ | 
|  | 718                     -t ${{TOKEN}} \ | 
|  | 719                     --log-file {{TMP}}/log.txt \ | 
|  | 720                     -d {{TEMP}} \ | 
|  | 721                     {params.srr} 2>&1 >> {log} | 
|  | 722                 then | 
|  | 723                     SUCCESS=1 | 
|  | 724                     break | 
|  | 725                 else | 
|  | 726                     echo "COUNT_GdcRetries 1" | 
|  | 727                     TIMEOUT=$((${{TIMEOUT}} * 2)) | 
|  | 728                     sleep ${{TIMEOUT}} | 
|  | 729                 fi | 
|  | 730             done | 
|  | 731             if (( $SUCCESS == 0 )) ; then | 
|  | 732                 echo "COUNT_GdcFailures 1" | 
|  | 733                 exit 1 | 
|  | 734             fi | 
|  | 735             test -d {{TEMP}}/{params.srr} | 
|  | 736             test -f {{TEMP}}/{params.srr}/*.tar.gz | 
|  | 737 | 
|  | 738             echo "=== gdc-client log.txt begin ===" >> {log} | 
|  | 739             cat {{TMP}}/log.txt >> {log} | 
|  | 740             echo "=== gdc-client log.txt end===" >> {log} | 
|  | 741 | 
|  | 742             size=$(cat {{TEMP}}/{params.srr}/*.tar.gz | wc -c) | 
|  | 743             echo "COUNT_GdcBytesDownloaded ${{size}}" | 
|  | 744 | 
|  | 745             tar zxvf {{TEMP}}/{params.srr}/*.tar.gz | 
|  | 746             rm -rf {{TEMP}} | 
|  | 747 | 
|  | 748             num_1s=$(ls -1 *_1.fastq | wc -l) | 
|  | 749             num_2s=$(ls -1 *_2.fastq | wc -l) | 
|  | 750             if (( ${{num_1s}} == 0 )) ; then | 
|  | 751                 echo "ERROR: No _1.fastq files output" | 
|  | 752                 exit 1 | 
|  | 753             fi | 
|  | 754             if (( ${{num_1s}} > 1 )) ; then | 
|  | 755                 echo "ERROR: More than one _1.fastq file found" | 
|  | 756                 exit 1 | 
|  | 757             fi | 
|  | 758             if (( ${{num_2s}} == 0 )) ; then | 
|  | 759                 # unpaired | 
|  | 760                 mv *_1.fastq {params.srr}_0.fastq | 
|  | 761             else | 
|  | 762                 # paired-end | 
|  | 763                 mv *_1.fastq {params.srr}_1.fastq | 
|  | 764                 mv *_2.fastq {params.srr}_2.fastq | 
|  | 765             fi | 
|  | 766         fi | 
|  | 767 | 
|  | 768         # Next chunk expects the FASTQ files to exist in the current directory | 
|  | 769         # named {{params.srr}}_{{0,1,2}}.fastq | 
|  | 770         size=0 | 
|  | 771         for i in {{0..2}} ; do | 
|  | 772             fn={params.srr}_${{i}}.fastq | 
|  | 773             if [[ -f ${{fn}} ]] ; then | 
|  | 774                 echo "COUNT_FilesDownloaded 1" | 
|  | 775             else | 
|  | 776                 touch ${{fn}} | 
|  | 777             fi | 
|  | 778             size=$((${{size}} + $(wc -c < ${{fn}}))) | 
|  | 779             mv ${{fn}} {config[temp]}/{wildcards.quad}_${{i}}.fastq | 
|  | 780         done | 
|  | 781         echo "COUNT_BytesDownloaded ${{size}}" | 
|  | 782         echo "COUNT_DownloadComplete 1" | 
|  | 783         """ |