| 10 | 1 """ | 
|  | 2 Parameters: | 
| 16 | 3 - fastq_dump_args: arguments to pass to fastq dumping tool | 
|  | 4 - fastq_dump_retries: number of retry attempts before dying | 
|  | 5 - fastq_dump_tool: name of tool to run, e.g. 'fastq-dump' or 'fasterq-dump' | 
| 10 | 6 - star: arguments to pass to STAR aligner | 
| 16 | 7 - salmon_args: arguments to pass to salmon quant | 
| 10 | 8 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10] | 
| 16 | 9 - bw_bed: name of BED file to use with bwtool | 
|  | 10 - max_unalign: maximum number of unaligned reads to save per run accession | 
| 10 | 11 """ | 
|  | 12 | 
| 16 | 13 STEPS = ['download', 'fastq_check', 'align', 'sort', | 
|  | 14          'bamcount', | 
|  | 15          'salmon', | 
|  | 16          'align_unmapped', | 
|  | 17          'extract_jx'] | 
| 10 | 18 | 
| 16 | 19 FILES = ['sjout.zst', 'fastq_check.tsv.zst', | 
|  | 20          'unmapped.bam', 'unmapped.fastq.zst', | 
| 10 | 21          'bamcount_nonref.csv.zst', | 
|  | 22          'bamcount_auc.tsv', | 
|  | 23          'bamcount_frag.tsv', | 
|  | 24          'Chimeric.out.junction.zst', | 
|  | 25          'all.exon_bw_count.zst', 'unique.exon_bw_count.zst', | 
| 16 | 26          'all.bw.zst', 'unique.bw.zst', | 
|  | 27          'salmon.tsv.zst', | 
|  | 28          'jx_bed.zst', | 
|  | 29          'manifest'] + list(map(lambda x: x + '.log', STEPS)) | 
| 10 | 30 | 
| 14 | 31 #get rid of the pesky warnings about current paths | 
|  | 32 #and make sure we're not assuming any wrong relative paths | 
| 12 | 33 import os | 
|  | 34 config['output']=os.path.abspath(config['output']) | 
| 13 | 35 config['temp']=os.path.abspath(config['temp']) | 
| 14 | 36 config['exon_bed']=os.path.abspath(config['exon_bed']) | 
|  | 37 config['ref']=os.path.abspath(config['ref']) | 
| 12 | 38 | 
| 10 | 39 import subprocess | 
|  | 40 def run_command(cmd_args): | 
|  | 41     cmd_args = ' '.join(cmd_args) | 
|  | 42     try: | 
|  | 43         subprocess.check_call(args=cmd_args, shell=True) | 
|  | 44     except subprocess.CalledProcessError as cpe: | 
|  | 45         sys.stderr.write("error in run_command for command: %s\n" % cmd_args) | 
|  | 46         raise cpe | 
|  | 47 | 
| 16 | 48 import re | 
| 35 | 49 #limit the outputs/steps to only 1) STAR called junctions 2) all reads per-base coverage (BigWigs) 3) all reads exon summarized coverage 4) AUC (for QC) | 
|  | 50 STEPS_FILES_FILTER=re.compile(r'(unmapped)|(download)|(salmon)|(extract_jx)|(jx_bed)|(manifest)|(nonref)|(Chimeric)(fastq_check)|(frag)|(unique)') | 
| 16 | 51 def remove_steps_files(): | 
|  | 52     #modify STEP and FILES | 
|  | 53     #so we don't run download or unmapped steps | 
|  | 54     global FILES | 
|  | 55     global STEPS | 
|  | 56     for i in reversed(range(0,len(STEPS))): | 
|  | 57         f = STEPS[i] | 
|  | 58         if STEPS_FILES_FILTER.search(f): | 
|  | 59             del(STEPS[i]) | 
|  | 60     for i in reversed(range(0,len(FILES))): | 
|  | 61         f = FILES[i] | 
|  | 62         if STEPS_FILES_FILTER.search(f): | 
|  | 63             del(FILES[i]) | 
| 10 | 64 | 
| 16 | 65 | 
| 11 | 66 FASTQ_PATT=re.compile(r'([^_\.]+)(_(\d+))?\.((fastq)|fq)(\.gz)?$') | 
| 10 | 67 def prep_for_galaxy_run(): | 
| 16 | 68     remove_steps_files() | 
| 10 | 69     try: | 
|  | 70         os.mkdir(config['temp']) | 
|  | 71     except OSError as ose: | 
|  | 72         pass | 
|  | 73     fastqs = config['inputs'].split(',') | 
|  | 74     m = FASTQ_PATT.search(fastqs[0]) | 
| 11 | 75     run_acc = 'sample' | 
|  | 76     if m is not None: | 
|  | 77         run_acc = m.group(1) | 
| 12 | 78     study_acc = 'study' | 
| 10 | 79     if 'study' in config: | 
|  | 80         study_acc = config['study'] | 
|  | 81     genome = 'hg38' | 
|  | 82     if 'genome' in config: | 
|  | 83         genome = config['genome'] | 
|  | 84     method = 'sra' | 
|  | 85     # SRR,SRP,genome | 
|  | 86     # e.g. SRR1557855,SRP045778,ce10 | 
|  | 87     #create links which will be used in the align step | 
|  | 88     i = 1 | 
|  | 89     for f in fastqs: | 
|  | 90         newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i) | 
| 15 | 91         if 'compressed' in config: | 
|  | 92             run_command(['zcat',f,'>',newf]) | 
|  | 93         else: | 
|  | 94             os.symlink(os.path.abspath(f), newf) | 
| 10 | 95         #create fastq 0 | 
|  | 96         if i == 1: | 
| 15 | 97             os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0)) | 
| 10 | 98         i += 1 | 
|  | 99     #create fastq 2 if not paired | 
|  | 100     if i == 2: | 
|  | 101         open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close() | 
|  | 102     return([run_acc, study_acc, genome, method]) | 
|  | 103 | 
|  | 104 | 
|  | 105 def get_accessions(wildcards): | 
|  | 106     """ | 
|  | 107     Grouping of SRRs with the same SRP could happen here | 
|  | 108     """ | 
|  | 109     #if running under galaxy where the user will input the | 
|  | 110     #FASTQs, study, and genome directly | 
|  | 111     if 'inputs' in config: | 
|  | 112         (run_acc, study_acc, genome, method) = prep_for_galaxy_run() | 
|  | 113         for ext in FILES: | 
|  | 114             yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext)) | 
|  | 115         #here to get the make_galaxy_links rule to fire | 
|  | 116         yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method)) | 
|  | 117     #not running under galaxy, takes a list of accessions | 
|  | 118     else: | 
|  | 119         for fn in config['input'].split(): | 
|  | 120             with open(fn, 'r') as fh: | 
|  | 121                 for ln in fh: | 
|  | 122                     if ln.count(',') < 2: | 
|  | 123                         continue | 
|  | 124                     toks = ln.rstrip().split(',') | 
|  | 125                     assert 3 <= len(toks) <= 4 | 
|  | 126                     method = 'sra' | 
|  | 127                     if len(toks) == 4: | 
|  | 128                         method = toks[3] | 
|  | 129                     # SRR,SRP,genome | 
|  | 130                     # e.g. SRR1557855,SRP045778,ce10 | 
|  | 131                     for ext in FILES: | 
|  | 132                         yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext)) | 
|  | 133 | 
|  | 134 rule all: | 
|  | 135     input: | 
|  | 136         get_accessions | 
|  | 137 | 
|  | 138 rule make_manifest: | 
|  | 139     input: | 
| 16 | 140         config['output'] + '/{quad}.salmon.tsv.zst', | 
| 10 | 141         config['output'] + '/{quad}.sjout.zst', | 
| 16 | 142         config['output'] + '/{quad}.jx_bed.zst', | 
| 10 | 143         config['output'] + '/{quad}.Chimeric.out.junction.zst', | 
| 16 | 144         config['output'] + '/{quad}.unmapped.bam', | 
|  | 145         config['output'] + '/{quad}.unmapped.fastq.zst', | 
| 10 | 146         config['output'] + '/{quad}.bamcount_nonref.csv.zst', | 
|  | 147         config['output'] + '/{quad}.bamcount_auc.tsv', | 
|  | 148         config['output'] + '/{quad}.bamcount_frag.tsv', | 
| 16 | 149         config['output'] + '/{quad}.fastq_check.tsv.zst', | 
| 10 | 150         config['output'] + '/{quad}.all.exon_bw_count.zst', | 
|  | 151         config['output'] + '/{quad}.unique.exon_bw_count.zst', | 
| 16 | 152         config['output'] + '/{quad}.all.bw.zst', | 
|  | 153         config['output'] + '/{quad}.unique.bw.zst', | 
|  | 154         config['output'] + '/{quad}.align.log', | 
|  | 155         config['output'] + '/{quad}.extract_jx.log', | 
|  | 156         config['output'] + '/{quad}.bamcount.log', | 
|  | 157         config['output'] + '/{quad}.align_unmapped.log', | 
|  | 158         config['output'] + '/{quad}.download.log', | 
|  | 159         config['output'] + '/{quad}.fastq_check.log', | 
|  | 160         config['output'] + '/{quad}.sort.log', | 
|  | 161         config['output'] + '/{quad}.salmon.log' | 
| 10 | 162     output: | 
|  | 163         config['output'] + '/{quad}.manifest' | 
|  | 164     params: | 
|  | 165         quad=lambda wildcards: wildcards.quad | 
|  | 166     run: | 
|  | 167         with open(output[0], 'wt') as fh: | 
|  | 168             for fn in FILES: | 
|  | 169                 fh.write(params.quad + "." + fn + '\n') | 
|  | 170 | 
|  | 171 def galaxy_link_files(op): | 
| 16 | 172     remove_steps_files() | 
| 10 | 173     a = [op + '/' + f for f in FILES] | 
|  | 174     return a | 
|  | 175 | 
|  | 176 rule make_galaxy_links: | 
|  | 177     input: | 
|  | 178         config['output'] + '/{quad}.sjout.zst', | 
| 35 | 179         #config['output'] + '/{quad}.fastq_check.tsv.zst', | 
| 10 | 180         config['output'] + '/{quad}.bamcount_auc.tsv', | 
| 35 | 181         #config['output'] + '/{quad}.bamcount_frag.tsv', | 
| 10 | 182         config['output'] + '/{quad}.all.exon_bw_count.zst', | 
| 35 | 183         #config['output'] + '/{quad}.unique.exon_bw_count.zst', | 
| 16 | 184         config['output'] + '/{quad}.all.bw.zst', | 
| 35 | 185         #config['output'] + '/{quad}.unique.bw.zst', | 
|  | 186         #config['output'] + '/{quad}.fastq_check.log', | 
|  | 187         #config['output'] + '/{quad}.align.log', | 
|  | 188         #config['output'] + '/{quad}.sort.log', | 
|  | 189         #config['output'] + '/{quad}.bamcount.log', | 
| 10 | 190     output: | 
|  | 191         config['output'] + '/{quad}.done' | 
|  | 192     params: | 
|  | 193         quad=lambda wildcards: wildcards.quad, | 
|  | 194         out=galaxy_link_files(config['output']) | 
|  | 195     run: | 
|  | 196         for (i,fn) in enumerate(input): | 
|  | 197             os.symlink(os.path.abspath(fn), params.out[i]) | 
| 16 | 198         os.symlink(os.path.abspath(input[-1]), output[0]) | 
| 10 | 199 | 
|  | 200 | 
|  | 201 rule bamcount: | 
|  | 202     input: | 
|  | 203         bam=config['temp'] + '/{quad}-sorted.bam', | 
|  | 204         bamidx=config['temp'] + '/{quad}-sorted.bam.bai', | 
|  | 205         bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed')) | 
|  | 206     output: | 
|  | 207         nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst', | 
|  | 208         auc=config['output'] + '/{quad}.bamcount_auc.tsv', | 
|  | 209         frag=config['output'] + '/{quad}.bamcount_frag.tsv', | 
|  | 210         all_bw=config['output'] + '/{quad}.all.bw.zst', | 
|  | 211         unique_bw=config['output'] + '/{quad}.unique.bw.zst', | 
|  | 212         all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst', | 
|  | 213         unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst' | 
|  | 214     log: | 
|  | 215         config['output'] + '/{quad}.bamcount.log' | 
|  | 216     params: | 
|  | 217         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 218         uniq_qual=config.get('unique_qual', 10) | 
|  | 219     threads: 4 | 
|  | 220     shell: | 
|  | 221         """ | 
|  | 222         TMP={config[temp]}/{params.srr}_bamcount | 
| 14 | 223         bamcount {input.bam} \ | 
| 10 | 224             --threads {threads} \ | 
|  | 225             --coverage \ | 
|  | 226             --no-head \ | 
|  | 227             --require-mdz \ | 
|  | 228             --min-unique-qual {params.uniq_qual} \ | 
|  | 229             --frag-dist ${{TMP}} \ | 
|  | 230             --bigwig ${{TMP}} \ | 
|  | 231             --annotation {input.bed} ${{TMP}} \ | 
|  | 232             --auc ${{TMP}} \ | 
|  | 233             --alts ${{TMP}} \ | 
|  | 234             2>&1 | tee -a {log} | 
|  | 235 | 
|  | 236         # | 
|  | 237         # --alts | 
|  | 238         # | 
|  | 239 | 
|  | 240         (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log} | 
|  | 241         size=$(wc -c < {output.nonref}) | 
|  | 242         echo "COUNT_NonrefSize ${{size}}" | 
|  | 243         rm -f ${{TMP}}.alts.tsv | 
|  | 244 | 
|  | 245         # | 
|  | 246         # --auc | 
|  | 247         # | 
|  | 248         mv ${{TMP}}.auc.tsv {output.auc} | 
|  | 249         size=$(wc -c < {output.auc}) | 
|  | 250         echo "COUNT_AucSize ${{size}}" | 
|  | 251         rm -f ${{TMP}}.auc.tsv | 
|  | 252 | 
|  | 253         # | 
|  | 254         # --frag-dist | 
|  | 255         # | 
|  | 256         mv ${{TMP}}.frags.tsv {output.frag} | 
|  | 257         size=$(wc -c < {output.frag}) | 
|  | 258         echo "COUNT_FragDistSize ${{size}}" | 
|  | 259         rm -f ${{TMP}}.frags.tsv | 
|  | 260 | 
|  | 261         # | 
|  | 262         # --bigwig | 
|  | 263         # | 
|  | 264 | 
|  | 265         (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log} | 
|  | 266         size=$(wc -c < {output.all_bw}) | 
|  | 267         echo "COUNT_BwSize ${{size}}" | 
|  | 268         rm -f ${{TMP}}.all.bw | 
|  | 269 | 
|  | 270         (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log} | 
|  | 271         size=$(wc -c < {output.unique_bw}) | 
|  | 272         echo "COUNT_BwSize ${{size}}" | 
|  | 273         rm -f ${{TMP}}.unique.bw | 
|  | 274 | 
|  | 275         # | 
|  | 276         # --annotation | 
|  | 277         # | 
|  | 278 | 
|  | 279         (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log} | 
|  | 280         size=$(wc -c < {output.all_bw_count}) | 
|  | 281         echo "COUNT_BwQuantSize ${{size}}" | 
|  | 282         rm -f ${{TMP}}.all.tsv | 
|  | 283 | 
|  | 284         (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log} | 
|  | 285         size=$(wc -c < {output.unique_bw_count}) | 
|  | 286         echo "COUNT_BwQuantSize ${{size}}" | 
|  | 287         rm -f ${{TMP}}.unique.tsv | 
|  | 288 | 
|  | 289         # Check that all temporaries were properly purged | 
|  | 290         set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l) | 
|  | 291         if (( $num_files > 0 )) ; then | 
|  | 292             echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)" | 
|  | 293             exit 1 | 
|  | 294         fi | 
|  | 295 | 
|  | 296         echo "COUNT_BamcountComplete 1" | 
|  | 297         """ | 
|  | 298 | 
| 16 | 299 rule bw_zstd: | 
|  | 300     input: | 
|  | 301         config['temp'] + '/{prefix}.bw' | 
|  | 302     output: | 
|  | 303         config['output'] + '/{prefix}.bw.zst' | 
|  | 304     shell: | 
|  | 305         """ | 
|  | 306         zstd {input} -o {output} | 
|  | 307         size=$(wc -c < {output}) | 
|  | 308         echo "COUNT_BwBytes ${{size}}" | 
|  | 309         echo "COUNT_BwZstdComplete 1" | 
|  | 310         """ | 
|  | 311 | 
| 10 | 312 rule sort: | 
|  | 313     input: | 
|  | 314         config['temp'] + '/{quad}.bam' | 
|  | 315     wildcard_constraints: | 
|  | 316         quad="[^-]+" | 
|  | 317     output: | 
|  | 318         bam=temp(config['temp'] + '/{quad}-sorted.bam'), | 
|  | 319         bai=temp(config['temp'] + '/{quad}-sorted.bam.bai') | 
|  | 320     log: | 
|  | 321         config['output'] + '/{quad}.sort.log' | 
|  | 322     params: | 
|  | 323         srr=lambda wildcards: wildcards.quad.split('_')[0] | 
|  | 324     threads: 8 | 
|  | 325     shell: | 
|  | 326         """ | 
|  | 327         TMP="{config[temp]}/sort_temp.{params.srr}" | 
|  | 328         mkdir -p ${{TMP}} | 
|  | 329         time samtools sort \ | 
|  | 330             -T ${{TMP}}/samtools_temp \ | 
|  | 331             -@ {threads} \ | 
|  | 332             -m 64M \ | 
|  | 333             -o {output.bam} {input} 2>&1 | tee -a {log} | 
|  | 334         rm -rf ${{TMP}} | 
|  | 335         size=$(wc -c < {output.bam}) | 
|  | 336         echo "COUNT_SortedBAMBytes ${{size}}" | 
|  | 337 | 
|  | 338         time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log} | 
|  | 339         echo "COUNT_SortComplete 1" | 
|  | 340         """ | 
|  | 341 | 
| 16 | 342 rule salmon: | 
|  | 343     input: | 
|  | 344         reads0=config['temp'] + '/{quad}_0.fastq', | 
|  | 345         reads1=config['temp'] + '/{quad}_1.fastq', | 
|  | 346         reads2=config['temp'] + '/{quad}_2.fastq', | 
|  | 347         index1=lambda wildcards: '%s/%s/salmon_index/hash.bin' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 348         index2=lambda wildcards: '%s/%s/salmon_index/sa.bin' % (config['ref'], wildcards.quad.split('_')[2]) | 
|  | 349     output: | 
|  | 350         config['output'] + '/{quad}.salmon.tsv.zst' | 
|  | 351     log: | 
|  | 352         config['output'] + '/{quad}.salmon.log' | 
|  | 353     params: | 
|  | 354         index_base=lambda wildcards: '%s/%s/salmon_index' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 355         salmon_args=config.get('salmon_args', '') | 
|  | 356     threads: 8 | 
|  | 357     shell: | 
|  | 358         """ | 
|  | 359         READ_FILES="-r {input.reads0}" | 
|  | 360         if [[ -s {input.reads2} ]] ; then | 
|  | 361             READ_FILES="-1 {input.reads1} -2 {input.reads2}" | 
|  | 362         fi | 
|  | 363         if set -o pipefail && time salmon quant \ | 
|  | 364             --libType U \ | 
|  | 365             --quiet \ | 
|  | 366             --validateMappings \ | 
|  | 367             -i {params.index_base} \ | 
|  | 368             -p {threads} \ | 
|  | 369             {params.salmon_args} \ | 
|  | 370             ${{READ_FILES}} \ | 
|  | 371             --output salmon_quant \ | 
|  | 372             --minAssignedFrags 1 \ | 
|  | 373             2>&1 | tee -a {log} | 
|  | 374         then | 
|  | 375             echo "COUNT_SalmonSuccess 1" | 
|  | 376 | 
|  | 377             time zstd salmon_quant/quant.sf -o {output} 2>&1 | tee -a {log} | 
|  | 378             size=$(wc -c < {output}) | 
|  | 379             echo "COUNT_SalmonQuantBytes ${{size}}" | 
|  | 380         else | 
|  | 381             touch {output} | 
|  | 382             echo "COUNT_SalmonFailure 1" | 
|  | 383         fi | 
|  | 384 | 
|  | 385         rm -rf salmon_quant | 
|  | 386         echo "COUNT_SalmonComplete 1" | 
|  | 387         """ | 
|  | 388 | 
|  | 389 rule align_unmapped: | 
|  | 390     input: | 
|  | 391         unmapped1=config['temp'] + '/{quad}_1.unmappedfastq', | 
|  | 392         unmapped2=config['temp'] + '/{quad}_2.unmappedfastq', | 
|  | 393         index=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome.1.ht2' % (config['ref'], wildcards.quad.split('_')[2]) | 
|  | 394     output: | 
|  | 395         bam=config['output'] + '/{quad}.unmapped.bam', | 
|  | 396         sample=config['output'] + '/{quad}.unmapped.fastq.zst' | 
|  | 397     log: | 
|  | 398         config['output'] + '/{quad}.align_unmapped.log' | 
|  | 399     params: | 
|  | 400         index_base=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 401         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 402         hisat2_params=config.get('hisat2', ''), | 
|  | 403         max_unalign=config.get('max_unalign', 100000) | 
|  | 404     threads: 16 | 
|  | 405     shell: | 
|  | 406         """ | 
|  | 407         TMP="{config[temp]}/align_unmapped_temp.{params.srr}" | 
|  | 408         READ_FILES="-1 {input.unmapped1} -2 {input.unmapped2}" | 
|  | 409         if [[ ! -s {input.unmapped2} ]] ; then | 
|  | 410             READ_FILES="-U {input.unmapped1}" | 
|  | 411         fi | 
|  | 412         time hisat2 \ | 
|  | 413             $READ_FILES \ | 
|  | 414             -t --mm \ | 
|  | 415             -x {params.index_base} \ | 
|  | 416             --threads {threads} \ | 
|  | 417             {params.hisat2_params} \ | 
|  | 418             --un ${{TMP}}.un \ | 
|  | 419             --un-conc ${{TMP}}.un_conc \ | 
|  | 420             -S ${{TMP}}.sam \ | 
|  | 421             2>&1 | tee -a {log} | 
|  | 422 | 
|  | 423         # | 
|  | 424         # Make BAM file out of aligned reads | 
|  | 425         # | 
|  | 426         time samtools view \ | 
|  | 427             -b -F 4 \ | 
|  | 428             -o {output.bam} \ | 
|  | 429             ${{TMP}}.sam \ | 
|  | 430             2>&1 | tee -a {log} | 
|  | 431         rm -f ${{TMP}}.sam | 
|  | 432 | 
|  | 433         # | 
|  | 434         # Save a subset of the doubly unaligned reads | 
|  | 435         # | 
|  | 436         if [[ ! -s {input.unmapped2} ]] ; then | 
|  | 437             # unpaired | 
|  | 438             fastq-sample \ | 
|  | 439                 -n {params.max_unalign} \ | 
|  | 440                 -o ${{TMP}}.samp \ | 
|  | 441                 ${{TMP}}.un | 
|  | 442             rm -f ${{TMP}}.un | 
|  | 443         else | 
|  | 444             # paired-end | 
|  | 445             fastq-sample \ | 
|  | 446                 -n {params.max_unalign} \ | 
|  | 447                 -o ${{TMP}}.samp \ | 
|  | 448                 ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc | 
|  | 449             rm -f ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc | 
|  | 450 | 
|  | 451             # interleave the two output fastqs into single file | 
|  | 452             paste ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq \ | 
|  | 453                 | paste - - - - \ | 
|  | 454                 | awk -v OFS="\n" -v FS="\t" '{{print($1,$3,$5,$7,$2,$4,$6,$8)}}' \ | 
|  | 455                 > ${{TMP}}.samp.fastq | 
|  | 456             rm -f ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq | 
|  | 457         fi | 
|  | 458 | 
|  | 459         test -f ${{TMP}}.samp.fastq | 
|  | 460         time zstd ${{TMP}}.samp.fastq -o {output.sample} 2>&1 | tee -a {log} | 
|  | 461         size=$(wc -c < {output.sample}) | 
|  | 462         echo "COUNT_UnmappedSampleBytes ${{size}}" | 
|  | 463         rm -f ${{TMP}}.samp.fastq | 
|  | 464 | 
|  | 465         size=$(wc -c < {output.bam}) | 
|  | 466         echo "COUNT_UnmappedBamBytes ${{size}}" | 
|  | 467 | 
|  | 468         echo "COUNT_AlignUnmappedComplete 1" | 
|  | 469         """ | 
|  | 470 | 
|  | 471 rule extract_jx: | 
|  | 472     input: | 
|  | 473         bam=config['temp'] + '/{quad}-sorted.bam', | 
|  | 474         bamidx=config['temp'] + '/{quad}-sorted.bam.bai', | 
|  | 475         fa=lambda wildcards: '%s/%s/fasta/genome.fa' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 476         gtf=lambda wildcards: '%s/%s/gtf/genes.gtf' % (config['ref'], wildcards.quad.split('_')[2]) | 
|  | 477     output: | 
|  | 478         config['output'] + '/{quad}.jx_bed.zst' | 
|  | 479     params: | 
|  | 480         srr=lambda wildcards: wildcards.quad.split('_')[0] | 
|  | 481     log: | 
|  | 482         config['output'] + '/{quad}.extract_jx.log' | 
|  | 483     shell: | 
|  | 484         """ | 
|  | 485         TMP="{config[temp]}/extract_jx.{params.srr}" | 
|  | 486         time regtools junctions extract \ | 
|  | 487             -i 20 -a 1 \ | 
|  | 488             -o ${{TMP}}.jx_tmp \ | 
|  | 489             {input.bam} 2>&1 | tee -a {log} | 
|  | 490         time zstd ${{TMP}}.jx_tmp -o {output} 2>&1 | tee -a {log} | 
|  | 491         rm -f ${{TMP}}.jx_tmp | 
|  | 492 | 
|  | 493         size=$(wc -c < {output}) | 
|  | 494         echo "COUNT_ExtractJxBytes ${{size}}" | 
|  | 495 | 
|  | 496         echo "COUNT_ExtractJxComplete 1" | 
|  | 497         """ | 
|  | 498 | 
| 10 | 499 rule align: | 
|  | 500     input: | 
|  | 501         reads0=config['temp'] + '/{quad}_0.fastq', | 
|  | 502         reads1=config['temp'] + '/{quad}_1.fastq', | 
|  | 503         reads2=config['temp'] + '/{quad}_2.fastq', | 
|  | 504         index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 505         index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2]) | 
|  | 506     wildcard_constraints: | 
|  | 507         quad="[^-]+" | 
|  | 508     output: | 
|  | 509         bam=temp(config['temp'] + '/{quad}.bam'), | 
|  | 510         jxs=config['output'] + '/{quad}.sjout.zst', | 
|  | 511         chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst', | 
|  | 512         unmapped1=config['temp'] + '/{quad}_1.unmappedfastq', | 
|  | 513         unmapped2=config['temp'] + '/{quad}_2.unmappedfastq' | 
|  | 514     log: | 
|  | 515         config['output'] + '/{quad}.align.log' | 
|  | 516     params: | 
|  | 517         index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]), | 
|  | 518         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 519         star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '') | 
|  | 520     threads: 16 | 
|  | 521     shell: | 
|  | 522         """ | 
|  | 523         READ_FILES="{input.reads0}" | 
|  | 524         if [[ -s {input.reads2} ]] ; then | 
|  | 525             READ_FILES="{input.reads1} {input.reads2}" | 
|  | 526         fi | 
|  | 527         TMP="{config[temp]}/align_temp.{params.srr}" | 
|  | 528         rm -rf ${{TMP}} | 
|  | 529         time STAR \ | 
|  | 530             {params.star_params} \ | 
|  | 531             --runMode alignReads \ | 
|  | 532             --runThreadN {threads} \ | 
|  | 533             --genomeDir {params.index_base} \ | 
|  | 534             --readFilesIn ${{READ_FILES}} \ | 
|  | 535             --twopassMode None \ | 
|  | 536             --outTmpDir ${{TMP}} \ | 
|  | 537             --outReadsUnmapped Fastx \ | 
|  | 538             --outMultimapperOrder Random \ | 
|  | 539             --outSAMreadID Number \ | 
|  | 540             --outSAMtype BAM Unsorted \ | 
|  | 541             --outSAMmode NoQS \ | 
|  | 542             --outSAMattributes NH MD \ | 
|  | 543             --chimOutType Junctions \ | 
|  | 544             --chimOutJunctionFormat 1 \ | 
|  | 545             --chimSegmentReadGapMax 3 \ | 
|  | 546             --chimJunctionOverhangMin 12 \ | 
|  | 547             --chimSegmentMin 12 2>&1 | tee -a {log} | 
|  | 548 | 
|  | 549         # Full set of output files: | 
|  | 550         # | 
|  | 551         # Aligned.out.bam | 
|  | 552         # Chimeric.out.junction | 
|  | 553         # Log.final.out | 
|  | 554         # Log.out | 
|  | 555         # Log.progress.out | 
|  | 556         # SJ.out.tab | 
|  | 557         # Unmapped.out.mate1 | 
|  | 558         # Unmapped.out.mate2 (if any reads were paired-end) | 
|  | 559 | 
|  | 560         # | 
|  | 561         # Logs | 
|  | 562         # | 
|  | 563         rm -rf ${{TMP}} | 
|  | 564         cat Log.out >> {log} | 
|  | 565         cat Log.final.out >> {log} | 
|  | 566         rm -f Log*.out | 
|  | 567 | 
|  | 568         # | 
|  | 569         # Junctions | 
|  | 570         # | 
|  | 571         test -f SJ.out.tab | 
|  | 572         time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log} | 
|  | 573         rm -f SJ.out.tab | 
|  | 574         size=$(wc -c < {output.jxs}) | 
|  | 575         echo "COUNT_CompressedJxBytes ${{size}}" | 
|  | 576 | 
|  | 577         # | 
|  | 578         # Chimerics | 
|  | 579         # | 
|  | 580         test -f Chimeric.out.junction | 
|  | 581         test -s Chimeric.out.junction | 
|  | 582         sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted | 
|  | 583         time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log} | 
|  | 584         rm -f Chimeric.out.junction Chimeric.out.junction.sorted | 
|  | 585         size=$(wc -c < {output.chimeric}) | 
|  | 586         echo "COUNT_ChimericBytes ${{size}}" | 
|  | 587 | 
|  | 588         # | 
|  | 589         # Unmapped | 
|  | 590         # | 
|  | 591         touch {output.unmapped2} | 
|  | 592         test -f Unmapped.out.mate1 | 
|  | 593         mv Unmapped.out.mate1 {output.unmapped1} | 
|  | 594         if [[ -f Unmapped.out.mate2 ]] ; then | 
|  | 595             mv Unmapped.out.mate2 {output.unmapped2} | 
|  | 596         fi | 
|  | 597 | 
|  | 598         # | 
|  | 599         # Alignments | 
|  | 600         # | 
|  | 601         size=$(wc -c < Aligned.out.bam) | 
|  | 602         echo "COUNT_BAMBytes ${{size}}" | 
|  | 603         mv Aligned.out.bam {output.bam} | 
| 16 | 604 | 
| 10 | 605         echo "COUNT_AlignComplete 1" | 
|  | 606         """ | 
| 16 | 607 | 
|  | 608 rule fastq_check: | 
|  | 609     input: | 
|  | 610         reads0=config['temp'] + '/{quad}_0.fastq', | 
|  | 611         reads1=config['temp'] + '/{quad}_1.fastq', | 
|  | 612         reads2=config['temp'] + '/{quad}_2.fastq' | 
|  | 613     output: | 
|  | 614         config['output'] + '/{quad}.fastq_check.tsv.zst' | 
|  | 615     log: | 
|  | 616         config['output'] + '/{quad}.fastq_check.log' | 
|  | 617     params: | 
|  | 618         srr=lambda wildcards: wildcards.quad.split('_')[0] | 
|  | 619     shell: | 
|  | 620         """ | 
|  | 621         TMP="{config[temp]}/fastq_check-{params.srr}.tsv" | 
|  | 622         touch ${{TMP}} | 
|  | 623         if [[ -s {input.reads0} ]] ; then | 
|  | 624             time seqtk fqchk -q0 {input.reads0} >>${{TMP}} 2>>{log} | 
|  | 625         fi | 
|  | 626         if [[ -s {input.reads1} ]] ; then | 
|  | 627             time seqtk fqchk -q0 {input.reads1} >>${{TMP}} 2>>{log} | 
|  | 628         fi | 
|  | 629         if [[ -s {input.reads2} ]] ; then | 
|  | 630             time seqtk fqchk -q0 {input.reads2} >>${{TMP}} 2>>{log} | 
|  | 631         fi | 
|  | 632         time zstd ${{TMP}} -o {output} 2>&1 | tee -a {log} | 
|  | 633         size=$(wc -c < {output}) | 
|  | 634         echo "COUNT_FastqCheckBytes ${{size}}" | 
|  | 635 | 
|  | 636         echo "COUNT_FastqCheckComplete 1" | 
|  | 637         """ | 
|  | 638 | 
|  | 639 rule download: | 
|  | 640     output: | 
|  | 641         temp(config['temp'] + '/{quad}_0.fastq'), | 
|  | 642         temp(config['temp'] + '/{quad}_1.fastq'), | 
|  | 643         temp(config['temp'] + '/{quad}_2.fastq') | 
|  | 644     log: | 
|  | 645         config['output'] + '/{quad}.download.log' | 
|  | 646     params: | 
|  | 647         srr=lambda wildcards: wildcards.quad.split('_')[0], | 
|  | 648         method=lambda wildcards: wildcards.quad.split('_')[3], | 
|  | 649         fd_params=config.get('fastq_dump_args', ''), | 
|  | 650         retries=config.get('fastq_dump_retries', '5') | 
|  | 651     threads: 4 | 
|  | 652     shell: | 
|  | 653         """ | 
|  | 654                 set -x | 
|  | 655         SUCCESS=0 | 
|  | 656         TIMEOUT=10 | 
|  | 657         PARAMS="" | 
|  | 658         TMP="{config[temp]}/dl-{params.srr}" | 
|  | 659         ! test -d ${{TMP}} | 
|  | 660 | 
|  | 661         if [[ {params.method} == "sra" ]] ; then | 
|  | 662             USE_FASTERQ=1 | 
|  | 663             for i in {{ 1..{params.retries} }} ; do | 
|  | 664                 if time prefetch -t fasp -O ${{TMP}} -L info {params.srr} 2>&1 >> {log} ; then | 
|  | 665                     SUCCESS=1 | 
|  | 666                     break | 
|  | 667                 else | 
|  | 668                     echo "COUNT_SraRetries 1" | 
|  | 669                     TIMEOUT=$((${{TIMEOUT}} * 2)) | 
|  | 670                     sleep ${{TIMEOUT}} | 
|  | 671                 fi | 
|  | 672             done | 
|  | 673             if (( $SUCCESS == 0 )) ; then | 
|  | 674                 echo "COUNT_SraFailures 1" | 
|  | 675                 exit 1 | 
|  | 676             fi | 
|  | 677             test -f ${{TMP}}/*.sra | 
|  | 678             size=$(cat ${{TMP}}/*.sra | wc -c) | 
|  | 679             echo "COUNT_SraBytesDownloaded ${{size}}" | 
|  | 680             if (( ${{USE_FASTERQ}} == 1 )) ; then | 
|  | 681                 time fasterq-dump ${{TMP}}/*.sra \ | 
|  | 682                     -e {threads} \ | 
|  | 683                     -t ${{TMP}} \ | 
|  | 684                     -L info \ | 
|  | 685                     --split-files \ | 
|  | 686                     --skip-technical \ | 
|  | 687                     -o {params.srr}.fastq \ | 
|  | 688                     2>&1 >> {log} | 
|  | 689                 test -f {params.srr}_2.fastq || mv {params.srr}.fastq {params.srr}_0.fastq | 
|  | 690             else | 
|  | 691                 time fastq-dump ${{TMP}}/*.sra \ | 
|  | 692                     -L info \ | 
|  | 693                     --split-files \ | 
|  | 694                     --skip-technical \ | 
|  | 695                     -O . \ | 
|  | 696                     2>&1 >> {log} | 
|  | 697                 test -f {params.srr}_2.fastq || mv {params.srr}_1.fastq {params.srr}_0.fastq | 
|  | 698             fi | 
|  | 699             rm -rf ${{TMP}} | 
|  | 700         elif [[ {params.method} == "gdc" ]] ; then | 
|  | 701             TOKEN=~/gdc/latest.txt | 
|  | 702             if [[ ! -f ${{TOKEN}} ]] ; then | 
|  | 703                 echo "ERROR: no GDC token file found at ${{TOKEN}}" | 
|  | 704                 exit 1 | 
|  | 705             fi | 
|  | 706             for i in {{ 1..{params.retries} }} ; do | 
|  | 707                 if time gdc-client download \ | 
|  | 708                     -t ${{TOKEN}} \ | 
|  | 709                     --log-file {{TMP}}/log.txt \ | 
|  | 710                     -d {{TEMP}} \ | 
|  | 711                     {params.srr} 2>&1 >> {log} | 
|  | 712                 then | 
|  | 713                     SUCCESS=1 | 
|  | 714                     break | 
|  | 715                 else | 
|  | 716                     echo "COUNT_GdcRetries 1" | 
|  | 717                     TIMEOUT=$((${{TIMEOUT}} * 2)) | 
|  | 718                     sleep ${{TIMEOUT}} | 
|  | 719                 fi | 
|  | 720             done | 
|  | 721             if (( $SUCCESS == 0 )) ; then | 
|  | 722                 echo "COUNT_GdcFailures 1" | 
|  | 723                 exit 1 | 
|  | 724             fi | 
|  | 725             test -d {{TEMP}}/{params.srr} | 
|  | 726             test -f {{TEMP}}/{params.srr}/*.tar.gz | 
|  | 727 | 
|  | 728             echo "=== gdc-client log.txt begin ===" >> {log} | 
|  | 729             cat {{TMP}}/log.txt >> {log} | 
|  | 730             echo "=== gdc-client log.txt end===" >> {log} | 
|  | 731 | 
|  | 732             size=$(cat {{TEMP}}/{params.srr}/*.tar.gz | wc -c) | 
|  | 733             echo "COUNT_GdcBytesDownloaded ${{size}}" | 
|  | 734 | 
|  | 735             tar zxvf {{TEMP}}/{params.srr}/*.tar.gz | 
|  | 736             rm -rf {{TEMP}} | 
|  | 737 | 
|  | 738             num_1s=$(ls -1 *_1.fastq | wc -l) | 
|  | 739             num_2s=$(ls -1 *_2.fastq | wc -l) | 
|  | 740             if (( ${{num_1s}} == 0 )) ; then | 
|  | 741                 echo "ERROR: No _1.fastq files output" | 
|  | 742                 exit 1 | 
|  | 743             fi | 
|  | 744             if (( ${{num_1s}} > 1 )) ; then | 
|  | 745                 echo "ERROR: More than one _1.fastq file found" | 
|  | 746                 exit 1 | 
|  | 747             fi | 
|  | 748             if (( ${{num_2s}} == 0 )) ; then | 
|  | 749                 # unpaired | 
|  | 750                 mv *_1.fastq {params.srr}_0.fastq | 
|  | 751             else | 
|  | 752                 # paired-end | 
|  | 753                 mv *_1.fastq {params.srr}_1.fastq | 
|  | 754                 mv *_2.fastq {params.srr}_2.fastq | 
|  | 755             fi | 
|  | 756         fi | 
|  | 757 | 
|  | 758         # Next chunk expects the FASTQ files to exist in the current directory | 
|  | 759         # named {{params.srr}}_{{0,1,2}}.fastq | 
|  | 760         size=0 | 
|  | 761         for i in {{0..2}} ; do | 
|  | 762             fn={params.srr}_${{i}}.fastq | 
|  | 763             if [[ -f ${{fn}} ]] ; then | 
|  | 764                 echo "COUNT_FilesDownloaded 1" | 
|  | 765             else | 
|  | 766                 touch ${{fn}} | 
|  | 767             fi | 
|  | 768             size=$((${{size}} + $(wc -c < ${{fn}}))) | 
|  | 769             mv ${{fn}} {config[temp]}/{wildcards.quad}_${{i}}.fastq | 
|  | 770         done | 
|  | 771         echo "COUNT_BytesDownloaded ${{size}}" | 
|  | 772         echo "COUNT_DownloadComplete 1" | 
|  | 773         """ |