| 
10
 | 
     1 """
 | 
| 
 | 
     2 Parameters:
 | 
| 
16
 | 
     3 - fastq_dump_args: arguments to pass to fastq dumping tool
 | 
| 
 | 
     4 - fastq_dump_retries: number of retry attempts before dying
 | 
| 
 | 
     5 - fastq_dump_tool: name of tool to run, e.g. 'fastq-dump' or 'fasterq-dump'
 | 
| 
10
 | 
     6 - star: arguments to pass to STAR aligner
 | 
| 
16
 | 
     7 - salmon_args: arguments to pass to salmon quant
 | 
| 
10
 | 
     8 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10]
 | 
| 
16
 | 
     9 - bw_bed: name of BED file to use with bwtool
 | 
| 
 | 
    10 - max_unalign: maximum number of unaligned reads to save per run accession
 | 
| 
10
 | 
    11 """
 | 
| 
 | 
    12 
 | 
| 
16
 | 
    13 STEPS = ['download', 'fastq_check', 'align', 'sort',
 | 
| 
 | 
    14          'bamcount',
 | 
| 
 | 
    15          'salmon',
 | 
| 
 | 
    16          'align_unmapped',
 | 
| 
 | 
    17          'extract_jx']
 | 
| 
10
 | 
    18 
 | 
| 
16
 | 
    19 FILES = ['sjout.zst', 'fastq_check.tsv.zst',
 | 
| 
 | 
    20          'unmapped.bam', 'unmapped.fastq.zst',
 | 
| 
10
 | 
    21          'bamcount_nonref.csv.zst',
 | 
| 
 | 
    22          'bamcount_auc.tsv',
 | 
| 
 | 
    23          'bamcount_frag.tsv',
 | 
| 
 | 
    24          'Chimeric.out.junction.zst',
 | 
| 
 | 
    25          'all.exon_bw_count.zst', 'unique.exon_bw_count.zst',
 | 
| 
16
 | 
    26          'all.bw.zst', 'unique.bw.zst',
 | 
| 
 | 
    27          'salmon.tsv.zst',
 | 
| 
 | 
    28          'jx_bed.zst',
 | 
| 
 | 
    29          'manifest'] + list(map(lambda x: x + '.log', STEPS))
 | 
| 
10
 | 
    30 
 | 
| 
14
 | 
    31 #get rid of the pesky warnings about current paths
 | 
| 
 | 
    32 #and make sure we're not assuming any wrong relative paths
 | 
| 
12
 | 
    33 import os
 | 
| 
 | 
    34 config['output']=os.path.abspath(config['output'])
 | 
| 
13
 | 
    35 config['temp']=os.path.abspath(config['temp'])
 | 
| 
14
 | 
    36 config['exon_bed']=os.path.abspath(config['exon_bed'])
 | 
| 
 | 
    37 config['ref']=os.path.abspath(config['ref'])
 | 
| 
12
 | 
    38 
 | 
| 
10
 | 
    39 import subprocess
 | 
| 
 | 
    40 def run_command(cmd_args):
 | 
| 
 | 
    41     cmd_args = ' '.join(cmd_args)
 | 
| 
 | 
    42     try:
 | 
| 
 | 
    43         subprocess.check_call(args=cmd_args, shell=True) 
 | 
| 
 | 
    44     except subprocess.CalledProcessError as cpe:
 | 
| 
 | 
    45         sys.stderr.write("error in run_command for command: %s\n" % cmd_args)
 | 
| 
 | 
    46         raise cpe
 | 
| 
 | 
    47 
 | 
| 
16
 | 
    48 import re
 | 
| 
 | 
    49 STEPS_FILES_FILTER=re.compile(r'(unmapped)|(download)|(salmon)|(extract_jx)|(jx_bed)|(manifest)')
 | 
| 
 | 
    50 def remove_steps_files():
 | 
| 
 | 
    51     #modify STEP and FILES
 | 
| 
 | 
    52     #so we don't run download or unmapped steps
 | 
| 
 | 
    53     global FILES
 | 
| 
 | 
    54     global STEPS
 | 
| 
 | 
    55     for i in reversed(range(0,len(STEPS))):
 | 
| 
 | 
    56         f = STEPS[i]
 | 
| 
 | 
    57         if STEPS_FILES_FILTER.search(f):
 | 
| 
 | 
    58             del(STEPS[i])
 | 
| 
 | 
    59     for i in reversed(range(0,len(FILES))):
 | 
| 
 | 
    60         f = FILES[i]
 | 
| 
 | 
    61         if STEPS_FILES_FILTER.search(f):
 | 
| 
 | 
    62             del(FILES[i])
 | 
| 
10
 | 
    63 
 | 
| 
16
 | 
    64 
 | 
| 
11
 | 
    65 FASTQ_PATT=re.compile(r'([^_\.]+)(_(\d+))?\.((fastq)|fq)(\.gz)?$')
 | 
| 
10
 | 
    66 def prep_for_galaxy_run():
 | 
| 
16
 | 
    67     remove_steps_files()
 | 
| 
10
 | 
    68     try:
 | 
| 
 | 
    69         os.mkdir(config['temp'])
 | 
| 
 | 
    70     except OSError as ose:
 | 
| 
 | 
    71         pass
 | 
| 
 | 
    72     fastqs = config['inputs'].split(',')
 | 
| 
 | 
    73     m = FASTQ_PATT.search(fastqs[0])
 | 
| 
11
 | 
    74     run_acc = 'sample'
 | 
| 
 | 
    75     if m is not None:
 | 
| 
 | 
    76         run_acc = m.group(1)
 | 
| 
12
 | 
    77     study_acc = 'study'
 | 
| 
10
 | 
    78     if 'study' in config:
 | 
| 
 | 
    79         study_acc = config['study']
 | 
| 
 | 
    80     genome = 'hg38'
 | 
| 
 | 
    81     if 'genome' in config:
 | 
| 
 | 
    82         genome = config['genome']
 | 
| 
 | 
    83     method = 'sra'
 | 
| 
 | 
    84     # SRR,SRP,genome
 | 
| 
 | 
    85     # e.g. SRR1557855,SRP045778,ce10
 | 
| 
 | 
    86     #create links which will be used in the align step
 | 
| 
 | 
    87     i = 1
 | 
| 
 | 
    88     for f in fastqs:
 | 
| 
 | 
    89         newf = '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, i)
 | 
| 
15
 | 
    90         if 'compressed' in config:
 | 
| 
 | 
    91             run_command(['zcat',f,'>',newf])
 | 
| 
 | 
    92         else:
 | 
| 
 | 
    93             os.symlink(os.path.abspath(f), newf)
 | 
| 
10
 | 
    94         #create fastq 0
 | 
| 
 | 
    95         if i == 1:
 | 
| 
15
 | 
    96             os.symlink(os.path.abspath(newf), '%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 0))
 | 
| 
10
 | 
    97         i += 1
 | 
| 
 | 
    98     #create fastq 2 if not paired
 | 
| 
 | 
    99     if i == 2:
 | 
| 
 | 
   100         open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close() 
 | 
| 
 | 
   101     #create expected file structure for annotated exon bed file & reference index
 | 
| 
 | 
   102     ref = config['ref']
 | 
| 
16
 | 
   103     config['ref'] = os.path.abspath('.')
 | 
| 
10
 | 
   104     os.makedirs('%s/%s' % (config['ref'], genome))
 | 
| 
 | 
   105     os.symlink(ref, '%s/%s/star_idx' % (config['ref'], genome))
 | 
| 
 | 
   106     os.makedirs('%s/%s/gtf' % (config['ref'], genome))
 | 
| 
 | 
   107     os.symlink(config['exon_bed'], 'exons.tmp')
 | 
| 
 | 
   108     os.symlink('../../exons.tmp', '%s/%s/gtf/%s' % (config['ref'], genome, config.get('bw_bed', 'exons.bed')))
 | 
| 
16
 | 
   109     #TODO may need to add additional symlinking for HISAT2 unmapped alignments
 | 
| 
10
 | 
   110     return([run_acc, study_acc, genome, method])
 | 
| 
 | 
   111 
 | 
| 
 | 
   112 
 | 
| 
 | 
   113 def get_accessions(wildcards):
 | 
| 
 | 
   114     """
 | 
| 
 | 
   115     Grouping of SRRs with the same SRP could happen here
 | 
| 
 | 
   116     """
 | 
| 
 | 
   117     #if running under galaxy where the user will input the 
 | 
| 
 | 
   118     #FASTQs, study, and genome directly
 | 
| 
 | 
   119     if 'inputs' in config:
 | 
| 
 | 
   120         (run_acc, study_acc, genome, method) = prep_for_galaxy_run()
 | 
| 
 | 
   121         for ext in FILES:
 | 
| 
 | 
   122             yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (run_acc, study_acc, genome, method, ext))
 | 
| 
 | 
   123         #here to get the make_galaxy_links rule to fire
 | 
| 
 | 
   124         yield os.path.join(config['output'], '%s_%s_%s_%s.done' % (run_acc, study_acc, genome, method))
 | 
| 
 | 
   125     #not running under galaxy, takes a list of accessions
 | 
| 
 | 
   126     else:
 | 
| 
 | 
   127         for fn in config['input'].split():
 | 
| 
 | 
   128             with open(fn, 'r') as fh:
 | 
| 
 | 
   129                 for ln in fh:
 | 
| 
 | 
   130                     if ln.count(',') < 2:
 | 
| 
 | 
   131                         continue
 | 
| 
 | 
   132                     toks = ln.rstrip().split(',')
 | 
| 
 | 
   133                     assert 3 <= len(toks) <= 4
 | 
| 
 | 
   134                     method = 'sra'
 | 
| 
 | 
   135                     if len(toks) == 4:
 | 
| 
 | 
   136                         method = toks[3]
 | 
| 
 | 
   137                     # SRR,SRP,genome
 | 
| 
 | 
   138                     # e.g. SRR1557855,SRP045778,ce10
 | 
| 
 | 
   139                     for ext in FILES:
 | 
| 
 | 
   140                         yield os.path.join(config['output'], '%s_%s_%s_%s.%s' % (toks[0], toks[1], toks[2], method, ext))
 | 
| 
 | 
   141 
 | 
| 
 | 
   142 rule all:
 | 
| 
 | 
   143     input:
 | 
| 
 | 
   144         get_accessions
 | 
| 
 | 
   145 
 | 
| 
 | 
   146 rule make_manifest:
 | 
| 
 | 
   147     input:
 | 
| 
16
 | 
   148         config['output'] + '/{quad}.salmon.tsv.zst',
 | 
| 
10
 | 
   149         config['output'] + '/{quad}.sjout.zst',
 | 
| 
16
 | 
   150         config['output'] + '/{quad}.jx_bed.zst',
 | 
| 
10
 | 
   151         config['output'] + '/{quad}.Chimeric.out.junction.zst',
 | 
| 
16
 | 
   152         config['output'] + '/{quad}.unmapped.bam',
 | 
| 
 | 
   153         config['output'] + '/{quad}.unmapped.fastq.zst',
 | 
| 
10
 | 
   154         config['output'] + '/{quad}.bamcount_nonref.csv.zst',
 | 
| 
 | 
   155         config['output'] + '/{quad}.bamcount_auc.tsv',
 | 
| 
 | 
   156         config['output'] + '/{quad}.bamcount_frag.tsv',
 | 
| 
16
 | 
   157         config['output'] + '/{quad}.fastq_check.tsv.zst',
 | 
| 
10
 | 
   158         config['output'] + '/{quad}.all.exon_bw_count.zst',
 | 
| 
 | 
   159         config['output'] + '/{quad}.unique.exon_bw_count.zst',
 | 
| 
16
 | 
   160         config['output'] + '/{quad}.all.bw.zst',
 | 
| 
 | 
   161         config['output'] + '/{quad}.unique.bw.zst',
 | 
| 
 | 
   162         config['output'] + '/{quad}.align.log',
 | 
| 
 | 
   163         config['output'] + '/{quad}.extract_jx.log',
 | 
| 
 | 
   164         config['output'] + '/{quad}.bamcount.log',
 | 
| 
 | 
   165         config['output'] + '/{quad}.align_unmapped.log',
 | 
| 
 | 
   166         config['output'] + '/{quad}.download.log',
 | 
| 
 | 
   167         config['output'] + '/{quad}.fastq_check.log',
 | 
| 
 | 
   168         config['output'] + '/{quad}.sort.log',
 | 
| 
 | 
   169         config['output'] + '/{quad}.salmon.log'
 | 
| 
10
 | 
   170     output:
 | 
| 
 | 
   171         config['output'] + '/{quad}.manifest'
 | 
| 
 | 
   172     params:
 | 
| 
 | 
   173         quad=lambda wildcards: wildcards.quad
 | 
| 
 | 
   174     run:
 | 
| 
 | 
   175         with open(output[0], 'wt') as fh:
 | 
| 
 | 
   176             for fn in FILES:
 | 
| 
 | 
   177                 fh.write(params.quad + "." + fn + '\n')
 | 
| 
 | 
   178 
 | 
| 
 | 
   179 def galaxy_link_files(op):
 | 
| 
16
 | 
   180     remove_steps_files()
 | 
| 
10
 | 
   181     a = [op + '/' + f for f in FILES]
 | 
| 
 | 
   182     return a
 | 
| 
 | 
   183 
 | 
| 
 | 
   184 rule make_galaxy_links:
 | 
| 
 | 
   185     input: 
 | 
| 
 | 
   186         config['output'] + '/{quad}.sjout.zst',
 | 
| 
16
 | 
   187         config['output'] + '/{quad}.fastq_check.tsv.zst',
 | 
| 
10
 | 
   188         config['output'] + '/{quad}.bamcount_nonref.csv.zst',
 | 
| 
 | 
   189         config['output'] + '/{quad}.bamcount_auc.tsv',
 | 
| 
 | 
   190         config['output'] + '/{quad}.bamcount_frag.tsv',
 | 
| 
 | 
   191         config['output'] + '/{quad}.Chimeric.out.junction.zst',
 | 
| 
 | 
   192         config['output'] + '/{quad}.all.exon_bw_count.zst',
 | 
| 
 | 
   193         config['output'] + '/{quad}.unique.exon_bw_count.zst',
 | 
| 
16
 | 
   194         config['output'] + '/{quad}.all.bw.zst',
 | 
| 
 | 
   195         config['output'] + '/{quad}.unique.bw.zst',
 | 
| 
 | 
   196         config['output'] + '/{quad}.fastq_check.log',
 | 
| 
 | 
   197         config['output'] + '/{quad}.align.log',
 | 
| 
 | 
   198         config['output'] + '/{quad}.sort.log',
 | 
| 
 | 
   199         config['output'] + '/{quad}.bamcount.log',
 | 
| 
10
 | 
   200     output:
 | 
| 
 | 
   201         config['output'] + '/{quad}.done'
 | 
| 
 | 
   202     params:
 | 
| 
 | 
   203         quad=lambda wildcards: wildcards.quad,
 | 
| 
 | 
   204         out=galaxy_link_files(config['output'])
 | 
| 
 | 
   205     run:
 | 
| 
 | 
   206         for (i,fn) in enumerate(input):
 | 
| 
 | 
   207             os.symlink(os.path.abspath(fn), params.out[i])
 | 
| 
16
 | 
   208         os.symlink(os.path.abspath(input[-1]), output[0])
 | 
| 
10
 | 
   209 
 | 
| 
 | 
   210 
 | 
| 
 | 
   211 rule bamcount:
 | 
| 
 | 
   212     input:
 | 
| 
 | 
   213         bam=config['temp'] + '/{quad}-sorted.bam',
 | 
| 
 | 
   214         bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
 | 
| 
 | 
   215         bed=lambda wildcards: '%s/%s/gtf/%s' % (config['ref'], wildcards.quad.split('_')[2], config.get('bw_bed', 'exons.bed'))
 | 
| 
 | 
   216     output:
 | 
| 
 | 
   217         nonref=config['output'] + '/{quad}.bamcount_nonref.csv.zst',
 | 
| 
 | 
   218         auc=config['output'] + '/{quad}.bamcount_auc.tsv',
 | 
| 
 | 
   219         frag=config['output'] + '/{quad}.bamcount_frag.tsv',
 | 
| 
 | 
   220         all_bw=config['output'] + '/{quad}.all.bw.zst',
 | 
| 
 | 
   221         unique_bw=config['output'] + '/{quad}.unique.bw.zst',
 | 
| 
 | 
   222         all_bw_count=config['output'] + '/{quad}.all.exon_bw_count.zst',
 | 
| 
 | 
   223         unique_bw_count=config['output'] + '/{quad}.unique.exon_bw_count.zst'
 | 
| 
 | 
   224     log:
 | 
| 
 | 
   225         config['output'] + '/{quad}.bamcount.log'
 | 
| 
 | 
   226     params:
 | 
| 
 | 
   227         srr=lambda wildcards: wildcards.quad.split('_')[0],
 | 
| 
 | 
   228         uniq_qual=config.get('unique_qual', 10)
 | 
| 
 | 
   229     threads: 4
 | 
| 
 | 
   230     shell:
 | 
| 
 | 
   231         """
 | 
| 
 | 
   232         TMP={config[temp]}/{params.srr}_bamcount
 | 
| 
14
 | 
   233         bamcount {input.bam} \
 | 
| 
10
 | 
   234             --threads {threads} \
 | 
| 
 | 
   235             --coverage \
 | 
| 
 | 
   236             --no-head \
 | 
| 
 | 
   237             --require-mdz \
 | 
| 
 | 
   238             --min-unique-qual {params.uniq_qual} \
 | 
| 
 | 
   239             --frag-dist ${{TMP}} \
 | 
| 
 | 
   240             --bigwig ${{TMP}} \
 | 
| 
 | 
   241             --annotation {input.bed} ${{TMP}} \
 | 
| 
 | 
   242             --auc ${{TMP}} \
 | 
| 
 | 
   243             --alts ${{TMP}} \
 | 
| 
 | 
   244             2>&1 | tee -a {log}
 | 
| 
 | 
   245 
 | 
| 
 | 
   246         #
 | 
| 
 | 
   247         # --alts
 | 
| 
 | 
   248         #
 | 
| 
 | 
   249 
 | 
| 
 | 
   250         (time zstd ${{TMP}}.alts.tsv -o {output.nonref}) 2>&1 | tee -a {log}
 | 
| 
 | 
   251         size=$(wc -c < {output.nonref})
 | 
| 
 | 
   252         echo "COUNT_NonrefSize ${{size}}"
 | 
| 
 | 
   253         rm -f ${{TMP}}.alts.tsv
 | 
| 
 | 
   254 
 | 
| 
 | 
   255         #
 | 
| 
 | 
   256         # --auc
 | 
| 
 | 
   257         #
 | 
| 
 | 
   258         mv ${{TMP}}.auc.tsv {output.auc}
 | 
| 
 | 
   259         size=$(wc -c < {output.auc})
 | 
| 
 | 
   260         echo "COUNT_AucSize ${{size}}"
 | 
| 
 | 
   261         rm -f ${{TMP}}.auc.tsv
 | 
| 
 | 
   262 
 | 
| 
 | 
   263         #
 | 
| 
 | 
   264         # --frag-dist
 | 
| 
 | 
   265         #
 | 
| 
 | 
   266         mv ${{TMP}}.frags.tsv {output.frag}
 | 
| 
 | 
   267         size=$(wc -c < {output.frag})
 | 
| 
 | 
   268         echo "COUNT_FragDistSize ${{size}}"
 | 
| 
 | 
   269         rm -f ${{TMP}}.frags.tsv
 | 
| 
 | 
   270 
 | 
| 
 | 
   271         #
 | 
| 
 | 
   272         # --bigwig
 | 
| 
 | 
   273         #
 | 
| 
 | 
   274 
 | 
| 
 | 
   275         (time zstd ${{TMP}}.all.bw -o {output.all_bw}) 2>&1 | tee -a {log}
 | 
| 
 | 
   276         size=$(wc -c < {output.all_bw})
 | 
| 
 | 
   277         echo "COUNT_BwSize ${{size}}"
 | 
| 
 | 
   278         rm -f ${{TMP}}.all.bw
 | 
| 
 | 
   279 
 | 
| 
 | 
   280         (time zstd ${{TMP}}.unique.bw -o {output.unique_bw}) 2>&1 | tee -a {log}
 | 
| 
 | 
   281         size=$(wc -c < {output.unique_bw})
 | 
| 
 | 
   282         echo "COUNT_BwSize ${{size}}"
 | 
| 
 | 
   283         rm -f ${{TMP}}.unique.bw
 | 
| 
 | 
   284 
 | 
| 
 | 
   285         #
 | 
| 
 | 
   286         # --annotation
 | 
| 
 | 
   287         #
 | 
| 
 | 
   288 
 | 
| 
 | 
   289         (time zstd ${{TMP}}.all.tsv -o {output.all_bw_count}) 2>&1 | tee -a {log}
 | 
| 
 | 
   290         size=$(wc -c < {output.all_bw_count})
 | 
| 
 | 
   291         echo "COUNT_BwQuantSize ${{size}}"
 | 
| 
 | 
   292         rm -f ${{TMP}}.all.tsv
 | 
| 
 | 
   293 
 | 
| 
 | 
   294         (time zstd ${{TMP}}.unique.tsv -o {output.unique_bw_count}) 2>&1 | tee -a {log}
 | 
| 
 | 
   295         size=$(wc -c < {output.unique_bw_count})
 | 
| 
 | 
   296         echo "COUNT_BwQuantSize ${{size}}"
 | 
| 
 | 
   297         rm -f ${{TMP}}.unique.tsv
 | 
| 
 | 
   298 
 | 
| 
 | 
   299         # Check that all temporaries were properly purged
 | 
| 
 | 
   300         set +o pipefail ; num_files=$(ls -d ${{TMP}}* 2>/dev/null | wc -l)
 | 
| 
 | 
   301         if (( $num_files > 0 )) ; then
 | 
| 
 | 
   302             echo "Failed to purge files (ignore . and ..): $(ls -ad ${{TMP}}*)"
 | 
| 
 | 
   303             exit 1
 | 
| 
 | 
   304         fi
 | 
| 
 | 
   305 
 | 
| 
 | 
   306         echo "COUNT_BamcountComplete 1"
 | 
| 
 | 
   307         """
 | 
| 
 | 
   308 
 | 
| 
16
 | 
   309 rule bw_zstd:
 | 
| 
 | 
   310     input:
 | 
| 
 | 
   311         config['temp'] + '/{prefix}.bw'
 | 
| 
 | 
   312     output:
 | 
| 
 | 
   313         config['output'] + '/{prefix}.bw.zst'
 | 
| 
 | 
   314     shell:
 | 
| 
 | 
   315         """
 | 
| 
 | 
   316         zstd {input} -o {output}
 | 
| 
 | 
   317         size=$(wc -c < {output})
 | 
| 
 | 
   318         echo "COUNT_BwBytes ${{size}}"
 | 
| 
 | 
   319         echo "COUNT_BwZstdComplete 1"
 | 
| 
 | 
   320         """
 | 
| 
 | 
   321 
 | 
| 
10
 | 
   322 rule sort:
 | 
| 
 | 
   323     input:
 | 
| 
 | 
   324         config['temp'] + '/{quad}.bam'
 | 
| 
 | 
   325     wildcard_constraints:
 | 
| 
 | 
   326         quad="[^-]+"
 | 
| 
 | 
   327     output:
 | 
| 
 | 
   328         bam=temp(config['temp'] + '/{quad}-sorted.bam'),
 | 
| 
 | 
   329         bai=temp(config['temp'] + '/{quad}-sorted.bam.bai')
 | 
| 
 | 
   330     log:
 | 
| 
 | 
   331         config['output'] + '/{quad}.sort.log'
 | 
| 
 | 
   332     params:
 | 
| 
 | 
   333         srr=lambda wildcards: wildcards.quad.split('_')[0]
 | 
| 
 | 
   334     threads: 8
 | 
| 
 | 
   335     shell:
 | 
| 
 | 
   336         """
 | 
| 
 | 
   337         TMP="{config[temp]}/sort_temp.{params.srr}"
 | 
| 
 | 
   338         mkdir -p ${{TMP}}
 | 
| 
 | 
   339         time samtools sort \
 | 
| 
 | 
   340             -T ${{TMP}}/samtools_temp \
 | 
| 
 | 
   341             -@ {threads} \
 | 
| 
 | 
   342             -m 64M \
 | 
| 
 | 
   343             -o {output.bam} {input} 2>&1 | tee -a {log}
 | 
| 
 | 
   344         rm -rf ${{TMP}}
 | 
| 
 | 
   345         size=$(wc -c < {output.bam})
 | 
| 
 | 
   346         echo "COUNT_SortedBAMBytes ${{size}}"
 | 
| 
 | 
   347 
 | 
| 
 | 
   348         time samtools index -@ {threads} {output.bam} 2>&1 | tee -a {log}
 | 
| 
 | 
   349         echo "COUNT_SortComplete 1"
 | 
| 
 | 
   350         """
 | 
| 
 | 
   351 
 | 
| 
16
 | 
   352 rule salmon:
 | 
| 
 | 
   353     input:
 | 
| 
 | 
   354         reads0=config['temp'] + '/{quad}_0.fastq',
 | 
| 
 | 
   355         reads1=config['temp'] + '/{quad}_1.fastq',
 | 
| 
 | 
   356         reads2=config['temp'] + '/{quad}_2.fastq',
 | 
| 
 | 
   357         index1=lambda wildcards: '%s/%s/salmon_index/hash.bin' % (config['ref'], wildcards.quad.split('_')[2]),
 | 
| 
 | 
   358         index2=lambda wildcards: '%s/%s/salmon_index/sa.bin' % (config['ref'], wildcards.quad.split('_')[2])
 | 
| 
 | 
   359     output:
 | 
| 
 | 
   360         config['output'] + '/{quad}.salmon.tsv.zst'
 | 
| 
 | 
   361     log:
 | 
| 
 | 
   362         config['output'] + '/{quad}.salmon.log'
 | 
| 
 | 
   363     params:
 | 
| 
 | 
   364         index_base=lambda wildcards: '%s/%s/salmon_index' % (config['ref'], wildcards.quad.split('_')[2]),
 | 
| 
 | 
   365         salmon_args=config.get('salmon_args', '')
 | 
| 
 | 
   366     threads: 8
 | 
| 
 | 
   367     shell:
 | 
| 
 | 
   368         """
 | 
| 
 | 
   369         READ_FILES="-r {input.reads0}"
 | 
| 
 | 
   370         if [[ -s {input.reads2} ]] ; then
 | 
| 
 | 
   371             READ_FILES="-1 {input.reads1} -2 {input.reads2}"
 | 
| 
 | 
   372         fi
 | 
| 
 | 
   373         if set -o pipefail && time salmon quant \
 | 
| 
 | 
   374             --libType U \
 | 
| 
 | 
   375             --quiet \
 | 
| 
 | 
   376             --validateMappings \
 | 
| 
 | 
   377             -i {params.index_base} \
 | 
| 
 | 
   378             -p {threads} \
 | 
| 
 | 
   379             {params.salmon_args} \
 | 
| 
 | 
   380             ${{READ_FILES}} \
 | 
| 
 | 
   381             --output salmon_quant \
 | 
| 
 | 
   382             --minAssignedFrags 1 \
 | 
| 
 | 
   383             2>&1 | tee -a {log}
 | 
| 
 | 
   384         then
 | 
| 
 | 
   385             echo "COUNT_SalmonSuccess 1"
 | 
| 
 | 
   386 
 | 
| 
 | 
   387             time zstd salmon_quant/quant.sf -o {output} 2>&1 | tee -a {log}
 | 
| 
 | 
   388             size=$(wc -c < {output})
 | 
| 
 | 
   389             echo "COUNT_SalmonQuantBytes ${{size}}"
 | 
| 
 | 
   390         else
 | 
| 
 | 
   391             touch {output}
 | 
| 
 | 
   392             echo "COUNT_SalmonFailure 1"
 | 
| 
 | 
   393         fi
 | 
| 
 | 
   394                 
 | 
| 
 | 
   395         rm -rf salmon_quant
 | 
| 
 | 
   396         echo "COUNT_SalmonComplete 1"
 | 
| 
 | 
   397         """
 | 
| 
 | 
   398 
 | 
| 
 | 
   399 rule align_unmapped:
 | 
| 
 | 
   400     input:
 | 
| 
 | 
   401         unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
 | 
| 
 | 
   402         unmapped2=config['temp'] + '/{quad}_2.unmappedfastq',
 | 
| 
 | 
   403         index=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome.1.ht2' % (config['ref'], wildcards.quad.split('_')[2])
 | 
| 
 | 
   404     output:
 | 
| 
 | 
   405         bam=config['output'] + '/{quad}.unmapped.bam',
 | 
| 
 | 
   406         sample=config['output'] + '/{quad}.unmapped.fastq.zst'
 | 
| 
 | 
   407     log:
 | 
| 
 | 
   408         config['output'] + '/{quad}.align_unmapped.log'
 | 
| 
 | 
   409     params:
 | 
| 
 | 
   410         index_base=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome' % (config['ref'], wildcards.quad.split('_')[2]),
 | 
| 
 | 
   411         srr=lambda wildcards: wildcards.quad.split('_')[0],
 | 
| 
 | 
   412         hisat2_params=config.get('hisat2', ''),
 | 
| 
 | 
   413         max_unalign=config.get('max_unalign', 100000)
 | 
| 
 | 
   414     threads: 16
 | 
| 
 | 
   415     shell:
 | 
| 
 | 
   416         """
 | 
| 
 | 
   417         TMP="{config[temp]}/align_unmapped_temp.{params.srr}"
 | 
| 
 | 
   418         READ_FILES="-1 {input.unmapped1} -2 {input.unmapped2}"
 | 
| 
 | 
   419         if [[ ! -s {input.unmapped2} ]] ; then
 | 
| 
 | 
   420             READ_FILES="-U {input.unmapped1}"
 | 
| 
 | 
   421         fi
 | 
| 
 | 
   422         time hisat2 \
 | 
| 
 | 
   423             $READ_FILES \
 | 
| 
 | 
   424             -t --mm \
 | 
| 
 | 
   425             -x {params.index_base} \
 | 
| 
 | 
   426             --threads {threads} \
 | 
| 
 | 
   427             {params.hisat2_params} \
 | 
| 
 | 
   428             --un ${{TMP}}.un \
 | 
| 
 | 
   429             --un-conc ${{TMP}}.un_conc \
 | 
| 
 | 
   430             -S ${{TMP}}.sam \
 | 
| 
 | 
   431             2>&1 | tee -a {log}
 | 
| 
 | 
   432 
 | 
| 
 | 
   433         #
 | 
| 
 | 
   434         # Make BAM file out of aligned reads
 | 
| 
 | 
   435         #
 | 
| 
 | 
   436         time samtools view \
 | 
| 
 | 
   437             -b -F 4 \
 | 
| 
 | 
   438             -o {output.bam} \
 | 
| 
 | 
   439             ${{TMP}}.sam \
 | 
| 
 | 
   440             2>&1 | tee -a {log}
 | 
| 
 | 
   441         rm -f ${{TMP}}.sam
 | 
| 
 | 
   442 
 | 
| 
 | 
   443         #
 | 
| 
 | 
   444         # Save a subset of the doubly unaligned reads
 | 
| 
 | 
   445         #
 | 
| 
 | 
   446         if [[ ! -s {input.unmapped2} ]] ; then
 | 
| 
 | 
   447             # unpaired
 | 
| 
 | 
   448             fastq-sample \
 | 
| 
 | 
   449                 -n {params.max_unalign} \
 | 
| 
 | 
   450                 -o ${{TMP}}.samp \
 | 
| 
 | 
   451                 ${{TMP}}.un
 | 
| 
 | 
   452             rm -f ${{TMP}}.un
 | 
| 
 | 
   453         else
 | 
| 
 | 
   454             # paired-end
 | 
| 
 | 
   455             fastq-sample \
 | 
| 
 | 
   456                 -n {params.max_unalign} \
 | 
| 
 | 
   457                 -o ${{TMP}}.samp \
 | 
| 
 | 
   458                 ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
 | 
| 
 | 
   459             rm -f ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
 | 
| 
 | 
   460             
 | 
| 
 | 
   461             # interleave the two output fastqs into single file
 | 
| 
 | 
   462             paste ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq \
 | 
| 
 | 
   463                 | paste - - - - \
 | 
| 
 | 
   464                 | awk -v OFS="\n" -v FS="\t" '{{print($1,$3,$5,$7,$2,$4,$6,$8)}}' \
 | 
| 
 | 
   465                 > ${{TMP}}.samp.fastq
 | 
| 
 | 
   466             rm -f ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq
 | 
| 
 | 
   467         fi
 | 
| 
 | 
   468 
 | 
| 
 | 
   469         test -f ${{TMP}}.samp.fastq
 | 
| 
 | 
   470         time zstd ${{TMP}}.samp.fastq -o {output.sample} 2>&1 | tee -a {log}
 | 
| 
 | 
   471         size=$(wc -c < {output.sample})
 | 
| 
 | 
   472         echo "COUNT_UnmappedSampleBytes ${{size}}"
 | 
| 
 | 
   473         rm -f ${{TMP}}.samp.fastq
 | 
| 
 | 
   474     
 | 
| 
 | 
   475         size=$(wc -c < {output.bam})
 | 
| 
 | 
   476         echo "COUNT_UnmappedBamBytes ${{size}}"
 | 
| 
 | 
   477 
 | 
| 
 | 
   478         echo "COUNT_AlignUnmappedComplete 1"
 | 
| 
 | 
   479         """
 | 
| 
 | 
   480 
 | 
| 
 | 
   481 rule extract_jx:
 | 
| 
 | 
   482     input:
 | 
| 
 | 
   483         bam=config['temp'] + '/{quad}-sorted.bam',
 | 
| 
 | 
   484         bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
 | 
| 
 | 
   485         fa=lambda wildcards: '%s/%s/fasta/genome.fa' % (config['ref'], wildcards.quad.split('_')[2]),
 | 
| 
 | 
   486         gtf=lambda wildcards: '%s/%s/gtf/genes.gtf' % (config['ref'], wildcards.quad.split('_')[2])
 | 
| 
 | 
   487     output:
 | 
| 
 | 
   488         config['output'] + '/{quad}.jx_bed.zst'
 | 
| 
 | 
   489     params:
 | 
| 
 | 
   490         srr=lambda wildcards: wildcards.quad.split('_')[0]
 | 
| 
 | 
   491     log:
 | 
| 
 | 
   492         config['output'] + '/{quad}.extract_jx.log'
 | 
| 
 | 
   493     shell:
 | 
| 
 | 
   494         """
 | 
| 
 | 
   495         TMP="{config[temp]}/extract_jx.{params.srr}"
 | 
| 
 | 
   496         time regtools junctions extract \
 | 
| 
 | 
   497             -i 20 -a 1 \
 | 
| 
 | 
   498             -o ${{TMP}}.jx_tmp \
 | 
| 
 | 
   499             {input.bam} 2>&1 | tee -a {log}
 | 
| 
 | 
   500         time zstd ${{TMP}}.jx_tmp -o {output} 2>&1 | tee -a {log}
 | 
| 
 | 
   501         rm -f ${{TMP}}.jx_tmp
 | 
| 
 | 
   502 
 | 
| 
 | 
   503         size=$(wc -c < {output})
 | 
| 
 | 
   504         echo "COUNT_ExtractJxBytes ${{size}}"
 | 
| 
 | 
   505 
 | 
| 
 | 
   506         echo "COUNT_ExtractJxComplete 1"
 | 
| 
 | 
   507         """
 | 
| 
 | 
   508 
 | 
| 
10
 | 
   509 rule align:
 | 
| 
 | 
   510     input:
 | 
| 
 | 
   511         reads0=config['temp'] + '/{quad}_0.fastq',
 | 
| 
 | 
   512         reads1=config['temp'] + '/{quad}_1.fastq',
 | 
| 
 | 
   513         reads2=config['temp'] + '/{quad}_2.fastq',
 | 
| 
 | 
   514         index1=lambda wildcards: '%s/%s/star_idx/SAindex' % (config['ref'], wildcards.quad.split('_')[2]),
 | 
| 
 | 
   515         index2=lambda wildcards: '%s/%s/star_idx/SA' % (config['ref'], wildcards.quad.split('_')[2])
 | 
| 
 | 
   516     wildcard_constraints:
 | 
| 
 | 
   517         quad="[^-]+"
 | 
| 
 | 
   518     output:
 | 
| 
 | 
   519         bam=temp(config['temp'] + '/{quad}.bam'),
 | 
| 
 | 
   520         jxs=config['output'] + '/{quad}.sjout.zst',
 | 
| 
 | 
   521         chimeric=config['output'] + '/{quad}.Chimeric.out.junction.zst',
 | 
| 
 | 
   522         unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
 | 
| 
 | 
   523         unmapped2=config['temp'] + '/{quad}_2.unmappedfastq'
 | 
| 
 | 
   524     log:
 | 
| 
 | 
   525         config['output'] + '/{quad}.align.log'
 | 
| 
 | 
   526     params:
 | 
| 
 | 
   527         index_base=lambda wildcards: '%s/%s/star_idx' % (config['ref'], wildcards.quad.split('_')[2]),
 | 
| 
 | 
   528         srr=lambda wildcards: wildcards.quad.split('_')[0],
 | 
| 
 | 
   529         star_params="%s %s" % (config.get('star', ''), '--genomeLoad LoadAndRemove' if 'inputs' not in config else '')
 | 
| 
 | 
   530     threads: 16
 | 
| 
 | 
   531     shell:
 | 
| 
 | 
   532         """
 | 
| 
 | 
   533         READ_FILES="{input.reads0}"
 | 
| 
 | 
   534         if [[ -s {input.reads2} ]] ; then
 | 
| 
 | 
   535             READ_FILES="{input.reads1} {input.reads2}"
 | 
| 
 | 
   536         fi
 | 
| 
 | 
   537         TMP="{config[temp]}/align_temp.{params.srr}"
 | 
| 
 | 
   538         rm -rf ${{TMP}}
 | 
| 
 | 
   539         time STAR \
 | 
| 
 | 
   540             {params.star_params} \
 | 
| 
 | 
   541             --runMode alignReads \
 | 
| 
 | 
   542             --runThreadN {threads} \
 | 
| 
 | 
   543             --genomeDir {params.index_base} \
 | 
| 
 | 
   544             --readFilesIn ${{READ_FILES}} \
 | 
| 
 | 
   545             --twopassMode None \
 | 
| 
 | 
   546             --outTmpDir ${{TMP}} \
 | 
| 
 | 
   547             --outReadsUnmapped Fastx \
 | 
| 
 | 
   548             --outMultimapperOrder Random \
 | 
| 
 | 
   549             --outSAMreadID Number \
 | 
| 
 | 
   550             --outSAMtype BAM Unsorted \
 | 
| 
 | 
   551             --outSAMmode NoQS \
 | 
| 
 | 
   552             --outSAMattributes NH MD \
 | 
| 
 | 
   553             --chimOutType Junctions \
 | 
| 
 | 
   554             --chimOutJunctionFormat 1 \
 | 
| 
 | 
   555             --chimSegmentReadGapMax 3 \
 | 
| 
 | 
   556             --chimJunctionOverhangMin 12 \
 | 
| 
 | 
   557             --chimSegmentMin 12 2>&1 | tee -a {log}
 | 
| 
 | 
   558    
 | 
| 
 | 
   559         # Full set of output files:
 | 
| 
 | 
   560         #
 | 
| 
 | 
   561         # Aligned.out.bam
 | 
| 
 | 
   562         # Chimeric.out.junction
 | 
| 
 | 
   563         # Log.final.out
 | 
| 
 | 
   564         # Log.out
 | 
| 
 | 
   565         # Log.progress.out
 | 
| 
 | 
   566         # SJ.out.tab
 | 
| 
 | 
   567         # Unmapped.out.mate1
 | 
| 
 | 
   568         # Unmapped.out.mate2 (if any reads were paired-end)
 | 
| 
 | 
   569 
 | 
| 
 | 
   570         #
 | 
| 
 | 
   571         # Logs
 | 
| 
 | 
   572         #
 | 
| 
 | 
   573         rm -rf ${{TMP}}
 | 
| 
 | 
   574         cat Log.out >> {log}
 | 
| 
 | 
   575         cat Log.final.out >> {log}
 | 
| 
 | 
   576         rm -f Log*.out
 | 
| 
 | 
   577 
 | 
| 
 | 
   578         #
 | 
| 
 | 
   579         # Junctions
 | 
| 
 | 
   580         #
 | 
| 
 | 
   581         test -f SJ.out.tab
 | 
| 
 | 
   582         time zstd SJ.out.tab -o {output.jxs} 2>&1 | tee -a {log}
 | 
| 
 | 
   583         rm -f SJ.out.tab
 | 
| 
 | 
   584         size=$(wc -c < {output.jxs})
 | 
| 
 | 
   585         echo "COUNT_CompressedJxBytes ${{size}}"
 | 
| 
 | 
   586 
 | 
| 
 | 
   587         #
 | 
| 
 | 
   588         # Chimerics
 | 
| 
 | 
   589         #
 | 
| 
 | 
   590         test -f Chimeric.out.junction
 | 
| 
 | 
   591         test -s Chimeric.out.junction
 | 
| 
 | 
   592         sort -k1,1 -n -k2,2 Chimeric.out.junction > Chimeric.out.junction.sorted
 | 
| 
 | 
   593         time zstd Chimeric.out.junction.sorted -o {output.chimeric} 2>&1 | tee -a {log}
 | 
| 
 | 
   594         rm -f Chimeric.out.junction Chimeric.out.junction.sorted
 | 
| 
 | 
   595         size=$(wc -c < {output.chimeric})
 | 
| 
 | 
   596         echo "COUNT_ChimericBytes ${{size}}"
 | 
| 
 | 
   597 
 | 
| 
 | 
   598         #
 | 
| 
 | 
   599         # Unmapped
 | 
| 
 | 
   600         #
 | 
| 
 | 
   601         touch {output.unmapped2}
 | 
| 
 | 
   602         test -f Unmapped.out.mate1
 | 
| 
 | 
   603         mv Unmapped.out.mate1 {output.unmapped1}
 | 
| 
 | 
   604         if [[ -f Unmapped.out.mate2 ]] ; then
 | 
| 
 | 
   605             mv Unmapped.out.mate2 {output.unmapped2}
 | 
| 
 | 
   606         fi
 | 
| 
 | 
   607 
 | 
| 
 | 
   608         #
 | 
| 
 | 
   609         # Alignments
 | 
| 
 | 
   610         #
 | 
| 
 | 
   611         size=$(wc -c < Aligned.out.bam)
 | 
| 
 | 
   612         echo "COUNT_BAMBytes ${{size}}"
 | 
| 
 | 
   613         mv Aligned.out.bam {output.bam}
 | 
| 
16
 | 
   614 
 | 
| 
10
 | 
   615         echo "COUNT_AlignComplete 1"
 | 
| 
 | 
   616         """
 | 
| 
16
 | 
   617 
 | 
| 
 | 
   618 rule fastq_check:
 | 
| 
 | 
   619     input:
 | 
| 
 | 
   620         reads0=config['temp'] + '/{quad}_0.fastq',
 | 
| 
 | 
   621         reads1=config['temp'] + '/{quad}_1.fastq',
 | 
| 
 | 
   622         reads2=config['temp'] + '/{quad}_2.fastq'
 | 
| 
 | 
   623     output:
 | 
| 
 | 
   624         config['output'] + '/{quad}.fastq_check.tsv.zst'
 | 
| 
 | 
   625     log:
 | 
| 
 | 
   626         config['output'] + '/{quad}.fastq_check.log'
 | 
| 
 | 
   627     params:
 | 
| 
 | 
   628         srr=lambda wildcards: wildcards.quad.split('_')[0]
 | 
| 
 | 
   629     shell:
 | 
| 
 | 
   630         """
 | 
| 
 | 
   631         TMP="{config[temp]}/fastq_check-{params.srr}.tsv"
 | 
| 
 | 
   632         touch ${{TMP}}
 | 
| 
 | 
   633         if [[ -s {input.reads0} ]] ; then
 | 
| 
 | 
   634             time seqtk fqchk -q0 {input.reads0} >>${{TMP}} 2>>{log}
 | 
| 
 | 
   635         fi
 | 
| 
 | 
   636         if [[ -s {input.reads1} ]] ; then
 | 
| 
 | 
   637             time seqtk fqchk -q0 {input.reads1} >>${{TMP}} 2>>{log}
 | 
| 
 | 
   638         fi
 | 
| 
 | 
   639         if [[ -s {input.reads2} ]] ; then
 | 
| 
 | 
   640             time seqtk fqchk -q0 {input.reads2} >>${{TMP}} 2>>{log}
 | 
| 
 | 
   641         fi
 | 
| 
 | 
   642         time zstd ${{TMP}} -o {output} 2>&1 | tee -a {log}
 | 
| 
 | 
   643         size=$(wc -c < {output})
 | 
| 
 | 
   644         echo "COUNT_FastqCheckBytes ${{size}}"
 | 
| 
 | 
   645 
 | 
| 
 | 
   646         echo "COUNT_FastqCheckComplete 1"
 | 
| 
 | 
   647         """
 | 
| 
 | 
   648 
 | 
| 
 | 
   649 rule download:
 | 
| 
 | 
   650     output:
 | 
| 
 | 
   651         temp(config['temp'] + '/{quad}_0.fastq'),
 | 
| 
 | 
   652         temp(config['temp'] + '/{quad}_1.fastq'),
 | 
| 
 | 
   653         temp(config['temp'] + '/{quad}_2.fastq')
 | 
| 
 | 
   654     log:
 | 
| 
 | 
   655         config['output'] + '/{quad}.download.log'
 | 
| 
 | 
   656     params:
 | 
| 
 | 
   657         srr=lambda wildcards: wildcards.quad.split('_')[0],
 | 
| 
 | 
   658         method=lambda wildcards: wildcards.quad.split('_')[3],
 | 
| 
 | 
   659         fd_params=config.get('fastq_dump_args', ''),
 | 
| 
 | 
   660         retries=config.get('fastq_dump_retries', '5')
 | 
| 
 | 
   661     threads: 4
 | 
| 
 | 
   662     shell:
 | 
| 
 | 
   663         """
 | 
| 
 | 
   664                 set -x
 | 
| 
 | 
   665         SUCCESS=0
 | 
| 
 | 
   666         TIMEOUT=10
 | 
| 
 | 
   667         PARAMS=""
 | 
| 
 | 
   668         TMP="{config[temp]}/dl-{params.srr}"
 | 
| 
 | 
   669         ! test -d ${{TMP}}
 | 
| 
 | 
   670         
 | 
| 
 | 
   671         if [[ {params.method} == "sra" ]] ; then
 | 
| 
 | 
   672             USE_FASTERQ=1
 | 
| 
 | 
   673             for i in {{ 1..{params.retries} }} ; do
 | 
| 
 | 
   674                 if time prefetch -t fasp -O ${{TMP}} -L info {params.srr} 2>&1 >> {log} ; then
 | 
| 
 | 
   675                     SUCCESS=1
 | 
| 
 | 
   676                     break
 | 
| 
 | 
   677                 else
 | 
| 
 | 
   678                     echo "COUNT_SraRetries 1"
 | 
| 
 | 
   679                     TIMEOUT=$((${{TIMEOUT}} * 2))
 | 
| 
 | 
   680                     sleep ${{TIMEOUT}}
 | 
| 
 | 
   681                 fi
 | 
| 
 | 
   682             done
 | 
| 
 | 
   683             if (( $SUCCESS == 0 )) ; then
 | 
| 
 | 
   684                 echo "COUNT_SraFailures 1"
 | 
| 
 | 
   685                 exit 1
 | 
| 
 | 
   686             fi
 | 
| 
 | 
   687             test -f ${{TMP}}/*.sra
 | 
| 
 | 
   688             size=$(cat ${{TMP}}/*.sra | wc -c)
 | 
| 
 | 
   689             echo "COUNT_SraBytesDownloaded ${{size}}"
 | 
| 
 | 
   690             if (( ${{USE_FASTERQ}} == 1 )) ; then
 | 
| 
 | 
   691                 time fasterq-dump ${{TMP}}/*.sra \
 | 
| 
 | 
   692                     -e {threads} \
 | 
| 
 | 
   693                     -t ${{TMP}} \
 | 
| 
 | 
   694                     -L info \
 | 
| 
 | 
   695                     --split-files \
 | 
| 
 | 
   696                     --skip-technical \
 | 
| 
 | 
   697                     -o {params.srr}.fastq \
 | 
| 
 | 
   698                     2>&1 >> {log}
 | 
| 
 | 
   699                 test -f {params.srr}_2.fastq || mv {params.srr}.fastq {params.srr}_0.fastq
 | 
| 
 | 
   700             else
 | 
| 
 | 
   701                 time fastq-dump ${{TMP}}/*.sra \
 | 
| 
 | 
   702                     -L info \
 | 
| 
 | 
   703                     --split-files \
 | 
| 
 | 
   704                     --skip-technical \
 | 
| 
 | 
   705                     -O . \
 | 
| 
 | 
   706                     2>&1 >> {log}
 | 
| 
 | 
   707                 test -f {params.srr}_2.fastq || mv {params.srr}_1.fastq {params.srr}_0.fastq
 | 
| 
 | 
   708             fi
 | 
| 
 | 
   709             rm -rf ${{TMP}}
 | 
| 
 | 
   710         elif [[ {params.method} == "gdc" ]] ; then
 | 
| 
 | 
   711             TOKEN=~/gdc/latest.txt
 | 
| 
 | 
   712             if [[ ! -f ${{TOKEN}} ]] ; then
 | 
| 
 | 
   713                 echo "ERROR: no GDC token file found at ${{TOKEN}}"
 | 
| 
 | 
   714                 exit 1
 | 
| 
 | 
   715             fi
 | 
| 
 | 
   716             for i in {{ 1..{params.retries} }} ; do
 | 
| 
 | 
   717                 if time gdc-client download \
 | 
| 
 | 
   718                     -t ${{TOKEN}} \
 | 
| 
 | 
   719                     --log-file {{TMP}}/log.txt \
 | 
| 
 | 
   720                     -d {{TEMP}} \
 | 
| 
 | 
   721                     {params.srr} 2>&1 >> {log}
 | 
| 
 | 
   722                 then
 | 
| 
 | 
   723                     SUCCESS=1
 | 
| 
 | 
   724                     break
 | 
| 
 | 
   725                 else
 | 
| 
 | 
   726                     echo "COUNT_GdcRetries 1"
 | 
| 
 | 
   727                     TIMEOUT=$((${{TIMEOUT}} * 2))
 | 
| 
 | 
   728                     sleep ${{TIMEOUT}}
 | 
| 
 | 
   729                 fi
 | 
| 
 | 
   730             done
 | 
| 
 | 
   731             if (( $SUCCESS == 0 )) ; then
 | 
| 
 | 
   732                 echo "COUNT_GdcFailures 1"
 | 
| 
 | 
   733                 exit 1
 | 
| 
 | 
   734             fi
 | 
| 
 | 
   735             test -d {{TEMP}}/{params.srr}
 | 
| 
 | 
   736             test -f {{TEMP}}/{params.srr}/*.tar.gz
 | 
| 
 | 
   737             
 | 
| 
 | 
   738             echo "=== gdc-client log.txt begin ===" >> {log}
 | 
| 
 | 
   739             cat {{TMP}}/log.txt >> {log}
 | 
| 
 | 
   740             echo "=== gdc-client log.txt end===" >> {log}
 | 
| 
 | 
   741             
 | 
| 
 | 
   742             size=$(cat {{TEMP}}/{params.srr}/*.tar.gz | wc -c)
 | 
| 
 | 
   743             echo "COUNT_GdcBytesDownloaded ${{size}}"
 | 
| 
 | 
   744 
 | 
| 
 | 
   745             tar zxvf {{TEMP}}/{params.srr}/*.tar.gz
 | 
| 
 | 
   746             rm -rf {{TEMP}}
 | 
| 
 | 
   747             
 | 
| 
 | 
   748             num_1s=$(ls -1 *_1.fastq | wc -l)
 | 
| 
 | 
   749             num_2s=$(ls -1 *_2.fastq | wc -l)
 | 
| 
 | 
   750             if (( ${{num_1s}} == 0 )) ; then
 | 
| 
 | 
   751                 echo "ERROR: No _1.fastq files output"
 | 
| 
 | 
   752                 exit 1
 | 
| 
 | 
   753             fi
 | 
| 
 | 
   754             if (( ${{num_1s}} > 1 )) ; then
 | 
| 
 | 
   755                 echo "ERROR: More than one _1.fastq file found"
 | 
| 
 | 
   756                 exit 1
 | 
| 
 | 
   757             fi
 | 
| 
 | 
   758             if (( ${{num_2s}} == 0 )) ; then
 | 
| 
 | 
   759                 # unpaired
 | 
| 
 | 
   760                 mv *_1.fastq {params.srr}_0.fastq
 | 
| 
 | 
   761             else
 | 
| 
 | 
   762                 # paired-end
 | 
| 
 | 
   763                 mv *_1.fastq {params.srr}_1.fastq
 | 
| 
 | 
   764                 mv *_2.fastq {params.srr}_2.fastq
 | 
| 
 | 
   765             fi
 | 
| 
 | 
   766         fi
 | 
| 
 | 
   767         
 | 
| 
 | 
   768         # Next chunk expects the FASTQ files to exist in the current directory
 | 
| 
 | 
   769         # named {{params.srr}}_{{0,1,2}}.fastq
 | 
| 
 | 
   770         size=0
 | 
| 
 | 
   771         for i in {{0..2}} ; do
 | 
| 
 | 
   772             fn={params.srr}_${{i}}.fastq
 | 
| 
 | 
   773             if [[ -f ${{fn}} ]] ; then
 | 
| 
 | 
   774                 echo "COUNT_FilesDownloaded 1"
 | 
| 
 | 
   775             else
 | 
| 
 | 
   776                 touch ${{fn}}
 | 
| 
 | 
   777             fi
 | 
| 
 | 
   778             size=$((${{size}} + $(wc -c < ${{fn}})))
 | 
| 
 | 
   779             mv ${{fn}} {config[temp]}/{wildcards.quad}_${{i}}.fastq
 | 
| 
 | 
   780         done
 | 
| 
 | 
   781         echo "COUNT_BytesDownloaded ${{size}}"
 | 
| 
 | 
   782         echo "COUNT_DownloadComplete 1"
 | 
| 
 | 
   783         """
 |