Mercurial > repos > chrisw > monorail_test

--- a/Snakefile	Wed Feb 13 17:03:56 2019 -0500
+++ b/Snakefile	Wed Feb 13 18:39:50 2019 -0500
@@ -1,18 +1,32 @@
 """
 Parameters:
+- fastq_dump_args: arguments to pass to fastq dumping tool
+- fastq_dump_retries: number of retry attempts before dying
+- fastq_dump_tool: name of tool to run, e.g. 'fastq-dump' or 'fasterq-dump'
 - star: arguments to pass to STAR aligner
+- salmon_args: arguments to pass to salmon quant
 - unique_qual: minimum MAPQ needed to be counted in unique BW [default: 10]
+- bw_bed: name of BED file to use with bwtool
+- max_unalign: maximum number of unaligned reads to save per run accession
 """

-STEPS = ['align', 'sort', 'bamcount']
+STEPS = ['download', 'fastq_check', 'align', 'sort',
+         'bamcount',
+         'salmon',
+         'align_unmapped',
+         'extract_jx']

-FILES = ['sjout.zst',
+FILES = ['sjout.zst', 'fastq_check.tsv.zst',
+         'unmapped.bam', 'unmapped.fastq.zst',
          'bamcount_nonref.csv.zst',
          'bamcount_auc.tsv',
          'bamcount_frag.tsv',
          'Chimeric.out.junction.zst',
          'all.exon_bw_count.zst', 'unique.exon_bw_count.zst',
-         'manifest']
+         'all.bw.zst', 'unique.bw.zst',
+         'salmon.tsv.zst',
+         'jx_bed.zst',
+         'manifest'] + list(map(lambda x: x + '.log', STEPS))

 #get rid of the pesky warnings about current paths
 #and make sure we're not assuming any wrong relative paths
@@ -31,10 +45,26 @@
         sys.stderr.write("error in run_command for command: %s\n" % cmd_args)
         raise cpe

+import re
+STEPS_FILES_FILTER=re.compile(r'(unmapped)|(download)|(salmon)|(extract_jx)|(jx_bed)|(manifest)')
+def remove_steps_files():
+    #modify STEP and FILES
+    #so we don't run download or unmapped steps
+    global FILES
+    global STEPS
+    for i in reversed(range(0,len(STEPS))):
+        f = STEPS[i]
+        if STEPS_FILES_FILTER.search(f):
+            del(STEPS[i])
+    for i in reversed(range(0,len(FILES))):
+        f = FILES[i]
+        if STEPS_FILES_FILTER.search(f):
+            del(FILES[i])

-import re
+
 FASTQ_PATT=re.compile(r'([^_\.]+)(_(\d+))?\.((fastq)|fq)(\.gz)?$')
 def prep_for_galaxy_run():
+    remove_steps_files()
     try:
         os.mkdir(config['temp'])
     except OSError as ose:
@@ -70,12 +100,13 @@
         open('%s/%s_%s_%s_%s_%d.fastq' % (config['temp'], run_acc, study_acc, genome, method, 2), "w").close()
     #create expected file structure for annotated exon bed file & reference index
     ref = config['ref']
-    config['ref'] = '.'
+    config['ref'] = os.path.abspath('.')
     os.makedirs('%s/%s' % (config['ref'], genome))
     os.symlink(ref, '%s/%s/star_idx' % (config['ref'], genome))
     os.makedirs('%s/%s/gtf' % (config['ref'], genome))
     os.symlink(config['exon_bed'], 'exons.tmp')
     os.symlink('../../exons.tmp', '%s/%s/gtf/%s' % (config['ref'], genome, config.get('bw_bed', 'exons.bed')))
+    #TODO may need to add additional symlinking for HISAT2 unmapped alignments
     return([run_acc, study_acc, genome, method])


@@ -112,16 +143,30 @@
     input:
         get_accessions

-
 rule make_manifest:
     input:
+        config['output'] + '/{quad}.salmon.tsv.zst',
         config['output'] + '/{quad}.sjout.zst',
+        config['output'] + '/{quad}.jx_bed.zst',
         config['output'] + '/{quad}.Chimeric.out.junction.zst',
+        config['output'] + '/{quad}.unmapped.bam',
+        config['output'] + '/{quad}.unmapped.fastq.zst',
         config['output'] + '/{quad}.bamcount_nonref.csv.zst',
         config['output'] + '/{quad}.bamcount_auc.tsv',
         config['output'] + '/{quad}.bamcount_frag.tsv',
+        config['output'] + '/{quad}.fastq_check.tsv.zst',
         config['output'] + '/{quad}.all.exon_bw_count.zst',
         config['output'] + '/{quad}.unique.exon_bw_count.zst',
+        config['output'] + '/{quad}.all.bw.zst',
+        config['output'] + '/{quad}.unique.bw.zst',
+        config['output'] + '/{quad}.align.log',
+        config['output'] + '/{quad}.extract_jx.log',
+        config['output'] + '/{quad}.bamcount.log',
+        config['output'] + '/{quad}.align_unmapped.log',
+        config['output'] + '/{quad}.download.log',
+        config['output'] + '/{quad}.fastq_check.log',
+        config['output'] + '/{quad}.sort.log',
+        config['output'] + '/{quad}.salmon.log'
     output:
         config['output'] + '/{quad}.manifest'
     params:
@@ -132,31 +177,35 @@
                 fh.write(params.quad + "." + fn + '\n')

 def galaxy_link_files(op):
+    remove_steps_files()
     a = [op + '/' + f for f in FILES]
-    a.extend([op + '/align.log', op + '/bamcount.log'])
     return a

 rule make_galaxy_links:
     input:
         config['output'] + '/{quad}.sjout.zst',
+        config['output'] + '/{quad}.fastq_check.tsv.zst',
         config['output'] + '/{quad}.bamcount_nonref.csv.zst',
         config['output'] + '/{quad}.bamcount_auc.tsv',
         config['output'] + '/{quad}.bamcount_frag.tsv',
         config['output'] + '/{quad}.Chimeric.out.junction.zst',
         config['output'] + '/{quad}.all.exon_bw_count.zst',
         config['output'] + '/{quad}.unique.exon_bw_count.zst',
-        config['output'] + '/{quad}.manifest'
+        config['output'] + '/{quad}.all.bw.zst',
+        config['output'] + '/{quad}.unique.bw.zst',
+        config['output'] + '/{quad}.fastq_check.log',
+        config['output'] + '/{quad}.align.log',
+        config['output'] + '/{quad}.sort.log',
+        config['output'] + '/{quad}.bamcount.log',
     output:
         config['output'] + '/{quad}.done'
     params:
         quad=lambda wildcards: wildcards.quad,
         out=galaxy_link_files(config['output'])
     run:
-        inputs = input.extend([config['output'] + '/' + params.quad + '.align.log',
-                               config['output'] + '/' + params.quad + '.bamcount.log'])
         for (i,fn) in enumerate(input):
             os.symlink(os.path.abspath(fn), params.out[i])
-        os.symlink(os.path.abspath(input[-3]), output[0])
+        os.symlink(os.path.abspath(input[-1]), output[0])


 rule bamcount:
@@ -257,6 +306,19 @@
         echo "COUNT_BamcountComplete 1"
         """

+rule bw_zstd:
+    input:
+        config['temp'] + '/{prefix}.bw'
+    output:
+        config['output'] + '/{prefix}.bw.zst'
+    shell:
+        """
+        zstd {input} -o {output}
+        size=$(wc -c < {output})
+        echo "COUNT_BwBytes ${{size}}"
+        echo "COUNT_BwZstdComplete 1"
+        """
+
 rule sort:
     input:
         config['temp'] + '/{quad}.bam'
@@ -287,6 +349,163 @@
         echo "COUNT_SortComplete 1"
         """

+rule salmon:
+    input:
+        reads0=config['temp'] + '/{quad}_0.fastq',
+        reads1=config['temp'] + '/{quad}_1.fastq',
+        reads2=config['temp'] + '/{quad}_2.fastq',
+        index1=lambda wildcards: '%s/%s/salmon_index/hash.bin' % (config['ref'], wildcards.quad.split('_')[2]),
+        index2=lambda wildcards: '%s/%s/salmon_index/sa.bin' % (config['ref'], wildcards.quad.split('_')[2])
+    output:
+        config['output'] + '/{quad}.salmon.tsv.zst'
+    log:
+        config['output'] + '/{quad}.salmon.log'
+    params:
+        index_base=lambda wildcards: '%s/%s/salmon_index' % (config['ref'], wildcards.quad.split('_')[2]),
+        salmon_args=config.get('salmon_args', '')
+    threads: 8
+    shell:
+        """
+        READ_FILES="-r {input.reads0}"
+        if [[ -s {input.reads2} ]] ; then
+            READ_FILES="-1 {input.reads1} -2 {input.reads2}"
+        fi
+        if set -o pipefail && time salmon quant \
+            --libType U \
+            --quiet \
+            --validateMappings \
+            -i {params.index_base} \
+            -p {threads} \
+            {params.salmon_args} \
+            ${{READ_FILES}} \
+            --output salmon_quant \
+            --minAssignedFrags 1 \
+            2>&1 | tee -a {log}
+        then
+            echo "COUNT_SalmonSuccess 1"
+
+            time zstd salmon_quant/quant.sf -o {output} 2>&1 | tee -a {log}
+            size=$(wc -c < {output})
+            echo "COUNT_SalmonQuantBytes ${{size}}"
+        else
+            touch {output}
+            echo "COUNT_SalmonFailure 1"
+        fi
+
+        rm -rf salmon_quant
+        echo "COUNT_SalmonComplete 1"
+        """
+
+rule align_unmapped:
+    input:
+        unmapped1=config['temp'] + '/{quad}_1.unmappedfastq',
+        unmapped2=config['temp'] + '/{quad}_2.unmappedfastq',
+        index=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome.1.ht2' % (config['ref'], wildcards.quad.split('_')[2])
+    output:
+        bam=config['output'] + '/{quad}.unmapped.bam',
+        sample=config['output'] + '/{quad}.unmapped.fastq.zst'
+    log:
+        config['output'] + '/{quad}.align_unmapped.log'
+    params:
+        index_base=lambda wildcards: '%s/%s/unmapped_hisat2_idx/genome' % (config['ref'], wildcards.quad.split('_')[2]),
+        srr=lambda wildcards: wildcards.quad.split('_')[0],
+        hisat2_params=config.get('hisat2', ''),
+        max_unalign=config.get('max_unalign', 100000)
+    threads: 16
+    shell:
+        """
+        TMP="{config[temp]}/align_unmapped_temp.{params.srr}"
+        READ_FILES="-1 {input.unmapped1} -2 {input.unmapped2}"
+        if [[ ! -s {input.unmapped2} ]] ; then
+            READ_FILES="-U {input.unmapped1}"
+        fi
+        time hisat2 \
+            $READ_FILES \
+            -t --mm \
+            -x {params.index_base} \
+            --threads {threads} \
+            {params.hisat2_params} \
+            --un ${{TMP}}.un \
+            --un-conc ${{TMP}}.un_conc \
+            -S ${{TMP}}.sam \
+            2>&1 | tee -a {log}
+
+        #
+        # Make BAM file out of aligned reads
+        #
+        time samtools view \
+            -b -F 4 \
+            -o {output.bam} \
+            ${{TMP}}.sam \
+            2>&1 | tee -a {log}
+        rm -f ${{TMP}}.sam
+
+        #
+        # Save a subset of the doubly unaligned reads
+        #
+        if [[ ! -s {input.unmapped2} ]] ; then
+            # unpaired
+            fastq-sample \
+                -n {params.max_unalign} \
+                -o ${{TMP}}.samp \
+                ${{TMP}}.un
+            rm -f ${{TMP}}.un
+        else
+            # paired-end
+            fastq-sample \
+                -n {params.max_unalign} \
+                -o ${{TMP}}.samp \
+                ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
+            rm -f ${{TMP}}.1.un_conc ${{TMP}}.2.un_conc
+
+            # interleave the two output fastqs into single file
+            paste ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq \
+                | paste - - - - \
+                | awk -v OFS="\n" -v FS="\t" '{{print($1,$3,$5,$7,$2,$4,$6,$8)}}' \
+                > ${{TMP}}.samp.fastq
+            rm -f ${{TMP}}.samp.1.fastq ${{TMP}}.samp.2.fastq
+        fi
+
+        test -f ${{TMP}}.samp.fastq
+        time zstd ${{TMP}}.samp.fastq -o {output.sample} 2>&1 | tee -a {log}
+        size=$(wc -c < {output.sample})
+        echo "COUNT_UnmappedSampleBytes ${{size}}"
+        rm -f ${{TMP}}.samp.fastq
+
+        size=$(wc -c < {output.bam})
+        echo "COUNT_UnmappedBamBytes ${{size}}"
+
+        echo "COUNT_AlignUnmappedComplete 1"
+        """
+
+rule extract_jx:
+    input:
+        bam=config['temp'] + '/{quad}-sorted.bam',
+        bamidx=config['temp'] + '/{quad}-sorted.bam.bai',
+        fa=lambda wildcards: '%s/%s/fasta/genome.fa' % (config['ref'], wildcards.quad.split('_')[2]),
+        gtf=lambda wildcards: '%s/%s/gtf/genes.gtf' % (config['ref'], wildcards.quad.split('_')[2])
+    output:
+        config['output'] + '/{quad}.jx_bed.zst'
+    params:
+        srr=lambda wildcards: wildcards.quad.split('_')[0]
+    log:
+        config['output'] + '/{quad}.extract_jx.log'
+    shell:
+        """
+        TMP="{config[temp]}/extract_jx.{params.srr}"
+        time regtools junctions extract \
+            -i 20 -a 1 \
+            -o ${{TMP}}.jx_tmp \
+            {input.bam} 2>&1 | tee -a {log}
+        time zstd ${{TMP}}.jx_tmp -o {output} 2>&1 | tee -a {log}
+        rm -f ${{TMP}}.jx_tmp
+
+        size=$(wc -c < {output})
+        echo "COUNT_ExtractJxBytes ${{size}}"
+
+        echo "COUNT_ExtractJxComplete 1"
+        """
+
 rule align:
     input:
         reads0=config['temp'] + '/{quad}_0.fastq',
@@ -392,5 +611,173 @@
         size=$(wc -c < Aligned.out.bam)
         echo "COUNT_BAMBytes ${{size}}"
         mv Aligned.out.bam {output.bam}
+
         echo "COUNT_AlignComplete 1"
         """
+
+rule fastq_check:
+    input:
+        reads0=config['temp'] + '/{quad}_0.fastq',
+        reads1=config['temp'] + '/{quad}_1.fastq',
+        reads2=config['temp'] + '/{quad}_2.fastq'
+    output:
+        config['output'] + '/{quad}.fastq_check.tsv.zst'
+    log:
+        config['output'] + '/{quad}.fastq_check.log'
+    params:
+        srr=lambda wildcards: wildcards.quad.split('_')[0]
+    shell:
+        """
+        TMP="{config[temp]}/fastq_check-{params.srr}.tsv"
+        touch ${{TMP}}
+        if [[ -s {input.reads0} ]] ; then
+            time seqtk fqchk -q0 {input.reads0} >>${{TMP}} 2>>{log}
+        fi
+        if [[ -s {input.reads1} ]] ; then
+            time seqtk fqchk -q0 {input.reads1} >>${{TMP}} 2>>{log}
+        fi
+        if [[ -s {input.reads2} ]] ; then
+            time seqtk fqchk -q0 {input.reads2} >>${{TMP}} 2>>{log}
+        fi
+        time zstd ${{TMP}} -o {output} 2>&1 | tee -a {log}
+        size=$(wc -c < {output})
+        echo "COUNT_FastqCheckBytes ${{size}}"
+
+        echo "COUNT_FastqCheckComplete 1"
+        """
+
+rule download:
+    output:
+        temp(config['temp'] + '/{quad}_0.fastq'),
+        temp(config['temp'] + '/{quad}_1.fastq'),
+        temp(config['temp'] + '/{quad}_2.fastq')
+    log:
+        config['output'] + '/{quad}.download.log'
+    params:
+        srr=lambda wildcards: wildcards.quad.split('_')[0],
+        method=lambda wildcards: wildcards.quad.split('_')[3],
+        fd_params=config.get('fastq_dump_args', ''),
+        retries=config.get('fastq_dump_retries', '5')
+    threads: 4
+    shell:
+        """
+                set -x
+        SUCCESS=0
+        TIMEOUT=10
+        PARAMS=""
+        TMP="{config[temp]}/dl-{params.srr}"
+        ! test -d ${{TMP}}
+
+        if [[ {params.method} == "sra" ]] ; then
+            USE_FASTERQ=1
+            for i in {{ 1..{params.retries} }} ; do
+                if time prefetch -t fasp -O ${{TMP}} -L info {params.srr} 2>&1 >> {log} ; then
+                    SUCCESS=1
+                    break
+                else
+                    echo "COUNT_SraRetries 1"
+                    TIMEOUT=$((${{TIMEOUT}} * 2))
+                    sleep ${{TIMEOUT}}
+                fi
+            done
+            if (( $SUCCESS == 0 )) ; then
+                echo "COUNT_SraFailures 1"
+                exit 1
+            fi
+            test -f ${{TMP}}/*.sra
+            size=$(cat ${{TMP}}/*.sra | wc -c)
+            echo "COUNT_SraBytesDownloaded ${{size}}"
+            if (( ${{USE_FASTERQ}} == 1 )) ; then
+                time fasterq-dump ${{TMP}}/*.sra \
+                    -e {threads} \
+                    -t ${{TMP}} \
+                    -L info \
+                    --split-files \
+                    --skip-technical \
+                    -o {params.srr}.fastq \
+                    2>&1 >> {log}
+                test -f {params.srr}_2.fastq || mv {params.srr}.fastq {params.srr}_0.fastq
+            else
+                time fastq-dump ${{TMP}}/*.sra \
+                    -L info \
+                    --split-files \
+                    --skip-technical \
+                    -O . \
+                    2>&1 >> {log}
+                test -f {params.srr}_2.fastq || mv {params.srr}_1.fastq {params.srr}_0.fastq
+            fi
+            rm -rf ${{TMP}}
+        elif [[ {params.method} == "gdc" ]] ; then
+            TOKEN=~/gdc/latest.txt
+            if [[ ! -f ${{TOKEN}} ]] ; then
+                echo "ERROR: no GDC token file found at ${{TOKEN}}"
+                exit 1
+            fi
+            for i in {{ 1..{params.retries} }} ; do
+                if time gdc-client download \
+                    -t ${{TOKEN}} \
+                    --log-file {{TMP}}/log.txt \
+                    -d {{TEMP}} \
+                    {params.srr} 2>&1 >> {log}
+                then
+                    SUCCESS=1
+                    break
+                else
+                    echo "COUNT_GdcRetries 1"
+                    TIMEOUT=$((${{TIMEOUT}} * 2))
+                    sleep ${{TIMEOUT}}
+                fi
+            done
+            if (( $SUCCESS == 0 )) ; then
+                echo "COUNT_GdcFailures 1"
+                exit 1
+            fi
+            test -d {{TEMP}}/{params.srr}
+            test -f {{TEMP}}/{params.srr}/*.tar.gz
+
+            echo "=== gdc-client log.txt begin ===" >> {log}
+            cat {{TMP}}/log.txt >> {log}
+            echo "=== gdc-client log.txt end===" >> {log}
+
+            size=$(cat {{TEMP}}/{params.srr}/*.tar.gz | wc -c)
+            echo "COUNT_GdcBytesDownloaded ${{size}}"
+
+            tar zxvf {{TEMP}}/{params.srr}/*.tar.gz
+            rm -rf {{TEMP}}
+
+            num_1s=$(ls -1 *_1.fastq | wc -l)
+            num_2s=$(ls -1 *_2.fastq | wc -l)
+            if (( ${{num_1s}} == 0 )) ; then
+                echo "ERROR: No _1.fastq files output"
+                exit 1
+            fi
+            if (( ${{num_1s}} > 1 )) ; then
+                echo "ERROR: More than one _1.fastq file found"
+                exit 1
+            fi
+            if (( ${{num_2s}} == 0 )) ; then
+                # unpaired
+                mv *_1.fastq {params.srr}_0.fastq
+            else
+                # paired-end
+                mv *_1.fastq {params.srr}_1.fastq
+                mv *_2.fastq {params.srr}_2.fastq
+            fi
+        fi
+
+        # Next chunk expects the FASTQ files to exist in the current directory
+        # named {{params.srr}}_{{0,1,2}}.fastq
+        size=0
+        for i in {{0..2}} ; do
+            fn={params.srr}_${{i}}.fastq
+            if [[ -f ${{fn}} ]] ; then
+                echo "COUNT_FilesDownloaded 1"
+            else
+                touch ${{fn}}
+            fi
+            size=$((${{size}} + $(wc -c < ${{fn}})))
+            mv ${{fn}} {config[temp]}/{wildcards.quad}_${{i}}.fastq
+        done
+        echo "COUNT_BytesDownloaded ${{size}}"
+        echo "COUNT_DownloadComplete 1"
+        """
--- a/monorail.xml	Wed Feb 13 17:03:56 2019 -0500
+++ b/monorail.xml	Wed Feb 13 18:39:50 2019 -0500
@@ -2,10 +2,18 @@
     <!-- much of this was based on https://github.com/galaxyproject/tools-iuc/blob/master/tools/rgrnastar/rg_rnaStar.xml -->
     <requirements>
         <requirement type="package" version="1.9">samtools</requirement>
-        <requirement type="package" version="2.6.0b">star</requirement>
+        <requirement type="package" version="2.6.1b">star</requirement>
         <requirement type="package" version="0.2.6">bamcount</requirement>
-        <requirement type="package" version="5.4.0">snakemake</requirement>
-        <requirement type="package" version="1.3.8">zstd</requirement>
+        <requirement type="package" version="5.4.0">snakemake-minimal</requirement>
+        <requirement type="package" version="1.3.3">zstd</requirement>
+        <requirement type="package" version="1.3">seqtk</requirement>
+<!--
+        <requirement type="package" version="2.1.0">hisat2</requirement>
+        <requirement type="package" version="0.8">fastq-tools</requirement>
+        <requirement type="package" version="0.12.0">salmon</requirement>
+        <requirement type="package" version="0.5.0">regtools</requirement>
+        <requirement type="package" version="2.9.1">sra-tools</requirement>
+-->
     </requirements>
         <!-- /bin/bash -x monorail.slim.sh ../ath10 4 10 ../ath10/gtf/exons.bed ./tmp2 ../fastqs/SRR8505407_1_100.fastq.gz ../fastqs/SRR8505407_2_100.fastq.gz -->
         <command detect_errors="aggressive"><![CDATA[