Mercurial > repos > nml > quasitools

diff hydra.xml @ 0:1f1214983a1c draft default tip
planemo upload for repository https://github.com/phac-nml/quasitools commit 5a9e4c9a582828654893166caf20576f5e0c418e
author: nml
date: Mon, 20 Jun 2022 20:05:57 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hydra.xml	Mon Jun 20 20:05:57 2022 +0000
@@ -0,0 +1,325 @@
+<tool id="hydra" name="Hydra pipeline" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
+    <description>Identifies drug resistance within an NGS dataset</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+
+        quasitools hydra
+
+        ## Preparing file input.
+        #if $data_type.type == "paired":
+
+            '$data_type.fastq_input1'
+            '$data_type.fastq_input2'
+
+        #elif $data_type.type == "collection":
+
+            '$data_type.fastq_input1.forward'
+            '$data_type.fastq_input1.reverse'
+
+        #elif $data_type.type == "single":
+
+            '$data_type.fastq_input1'
+
+        #end if
+
+        #if $mutation_db:
+            -m '$mutation_db'
+        #end if
+
+        #if $reporting_threshold:
+            -rt '$reporting_threshold'
+        #end if
+
+        #if $consensus_pct:
+            -cp '$consensus_pct'
+        #end if
+
+        #if $length_cutoff:
+            -lc '$length_cutoff'
+        #end if
+
+        #if $score_cutoff:
+            -sc '$score_cutoff'
+        #end if
+
+        #if $error_rate:
+            -e '$error_rate'
+        #end if
+
+        #if $min_read_qual:
+            -rq '$min_read_qual'
+        #end if
+
+        #if $min_variant_qual:
+            -vq '$min_variant_qual'
+        #end if
+
+        #if $min_depth:
+            -md '$min_depth'
+        #end if
+
+        #if $min_ac:
+            -ma '$min_ac'
+        #end if
+
+        #if $min_freq:
+            -mf '$min_freq'
+        #end if
+
+        #if $consensus.consensus_bool == "true_consensus":
+            --generate_consensus
+
+            #if $consensus.fasta_id.type == "default":
+                --id
+                #if $data_type.type == "paired":
+                    '${fastq_input1.element_identifier}'_'${fastq_input2.element_identifier}'
+                #elif $data_type.type == "single":
+                    '${fastq_input1.element_identifier}'
+                #end if
+            #elif $consensus.fasta_id.type == "custom":
+                --id '$consensus.fasta_id.custom_id'
+            #end if
+        #end if
+
+        #if $low_quality.qual_selector == "filter_ns":
+            --ns
+        #elif $low_quality.qual_selector == "mask_reads":
+            --mask_reads
+        #end if
+
+        #if $score_type.score_selector == "median":
+            --median
+        #elif $score_type.score_selector == "mean":
+            --mean
+        #end if
+
+        $trim_reads
+
+        -o output
+
+    ]]></command>
+    <inputs>
+        <conditional name="data_type">
+            <param name="type" type="select" label="Specify the read type.">
+                <option value="single">Single-end Data</option>
+                <option value="paired">Paired-end Data</option>
+                <option value="collection">Collection Paired-end Data</option>
+            </param>
+            <when value="single">
+                <param name="fastq_input1" type="data" format="fastq" label="Single end read file(s)"/>
+            </when>
+            <when value="paired">
+                <param name="fastq_input1" type="data" format="fastq" label="Forward paired-end read file"/>
+                <param name="fastq_input2" type="data" format="fastq" label="Reverse paired-end read file"/>
+            </when>
+            <when value="collection">
+                <param name="fastq_input1" type="data_collection" label="Paired-end reads collection" optional="false" format="fastq" collection_type="paired" />
+            </when>
+        </conditional>
+        <param name="mutation_db" type="data" format="tsv" optional="true" label="Mutation DB" help="Defaults to HIV mutation database." />
+        <param name="reporting_threshold" type="integer" optional="true" min="1" max="100" value="1" label="Reporting threshold. Defaults to 1." help="Minimum mutation frequency to report." />
+        <param name="consensus_pct" type="integer" optional="true" min="1" max="20" value="20" label="Consensus percentage" help="Minimum mutation frequency to report. Defaults to 20." />
+        <param name="length_cutoff" type="integer" optional="true" min="1" max="1000" label="Length cutoff" value="100" help="Reads which fall short of the specified length will be filtered out. Defaults to 100." />
+        <param name="score_cutoff" type="integer" optional="true" min="0" max="40" label="Score cutoff" value="30" help="Reads whose median or mean quality score (depending on the score type specified) is less than the specified score cutoff value will be filtered out. Defaults to 30." />
+        <param name="error_rate" type="float" optional="true" min="0" max="1" label="Error rate" value="0.0021" help="Estimated sequencing error rate. Defaults to 0.0021."/>
+        <param name="min_variant_qual" type="integer" optional="true" min="1" max="100" label="Minimum quality" value="30" help="Minimum required quality for variant to be considered later on in the pipeline. Defaults to 30." />
+        <param name="min_read_qual" type="integer" optional="true" min="1" max="100" label="Minimum quality" value="30" help="Minimum required quality for a position in a read not to be masked, is masking is enabled. Defaults to 30." />
+        <param name="min_depth" type="integer" optional="true" min="0" max="5000" label="Minimum depth" value="100" help="Minimum required depth for variant to be considered later on in the pipeline. Defaults to 100." />
+        <param name="min_ac" type="integer" optional="true" min="0" max="5000" label="Minimum allele count" value="5" help="Minimum required allele count for variant to be considered later on in the pipeline. Defaults to 5." />
+        <param name="min_freq" type="float" optional="true" min="0" max="1" label="Minimum frequency" value="0.01" help="Minimum required frequency for variant to be considered later on in the pipeline. Defaults to 0.01." />
+        <param name="trim_reads" type="boolean" optional="true" checked="false" truevalue="-tr" falsevalue="" label="Trim reads" help="Iteratively trim reads based on filter values if enabled." />
+        <conditional name="consensus">
+            <param name="consensus_bool" type="select" label="Generate consensus sequence." multiple="false" display="radio">
+                <option value="true_consensus">True</option>
+                <option selected="true" value="false_consensus">False</option>
+            </param>
+            <when value="true_consensus">
+                <conditional name="fasta_id">
+                    <param name="type" type="select" label="Specify consensus fasta identifier" multiple="false" display="radio">
+                        <option value="default" >Use fasta dataset name</option>
+                        <option value="custom">Use custom name</option>
+                    </param>
+                    <when value="default">
+                    </when>
+                    <when value="custom">
+                        <param name="custom_id" type="text" optional="false" value="custom_id" label="Fasta identifier" help="Type in a fasta identifier."/>
+                    </when>
+                </conditional>
+            </when>
+            <when value="false_consensus">
+            </when>
+        </conditional>
+        <conditional name="low_quality">
+            <param name="qual_selector" type="select" label="Filter out regions masked, or mask low coverage regions with n's." multiple="false" display="radio">
+                <option value="filter_ns">Filter out regions with n's</option>
+                <option value="mask_reads">Mask low coverage regions with n's</option>
+                <option value="neither" selected="true">Do not filter or mask low coverage regions.</option>
+            </param>
+            <when value="filter_ns">
+            </when>
+            <when value="mask_reads">
+            </when>
+            <when value="neither">
+            </when>
+        </conditional>
+        <conditional name="score_type">
+            <param name="score_selector" type="select" label="Use either median score (default) or mean score for the score cutoff value." multiple="false" display="radio">
+                <option value="median" selected="true">Use median score</option>
+                <option value="mean">Use mean score</option>
+            </param>
+            <when value="median">
+            </when>
+            <when value="mean">
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data format="bam" label="HyDRA: alignment bam output" name="output_bam" from_work_dir="output/align.bam" />
+        <data format="csv" label="HyDRA: coverage output" name="output_coverage" from_work_dir="output/coverage_file.csv" />
+        <data format="csv" label="HyDRA: drug resistance output" name="output_dr" from_work_dir="output/dr_report.csv" />
+        <data format="fastq" label="HyDRA: filtered reads output" name="output_filtered" from_work_dir="output/filtered.fastq" />
+        <data format="vcf" label="HyDRA: variants output" name="output_hydra" from_work_dir="output/hydra.vcf" />
+        <data format="txt" label="HyDRA: aa mutations output" name="output_aa_mt" from_work_dir="output/mutation_report.aavf" />
+        <data format="txt" label="HyDRA: stats output" name="output_stats" from_work_dir="output/stats.txt" />
+        <data format="fasta" label="HyDRA: consensus output" name="output_consensus" from_work_dir="output/consensus.fasta" >
+            <filter>consensus['consensus_bool'] == "true_consensus"</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="type" value="single"/>
+            <param name="fastq_input1" value="forward.fastq" />
+            <param name="score_selector" value="mean" />
+            <output name="output_coverage">
+                <assert_contents>
+                    <has_text text="frame: 0" />
+                    <has_text text="1,0" />
+                    <has_text text="948,0" />
+                </assert_contents>
+            </output>
+            <output name="output_dr">
+                <assert_contents>
+                    <has_text text="Chromosome,Gene,Category,Surveillance,Wildtype,Position,Mutation,Mutation Frequency,Coverage" />
+                    <has_text text="hxb2_pol,RT,NNRTI,Yes,K,101,P,14.23,1574" />
+                    <has_text text="hxb2_pol,RT,NNRTI,Yes,K,103,N,5.49,1912" />
+                    <has_text text="hxb2_pol,RT,NNRTI,Yes,Y,181,C,24.07,4557" />
+                    <has_text text="hxb2_pol,RT,NNRTI,Yes,Y,181,I,18.04,4557" />
+                    <has_text text="hxb2_pol,RT,NNRTI,Yes,Y,181,V,20.08,4557" />
+                    <has_text text="hxb2_pol,RT,NNRTI,Yes,Y,188,C,2.81,3454" />
+                    <has_text text="hxb2_pol,RT,NNRTI,Yes,G,190,A,5.20,3233" />
+                    <has_text text="hxb2_pol,RT,NNRTI,Yes,G,190,S,6.68,3233" />
+                </assert_contents>
+            </output>
+            <output name="output_hydra">
+                <assert_contents>
+                    <has_text_matching expression="#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"/>
+                    <has_text_matching expression="hxb2_pol\s576\s.\sa\sg\s100\sPASS\sDP=805;AC=245;AF=0.3043" />
+                    <has_text_matching expression="hxb2_pol\s958\s.\sc\sa\s100\sPASS\sDP=2503;AC=28;AF=0.0112" />
+                </assert_contents>
+            </output>
+            <output name="output_aa_mt">
+                <assert_contents>
+                    <has_text_matching expression="#CHROM\tGENE\tPOS\tREF\tALT\tFILTER\tALT_FREQ\tCOVERAGE\tINFO"/>
+                    <has_text_matching expression="hxb2_pol\tRT\t101\tK\tP\tPASS\t0.1423\t1574\tRC=aaa;AC=CCa;ACF=0.1423;CAT=NNRTI;SRVL=Yes" />
+                    <has_text_matching expression="hxb2_pol\tRT\t101\tK\tT\tmf0.01\t0.0013\t1574\tRC=aaa;AC=aCa;ACF=0.0013;CAT=.;SRVL=." />
+                </assert_contents>
+            </output>
+           
+            <output name="output_stats">
+                <assert_contents>
+                    <has_text text="Input Size: 25000"/>
+                    <has_text text="Number of reads filtered due to length: 15074"/>
+                    <has_text text="Number of reads filtered due to average quality score: 501"/>
+                    <has_text text="Number of reads filtered due to presence of Ns: 0"/>
+                    <has_text text="Number of reads filtered due to excess coverage: 0"/>
+                    <has_text text="Number of reads filtered due to poor mapping: 12"/>
+                    <has_text text="Percentage of reads filtered: 62.35"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+HyDRA - HIV Drug Resistance Analyzer
+====================================
+
+The HyDRA pipeline provides a pipeline for identifying drug resistance within a Next Generation Sequencing dataset. The pipeline takes as input the raw reads produced by a Next Generation Sequencer and produces a report detailing found drug resistance per sample.
+
+Authors
+-------
+
+The HyDRA pipeline was developed by Eric Enns and David Peddle.
+
+Stages
+------
+
+The HyDRA pipleine proceeds through the following stages:
+
+1. Quality Control/Filtering
+2. Reference mapping using bowtie2.
+3. Variant Calling and filtering using a Poisson distribution.
+4. AA Mutation Calling and filtering.
+5. Drug Resistance report generation.
+
+Details
+-------
+
+The following is an example for running the pipeline, using our included test dataset:
+    * Output directory name: "/tmp/hydra_out"
+    * Forward reads: "reads_w_K103N.fastq"
+
+### Output ###
+
+The detailed output directory tree looks as follows:
+
+    /tmp/hydra_out/
+        * align.bam
+        * coverage_file.csv
+        * dr_report.csv
+        * filtered.fastq
+        * hydra.vcf
+        * mutation_report.aavf
+        * stats.txt
+
+The description of each of these directories/files are as follows:
+
+* __run.conf__: The configuration used when this output was produced.
+* __reads_w_K103N/__: The results directory for the input file reads_w_K103N.fastq
+    * __align.bam__: The alignment file in bam format.
+    * __coverage_file.csv__: A file with one entry per line with the AA position and the coverage at the position.
+    * __dr_report.csv__: A report detailing the drug resistant mutations found, above the reporting threshold (default: 1%).
+    * __filtered.fastq__: The reads remaining after the filtering stage.
+    * __hydra.vcf__: The variants found by the pipeline.
+    * __mutation_report.aavf__: The AA mutations found by the pipeline.
+    * __stats.txt__: A log file detailing size after filtering and major stages.
+
+The __dr_report.csv__ file lists all found drug resistant mutations (mutations included in the mutation database) which have frequency greater than the reporting threshold. An example of this file is given below.
+
+Example: __dr_report.csv__
+
+    Gene,Category,Surveillance,Wildtype,Position,Mutation,Mutation Frequency,Coverage
+    RT,NNRTI,Yes,K,103,N,9.03,155
+
+The __mutation_report.aavf__ file is in AAVF format (https://github.com/winhiv/aavf-spec), an amino acid variant format inspired by the VCF format. The __mutation_report.aavf__ file details all of the AA mutations found by the pipeline. An example if this file is given below.
+
+Example: __mutation_report.aavf__
+
+    ##fileformat=AAVFv1.0
+    ##fileDate=20220615
+    ##source=quasitools:hydra
+    ##reference=hxb2_pol.fas
+    ##INFO=<ID=RC,Number=1,Type=String,Description="Reference Codon">
+    ##INFO=<ID=AC,Number=.,Type=String,Description="Alternate Codon">
+    ##INFO=<ID=ACF,Number=.,Type=Float,Description="Alternate Codon Frequency,for each Alternate Codon,in the same order aslisted.">
+    ##INFO=<ID=CAT,Number=.,Type=String,Description="Drug Resistance Category">
+    ##INFO=<ID=SRVL,Number=.,Type=String,Description="Drug Resistance Surveillance">
+    ##FILTER=<ID=af0.01,Description="Set if True; alt_freq<0.01">
+    #CHROM	GENE	POS	REF	ALT	FILTER	ALT_FREQ	COVERAGE	INFO
+    hxb2_pol	RT	101	K	P	PASS	0.1423	1574	RC=aaa;AC=CCa;ACF=0.1423;CAT=NNRTI;SRVL=Yes
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>