view samtools_mpileup.xml @ 2:3aa48bcbc599 draft

Uploaded tarball for 0.0.3 version.
author devteam
date Wed, 12 Mar 2014 12:53:30 -0400
parents b47a418ccfdc
children da0203c3461a
line wrap: on
line source

<tool id="samtools_mpileup" name="MPileup" version="0.0.3">
    <description>SNP and indel caller</description>
    <requirements>
        <requirement type="package" version="0.1.19">samtools</requirement>
    </requirements>
    <command><![CDATA[
    #if $reference_source.reference_source_selector == "history":
       ln -s "${reference_source.ref_file}" && samtools faidx `basename "${reference_source.ref_file}"` && samtools mpileup
    #else:
        samtools mpileup
    #end if
    #if $reference_source.reference_source_selector != "history":
        -f "${reference_source.ref_file.fields.path}"
    #else:
        -f "${reference_source.ref_file}"
    #end if
    #for $i, $input_bam in enumerate( $reference_source.input_bams ):
        $input_bam.input_bam
    #end for
    #if str( $advanced_options.advanced_options_selector ) == "advanced":
        #if str( $advanced_options.filter_by_flags.filter_flags ) == "filter":
            #if $advanced_options.filter_by_flags.require_flags:
                --rf ${sum([int(flag) for flag in str($advanced_options.filter_by_flags.require_flags).split(',')])}
            #end if
            #if $advanced_options.filter_by_flags.exclude_flags:
                --ff ${sum([int(flag) for flag in str($advanced_options.filter_by_flags.exclude_flags).split(',')])}
            #end if
        #end if
        #if str( $advanced_options.limit_by_region.limit_by_regions ) == "paste":
            -l "$pasted_regions"
        #elif str( $advanced_options.limit_by_region.limit_by_regions ) == "history"
            -l "$bed_regions"
        #end if
        #if str( $advanced_options.exclude_read_group.exclude_read_groups ) == "paste":
            -G "$excluded_read_groups"
        #elif str( $advanced_options.exclude_read_group.exclude_read_groups ) == "history"
            -G "$read_groups"
        #end if
            ${advanced_options.skip_anomalous_read_pairs}
        ${advanced_options.disable_probabilistic_realignment}
        -C "${advanced_options.coefficient_for_downgrading}"
        -d "${advanced_options.max_reads_per_bam}"
        ${advanced_options.extended_BAQ_computation}
        -q "${advanced_options.minimum_mapping_quality}"
        -Q "${advanced_options.minimum_base_quality}"
        #if str( $advanced_options.region_string ):
            -r "${advanced_options.region_string}"
        #end if
        ${advanced_options.output_per_sample_read_depth}
        ${advanced_options.output_per_sample_strand_bias_p_value}
    #end if
    #if str( $genotype_likelihood_computation_type.genotype_likelihood_computation_type_selector ) == 'perform_genotype_likelihood_computation':
        ##-g or -u
        -g
        -e "${genotype_likelihood_computation_type.gap_extension_sequencing_error_probability}"
        -h "${genotype_likelihood_computation_type.coefficient_for_modeling_homopolymer_errors}"
        #if str( $genotype_likelihood_computation_type.perform_indel_calling.perform_indel_calling_selector ) == 'perform_indel_calling':
            -L "${genotype_likelihood_computation_type.perform_indel_calling.skip_indel_calling_above_sample_depth}"
            -m "${genotype_likelihood_computation_type.perform_indel_calling.minimum_gapped_reads_for_indel_candidates}"
            -F "${genotype_likelihood_computation_type.perform_indel_calling.minimum_gapped_read_fraction}"
            ${genotype_likelihood_computation_type.perform_indel_calling.gapped_read_per_sample}
        #else:
            -I
        #end if
        -o "${genotype_likelihood_computation_type.gap_open_sequencing_error_probability}"
        #if len( $genotype_likelihood_computation_type.platform_list_repeat ):
            -P "${ ",".join( [ str( platform.platform_entry ) for platform in $genotype_likelihood_computation_type.platform_list_repeat ] ) }"
        #end if
    #else:
        ${genotype_likelihood_computation_type.base_position_on_reads}
        ${genotype_likelihood_computation_type.output_mapping_quality}
    #end if
    > "$output_mpileup" 2> "$output_log"
    ]]></command>
    <stdio>
        <exit_code range="1:" level="fatal" description="Error" />
    </stdio>
    <inputs>
        <conditional name="reference_source">
            <param name="reference_source_selector" type="select" label="Choose the source for the reference list">
                <option value="cached">Locally cached</option>
                <option value="history">History</option>
            </param>
            <when value="cached">
                <repeat name="input_bams" title="BAM file" min="1">
                    <param name="input_bam" type="data" format="bam" label="BAM file">
                        <validator type="unspecified_build" />
                        <validator type="dataset_metadata_in_data_table" table_name="fasta_indexes" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." /> <!-- fixme!!! this needs to be a select -->
                    </param>
                </repeat>
                <param name="ref_file" type="select" label="Using reference genome">
                    <options from_data_table="fasta_indexes" />
                    <!-- <filter type="data_meta" ref="input_bam" key="dbkey" column="1" /> does not yet work in a repeat...-->
                </param>
            </when>
            <when value="history">
                <repeat name="input_bams" title="BAM file" min="1">
                    <param name="input_bam" type="data" format="bam" label="BAM file">
                        <validator type="metadata" check="bam_index" message="Metadata missing, click the pencil icon in the history item and use the auto-detect feature to correct this issue." />
                    </param>
                </repeat>
                <param name="ref_file" type="data" format="fasta" label="Using reference file" />
            </when>
        </conditional>
        <conditional name="genotype_likelihood_computation_type">
            <param name="genotype_likelihood_computation_type_selector" type="select" label="Genotype Likelihood Computation">
                <option value="perform_genotype_likelihood_computation">Perform genotype likelihood computation</option>
                <option value="do_not_perform_genotype_likelihood_computation" selected="True">Do not perform genotype likelihood computation</option>
            </param>
            <when value="perform_genotype_likelihood_computation">
                <param name="gap_extension_sequencing_error_probability" type="integer" value="20" label="Phred-scaled gap extension sequencing error probability" />
                <param name="coefficient_for_modeling_homopolymer_errors" type="integer" value="100" label="Coefficient for modeling homopolymer errors." />
                <conditional name="perform_indel_calling">
                    <param name="perform_indel_calling_selector" type="select" label="Perform INDEL calling">
                        <option value="perform_indel_calling" selected="True">Perform INDEL calling</option>
                        <option value="do_not_perform_indel_calling">Do not perform INDEL calling</option>
                    </param>
                    <when value="perform_indel_calling">
                        <param name="skip_indel_calling_above_sample_depth" type="integer" value="250" label="Skip INDEL calling if the average per-sample depth is above" />
                        <param name="minimum_gapped_reads_for_indel_candidates" type="integer" value="1" label="Minimum gapped reads for indel candidates" />
                        <param name="minimum_gapped_read_fraction" type="float" value="0.002" label="Minimum fraction of gapped reads for candidates" />
                        <param name="gapped_read_per_sample" type="boolean" truevalue="-p" falsevalue="" checked="False" label="Apply minimum values on a per-sample basis" />
                    </when>
                    <when value="do_not_perform_indel_calling" />
                </conditional>
                <param name="gap_open_sequencing_error_probability" type="integer" value="40" label="Phred-scaled gap open sequencing error probability" />
                <repeat name="platform_list_repeat" title="Platform for INDEL candidates">
                    <param name="platform_entry" type="text" value="" label="Platform to use for INDEL candidates" />
                </repeat>
            </when>
            <when value="do_not_perform_genotype_likelihood_computation">
                <param name="base_position_on_reads" type="boolean" truevalue="-O" falsevalue="" checked="False" label="Output base positions on reads" />
                <param name="output_mapping_quality" type="boolean" truevalue="-s" falsevalue="" checked="False" label="Output mapping quality" />
            </when>
        </conditional>
        <conditional name="advanced_options">
            <param name="advanced_options_selector" type="select" label="Set advanced options">
                <option value="basic" selected="True">Basic</option>
                <option value="advanced">Advanced</option>
            </param>
            <when value="advanced">
                <conditional name="filter_by_flags">
                    <param name="filter_flags" type="select" label="Set filter by flags">
                        <option value="nofilter" selected="True">Do not filter</option>
                        <option value="filter">Filter by flags to exclude or require</option>
                    </param>
                    <when value="filter">
                        <param name="require_flags" type="select" display="checkboxes" multiple="True" label="Require">
                            <option value="1">Read is paired</option>
                            <option value="2">Read is mapped in a proper pair</option>
                            <option value="4">The read is unmapped</option>
                            <option value="8">The mate is unmapped</option>
                            <option value="16">Read strand</option>
                            <option value="32">Mate strand</option>
                            <option value="64">Read is the first in a pair</option>
                            <option value="128">Read is the second in a pair</option>
                            <option value="256">The alignment or this read is not primary</option>
                            <option value="512">The read fails platform/vendor quality checks</option>
                            <option value="1024">The read is a PCR or optical duplicate</option>
                        </param>
                        <param name="exclude_flags" type="select" display="checkboxes" multiple="True" label="Exclude">
                            <option value="1">Read is paired</option>
                            <option value="2">Read is mapped in a proper pair</option>
                            <option value="4">The read is unmapped</option>
                            <option value="8">The mate is unmapped</option>
                            <option value="16">Read strand</option>
                            <option value="32">Mate strand</option>
                            <option value="64">Read is the first in a pair</option>
                            <option value="128">Read is the second in a pair</option>
                            <option value="256">The alignment or this read is not primary</option>
                            <option value="512">The read fails platform/vendor quality checks</option>
                            <option value="1024">The read is a PCR or optical duplicate</option>
                        </param>
                    </when>
                    <when value="nofilter" />
                </conditional>
                <conditional name="limit_by_region">
                    <param name="limit_by_regions" type="select" label="Select regions to call">
                        <option value="no_limit" selected="True">Do not limit</option>
                        <option value="history">From an uploaded BED file</option>
                        <option value="paste">Paste a list of regions or BED</option>
                    </param>
                    <when value="history">
                        <param name="bed_regions" type="data" format="bed" label="BED file">
                            <validator type="dataset_ok_validator" />
                        </param>
                    </when>
                    <when value="paste">
                        <param name="region_paste" type="text" area="true" size="10x35" label="Regions" help="Paste a list of regions in BED format or as a list of chromosomes and positions"/>
                    </when>
                    <when value="no_limit" />
                </conditional>
                <conditional name="exclude_read_group">
                    <param name="exclude_read_groups" type="select" label="Select read groups to exclude">
                        <option value="no_limit" selected="True">Do not exclude</option>
                        <option value="history">From an uploaded text file</option>
                        <option value="paste">Paste a list of read groups</option>
                    </param>
                    <when value="history">
                        <param name="read_groups" type="data" format="txt" label="Text file">
                            <validator type="dataset_ok_validator" />
                        </param>
                    </when>
                    <when value="paste">
                        <param name="group_paste" type="text" area="true" size="10x35" label="Read groups" help="Paste a list of read groups"/>
                    </when>
                    <when value="no_limit" />
                </conditional>
                <param name="skip_anomalous_read_pairs" type="boolean" truevalue="-A" falsevalue="" checked="False" label="Do not skip anomalous read pairs in variant calling" />
                <param name="disable_probabilistic_realignment" type="boolean" truevalue="-B" falsevalue="" checked="False" label=" Disable probabilistic realignment for the computation of base alignment quality (BAQ)" />
                <param name="coefficient_for_downgrading" type="integer" value="0" label="Coefficient for downgrading mapping quality for reads containing excessive mismatches" />
                <param name="max_reads_per_bam" type="integer" value="250" label="Max reads per BAM" />
                <param name="extended_BAQ_computation" type="boolean" truevalue="-E" falsevalue="" checked="False" label="Extended BAQ computation" />
                <param name="minimum_mapping_quality" type="integer" value="0" label="Minimum mapping quality for an alignment to be used" />
                <param name="minimum_base_quality" type="integer" value="13" label="Minimum base quality for a base to be considered" />
                <param name="region_string" type="text" value="" label="Only generate pileup in region" />
                <param name="output_per_sample_read_depth" type="boolean" truevalue="-D" falsevalue="" checked="False" label="Output per-sample read depth" />
                <param name="output_per_sample_strand_bias_p_value" type="boolean" truevalue="-S" falsevalue="" checked="False" label="Output per-sample Phred-scaled strand bias P-value" />
            </when>
            <when value="basic" />
        </conditional>
    </inputs>
    <configfiles>
        <configfile name="excluded_read_groups">
<![CDATA[
<% 
import re
%>
#set pasted_data = ''
#if str( $advanced_options.advanced_options_selector ) == "advanced":
    #if str( $advanced_options.exclude_read_group.exclude_read_groups ) == "paste":
        #set regex=re.compile("\\s+")
        #set pasted_data = '\t'.join( regex.split( str( $advanced_options.exclude_read_group['read_groups'] ) ) )
    #end if
#end if
${pasted_data}
]]>  
        </configfile>
        <configfile name="pasted_regions">
<![CDATA[
<% 
import re
%>
#set pasted_data = ''
#if str( $advanced_options.advanced_options_selector ) == "advanced":
    #if str( $advanced_options.limit_by_region.limit_by_regions ) == "paste":
        #set regex=re.compile("\\s+")
        #set pasted_data = '\t'.join( regex.split( str( $advanced_options.limit_by_region['region_paste'] ) ) )
    #end if
#end if
${pasted_data}
]]>    
        </configfile>
    </configfiles>
    <outputs>
        <data format="pileup" name="output_mpileup" label="${tool.name} on ${on_string}">
            <change_format>
                <when input="genotype_likelihood_computation_type.genotype_likelihood_computation_type_selector" value="perform_genotype_likelihood_computation" format="bcf" />
            </change_format>
        </data>
        <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" />
    </outputs>
    <tests>
        <test>
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="phiX.fasta" ftype="fasta" />
            <param name="input_bam" value="samtools_mpileup_in_1.bam" ftype="bam" />
            <param name="genotype_likelihood_computation_type_selector" value="do_not_perform_genotype_likelihood_computation" />
            <param name="advanced_options_selector" value="basic" />
            <param name="base_position_on_reads" value="true" />
            <param name="output_mapping_quality" value="true" />
            <output name="output_mpileup" file="samtools_mpileup_out_1.pileup" /> 
            <output name="output_log" file="samtools_mpileup_out_1.log" />
        </test>
        <test>
            <param name="reference_source_selector" value="history" />
            <param name="ref_file" value="phiX.fasta" ftype="fasta" />
            <param name="input_bam" value="samtools_mpileup_in_1.bam" ftype="bam" />
            <param name="genotype_likelihood_computation_type_selector" value="perform_genotype_likelihood_computation" />
            <param name="gap_extension_sequencing_error_probability" value="20" />
            <param name="coefficient_for_modeling_homopolymer_errors" value="100" />
            <param name="perform_indel_calling_selector" value="perform_indel_calling" />
            <param name="skip_indel_calling_above_sample_depth" value="250" />
            <param name="gap_open_sequencing_error_probability" value="40" />
            <param name="platform_list_repeat" value="0" />
            <param name="advanced_options_selector" value="basic" />
            <output name="output_mpileup" ftype="bcf" file="samtools_mpileup_out_2.bcf" lines_diff="1" /> 
            <output name="output_log" file="samtools_mpileup_out_2.log" />
        </test>
        <test>
            <param name="reference_source_selector" value="cached" />
            <param name="input_bam" value="samtools_mpileup_in_3.bam" ftype="bam" dbkey="phiX" />
            <param name="genotype_likelihood_computation_type_selector" value="perform_genotype_likelihood_computation" />
            <param name="gap_extension_sequencing_error_probability" value="20" />
            <param name="coefficient_for_modeling_homopolymer_errors" value="100" />
            <param name="perform_indel_calling_selector" value="perform_indel_calling" />
            <param name="skip_indel_calling_above_sample_depth" value="250" />
            <param name="gap_open_sequencing_error_probability" value="40" />
            <param name="platform_list_repeat" value="0" />
            <param name="advanced_options_selector" value="basic" />
            <output name="output_mpileup" ftype="bcf" file="samtools_mpileup_out_2.bcf" lines_diff="1" /> 
            <output name="output_log" file="samtools_mpileup_out_3.log" />
        </test>
        <test>
            <param name="reference_source_selector" value="cached" />
            <param name="input_bam" value="samtools_mpileup_in_1.bam" ftype="bam" dbkey="phiX" />
            <param name="genotype_likelihood_computation_type_selector" value="perform_genotype_likelihood_computation" />
            <param name="gap_extension_sequencing_error_probability" value="20" />
            <param name="coefficient_for_modeling_homopolymer_errors" value="100" />
            <param name="perform_indel_calling_selector" value="perform_indel_calling" />
            <param name="skip_indel_calling_above_sample_depth" value="250" />
            <param name="gap_open_sequencing_error_probability" value="40" />
            <param name="platform_list_repeat" value="0" />
            <param name="advanced_options_selector" value="advanced" />
            <param name="advanced_options|filter_by_flags|filter_flags" value="nofilter" />
            <param name="advanced_options|limit_by_region|limit_by_regions" value="no_limit" />
            <param name="advanced_options|coefficient_for_downgrading" value="true" />
            <param name="advanced_options|max_reads_per_bam" value="200" />
            <param name="advanced_options|extended_BAQ_computation" value="true" />
            <param name="advanced_options|minimum_mapping_quality" value="0" />
            <param name="advanced_options|minimum_base_quality" value="43" />
            <output name="output_mpileup" ftype="bcf" file="samtools_mpileup_out_4.bcf" lines_diff="1" /> 
            <output name="output_log" file="samtools_mpileup_out_4.log" />
        </test>
    </tests>
    <help>
**What it does**

 Generate BCF or pileup for one or multiple BAM files. Alignment records are grouped by sample identifiers in @RG header lines. If sample identifiers are absent, each input file is regarded as one sample. 

------

.. list-table:: **Input options**
   :widths: 5 5 40 10
   :header-rows: 1

   * - Flag
     - Type
     - Description
     - Default
   * - -6
     - *BOOLEAN*
     - assume the quality is in the Illumina-1.3+ encoding
     - off
   * - -A
     - *BOOLEAN*
     - count anomalous read pairs
     - off
   * - -B
     - *BOOLEAN*
     - disable BAQ computation
     - off
   * - -b
     - *FILE*
     - list of input BAM filenames, one per line
     - *null*
   * - -C
     - *INT*
     - parameter for adjusting mapQ; 0 to disable
     - 0
   * - -d
     - *INT*
     - max per-BAM depth to avoid excessive memory usage
     - 250
   * - -E
     - *BOOLEAN*
     - recalculate extended BAQ on the fly thus ignoring existing BQs
     - off
   * - -f
     - *FILE*
     - faidx indexed reference sequence file
     - *null*
   * - -G
     - *FILE*
     - exclude read groups listed in FILE
     - *null*
   * - -l
     - *FILE*
     - list of positions (chr pos) or regions (BED)
     - *null*
   * - -M
     - *INT*
     - cap mapping quality at INT
     - 60
   * - -r
     - *STR*
     - region in which pileup is generated
     - *null*
   * - -R
     - *BOOLEAN*
     - ignore RG tags
     - off
   * - -q
     - *INT*
     - skip alignments with mapQ smaller than INT
     - 0
   * - -Q
     - *INT*
     - skip bases with baseQ/BAQ smaller than INT
     - 13
   * - --rf
     - *INT*
     - required flags: skip reads with mask bits unset
     - 0
   * - --ff
     - *INT*
     - filter flags: skip reads with mask bits set
     - 0

------

.. list-table:: **Output options**
   :widths: 5 5 40 10
   :header-rows: 1

   * - Flag
     - Type
     - Description
     - Default
   * - -D
     - *BOOLEAN*
     -  output per-sample DP in BCF (require -g/-u)
     - off
   * - -g
     - *BOOLEAN*
     - generate BCF output (genotype likelihoods)
     - off
   * - -O
     - *BOOLEAN*
     - output base positions on reads (disabled by -g/-u)
     - off
   * - -s
     - *BOOLEAN*
     - output mapping quality (disabled by -g/-u)
     - off
   * - -S
     - *BOOLEAN*
     - output per-sample strand bias P-value in BCF (require -g/-u)
     - off
   * - -u
     - *BOOLEAN*
     - generate uncompressed BCF output
     - off

------

.. list-table:: **SNP/INDEL genotype likelihoods options (effective with '-g' or '-u')**
   :widths: 5 5 40 10
   :header-rows: 1

   * - Flag
     - Type
     - Description
     - Default
   * - -e
     - *INT*
     - Phred-scaled gap extension seq error probability
     - 20
   * - -F
     - *FLOAT*
     - minimum fraction of gapped reads for candidates
     - 0.002
   * - -h
     - *INT*
     - coefficient for homopolymer errors
     - 100
   * - -I
     - *BOOLEAN*
     - do not perform indel calling
     - off
   * - -L
     - *INT*
     - max per-sample depth for INDEL calling
     - 250
   * - -m
     - *INT*
     - minimum gapped reads for indel candidates
     - 1
   * - -o
     - *INT*
     - Phred-scaled gap open sequencing error probability
     - 40
   * - -p
     - *BOOLEAN*
     - apply -m and -F per-sample to increase sensitivity
     - off
   * - -P
     - *STR*
     - comma separated list of platforms for indels
     - all

------

**Citation**

For the underlying tool, please cite `Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. &lt;http://www.ncbi.nlm.nih.gov/pubmed/19505943&gt;`_


If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*
    </help>
</tool>