view cuffdiff_wrapper.xml @ 6:79c687de829b

Add more descriptive label
author Jim Johnson <jj@umn.edu>
date Tue, 09 Oct 2012 08:09:32 -0500
parents 9d25b0fd882b
children 56bdf6d4ac9c
line wrap: on
line source

<tool id="cuffdiff" name="Cuffdiff" version="0.0.6">
    <!-- Wrapper supports Cuffdiff versions v1.3.0-v2.0 -->
    <description>find significant changes in transcript expression, splicing, and promoter use</description>
    <requirements>
        <requirement type="package">cufflinks</requirement>
    </requirements>
    <command interpreter="python">
        #set sel_outputs = $output_sel.__str__.split(',')
        cuffdiff_wrapper.py
            --FDR=$fdr
            --num-threads="4"
            --min-alignment-count=$min_alignment_count

            #if 'cuffdata' in $sel_outputs or not $output_sel:
                --cuffdatadir=$cuffdata.extra_files_path
            #end if
            #if 'cummeRbund_db' in $sel_outputs:
                --cummeRbund_db=$cummeRbund_db
            #end if

            #if 'isoforms_fpkm_tracking' in $sel_outputs:
                --isoforms_fpkm_tracking_output=$isoforms_fpkm_tracking
            #end if
            #if 'genes_fpkm_tracking' in $sel_outputs:
                --genes_fpkm_tracking_output=$genes_fpkm_tracking
            #end if
            #if 'cds_fpkm_tracking' in $sel_outputs:
                --cds_fpkm_tracking_output=$cds_fpkm_tracking
            #end if
            #if 'tss_groups_fpkm_tracking' in $sel_outputs:
                --tss_groups_fpkm_tracking_output=$tss_groups_fpkm_tracking
            #end if
            #if 'isoforms_exp_diff' in $sel_outputs:
                --isoforms_exp_output=$isoforms_exp_diff
            #end if
            #if 'genes_exp_diff' in $sel_outputs:
                --genes_exp_output=$genes_exp_diff
            #end if
            #if 'tss_groups_exp_diff' in $sel_outputs:
                --tss_groups_exp_output=$tss_groups_exp_diff
            #end if
            #if 'cds_exp_fpkm_tracking' in $sel_outputs:
                --cds_exp_fpkm_tracking_output=$cds_exp_fpkm_tracking
            #end if
            #if 'splicing_diff' in $sel_outputs:
                --splicing_diff_output=$splicing_diff
            #end if
            #if 'cds_diff' in $sel_outputs:
                --cds_diff_output=$cds_diff
            #end if
            #if 'promoters_diff' in $sel_outputs:
                --promoters_diff_output=$promoters_diff
            #end if
            #if 'cds_read_group_tracking' in $sel_outputs:
                --cds_read_group_tracking=$cds_read_group_tracking
            #end if
            #if 'tss_groups_read_group_tracking' in $sel_outputs:
                --tss_groups_read_group_tracking=$tss_groups_read_group_tracking
            #end if
            #if 'genes_read_group_tracking' in $sel_outputs:
                --genes_read_group_tracking=$genes_read_group_tracking
            #end if
            #if 'isoforms_read_group_tracking' in $sel_outputs:
                --isoforms_read_group_tracking=$isoforms_read_group_tracking
            #end if
            
            ## Set advanced data parameters?
            #if $additional.sAdditional == "Yes":
                -m $additional.frag_mean_len
                -s $additional.frag_len_std_dev
            #end if

            ## Normalization?
            #if str($do_normalization) == "Yes":
            -N
            #end if

            ## Multi-read correct?
            #if str($multiread_correct) == "Yes":
            -u
            #end if

            ## Bias correction?
            #if $bias_correction.do_bias_correction == "Yes":
	        -b
                #if $bias_correction.seq_source.index_source == "history":
                    --ref_file=$bias_correction.seq_source.ref_file
                #else:
                    --ref_file="None"
                #end if
                --dbkey=${gtf_input.metadata.dbkey}
                --index_dir=${GALAXY_DATA_INDEX_DIR}
            #end if
                
            ## Inputs.
            --inputA=$gtf_input
            #if $group_analysis.do_groups == "No":
                --input1=$aligned_reads1
                --input2=$aligned_reads2
            #else:
                ## Replicates.
                --labels
                #for $group in $group_analysis.groups
                    ${group.group}
                #end for
                --files
                #for $group in $group_analysis.groups
                    #for $file in $group.files:
                        ${file.file}
                    #end for
                    ,
                #end for
            #end if

    </command>
    <inputs>
        <param format="gtf" name="gtf_input" type="data" label="Transcripts" help="A transcript GTF file produced by cufflinks, cuffcompare, or other source."/>
        <conditional name="group_analysis"> 
            <param name="do_groups" type="select" label="Perform replicate analysis" help="Perform cuffdiff with replicates in each group.">
                <option value="No">No</option>
                <option value="Yes">Yes</option>
            </param>
            <when value="Yes">
                <repeat name="groups" title="Group">
                    <param name="group" title="Group name" type="text" label="Group name (no spaces or commas)"/>
                    <repeat name="files" title="Replicate">
                        <param name="file" label="Add file" type="data" format="sam,bam"/>
                    </repeat>
                </repeat>
            </when>
            <when value="No">
                <param format="sam,bam" name="aligned_reads1" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>
                <param format="sam,bam" name="aligned_reads2" type="data" label="SAM or BAM file of aligned RNA-Seq reads" help=""/>
            </when>
        </conditional>

        <param name="fdr" type="float" value="0.05" label="False Discovery Rate" help="The allowed false discovery rate."/>

        <param name="min_alignment_count" type="integer" value="10" label="Min Alignment Count" help="The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples."/>

        <param name="do_normalization" type="select" label="Perform quartile normalization" help="Removes top 25% of genes from FPKM denominator to improve accuracy of differential expression calls for low abundance transcripts.">
            <option value="No">No</option>
            <option value="Yes">Yes</option>
        </param>

        <param name="multiread_correct" type="select" label="Use multi-read correct" help="Tells Cufflinks to do an initial estimation procedure to more accurately weight reads mapping to multiple locations in the genome.">
            <option value="No" selected="true">No</option>
            <option value="Yes">Yes</option>
        </param>

        <conditional name="bias_correction">
            <param name="do_bias_correction" type="select" label="Perform Bias Correction" help="Bias detection and correction can significantly improve accuracy of transcript abundance estimates.">
                <option value="No">No</option>
                <option value="Yes">Yes</option>
            </param>
            <when value="Yes">
                <conditional name="seq_source">
                  <param name="index_source" type="select" label="Reference sequence data">
                    <option value="cached">Locally cached</option>
                    <option value="history">History</option>
                  </param>
                  <when value="cached"></when>
                  <when value="history">
                      <param name="ref_file" type="data" format="fasta" label="Using reference file" />
                  </when>
                </conditional>
            </when>
            <when value="No"></when>
        </conditional>

        <conditional name="additional">
            <param name="sAdditional" type="select" label="Set Additional Parameters? (not recommended)">
                <option value="No">No</option>
                <option value="Yes">Yes</option>
            </param>
            <when value="No"></when>
            <when value="Yes">
                <param name="frag_mean_len" type="integer" value="200" label="Average Fragment Length"/>
                <param name="frag_len_std_dev" type="integer" value="80" label="Fragment Length Standard Deviation"/>
            </when>
        </conditional>

        <param name="output_sel" type="select" multiple="true" display="checkboxes" force_select="true" label="Select outputs for history datasets">
            <option value="cuffdata">cuffdata - html page with links to cuffdiff outputs</option>
            <option value="cummeRbund_db">cummeRbund database</option>
            <option value="run_info">run.info</option>
            <option value="read_groups_info">read_groups.info</option>
            <option value="splicing_diff">splicing.diff</option>
            <option value="promoters_diff">promoters.diff</option>
            <option value="genes_exp_diff">genes_exp.diff</option>
            <option value="genes_fpkm_tracking">genes.fpkm_tracking</option>
            <option value="genes_count_tracking">genes.count_tracking</option>
            <option value="genes_read_group_tracking">genes.read_group_tracking</option>
            <option value="isoforms_exp_diff">isoforms.exp_diff</option>
            <option value="isoforms_fpkm_tracking">isoforms.fpkm_tracking</option>
            <option value="isoforms_count_tracking">isoforms.count_tracking</option>
            <option value="isoforms_read_group_tracking">isoforms.read_group_tracking</option>
            <option value="cds_diff">cds.diff</option>
            <option value="cds_exp_diff">cds_exp.diff</option>
            <option value="cds_fpkm_tracking">cds.fpkm_tracking</option>
            <option value="cds_count_tracking">cds.count_tracking</option>
            <option value="cds_read_group_tracking">cds.read_group_tracking</option>
            <option value="tss_groups_exp_diff">tss_groups_exp.diff</option>
            <option value="tss_groups_fpkm_tracking">tss_groups.fpkm_tracking</option>
            <option value="tss_groups_count_tracking">tss_groups.count_tracking</option>
            <option value="tss_groups_read_group_tracking">tss_groups.read_group_tracking</option>
        </param>
        
    </inputs>

    <outputs>
        <data format="text" name="run_info" label="${tool.name} on ${on_string}: run.info">
            <filter>output_sel and 'run_info' in output_sel</filter>
        </data>
        <data format="tabular" name="read_groups_info" label="${tool.name} on ${on_string}: read_groups.info">
            <filter>output_sel and 'read_groups_info' in output_sel</filter>
        </data>
        <data format="tabular" name="splicing_diff" label="${tool.name} on ${on_string}: splicing differential expression testing">
            <filter>output_sel and 'splicing_diff' in output_sel</filter>
        </data>
        <data format="tabular" name="promoters_diff" label="${tool.name} on ${on_string}: promoters differential expression testing">
            <filter>output_sel and 'promoters_diff' in output_sel</filter>
        </data>
        <data format="tabular" name="cds_diff" label="${tool.name} on ${on_string}: CDS overloading diffential expression testing">
            <filter>output_sel and 'cds_diff' in output_sel</filter>
        </data>
        <data format="tabular" name="cds_exp_diff" label="${tool.name} on ${on_string}: CDS differential expression testing">
            <filter>output_sel and 'cds_exp_diff' in output_sel</filter>
        </data>
        <data format="tabular" name="cds_fpkm_tracking" label="${tool.name} on ${on_string}: CDS FPKM tracking">
            <filter>output_sel and 'cds_fpkm_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="cds_count_tracking" label="${tool.name} on ${on_string}: CDS counts">
            <filter>output_sel and 'cds_count_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="cds_read_group_tracking" label="${tool.name} on ${on_string}: CDS Read Group tracking">
            <filter>output_sel and 'cds_read_group_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="tss_groups_exp_diff" label="${tool.name} on ${on_string}: TSS groups differential expression testing">
            <filter>output_sel and 'tss_groups_exp_diff' in output_sel</filter>
        </data>
        <data format="tabular" name="tss_groups_fpkm_tracking" label="${tool.name} on ${on_string}: TSS groups FPKM tracking">
            <filter>output_sel and 'tss_groups_fpkm_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="tss_groups_count_tracking" label="${tool.name} on ${on_string}: TSS groups counts">
            <filter>output_sel and 'tss_groups_count_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="tss_groups_read_group_tracking" label="${tool.name} on ${on_string}: TSS groups Read Group tracking">
            <filter>output_sel and 'tss_groups_read_group_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="isoforms_exp_diff" label="${tool.name} on ${on_string}: transcript differential expression testing">
            <filter>output_sel and 'isoforms_exp_diff' in output_sel</filter>
        </data>
        <data format="tabular" name="isoforms_fpkm_tracking" label="${tool.name} on ${on_string}: transcript FPKM tracking">
            <filter>output_sel and 'isoforms_fpkm_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="isoforms_count_tracking" label="${tool.name} on ${on_string}: transcript counts">
            <filter>output_sel and 'isoforms_count_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="isoforms_read_group_tracking" label="${tool.name} on ${on_string}: transcript Read Group tracking">
            <filter>output_sel and 'isoforms_read_group_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="genes_exp_diff" label="${tool.name} on ${on_string}: gene differential expression testing">
            <filter>output_sel and 'genes_exp_diff' in output_sel</filter>
        </data>
        <data format="tabular" name="genes_fpkm_tracking" label="${tool.name} on ${on_string}: gene FPKM tracking">
            <filter>output_sel and 'genes_fpkm_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="genes_count_tracking" label="${tool.name} on ${on_string}: gene counts">
            <filter>output_sel and 'genes_count_tracking' in output_sel</filter>
        </data>
        <data format="tabular" name="genes_read_group_tracking" label="${tool.name} on ${on_string}: gene Read Group tracking">
            <filter>output_sel and 'genes_read_group_tracking' in output_sel</filter>
        </data>
        <data format="cuffdata" name="cuffdata" label="${tool.name} on ${on_string}: cuffdata" >
            <filter>not output_sel or output_sel and 'cuffdata' in output_sel</filter>
        </data>
        <data format="cuffdatadb" name="cummeRbund_db" label="${tool.name} on ${on_string}: cummeRbund sqlite Database" >
            <filter>output_sel and 'cummeRbund_db' in output_sel</filter>
        </data>
    </outputs>
    <stdio>
        <exit_code range="1:"  level="fatal"   description="Cufflinks Err" />
    </stdio>


    <tests>
        <test>
                <!--
                    cuffdiff cuffcompare_out5.gtf cuffdiff_in1.sam cuffdiff_in2.sam 
                -->
                <param name="gtf_input" value="cuffcompare_out5.gtf" ftype="gtf" />
                <param name="do_groups" value="No" />
                <param name="aligned_reads1" value="cuffdiff_in1.sam" ftype="sam" />
                <param name="aligned_reads2" value="cuffdiff_in2.sam" ftype="sam" />
                <!-- Defaults. -->
                <param name="fdr" value="0.05" />
                <param name="min_alignment_count" value="0" />
                <param name="do_bias_correction" value="No" />
                <param name="do_normalization" value="No" />
                <param name="multiread_correct" value="No"/>
                <param name="sAdditional" value="No"/>
                <!-- 
                    Line diffs are needed because cuffdiff does not produce deterministic output.
                    TODO: can we find datasets that lead to deterministic behavior?
                -->
                <output name="splicing_diff" file="cuffdiff_out9.txt"/>
                <output name="promoters_diff" file="cuffdiff_out10.txt"/>
                <output name="cds_diff" file="cuffdiff_out11.txt"/>
                <output name="cds_exp_fpkm_tracking" file="cuffdiff_out4.txt"/>
                <output name="cds_fpkm_tracking" file="cuffdiff_out8.txt"/>
                <output name="tss_groups_exp" file="cuffdiff_out3.txt" lines_diff="200"/>
                <output name="tss_groups_fpkm_tracking" file="cuffdiff_out7.txt"/>
                <output name="genes_exp" file="cuffdiff_out2.txt" lines_diff="200"/>
                <output name="genes_fpkm_tracking" file="cuffdiff_out6.txt" lines_diff="200"/>
                <output name="isoforms_exp" file="cuffdiff_out1.txt" lines_diff="200"/>
                <output name="isoforms_fpkm_tracking" file="cuffdiff_out5.txt" lines_diff="200"/>
        </test>
    </tests>

    <help>
**Cuffdiff Overview**

Cuffdiff is part of Cufflinks_. Cuffdiff find significant changes in transcript expression, splicing, and promoter use. Please cite: Trapnell C, Williams BA, Pertea G, Mortazavi AM, Kwan G, van Baren MJ, Salzberg SL, Wold B, Pachter L. Transcript assembly and abundance estimation from RNA-Seq reveals thousands of new transcripts and switching among isoforms. Nature Biotechnology doi:10.1038/nbt.1621

.. _Cufflinks: http://cufflinks.cbcb.umd.edu/
        
------

**Know what you are doing**

.. class:: warningmark

There is no such thing (yet) as an automated gearshift in expression analysis. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.

.. __: http://cufflinks.cbcb.umd.edu/manual.html#cuffdiff

------

**Input format**

Cuffdiff takes Cufflinks or Cuffcompare GTF files as input along with two SAM files containing the fragment alignments for two or more samples.

------

**Outputs**

Cuffdiff produces many output files:

1. Transcript FPKM expression tracking.
2. Gene FPKM expression tracking; tracks the summed FPKM of transcripts sharing each gene_id
3. Primary transcript FPKM tracking; tracks the summed FPKM of transcripts sharing each tss_id
4. Coding sequence FPKM tracking; tracks the summed FPKM of transcripts sharing each p_id, independent of tss_id
5. Transcript differential FPKM.
6. Gene differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each gene_id
7. Primary transcript differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each tss_id
8. Coding sequence differential FPKM. Tests difference sin the summed FPKM of transcripts sharing each p_id independent of tss_id
9. Differential splicing tests: this tab delimited file lists, for each primary transcript, the amount of overloading detected among its isoforms, i.e. how much differential splicing exists between isoforms processed from a single primary transcript. Only primary transcripts from which two or more isoforms are spliced are listed in this file.
10. Differential promoter tests: this tab delimited file lists, for each gene, the amount of overloading detected among its primary transcripts, i.e. how much differential promoter use exists between samples. Only genes producing two or more distinct primary transcripts (i.e. multi-promoter genes) are listed here.
11. Differential CDS tests: this tab delimited file lists, for each gene, the amount of overloading detected among its coding sequences, i.e. how much differential CDS output exists between samples. Only genes producing two or more distinct CDS (i.e. multi-protein genes) are listed here.
    
-------

**Settings**

All of the options have a default value. You can change any of them. Most of the options in Cuffdiff have been implemented here.

------

**Cuffdiff parameter list**

This is a list of implemented Cuffdiff options::

  -m INT                         Average fragement length; default 200
  -s INT                         Fragment legnth standard deviation; default 80
  -c INT                         The minimum number of alignments in a locus for needed to conduct significance testing on changes in that locus observed between samples. If no testing is performed, changes in the locus are deemed not significant, and the locus' observed changes don't contribute to correction for multiple testing. The default is 1,000 fragment alignments (up to 2,000 paired reads).
  --FDR FLOAT                    The allowed false discovery rate. The default is 0.05.
  --num-importance-samples INT   Sets the number of importance samples generated for each locus during abundance estimation. Default: 1000
  --max-mle-iterations INT       Sets the number of iterations allowed during maximum likelihood estimation of abundances. Default: 5000
  -N                             With this option, Cufflinks excludes the contribution of the top 25 percent most highly expressed genes from the number of mapped fragments used in the FPKM denominator. This can improve robustness of differential expression calls for less abundant genes and transcripts.
  
    </help>
</tool>