comparison cuffcompare_wrapper.xml @ 0:d0d26169cc2a draft

Uploaded
author devteam
date Wed, 26 Nov 2014 13:54:44 -0500
parents
children 6d8ab54229a0
comparison
equal deleted inserted replaced
-1:000000000000 0:d0d26169cc2a
1 <tool id="cuffcompare" name="Cuffcompare" version="2.2.1.0">
2 <description>compare assembled transcripts to a reference annotation and track Cufflinks transcripts across multiple experiments</description>
3 <expand macro="requirements" />
4 <expand macro="stdio" />
5 <macros>
6 <import>cuff_macros.xml</import>
7 </macros>
8 <version_command>cuffcompare 2>&amp;1 | head -n 1</version_command>
9 <command interpreter="python">
10 cuffcompare_wrapper.py
11 ## Use annotation reference?
12 #if $annotation.use_ref_annotation == "Yes":
13 -r $annotation.reference_annotation
14 #if $annotation.ignore_nonoverlapping_reference:
15 -R
16 #end if
17 #if $annotation.ignore_nonoverlapping_transfrags:
18 -Q
19 #end if
20
21 #end if
22
23 ## Use sequence data?
24 #if $seq_data.use_seq_data == "Yes":
25 -s
26 #if $seq_data.seq_source.index_source == "history":
27 --ref_file=$seq_data.seq_source.ref_file
28 #else:
29 --index=${seq_data.seq_source.index.fields.path}
30 #end if
31 #end if
32
33 $discard_single_exon
34
35 -e $max_dist_exon
36 -d $max_dist_group
37
38 #if $discard_intron_redundant_transfrags:
39 -F
40 #end if
41
42 ## Outputs.
43 --combined-transcripts=${transcripts_combined}
44
45 @CUFFLINKS_GTF_INPUTS@
46 </command>
47 <inputs>
48 <expand macro="cufflinks_gtf_inputs" />
49 <conditional name="annotation">
50 <param name="use_ref_annotation" type="select" label="Use Reference Annotation">
51 <option value="No">No</option>
52 <option value="Yes">Yes</option>
53 </param>
54 <when value="Yes">
55 <param format="gff3,gtf" name="reference_annotation" type="data" label="Reference Annotation" help="Requires an annotation file in GFF3 or GTF format."/>
56 <param name="ignore_nonoverlapping_reference" type="boolean" label="Ignore reference transcripts that are not overlapped by any input transfrags" help="consider only the reference transcripts that overlap any of the input transfrags (Sn correction)" />
57 <param name="ignore_nonoverlapping_transfrags" type="boolean" label="Ignore input transcripts that are not overlapped by any reference transcripts" help="consider only the input transcripts that overlap any of the reference transcripts (Sp correction). Warning: this will discard all 'novel' loci!" />
58 </when>
59 <when value="No">
60 </when>
61 </conditional>
62 <conditional name="seq_data">
63 <param name="use_seq_data" type="select" label="Use Sequence Data"
64 help="Use sequence data for some optional classification functions, including the addition of the p_id attribute required by Cuffdiff.">
65 <option value="Yes">Yes</option>
66 <option value="No">No</option>
67 </param>
68 <when value="No"></when>
69 <when value="Yes">
70 <conditional name="seq_source">
71 <param name="index_source" type="select" label="Choose the source for the reference list">
72 <option value="cached">Locally cached</option>
73 <option value="history">History</option>
74 </param>
75 <when value="cached">
76 <param name="index" type="select" label="Using reference genome">
77 <options from_data_table="fasta_indexes">
78 <filter type="data_meta" ref="inputs" key="dbkey" column="1" />
79 <validator type="no_options" message="No reference genome is available for the build associated with the selected input dataset" />
80 </options>
81 </param>
82 </when>
83 <when value="history">
84 <param name="ref_file" type="data" format="fasta" label="Using reference file" />
85 </when>
86 </conditional>
87 </when>
88 </conditional>
89 <param type="select" name="discard_single_exon" label="discard (ignore) single-exon transcripts">
90 <option value="" selected="True">No</option>
91 <option value="-M">Discard single-exon transfrags and reference transcripts</option>
92 <option value="-N">Discard single-exon reference transcripts</option>
93 </param>
94 <param type="integer" name="max_dist_exon" value="100" label="Max. Distance for assessing exon accuracy"
95 help="max. distance (range) allowed from free ends of terminal exons of reference transcripts when assessing exon accuracy. Default: 100" />
96 <param type="integer" name="max_dist_group" value="100" label="Max.Distance for transcript grouping"
97 help="max. distance (range) for grouping transcript start sites. Default: 100" />
98 <param type="boolean" name="discard_intron_redundant_transfrags" label="discard intron-redundant transfrags sharing 5'"
99 help="Discard intron-redundant transfrags if they share the 5' end (if they differ only at the 3' end)" />
100 </inputs>
101
102 <outputs>
103 <data format="txt" name="transcripts_accuracy" label="${tool.name} on ${on_string}: transcript accuracy"
104 from_work_dir="cc_output.stats" />
105 <data format="tabular" name="input1_tmap" label="${tool.name} on ${on_string}: data ${inputs[0].hid} tmap file"
106 from_work_dir="cc_output.input1.tmap" />
107 <data format="tabular" name="input1_refmap"
108 label="${tool.name} on ${on_string}: data ${inputs[0].hid} refmap file"
109 from_work_dir="cc_output.input1.refmap">
110 <filter>annotation['use_ref_annotation'] == 'Yes'</filter>
111 </data>
112 <data format="tabular" name="input2_tmap" label="${tool.name} on ${on_string}: data ${inputs[1].hid} tmap file" from_work_dir="cc_output.input2.tmap">
113 <filter>@HAS_MULTIPLE_INPUTS@</filter>
114 </data>
115 <data format="tabular" name="input2_refmap"
116 label="${tool.name} on ${on_string}: data ${inputs[1].hid} refmap file"
117 from_work_dir="cc_output.input2.refmap">
118 <filter>annotation['use_ref_annotation'] == 'Yes' and @HAS_MULTIPLE_INPUTS@</filter>
119 </data>
120 <data format="tabular" name="transcripts_tracking" label="${tool.name} on ${on_string}: transcript tracking" from_work_dir="cc_output.tracking">
121 <filter>@HAS_MULTIPLE_INPUTS@</filter>
122 </data>
123 <data format="gtf" name="transcripts_combined" label="${tool.name} on ${on_string}: combined transcripts"/>
124 </outputs>
125
126 <tests>
127 <!--
128 cuffcompare -r cuffcompare_in3.gtf -R cuffcompare_in1.gtf cuffcompare_in2.gtf
129 -->
130 <test>
131 <param name="inputs" value="cuffcompare_in1.gtf,cuffcompare_in2.gtf" ftype="gtf"/>
132 <param name="use_ref_annotation" value="Yes"/>
133 <param name="reference_annotation" value="cuffcompare_in3.gtf" ftype="gtf"/>
134 <param name="ignore_nonoverlapping_reference" value="Yes"/>
135 <param name="ignore_nonoverlapping_transfrags" value="No"/>
136 <param name="use_seq_data" value="No"/>
137 <param name="discard_single_exon" value="" />
138 <param name="max_dist_exon" value="100" />
139 <param name="max_dist_group" value="100" />
140 <param name="discard_intron_redundant_transfrags" value="No" />
141 <!-- Line diffs are the result of different locations for input files; this cannot be fixed as cuffcompare outputs
142 full input path for each input. -->
143 <output name="transcripts_accuracy" file="cuffcompare_out7.txt" lines_diff="2"/>
144 <output name="input1_tmap" file="cuffcompare_out1.tmap"/>
145 <output name="input1_refmap" file="cuffcompare_out2.refmap"/>
146 <output name="input2_tmap" file="cuffcompare_out3.tmap"/>
147 <output name="input2_refmap" file="cuffcompare_out4.refmap"/>
148 <output name="transcripts_tracking" file="cuffcompare_out6.tracking"/>
149 <output name="transcripts_combined" file="cuffcompare_out5.gtf"/>
150 </test>
151 </tests>
152
153 <help>
154 **Cuffcompare Overview**
155
156 Cuffcompare is part of Cufflinks_. Cuffcompare helps you: (a) compare your assembled transcripts to a reference annotation and (b) track Cufflinks transcripts across multiple experiments (e.g. across a time course). Please cite: Trapnell C, Williams BA, Pertea G, Mortazavi AM, Kwan G, van Baren MJ, Salzberg SL, Wold B, Pachter L. Transcript assembly and abundance estimation from RNA-Seq reveals thousands of new transcripts and switching among isoforms. Nature Biotechnology doi:10.1038/nbt.1621
157
158 .. _Cufflinks: http://cufflinks.cbcb.umd.edu/
159
160 ------
161
162 **Know what you are doing**
163
164 .. class:: warningmark
165
166 There is no such thing (yet) as an automated gearshift in expression analysis. It is all like stick-shift driving in San Francisco. In other words, running this tool with default parameters will probably not give you meaningful results. A way to deal with this is to **understand** the parameters by carefully reading the `documentation`__ and experimenting. Fortunately, Galaxy makes experimenting easy.
167
168 .. __: http://cufflinks.cbcb.umd.edu/manual.html#cuffcompare
169
170 ------
171
172 **Input format**
173
174 Cuffcompare takes Cufflinks' GTF output as input, and optionally can take a "reference" annotation (such as from Ensembl_)
175
176 .. _Ensembl: http://www.ensembl.org
177
178 ------
179
180 **Outputs**
181
182 Cuffcompare produces the following output files:
183
184 Transcripts Accuracy File:
185
186 Cuffcompare reports various statistics related to the "accuracy" of the transcripts in each sample when compared to the reference annotation data. The typical gene finding measures of "sensitivity" and "specificity" (as defined in Burset, M., Guigó, R. : Evaluation of gene structure prediction programs (1996) Genomics, 34 (3), pp. 353-367. doi: 10.1006/geno.1996.0298) are calculated at various levels (nucleotide, exon, intron, transcript, gene) for each input file and reported in this file. The Sn and Sp columns show specificity and sensitivity values at each level, while the fSn and fSp columns are "fuzzy" variants of these same accuracy calculations, allowing for a very small variation in exon boundaries to still be counted as a "match".
187
188 Transcripts Combined File:
189
190 Cuffcompare reports a GTF file containing the "union" of all transfrags in each sample. If a transfrag is present in both samples, it is thus reported once in the combined gtf.
191
192 Transcripts Tracking File:
193
194 This file matches transcripts up between samples. Each row contains a transcript structure that is present in one or more input GTF files. Because the transcripts will generally have different IDs (unless you assembled your RNA-Seq reads against a reference transcriptome), cuffcompare examines the structure of each the transcripts, matching transcripts that agree on the coordinates and order of all of their introns, as well as strand. Matching transcripts are allowed to differ on the length of the first and last exons, since these lengths will naturally vary from sample to sample due to the random nature of sequencing.
195 If you ran cuffcompare with the -r option, the first and second columns contain the closest matching reference transcript to the one described by each row.
196
197 Here's an example of a line from the tracking file::
198
199 TCONS_00000045 XLOC_000023 Tcea|uc007afj.1 j \
200 q1:exp.115|exp.115.0|100|3.061355|0.350242|0.350207 \
201 q2:60hr.292|60hr.292.0|100|4.094084|0.000000|0.000000
202
203 In this example, a transcript present in the two input files, called exp.115.0 in the first and 60hr.292.0 in the second, doesn't match any reference transcript exactly, but shares exons with uc007afj.1, an isoform of the gene Tcea, as indicated by the class code j. The first three columns are as follows::
204
205 Column number Column name Example Description
206 -----------------------------------------------------------------------
207 1 Cufflinks transfrag id TCONS_00000045 A unique internal id for the transfrag
208 2 Cufflinks locus id XLOC_000023 A unique internal id for the locus
209 3 Reference gene id Tcea The gene_name attribute of the reference GTF record for this transcript, or '-' if no reference transcript overlaps this Cufflinks transcript
210 4 Reference transcript id uc007afj.1 The transcript_id attribute of the reference GTF record for this transcript, or '-' if no reference transcript overlaps this Cufflinks transcript
211 5 Class code c The type of match between the Cufflinks transcripts in column 6 and the reference transcript. See class codes
212
213 Each of the columns after the fifth have the following format:
214 qJ:gene_id|transcript_id|FMI|FPKM|conf_lo|conf_hi
215
216 A transcript need be present in all samples to be reported in the tracking file. A sample not containing a transcript will have a "-" in its entry in the row for that transcript.
217
218 Class Codes
219
220 If you ran cuffcompare with the -r option, tracking rows will contain the following values. If you did not use -r, the rows will all contain "-" in their class code column::
221
222 Priority Code Description
223 ---------------------------------
224 1 = Match
225 2 c Contained
226 3 j New isoform
227 4 e A single exon transcript overlapping a reference exon and at least 10 bp of a reference intron, indicating a possible pre-mRNA fragment.
228 5 i A single exon transcript falling entirely with a reference intron
229 6 r Repeat. Currently determined by looking at the reference sequence and applied to transcripts where at least 50% of the bases are lower case
230 7 p Possible polymerase run-on fragment
231 8 u Unknown, intergenic transcript
232 9 o Unknown, generic overlap with reference
233 10 . (.tracking file only, indicates multiple classifications)
234
235 -------
236
237 **Settings**
238
239 All of the options have a default value. You can change any of them. Most of the options in Cuffcompare have been implemented here.
240
241 ------
242
243 **Cuffcompare parameter list**
244
245 This is a list of implemented Cuffcompare options::
246
247 -r An optional "reference" annotation GTF. Each sample is matched against this file, and sample isoforms are tagged as overlapping, matching, or novel where appropriate. See the refmap and tmap output file descriptions below.
248 -R If -r was specified, this option causes cuffcompare to ignore reference transcripts that are not overlapped by any transcript in one of cuff1.gtf,...,cuffN.gtf. Useful for ignoring annotated transcripts that are not present in your RNA-Seq samples and thus adjusting the "sensitivity" calculation in the accuracy report written in the transcripts_accuracy file
249 </help>
250 <citations>
251 <citation type="doi">10.1038/nbt.1621</citation>
252 </citations>
253 </tool>