annotate cd_hit_est.xml @ 12:b1bf31be0d3c default tip

Fix cheetah tests of whether optional arguments are set
author Jim Johnson <jj@umn.edu>
date Thu, 29 Oct 2015 10:09:02 -0500
parents 75fde37f69e5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
1 <tool id="cd_hit_est" name="CD-HIT-EST" version="1.2">
0
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
2 <description>Cluster a nucleotide dataset into representative sequences</description>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
3 <requirements>
2
c99f307f2bdb correct requirement - cd-hit
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
4 <requirement type="package" version="4.6.1">cd-hit</requirement>
0
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
5 </requirements>
11
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
6 <macros>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
7 <import>cdhit_macros.xml</import>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
8 </macros>
0
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
9 <command>
11
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
10 cd-hit-est -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize $strand
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
11 #include source=$common_cdhit_options#
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
12 #include source=$runtime_tuning#
0
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
13 </command>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
14 <inputs>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
15 <param name="fasta_in" type="data" format="fasta" label="EST Sequences to cluster"/>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
16 <param name="similarity" type="float" value="0.9" label="similarity threshold: .75 - 1.0, default is .9">
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
17 <validator type="in_range" message="sequence similarity threshold should be .75 - 1.0" min=".75" max="1.0"/>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
18 </param>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
19 <param name="wordsize" type="integer" value="8" label="word size">
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
20 <help> Suggested word size:
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
21 8,9,10 for thresholds 0.90 ~ 1.0
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
22 7 for thresholds 0.88 ~ 0.9
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
23 6 for thresholds 0.85 ~ 0.88
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
24 5 for thresholds 0.80 ~ 0.85
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
25 4 for thresholds 0.75 ~ 0.8
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
26 </help>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
27 <validator type="in_range" message="word size should be between 4 and 10" min="4" max="10"/>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
28 </param>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
29 <param name="strand" type="boolean" truevalue="-r 1" falsevalue="" checked="false" label="Compare both strands"/>
11
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
30 <expand macro="common_cdhit_options" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
31 <expand macro="runtime_tuning" />
0
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
32 </inputs>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
33 <outputs>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
34 <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: clusters" from_work_dir="rep_seq.clstr"/>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
35 <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: representatives.fasta" from_work_dir="rep_seq"/>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
36 </outputs>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
37 <tests>
3
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
38 <test>
11
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
39 <!-- Expect 3 clusters: 0,1,2 -->
3
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
40 <param name="fasta_in" value="cd_hit_est_in.fa" />
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
41 <param name="similarity" value="0.9"/>
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
42 <param name="wordsize" value="8"/>
11
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
43 <param name="strand" value="true"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
44 <!-- conditionals in macros -->
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
45 <param name="settings" value="no"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
46 <param name="tuning" value="default"/>
3
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
47 <output name="clusters_out">
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
48 <assert_contents>
11
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
49 <has_text text=">Cluster 0" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
50 <!-- There should not be a Cluster 3 -->
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
51 <not_has_text text="Cluster 3" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
52 <has_text_matching expression="F12Fcsw_481739" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
53 </assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
54 </output>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
55 <output name="fasta_out">
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
56 <assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
57 <has_text_matching expression="^>[MF]\d\dFcsw_\d*" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
58 </assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
59 </output>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
60 </test>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
61 <test>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
62 <!-- tighter constraints should yield more clusters -->
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
63 <param name="fasta_in" value="cd_hit_est_in.fa" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
64 <param name="similarity" value="0.95"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
65 <param name="wordsize" value="9"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
66 <param name="strand" value="true"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
67 <!-- conditionals in macros -->
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
68 <param name="settings" value="no"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
69 <param name="tuning" value="default"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
70 <output name="clusters_out">
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
71 <assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents: 6
diff changeset
72 <has_text text=">Cluster 4" />
3
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
73 <has_text_matching expression=">F12Fcsw_481739" />
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
74 </assert_contents>
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
75 </output>
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
76 <output name="fasta_out">
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
77 <assert_contents>
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
78 <has_text_matching expression="^>[MF]\d\dFcsw_\d*" />
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
79 </assert_contents>
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
80 </output>
22a3fc9a14d7 Add functional test
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
81 </test>
0
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
82 </tests>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
83 <help>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
84 **CD-HIT-EST**
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
85
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
86 CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database.
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
87
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
88 .. _CD-HIT: http://www.bioinformatics.org/cd-hit/
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
89
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
90 ------
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
91
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
92 **Inputs**
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
93
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
94 cd-hit-est requires a fasta dataset as input.
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
95
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
96 ------
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
97
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
98 **Outputs**
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
99
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
100 A fasta datasets containing representative sequences.
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
101
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
102 A text file listing the mapping of sequences to the representative sequences::
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
103
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
104 >Cluster 0
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
105 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
106 >Cluster 1
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
107 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
108 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
109 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
110 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
111 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
112 >Cluster 2
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
113 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
114 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
115 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
116 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
117
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
118
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
119 </help>
13900a1ad862 Uploaded
jjohnson
parents:
diff changeset
120 </tool>