annotate cd_hit_protein.xml @ 12:b1bf31be0d3c default tip

Fix cheetah tests of whether optional arguments are set
author Jim Johnson <jj@umn.edu>
date Thu, 29 Oct 2015 10:09:02 -0500
parents 75fde37f69e5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
1 <tool id="cd_hit_protein" name="CD-HIT PROTEIN" version="1.2">
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
2 <description>Cluster a protein dataset into representative sequences</description>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
3 <requirements>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
4 <requirement type="package" version="4.6.1">cd-hit</requirement>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
5 </requirements>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
6 <macros>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
7 <import>cdhit_macros.xml</import>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
8 </macros>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
9 <command>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
10 cd-hit -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
11 #include source=$common_cdhit_options#
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
12 #include source=$runtime_tuning#
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
13 </command>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
14 <inputs>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
15 <param name="fasta_in" type="data" format="fasta" label="Protein Sequences to cluster"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
16 <param name="similarity" type="float" value="0.9" label="similarity threshold: .4 - 1.0 (default .9)">
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
17 <validator type="in_range" message="sequence similarity threshold should be .4 - 1.0" min=".4" max="1.0"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
18 </param>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
19 <param name="wordsize" type="integer" value="5" label="word size (default 5)">
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
20 <help> Suggested word size:
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
21 5 for thresholds 0.7 ~ 1.0;
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
22 4 for thresholds 0.6 ~ 0.7;
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
23 3 for thresholds 0.5 ~ 0.6;
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
24 2 for thresholds 0.4 ~ 0.5;
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
25 </help>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
26 <validator type="in_range" message="word size should be between 2 and 5" min="2" max="5"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
27 </param>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
28 <expand macro="common_cdhit_options" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
29 <expand macro="runtime_tuning" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
30 </inputs>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
31 <outputs>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
32 <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: clusters" from_work_dir="rep_seq.clstr"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
33 <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: representatives.fasta" from_work_dir="rep_seq"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
34 </outputs>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
35 <tests>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
36 <test>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
37 <param name="fasta_in" value="cd_hit_protein_in.fasta" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
38 <param name="similarity" value="0.9"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
39 <param name="wordsize" value="5"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
40 <!-- conditionals in macros -->
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
41 <param name="settings" value="no"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
42 <param name="tuning" value="default"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
43 <output name="clusters_out">
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
44 <assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
45 <has_text text="Cluster 0" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
46 <!--
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
47 <has_text_matching expression=">sp.P00338-2.LDHA_HU" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
48 -->
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
49 </assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
50 </output>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
51 <output name="fasta_out">
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
52 <assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
53 <has_text_matching expression=">sp.P19858.LDHA_BOVIN" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
54 </assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
55 </output>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
56 </test>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
57 <test>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
58 <param name="fasta_in" value="cd_hit_protein_in.fasta" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
59 <param name="similarity" value="0.8" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
60 <param name="wordsize" value="5" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
61 <!-- conditionals in macros -->
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
62 <param name="settings" value="no"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
63 <param name="tuning" value="default"/>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
64 <output name="clusters_out">
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
65 <assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
66 <has_text text="Cluster 0" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
67 <not_has_text text="Cluster 4" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
68 </assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
69 </output>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
70 <output name="fasta_out">
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
71 <assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
72 <has_text_matching expression=">sp.P00340.LDHA_CHICK" />
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
73 </assert_contents>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
74 </output>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
75 </test>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
76 </tests>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
77
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
78 <help>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
79 **CD-HIT**
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
80
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
81 CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database.
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
82
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
83 .. _CD-HIT: http://www.bioinformatics.org/cd-hit/
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
84
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
85 ------
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
86
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
87 **Inputs**
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
88
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
89 cd-hit requires a protein fasta dataset as input.
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
90
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
91 ------
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
92
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
93 **Outputs**
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
94
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
95 A fasta datasets containing representative sequences.
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
96
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
97 A text file listing the mapping of sequences to the representative sequences::
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
98
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
99 >Cluster 0
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
100 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
101 >Cluster 1
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
102 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
103 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
104 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
105 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
106 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
107 >Cluster 2
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
108 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
109 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
110 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
111 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
112
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
113
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
114 </help>
75fde37f69e5 Add cd-hit to protein fastas
Jim Johnson <jj@umn.edu>
parents:
diff changeset
115 </tool>