Mercurial > repos > bebatut > cdhit

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Mon Apr 25 12:13:50 2016 -0400
@@ -0,0 +1,12 @@
+CD_HIT memory usage
+====================
+
+By default, maximum of 4Gb is attributed to CD_HIT.
+
+To change the maximum memory usage, you can edit the CD_HIT_MEM_OPTIONS in the file:
+
+<tool_dependency_dir>/cd-hit/4.6.4/bebatut/cdhit/<hash_string>/env.sh
+
+For example to increase to 8Gb, you will write:
+
+CD_HIT_MEM_OPTIONS='-M 8000'
\ No newline at end of file
--- a/cd_hit_est.xml	Thu Apr 14 09:24:49 2016 -0400
+++ b/cd_hit_est.xml	Mon Apr 25 12:13:50 2016 -0400
@@ -15,82 +15,41 @@
       -n $wordsize $strand

       #include source=$common_cdhit_options#
-      #include source=$runtime_tuning#
   ]]></command>

   <inputs>
-    <param name="fasta_in" type="data" format="fasta" label="EST Sequences to cluster"/>
-    <param name="similarity" type="float" value="0.9"  label="similarity threshold: .75 - 1.0, default is .9">
-      <validator type="in_range" message="sequence similarity threshold should be .75 - 1.0" min=".75" max="1.0"/>
-    </param>
-    <param name="wordsize" type="integer" value="8"  label="word size">
-      <help> Suggested word size:
-             8,9,10 for thresholds 0.90 ~ 1.0
-             7      for thresholds 0.88 ~ 0.9
-             6      for thresholds 0.85 ~ 0.88
-             5      for thresholds 0.80 ~ 0.85
-             4      for thresholds 0.75 ~ 0.8
-      </help>
-      <validator type="in_range" message="word size should be between 4 and 10" min="4" max="10"/>
-    </param>
-    <param name="strand" type="boolean" truevalue="-r 1" falsevalue="" checked="false" label="Compare both strands"/>
+    <param name="fasta_in" type="data" format="fasta" label="EST Sequences to cluster" help="(-i)"/>
+
+    <param name="similarity" type="float" value="0.9"  label="Similarity threshold" min=".75" max="1.0" help="(-c)"/>
+
+    <param name="wordsize" type="integer" value="8" label="Word size" min="4" max="10" help="It is suggested to adjust word size in function of similarity threshold. 8,9 or 10  for threshold in [0.9;1.0] interval, 7 for [0.88;0.9], 6 for [0.85;0.88], 5 for [0.80;0.85], 4 for [0.75;0.8] (-n)"/>
+
+    <param name="strand" type="boolean" truevalue="-r 1" falsevalue="" checked="false" label="Compare both strands?"/>
+
     <expand macro="common_cdhit_options" />
-    <expand macro="runtime_tuning" />
   </inputs>

   <outputs>
-    <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: clusters" from_work_dir="rep_seq.clstr"/>
-    <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: representatives.fasta" from_work_dir="rep_seq"/>
+    <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: Clusters" from_work_dir="rep_seq.clstr"/>
+
+    <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: Representative sequences" from_work_dir="rep_seq"/>
   </outputs>

   <tests>
-   <test>
-     <!-- Expect 3 clusters: 0,1,2 -->
-     <param name="fasta_in" value="cd_hit_est_in.fa" />
-     <param name="similarity" value="0.9"/>
-     <param name="wordsize" value="8"/>
-     <param name="strand" value="true"/>
-     <!-- conditionals in macros -->
-     <param name="settings" value="no"/>
-     <param name="tuning" value="default"/>
-     <output name="clusters_out">
-         <assert_contents>
-             <has_text text=">Cluster 0" />
-             <!-- There should not be a Cluster 3 -->
-             <not_has_text text="Cluster 3" />
-             <has_text_matching expression="F12Fcsw_481739" />
-         </assert_contents>
-     </output>
-     <output name="fasta_out">
-         <assert_contents>
-             <has_text_matching expression="^>[MF]\d\dFcsw_\d*" />
-         </assert_contents>
-     </output>
-   </test>
-   <test>
-     <!-- tighter constraints should yield more clusters -->
-     <param name="fasta_in" value="cd_hit_est_in.fa" />
-     <param name="similarity" value="0.95"/>
-     <param name="wordsize" value="9"/>
-     <param name="strand" value="true"/>
-     <!-- conditionals in macros -->
-     <param name="settings" value="no"/>
-     <param name="tuning" value="default"/>
-     <output name="clusters_out">
-         <assert_contents>
-             <has_text text=">Cluster 4" />
-             <has_text_matching expression=">F12Fcsw_481739" />
-         </assert_contents>
-     </output>
-     <output name="fasta_out">
-         <assert_contents>
-             <has_text_matching expression="^>[MF]\d\dFcsw_\d*" />
-         </assert_contents>
-     </output>
-   </test>
+      <test>
+        <param name="fasta_in" value="cd_hit_est_in.fa" />
+        <param name="similarity" value="0.9"/>
+        <param name="wordsize" value="8"/>
+        <param name="strand" value="false"/>
+        <param name="settings" value="no"/>
+        <param name="tuning" value="default"/>
+        <output name="clusters_out" file="est_clusters_output.txt"/>
+        <output name="fasta_out" file="est_fasta_output.fasta"/>
+      </test>
   </tests>

   <help><![CDATA[
+
 **What it does**

 CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database.
@@ -101,15 +60,15 @@

 **Inputs**

-cd-hit-est requires a fasta dataset as input.
-
+cd-hit-est requires a fasta file as input.
+
 ------

 **Outputs**

-A fasta datasets containing representative sequences.
+The first output is a fasta file containing representative sequences.

-A text file listing the mapping of sequences to the representative sequences::
+The second output is a text file listing the mapping of sequences to the representative sequences::

 	>Cluster 0
 	0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
@@ -124,8 +83,6 @@
 	1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
 	2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
 	3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
-
-
   ]]></help>

   <citations>
--- a/cd_hit_protein.xml	Thu Apr 14 09:24:49 2016 -0400
+++ b/cd_hit_protein.xml	Mon Apr 25 12:13:50 2016 -0400
@@ -1,89 +1,51 @@
 <tool id="cd_hit_protein" name="CD-HIT PROTEIN" version="1.3">
   <description>Cluster a protein dataset into representative sequences</description>
-
-  <macros>
-    <import>cdhit_macros.xml</import>
-  </macros>
-
-  <expand macro="requirements"/>
+    <macros>
+      <import>cdhit_macros.xml</import>
+    </macros>

-  <command><![CDATA[
-    cd-hit
-      -i "$fasta_in"
-      -o rep_seq
-      -c $similarity
-      -n $wordsize
-      #include source=$common_cdhit_options#
-      #include source=$runtime_tuning#
-  ]]></command>
+    <expand macro="requirements"/>

-  <inputs>
-    <param name="fasta_in" type="data" format="fasta" label="Protein Sequences to cluster"/>
-    <param name="similarity" type="float" value="0.9"  label="similarity threshold: .4 - 1.0 (default .9)">
-      <validator type="in_range" message="sequence similarity threshold should be .4 - 1.0" min=".4" max="1.0"/>
-    </param>
-    <param name="wordsize" type="integer" value="5"  label="word size (default 5)">
-      <help> Suggested word size:
-           5    for thresholds 0.7 ~ 1.0;
-           4    for thresholds 0.6 ~ 0.7;
-           3    for thresholds 0.5 ~ 0.6;
-           2    for thresholds 0.4 ~ 0.5;
-      </help>
-      <validator type="in_range" message="word size should be between 2 and 5" min="2" max="5"/>
-    </param>
-    <expand macro="common_cdhit_options" />
-    <expand macro="runtime_tuning" />
-  </inputs>
+    <command><![CDATA[
+      cd-hit
+        -i "$fasta_in"
+        -o rep_seq
+        -c $similarity
+        -n $wordsize

-  <outputs>
-    <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: clusters" from_work_dir="rep_seq.clstr"/>
-    <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: representatives.fasta" from_work_dir="rep_seq"/>
-  </outputs>
+        #include source=$common_cdhit_options#
+    ]]></command>
+
+    <inputs>
+      <param name="fasta_in" type="data" format="fasta" label="Protein Sequences to cluster" help="(-i)"/>
+
+      <param name="similarity" type="float" value="0.9"  label="Similarity threshold" min=".4" max="1.0" help="Similarity threshold must be in [0.4;1.0] interval (-c)"/>
+
+      <param name="wordsize" type="integer" value="5"  label="Word size" min="2" max="5" help="It is suggested to adjust word size in function of similarity threshold. 5 for threshold in [0.7;1.0] interval, 4 for [0.6;0.7], 3 for [0.5;0.6], 2 for [0.4;0.5] (-n)"/>

-  <tests>
-    <test>
-       <param name="fasta_in" value="cd_hit_protein_in.fasta" />
-       <param name="similarity" value="0.9"/>
-       <param name="wordsize" value="5"/>
-       <!-- conditionals in macros -->
-       <param name="settings" value="no"/>
-       <param name="tuning" value="default"/>
-       <output name="clusters_out">
-           <assert_contents>
-               <has_text text="Cluster 0" />
-               <!--
-               <has_text_matching expression=">sp.P00338-2.LDHA_HU" />
-               -->
-           </assert_contents>
-       </output>
-       <output name="fasta_out">
-           <assert_contents>
-               <has_text_matching expression=">sp.P19858.LDHA_BOVIN" />
-           </assert_contents>
-       </output>
-    </test>
-    <test>
-      <param name="fasta_in" value="cd_hit_protein_in.fasta" />
-      <param name="similarity" value="0.8" />
-      <param name="wordsize" value="5" />
-     <!-- conditionals in macros -->
-      <param name="settings" value="no"/>
-      <param name="tuning" value="default"/>
-      <output name="clusters_out">
-        <assert_contents>
-      <has_text text="Cluster 0" />
-      <not_has_text text="Cluster 4" />
-        </assert_contents>
-      </output>
-      <output name="fasta_out">
-        <assert_contents>
-        <has_text_matching expression=">sp.P00340.LDHA_CHICK" />
-        </assert_contents>
-      </output>
-    </test>
-  </tests>
+      <expand macro="common_cdhit_options" />
+    </inputs>
+
+    <outputs>
+      <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: Clusters" from_work_dir="rep_seq.clstr"/>
+      <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: Representatives sequences" from_work_dir="rep_seq"/>
+    </outputs>

-  <help><![CDATA[
+    <tests>
+      <test>
+        <param name="fasta_in" value="cd_hit_protein_in.fasta" />
+        <param name="similarity" value="0.9" />
+        <param name="wordsize" value="5" />
+        <param name="settings" value="no"/>
+        <param name="print_alignment" value="false"/>
+        <param name="cluster_type" value="false"/>
+        <param name="tuning" value="default"/>
+        <output name="clusters_out" file="protein_clusters_output.txt"/>
+        <output name="fasta_out" file="protein_fasta_output.fasta"/>
+      </test>
+    </tests>
+
+    <help><![CDATA[
 **What it does**

 CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database.
@@ -94,35 +56,33 @@

 **Inputs**

-cd-hit requires a protein fasta dataset as input.
-
+cd-hit requires a protein fasta file as input.
+
 ------

 **Outputs**

-A fasta datasets containing representative sequences.
+The first output is a fasta file containing representative sequences.

-A text file listing the mapping of sequences to the representative sequences::
+The second output is a text file listing the mapping of sequences to the representative sequences::

-	>Cluster 0
-	0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
-	>Cluster 1
-	0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
-	1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
-	2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
-	3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
-	4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
-	>Cluster 2
-	0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
-	1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
-	2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
-	3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
+  >Cluster 0
+  0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
+  >Cluster 1
+  0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
+  1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
+  2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
+  3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
+  4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
+  >Cluster 2
+  0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
+  1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
+  2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
+  3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
+    ]]></help>

-
-  ]]></help>
-
-  <citations>
-    <citation type="doi">10.1093/bioinformatics/btl158</citation>
-    <citation type="doi">10.1093/bioinformatics/bts565</citation>
-  </citations>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btl158</citation>
+        <citation type="doi">10.1093/bioinformatics/bts565</citation>
+    </citations>
 </tool>
--- a/cdhit_macros.xml	Thu Apr 14 09:24:49 2016 -0400
+++ b/cdhit_macros.xml	Mon Apr 25 12:13:50 2016 -0400
@@ -5,170 +5,112 @@
       </requirements>
     </xml>

-  <template name="common_cdhit_options">
-    <!-- start common cdhit  options -->
+  <template name="common_cdhit_options">
+    #if $advanced.settings == 'yes':

-    <!-- start adv. settings -->
-    #if $advanced.settings == 'yes':
       #if str($advanced.band_width) != 'None':
         -b $advanced.band_width
       #end if
+
       #if str($advanced.throw_away_len) != 'None':
         -l $advanced.throw_away_len
       #end if
+
       #if str($advanced.description_len) != 'None':
         -d $advanced.description_len
       #end if
+
       #if str($advanced.cutoff_diff_len) != 'None':
         -s $advanced.cutoff_diff_len
       #end if
+
       #if str($advanced.aa_cutoff_diff_len) != 'None':
         -S $advanced.aa_cutoff_diff_len
       #end if
+
       #if $advanced.align.style == 'local':
         -G 0
         #if str($advance.align.align_coverage_long) != 'None':
           -aL $advance.align.align_coverage_long
         #end if
-        #if str($advance.align.aa_align_coverage_long) != 'None':
-          -AL $advance.align.aa_align_coverage_long
+
+        #if str($advance.align.align_coverage_long_control) != 'None':
+          -AL $advance.align.align_coverage_long_control
         #end if
+
         #if str($advance.aling.align_coverage_short) != 'None':
           -aS $advance.align.align_coverage_short
         #end if
-        #if str($advance.aling.aa_align_coverage_short) != 'None':
-          -AS $advance.align.aa_align_coverage_short
+
+        #if str($advance.aling.align_coverage_short_control) != 'None':
+          -AS $advance.align.align_coverage_short_control
         #end if
+
         #if str($advance.align.align_coverage_min) != 'None':
           -A $advance.align.aling_coverage_min
         #end if
       #end if
     #end if
-    <!-- end adv. settings -->
+
     #if $print_alignment:
       $print_alignment
     #end if
+
     #if $cluster_type:
       $cluster_type
     #end if
-  </template>

-  <template name="runtime_tuning">
-    #if $runtime.tuning == 'tune':
-      #if $runtime.threads_num:
-        -T $runtime.threads_num
-      #end if
-      #if $runtime.memory_limit:
-        -M $runtime.memory_limit
-      #end if
-      $runtime.in_ram
-    #else
-      \$CDHIT_SITE_OPTIONS
-    #end if
-    <!-- end runtime tuning options -->
+    \$CDHIT_SITE_OPTIONS
+
+    -T \${GALAXY_SLOTS:-1}
   </template>

   <macro name="common_cdhit_options">
-
     <conditional name="advanced">
-      <param name="settings" type="select" label="Use adavanced settings">
+      <param name="settings" type="select" label="Use advanced settings?">
         <option value="no" selected="true">No</option>
         <option value="yes">Yes</option>
       </param>
+
       <when value="no"/>
+
       <when value="yes">
-        <param name="band_width" type="integer" value="" optional="true" label="band_width of alignment (default 20)">
-          <validator type="in_range" message="alignment band_width must be greater than 0" min="1"/>
-        </param>
-        <param name="throw_away_len" type="integer" value="" optional="true" label="length of throw_away_sequences (default 10)">
-          <validator type="in_range" message="throw_away_sequences length must be greater than 0" min="1"/>
-        </param>
-        <param name="description_len" type="integer" value="" optional="true" label="length of description in .clstr file (default 20)">
-          <help>if set to 0, it takes the fasta defline and stops at first space</help>
-          <validator type="in_range" message="description length cannot be negative" min="0"/>
-        </param>
-        <param name="cutoff_diff_len" type="float" value="" optional="true" label="length difference cutoff (default 0.0)">
-          <help>if set to 0.9, the shorter sequences need to be at least 90% length of the representative of the cluster</help>
-          <validator type="in_range" message="length difference cutoff must be between 0.0 and 1.0" min="0.0" max="1.0"/>
-        </param>
-        <param name="aa_cutoff_diff_len" type="integer" value="" optional="true" label="length difference cutoff in amino acid (default 999999)">
-          <help>if set to 60, the length difference between the shorter sequences and the representative of the cluster can not be bigger than 60</help>
-          <validator type="in_range" message="length difference cutoff in amino acid  be greater than 0" min="0"/>
-        </param>
+        <param name="band_width" type="integer" value="20" optional="true" label="Alignment band width" min="1" help="(-b)"/>
+
+        <param name="throw_away_len" type="integer" value="10" optional="true" label="Length of throw away sequences" min="1" help="(-l)"/>
+
+        <param name="description_len" type="integer" value="20" optional="true" label="Length of the description" min="0" help="If set to 0, it takes the fasta defline and stops at first space (-d)"/>
+
+        <param name="cutoff_diff_len" type="float" value="0.0" optional="true" label="Length difference cutoff" min="0.0" max="1.0" help="If set to 0.9, the shorter sequences need to be at least 90% length of the representative of the cluster (-s)"/>
+
+        <param name="aa_cutoff_diff_len" type="integer" value="999999" optional="true" label="Length difference cutoff in amino acid" min="0" help="If set to 60, the length difference between the shorter sequences and the representative of the cluster can not be bigger than 60 (-S)"/>
+
         <conditional name="align">
-          <param name="style" type="select" label="global or local alignments">
-            <help>local sequence identity, calculated as : number of identical amino acids in alignment divided by the length of the alignment
-                  You must set alignment coverage by length or fraction.
-            </help>
+          <param name="style" type="select" label="global or local alignments" help="Local sequence identity, calculated as : number of identical amino acids in alignment divided by the length of the alignment. You must set alignment coverage by length or fraction.">
             <option value="global" selected="true">Global</option>
             <option value="local" >Local</option>
           </param>
+
           <when value="global"/>
+
           <when value="local">
-
-	    <param name="align_coverage_long" type="float" value="" optional="true" label="alignment coverage for the longer sequence (default 0.0)">
-              <help>if set to 0.9, the alignment must covers 90% of the sequence</help>
-	      <validator type="in_range" message="input must be between 0.0 and 1.0." min="0.0" max="1.0" />
-	    </param>
-	    <param name="aa_align_coverage_long" type="integer" value="" optional="true" label="alignment coverage control for the longer sequence (default 99999999)" >
-              <help>if set to 60, and the length of the sequence is 400,then the alignment must be at least 340 (400-60) residues</help>
-	      <validator type="in_range" message="input cannot be negative." min="0" />
-	    </param>
-	    <param name="align_coverage_short" type="float" value="" optional="true" label="alignment coverage for the shorter sequence (default 0.0)" >
-              <help>if set to 0.9, the alignment must covers 90% of the sequence</help>
-	      <validator type="in_range" message="input must be between 0.0 and 1.0." min="0.0" max="1.0" />
-	    </param>
-	    <param name="aa_align_coverage_short" type="integer" value="" optional="true" label="alignment coverage control for the shorter sequence (default 99999999)" >
-              <help>if set to 60, and the length of the sequence is 400, then the alignment must be at least 340 (400-60) residues</help>
-	      <validator type="in_range" message="input cannot be negative." min="0" />
-	    </param>
-	    <param name="align_coverage_min" type="integer" value="" optional="true" label="minimal alignment coverage control for the both sequences (default 0)" >
-              <help>alignment must cover at least this value for both sequences</help>
-	      <validator type="in_range" message="coverage must be at least 0." min="0"/>
-	    </param>
+	             <param name="align_coverage_long" type="float" value="0.0" optional="true" label="Alignment coverage for the longer sequence" min="0.0" max="1.0" help="If set to 0.9, the alignment must covers 90% of the sequence (-aL)"/>
+
+	             <param name="align_coverage_long_control" type="integer" value="99999999" optional="true" label="Alignment coverage control for the longer sequence " min="0" help="If set to 60, and the sequence's length 400,then the alignment must be at least 340 (400-60) residues (-AL)"/>
+
+	             <param name="align_coverage_short" type="float" value="0.0" optional="true" label="Alignment coverage for the shorter sequence" min="0.0" max="1.0" help="As for the longer (-aS)"/>
+
+	             <param name="align_coverage_short_control" type="integer" value="99999999" optional="true" label="Alignment coverage control for the shorter sequence" min="0" help="As for the longer (-AS)"/>
+
+	             <param name="align_coverage_min" type="integer" value="0" optional="true" label="Minimal alignment coverage control for the both sequences" min="0" help="Alignment must cover at least this value for both sequences (-A)"/>
           </when>
         </conditional>
-
       </when>
     </conditional>

     <param name="print_alignment" type="boolean" truevalue="-p 1" falsevalue="" checked="false" label="Print alignment overlap in .clstr file"/>

-    <param name="cluster_type" type="boolean" truevalue="-g 1" falsevalue="" checked="false" label="Slow Cluster"
-           help="by cd-hit's default algorithm, a sequence is clustered to the first
-        cluster that meet the threshold (fast cluster). If set the program
-        will cluster it into the most similar cluster that meet the threshold
-        (accurate but slow mode). This won't change the representatives of final clusters"/>
-
+    <param name="cluster_type" type="boolean" truevalue="-g 1" falsevalue="" checked="false" label="Slow Cluster" help="In cd-hit's default algorithm, a sequence is clustered to the first cluster that meet the threshold (fast cluster). If set to slow, the program will cluster it into the most similar cluster that meet the threshold (accurate but slow mode). This won't change the representatives of final clusters (-g)"/>
   </macro>

-  <macro name="runtime_tuning">
-    <conditional name="runtime">
-      <param name="tuning" type="select" label="Runtime Memory and Threads">
-        <option value="default" selected="true">Use Default settings</option>
-        <option value="tune" >Set Runtime options</option>
-      </param>
-      <when value="default"/>
-      <when value="tune">
-        <param name="threads_num" type="integer" value="1" optional="true" label="number of threads; with 0, all CPUs will be used. (default 1)" >
-          <validator type="in_range" message="input cannot be negative." min="0" />
-        </param>
-        <param name="memory_limit" type="integer" value="800" optional="true" label="memory limit (in MB) for the program; 0 for unlimitted. (default 800)" >
-          <validator type="in_range" message="input cannot be negative." min="0" />
-        </param>
-        <param name="in_ram" type="boolean" truevalue="-B 1" falsevalue="" checked="false" label="Too big for in Memory calculation"
-               help="Use for huge databases"/>
-      </when>
-    </conditional>
-  </macro>
-
-  <token name="@CITATION_SECTION@">------
-
-**Citation**
-
-For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. &lt;http://www.ncbi.nlm.nih.gov/pubmed/21478889&gt;`_
-
-If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.*
-
-  </token>
 </macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est_clusters_output.txt	Mon Apr 25 12:13:50 2016 -0400
@@ -0,0 +1,39 @@
+>Cluster 0
+0	239nt, >F12Fcsw_481739... at +/99.16%
+1	243nt, >F14Fcsw_133982... *
+2	242nt, >F14Fcsw_149685... at +/99.59%
+3	230nt, >F14Fcsw_175165... at +/96.96%
+4	239nt, >F14Fcsw_176364... at +/97.91%
+5	239nt, >F14Fcsw_224425... at +/99.16%
+6	240nt, >F14Fcsw_27361... at +/99.58%
+7	239nt, >F14Fcsw_2745... at +/99.58%
+8	238nt, >F14Fcsw_37069... at +/99.58%
+9	238nt, >F14Fcsw_38031... at +/99.16%
+10	239nt, >F14Fcsw_49588... at +/99.16%
+11	230nt, >F23Fcsw_160873... at +/96.52%
+12	183nt, >F31Fcsw_135439... at +/95.63%
+13	241nt, >F34Fcsw_50866... at +/91.29%
+14	230nt, >M12Fcsw_69587... at +/92.61%
+15	240nt, >M13Fcsw_127764... at +/97.92%
+16	222nt, >M13Fcsw_198303... at +/96.40%
+17	227nt, >M14Fcsw_117325... at +/96.92%
+18	241nt, >M14Fcsw_151062... at +/99.59%
+19	239nt, >M14Fcsw_181677... at +/97.07%
+20	240nt, >M14Fcsw_186607... at +/99.17%
+21	239nt, >M24Fcsw_136217... at +/94.56%
+22	239nt, >M41Fcsw_259146... at +/97.91%
+23	210nt, >M42Fcsw_137216... at +/99.05%
+24	239nt, >M42Fcsw_138199... at +/99.16%
+25	208nt, >M42Fcsw_263016... at +/98.56%
+>Cluster 1
+0	238nt, >F22Fcsw_400293... at +/91.18%
+1	243nt, >F23Fcsw_133990... *
+2	243nt, >F23Fcsw_86009... at +/90.95%
+3	205nt, >F23Fcsw_96640... at +/91.71%
+4	210nt, >F32Fcsw_322472... at +/90.95%
+5	242nt, >F33Fcsw_137774... at +/90.91%
+6	234nt, >M13Fcsw_128004... at +/90.17%
+7	218nt, >M42Fcsw_225418... at +/90.83%
+8	193nt, >M42Fcsw_334979... at +/90.16%
+9	216nt, >M43Fcsw_250770... at +/90.28%
+10	241nt, >M44Fcsw_200453... at +/90.04%
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est_fasta_output.fasta	Mon Apr 25 12:13:50 2016 -0400
@@ -0,0 +1,4 @@
+>F14Fcsw_133982
+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGCCCAA
+>F23Fcsw_133990
+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCGACAACACTGGGATAGCCTTTCGAAAGAAAGATTAATACCGGATGGCATAGTTTTCCCGCATGGAAAAACTATTAAAGAATTTCGGTTATCGATGGGGATGCGTTCCATTAGGCAGTTGGCGGGGTAACGGCCCACCAAACCGACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_clusters_output.txt	Mon Apr 25 12:13:50 2016 -0400
@@ -0,0 +1,12 @@
+>Cluster 0
+0	375aa, >sp|P00325|ADH1B_HUM... *
+1	375aa, >tr|K7D361|K7D361_PA... at 99.73%
+>Cluster 1
+0	375aa, >sp|P00329|ADH1_MOUS... *
+>Cluster 2
+0	332aa, >sp|P00340|LDHA_CHIC... *
+>Cluster 3
+0	241aa, >sp|P00338-5|LDHA_HU... at 91.29%
+1	332aa, >sp|P19858|LDHA_BOVI... *
+>Cluster 4
+0	274aa, >sp|P00338-2|LDHA_HU... *
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/protein_fasta_output.fasta	Mon Apr 25 12:13:50 2016 -0400
@@ -0,0 +1,36 @@
+>sp|P00325|ADH1B_HUMAN Alcohol dehydrogenase 1B OS=Homo sapiens GN=ADH1B PE=1 SV=2
+MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT
+PLPVILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP
+RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG
+YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ
+NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF
+DLLHSGKSIRTVLTF
+>sp|P00329|ADH1_MOUSE Alcohol dehydrogenase 1 OS=Mus musculus GN=Adh1 PE=2 SV=2
+MSTAGKVIKCKAAVLWELHKPFTIEDIEVAPPKAHEVRIKMVATGVCRSDDHVVSGTLVT
+PLPAVLGHEGAGIVESVGEGVTCVKPGDKVIPLFSPQCGECRICKHPESNFCSRSDLLMP
+RGTLREGTSRFSCKGKQIHNFISTSTFSQYTVVDDIAVAKIDGASPLDKVCLIGCGFSTG
+YGSAVKVAKVTPGSTCAVFGLGGVGLSVIIGCKAAGAARIIAVDINKDKFAKAKELGATE
+CINPQDYSKPIQEVLQEMTDGGVDFSFEVIGRLDTMTSALLSCHAACGVSVVVGVPPNAQ
+NLSMNPMLLLLGRTWKGAIFGGFKSKDSVPKLVADFMAKKFPLDPLITHVLPFEKINEAF
+DLLRSGKSIRTVLTF
+>sp|P00338-2|LDHA_HUMAN Isoform 2 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA
+MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKECRYTLGDPKGA
+AILKSSDVISFHCLGYNRILGGGCACCPFYLICD
+>sp|P00340|LDHA_CHICK L-lactate dehydrogenase A chain OS=Gallus gallus GN=LDHA PE=1 SV=3
+MSLKDHLIHNVHKEEHAHAHNKISVVGVGAVGMACAISILMKDLADELTLVDVVEDKLKG
+EMLDLQHGSLFLKTPKIISGKDYSVTAHSKLVIVTAGARQQEGESRLNLVQRNVNIFKFI
+IPNVVKYSPDCKLLIVSNPVDILTYVAWKISGFPKHRVIGSGCNLDSARFRHLMGERLGI
+HPLSCHGWIVGEHGDSSVPVWSGVNVAGVSLKALHPDMGTDADKEHWKEVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAETIMKNLRRVHPISTAVKGMHGIKDDVFLSVPCVLGSSGI
+TDVVKMILKPDEEEKIKKSADTLWGIQKELQF
+>sp|P19858|LDHA_BOVIN L-lactate dehydrogenase A chain OS=Bos taurus GN=LDHA PE=2 SV=2
+MATLKDQLIQNLLKEEHVPQNKITIVGVGAVGMACAISILMKDLADEVALVDVMEDKLKG
+EMMDLQHGSLFLRTPKIVSGKDYNVTANSRLVIITAGARQQEGESRLNLVQRNVNIFKFI
+IPNIVKYSPNCKLLVVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV
+HPLSCHGWILGEHGDSSVPVWSGVNVAGVSLKNLHPELGTDADKEQWKAVHKQVVDSAYE
+VIKLKGYTSWAIGLSVADLAESIMKNLRRVHPISTMIKGLYGIKEDVFLSVPCILGQNGI
+SDVVKVTLTHEEEACLKKSADTLWGIQKELQF
--- a/tool_dependencies.xml	Thu Apr 14 09:24:49 2016 -0400
+++ b/tool_dependencies.xml	Mon Apr 25 12:13:50 2016 -0400
@@ -15,7 +15,7 @@
                 </action>
                 <action type="set_environment">
                     <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR</environment_variable>
-                    <environment_variable name="CDHIT_SITE_OPTIONS" action="set_to">"-M 4000 -T 0"</environment_variable>
+                    <environment_variable name="CDHIT_SITE_OPTIONS" action="set_to">"-M 4000"</environment_variable>
                 </action>
             </actions>
         </install>
@@ -29,9 +29,7 @@
 https://code.google.com/p/cdhit/source/browse/README

 Change the CDHIT_SITE_OPTIONS variable in the installed env.sh file to adjust
-the maximum memory Mb (-M) or to limit the number of threads (-T)
-to match your site
+the maximum memory Mb (-M).
         </readme>
     </package>
-</tool_dependency>
-
+</tool_dependency>
\ No newline at end of file