Mercurial > repos > iuc > cd_hit

--- a/cd_hit.xml	Tue Aug 31 07:48:54 2021 +0000
+++ b/cd_hit.xml	Fri Nov 05 08:22:56 2021 +0000
@@ -1,11 +1,10 @@
-<tool id="cd_hit" name="cd-hit" version="4.6.8.1">
+<tool id="cd_hit" name="cd-hit" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
     <description>Cluster or compare biological sequence datasets</description>
-    <xrefs>
-        <xref type="bio.tools">cd-hit</xref>
-    </xrefs>
-    <requirements>
-        <requirement type="package" version="4.6.8">cd-hit</requirement>
-    </requirements>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="xrefs"/>
     <version_command><![CDATA[
 cd-hit | grep "CD-HIT version" | cut -d" " -f 4
     ]]></version_command>
@@ -52,6 +51,8 @@
 -U $advanced.max_unmatched_len
 $advanced.accurate
 $advanced.inram
+$advanced.sort_cluster
+$advanced.sort_fasta
 #if $print_alnovl.print_alnovl_select == "yes":
     -p 1
     -d $print_alnovl.desclen
@@ -140,6 +141,8 @@
             <param name="max_unmatched_len" argument="-U" type="integer" min="0" value="99999999" label="Maximum unmatched length" help="If set to 10, the unmatched region (excluding leading and tailing gaps) must not be more than 10 bases"/>
             <param name="inram" argument="-B" type="boolean" truevalue="-B 0" falsevalue="-B 1" checked="true" label="Sequences are stored in RAM" help="If false: sequence are stored on hard drive - use for huge data sets"/>
             <param name="accurate" argument="-g" type="boolean" truevalue="-g 1" falsevalue="-g 0" checked="false" label="Accurate but slow mode" help="By cd-hit's default algorithm, a sequence is clustered to the first cluster that meet the threshold (fast cluster). If set to true, the program will cluster it into the most similar cluster that meet the threshold (accurate but slow mode)"/>
+            <param name="sort_cluster" argument="-sc" type="boolean" truevalue="-sc 1" falsevalue="-sc 0" label="Sort clusters by size" help="When disabled, clusters are sorted by decreasing length; if enabled, clusters are sorted by decreasing size" />
+            <param name="sort_fasta" argument="-sf" type="boolean" truevalue="-sf 1" falsevalue="-sf 0" label="Sort FASTA/FASTQ by cluster size" help="When enabled, output sequences are sorted by decreasing cluster size" />
         </section>

         <conditional name="print_alnovl">
@@ -229,6 +232,42 @@
             <output name="clusters_out" file="est-2d.txt.clstr"/>
             <output name="fasta_out" file="est-2d.txt"/>
         </test>
+        <!-- Test sort cluster parameter -->
+        <test>
+            <param name="fasta_in" value="cd_hit_est_in.fa" />
+            <conditional name="twod">
+                <param name="twod_select" value="" />
+            </conditional>
+            <conditional name="est">
+                <param name="est_select" value="-est" />
+                <param name="wordsize" value="8"/>
+                <param name="strand" value="false"/>
+            </conditional>
+            <param name="similarity" value="0.9"/>
+            <section name="advanced">
+                <param name="sort_cluster" value="true"/>
+            </section>
+            <output name="clusters_out" file="est_clusters_sorted.txt"/>
+            <output name="fasta_out" file="est_fasta_output.fasta"/>
+        </test>
+        <!-- Test sort fasta parameter -->
+        <test>
+            <param name="fasta_in" value="cd_hit_est_in.fa" />
+            <conditional name="twod">
+                <param name="twod_select" value="" />
+            </conditional>
+            <conditional name="est">
+                <param name="est_select" value="-est" />
+                <param name="wordsize" value="8"/>
+                <param name="strand" value="false"/>
+            </conditional>
+            <param name="similarity" value="0.9"/>
+            <section name="advanced">
+                <param name="sort_fasta" value="true"/>
+            </section>
+            <output name="clusters_out" file="est_clusters_output.txt"/>
+            <output name="fasta_out" file="est_fasta_sorted.fasta"/>
+        </test>
     </tests>
     <help><![CDATA[
 **What it does**
@@ -279,8 +318,5 @@

 2. The second output is a text file that lists similar sequences between db1 & db2
     ]]></help>
-    <citations>
-        <citation type="doi">10.1093/bioinformatics/btl158</citation>
-        <citation type="doi">10.1093/bioinformatics/bts565</citation>
-    </citations>
+    <expand macro="citations" />
 </tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Fri Nov 05 08:22:56 2021 +0000
@@ -0,0 +1,20 @@
+<macros>
+    <token name="@TOOL_VERSION@">4.8.1</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <xml name="xrefs">
+        <xrefs>
+            <xref type="bio.tools">cd-hit</xref>
+        </xrefs>
+    </xml>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">cd-hit</requirement>
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1093/bioinformatics/btl158</citation>
+            <citation type="doi">10.1093/bioinformatics/bts565</citation>
+        </citations>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est_clusters_sorted.txt	Fri Nov 05 08:22:56 2021 +0000
@@ -0,0 +1,39 @@
+>Cluster 0
+0	239nt, >F12Fcsw_481739... at +/99.16%
+1	243nt, >F14Fcsw_133982... *
+2	242nt, >F14Fcsw_149685... at +/99.59%
+3	230nt, >F14Fcsw_175165... at +/96.96%
+4	239nt, >F14Fcsw_176364... at +/97.91%
+5	239nt, >F14Fcsw_224425... at +/99.16%
+6	240nt, >F14Fcsw_27361... at +/99.58%
+7	239nt, >F14Fcsw_2745... at +/99.58%
+8	238nt, >F14Fcsw_37069... at +/99.58%
+9	238nt, >F14Fcsw_38031... at +/99.16%
+10	239nt, >F14Fcsw_49588... at +/99.16%
+11	230nt, >F23Fcsw_160873... at +/96.52%
+12	183nt, >F31Fcsw_135439... at +/95.63%
+13	241nt, >F34Fcsw_50866... at +/91.29%
+14	230nt, >M12Fcsw_69587... at +/92.61%
+15	240nt, >M13Fcsw_127764... at +/97.92%
+16	222nt, >M13Fcsw_198303... at +/96.40%
+17	227nt, >M14Fcsw_117325... at +/96.92%
+18	241nt, >M14Fcsw_151062... at +/99.59%
+19	239nt, >M14Fcsw_181677... at +/97.07%
+20	240nt, >M14Fcsw_186607... at +/99.17%
+21	239nt, >M24Fcsw_136217... at +/94.56%
+22	239nt, >M41Fcsw_259146... at +/97.91%
+23	210nt, >M42Fcsw_137216... at +/99.05%
+24	239nt, >M42Fcsw_138199... at +/99.16%
+25	208nt, >M42Fcsw_263016... at +/98.56%
+>Cluster 1
+0	238nt, >F22Fcsw_400293... at +/91.18%
+1	243nt, >F23Fcsw_133990... *
+2	243nt, >F23Fcsw_86009... at +/90.95%
+3	205nt, >F23Fcsw_96640... at +/91.71%
+4	210nt, >F32Fcsw_322472... at +/90.95%
+5	242nt, >F33Fcsw_137774... at +/90.91%
+6	234nt, >M13Fcsw_128004... at +/90.17%
+7	218nt, >M42Fcsw_225418... at +/90.83%
+8	193nt, >M42Fcsw_334979... at +/90.16%
+9	216nt, >M43Fcsw_250770... at +/90.28%
+10	241nt, >M44Fcsw_200453... at +/90.04%
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/est_fasta_sorted.fasta	Fri Nov 05 08:22:56 2021 +0000
@@ -0,0 +1,4 @@
+>F14Fcsw_133982
+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCTCATACTCGGGGATAGCCTTTCGAAAGAAAGATTAATATCCGATAGCATATATTTCCCGCATGGGTTTTATATTAAAGAAATTCGGTATGAGATGGGGATGCGTTCCATTAGTTTGTTGGCGGGGTAACGGCCCACCAAGACTACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGCCCAA
+>F23Fcsw_133990
+GGCGACCGGCGCACGGGTGAGTAACACGTATCCAACCTGCCGACAACACTGGGATAGCCTTTCGAAAGAAAGATTAATACCGGATGGCATAGTTTTCCCGCATGGAAAAACTATTAAAGAATTTCGGTTATCGATGGGGATGCGTTCCATTAGGCAGTTGGCGGGGTAACGGCCCACCAAACCGACGATGGATAGGGGTTCTGAGAGGAAGGTCCCCCACATTGGAACTGAGACACGGTCCAA