Mercurial > repos > jjohnson > cdhit
changeset 11:75fde37f69e5
Add cd-hit to protein fastas
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Thu, 27 Jun 2013 21:27:06 -0500 |
parents | 211ca88ce047 |
children | b1bf31be0d3c |
files | cd_hit_est.xml cd_hit_protein.xml cdhit_macros.xml test-data/cd_hit_protein_in.fasta |
diffstat | 4 files changed, 371 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/cd_hit_est.xml Thu Sep 13 20:15:09 2012 -0500 +++ b/cd_hit_est.xml Thu Jun 27 21:27:06 2013 -0500 @@ -1,10 +1,15 @@ -<tool id="cd_hit_est" name="CD-HIT-EST" version="1.1"> +<tool id="cd_hit_est" name="CD-HIT-EST" version="1.2"> <description>Cluster a nucleotide dataset into representative sequences</description> <requirements> <requirement type="package" version="4.6.1">cd-hit</requirement> </requirements> + <macros> + <import>cdhit_macros.xml</import> + </macros> <command> - cd-hit-est -i $fasta_in -o rep_seq -c $similarity -n $wordsize $strand + cd-hit-est -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize $strand + #include source=$common_cdhit_options# + #include source=$runtime_tuning# </command> <inputs> <param name="fasta_in" type="data" format="fasta" label="EST Sequences to cluster"/> @@ -22,6 +27,8 @@ <validator type="in_range" message="word size should be between 4 and 10" min="4" max="10"/> </param> <param name="strand" type="boolean" truevalue="-r 1" falsevalue="" checked="false" label="Compare both strands"/> + <expand macro="common_cdhit_options" /> + <expand macro="runtime_tuning" /> </inputs> <outputs> <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: clusters" from_work_dir="rep_seq.clstr"/> @@ -29,12 +36,40 @@ </outputs> <tests> <test> + <!-- Expect 3 clusters: 0,1,2 --> <param name="fasta_in" value="cd_hit_est_in.fa" /> <param name="similarity" value="0.9"/> <param name="wordsize" value="8"/> + <param name="strand" value="true"/> + <!-- conditionals in macros --> + <param name="settings" value="no"/> + <param name="tuning" value="default"/> <output name="clusters_out"> <assert_contents> - <has_text text=">Cluster" /> + <has_text text=">Cluster 0" /> + <!-- There should not be a Cluster 3 --> + <not_has_text text="Cluster 3" /> + <has_text_matching expression="F12Fcsw_481739" /> + </assert_contents> + </output> + <output name="fasta_out"> + <assert_contents> + <has_text_matching expression="^>[MF]\d\dFcsw_\d*" /> + </assert_contents> + </output> + </test> + <test> + <!-- tighter constraints should yield more clusters --> + <param name="fasta_in" value="cd_hit_est_in.fa" /> + <param name="similarity" value="0.95"/> + <param name="wordsize" value="9"/> + <param name="strand" value="true"/> + <!-- conditionals in macros --> + <param name="settings" value="no"/> + <param name="tuning" value="default"/> + <output name="clusters_out"> + <assert_contents> + <has_text text=">Cluster 4" /> <has_text_matching expression=">F12Fcsw_481739" /> </assert_contents> </output>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cd_hit_protein.xml Thu Jun 27 21:27:06 2013 -0500 @@ -0,0 +1,115 @@ +<tool id="cd_hit_protein" name="CD-HIT PROTEIN" version="1.2"> + <description>Cluster a protein dataset into representative sequences</description> + <requirements> + <requirement type="package" version="4.6.1">cd-hit</requirement> + </requirements> + <macros> + <import>cdhit_macros.xml</import> + </macros> + <command> + cd-hit -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize + #include source=$common_cdhit_options# + #include source=$runtime_tuning# + </command> + <inputs> + <param name="fasta_in" type="data" format="fasta" label="Protein Sequences to cluster"/> + <param name="similarity" type="float" value="0.9" label="similarity threshold: .4 - 1.0 (default .9)"> + <validator type="in_range" message="sequence similarity threshold should be .4 - 1.0" min=".4" max="1.0"/> + </param> + <param name="wordsize" type="integer" value="5" label="word size (default 5)"> + <help> Suggested word size: + 5 for thresholds 0.7 ~ 1.0; + 4 for thresholds 0.6 ~ 0.7; + 3 for thresholds 0.5 ~ 0.6; + 2 for thresholds 0.4 ~ 0.5; + </help> + <validator type="in_range" message="word size should be between 2 and 5" min="2" max="5"/> + </param> + <expand macro="common_cdhit_options" /> + <expand macro="runtime_tuning" /> + </inputs> + <outputs> + <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: clusters" from_work_dir="rep_seq.clstr"/> + <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: representatives.fasta" from_work_dir="rep_seq"/> + </outputs> + <tests> + <test> + <param name="fasta_in" value="cd_hit_protein_in.fasta" /> + <param name="similarity" value="0.9"/> + <param name="wordsize" value="5"/> + <!-- conditionals in macros --> + <param name="settings" value="no"/> + <param name="tuning" value="default"/> + <output name="clusters_out"> + <assert_contents> + <has_text text="Cluster 0" /> + <!-- + <has_text_matching expression=">sp.P00338-2.LDHA_HU" /> + --> + </assert_contents> + </output> + <output name="fasta_out"> + <assert_contents> + <has_text_matching expression=">sp.P19858.LDHA_BOVIN" /> + </assert_contents> + </output> + </test> + <test> + <param name="fasta_in" value="cd_hit_protein_in.fasta" /> + <param name="similarity" value="0.8" /> + <param name="wordsize" value="5" /> + <!-- conditionals in macros --> + <param name="settings" value="no"/> + <param name="tuning" value="default"/> + <output name="clusters_out"> + <assert_contents> + <has_text text="Cluster 0" /> + <not_has_text text="Cluster 4" /> + </assert_contents> + </output> + <output name="fasta_out"> + <assert_contents> + <has_text_matching expression=">sp.P00340.LDHA_CHICK" /> + </assert_contents> + </output> + </test> + </tests> + + <help> +**CD-HIT** + +CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database. + +.. _CD-HIT: http://www.bioinformatics.org/cd-hit/ + +------ + +**Inputs** + +cd-hit requires a protein fasta dataset as input. + +------ + +**Outputs** + +A fasta datasets containing representative sequences. + +A text file listing the mapping of sequences to the representative sequences:: + + >Cluster 0 + 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... * + >Cluster 1 + 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80% + 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84% + 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... * + 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84% + 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63% + >Cluster 2 + 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60% + 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... * + 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73% + 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69% + + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cdhit_macros.xml Thu Jun 27 21:27:06 2013 -0500 @@ -0,0 +1,168 @@ +<macros> + <template name="common_cdhit_options"> + <!-- start common cdhit options --> + + <!-- start adv. settings --> + #if $advanced.settings == 'yes': + #if $advanced.band_width: + -b $advanced.band_width + #end if + #if $advanced.throw_away_len: + -l $advanced.throw_away_len + #end if + #if $advanced.description_len: + -d $advanced.description_len + #end if + #if $advanced.cutoff_diff_len: + -s $advanced.cutoff_diff_len + #end if + #if $advanced.aa_cutoff_diff_len: + -S $advanced.aa_cutoff_diff_len + #end if + #if $advanced.align.style == 'local': + -G 0 + #if $advance.align.align_coverage_long: + -aL $advance.align.align_coverage_long + #end if + #if $advance.align.aa_align_coverage_long: + -AL $advance.align.aa_align_coverage_long + #end if + #if $advance.aling.align_coverage_short: + -aS $advance.align.align_coverage_short + #end if + #if $advance.aling.aa_align_coverage_short: + -AS $advance.align.aa_align_coverage_short + #end if + #if $advance.align.align_coverage_min: + -A $advance.align.aling_coverage_min + #end if + #end if + #end if + <!-- end adv. settings --> + #if $print_alignment: + $print_alignment + #end if + #if $cluster_type: + $cluster_type + #end if + </template> + + <template name="runtime_tuning"> + #if $runtime.tuning == 'tune': + #if $runtime.threads_num: + -T $runtime.threads_num + #end if + #if $runtime.memory_limit: + -M $runtime.memory_limit + #end if + $runtime.in_ram + #else + \$CDHIT_SITE_OPTIONS + #end if + <!-- end runtime tuning options --> + </template> + + <macro name="common_cdhit_options"> + + <conditional name="advanced"> + <param name="settings" type="select" label="Use adavanced settings"> + <option value="no" selected="true">No</option> + <option value="yes">Yes</option> + </param> + <when value="no"/> + <when value="yes"> + <param name="band_width" type="integer" value="" optional="true" label="band_width of alignment (default 20)"> + <validator type="in_range" message="alignment band_width must be greater than 0" min="1"/> + </param> + <param name="throw_away_len" type="integer" value="" optional="true" label="length of throw_away_sequences (default 10)"> + <validator type="in_range" message="throw_away_sequences length must be greater than 0" min="1"/> + </param> + <param name="description_len" type="integer" value="" optional="true" label="length of description in .clstr file (default 20)"> + <help>if set to 0, it takes the fasta defline and stops at first space</help> + <validator type="in_range" message="description length cannot be negative" min="0"/> + </param> + <param name="cutoff_diff_len" type="float" value="" optional="true" label="length difference cutoff (default 0.0)"> + <help>if set to 0.9, the shorter sequences need to be at least 90% length of the representative of the cluster</help> + <validator type="in_range" message="length difference cutoff must be between 0.0 and 1.0" min="0.0" max="1.0"/> + </param> + <param name="aa_cutoff_diff_len" type="integer" value="" optional="true" label="length difference cutoff in amino acid (default 999999)"> + <help>if set to 60, the length difference between the shorter sequences and the representative of the cluster can not be bigger than 60</help> + <validator type="in_range" message="length difference cutoff in amino acid be greater than 0" min="0"/> + </param> + <conditional name="align"> + <param name="style" type="select" label="global or local alignments"> + <help>local sequence identity, calculated as : number of identical amino acids in alignment divided by the length of the alignment + You must set alignment coverage by length or fraction. + </help> + <option value="global" selected="true">Global</option> + <option value="local" >Local</option> + </param> + <when value="global"/> + <when value="local"> + + <param name="align_coverage_long" type="float" value="" optional="true" label="alignment coverage for the longer sequence (default 0.0)"> + <help>if set to 0.9, the alignment must covers 90% of the sequence</help> + <validator type="in_range" message="input must be between 0.0 and 1.0." min="0.0" max="1.0" /> + </param> + <param name="aa_align_coverage_long" type="integer" value="" optional="true" label="alignment coverage control for the longer sequence (default 99999999)" > + <help>if set to 60, and the length of the sequence is 400,then the alignment must be at least 340 (400-60) residues</help> + <validator type="in_range" message="input cannot be negative." min="0" /> + </param> + <param name="align_coverage_short" type="float" value="" optional="true" label="alignment coverage for the shorter sequence (default 0.0)" > + <help>if set to 0.9, the alignment must covers 90% of the sequence</help> + <validator type="in_range" message="input must be between 0.0 and 1.0." min="0.0" max="1.0" /> + </param> + <param name="aa_align_coverage_short" type="integer" value="" optional="true" label="alignment coverage control for the shorter sequence (default 99999999)" > + <help>if set to 60, and the length of the sequence is 400, then the alignment must be at least 340 (400-60) residues</help> + <validator type="in_range" message="input cannot be negative." min="0" /> + </param> + <param name="align_coverage_min" type="integer" value="" optional="true" label="minimal alignment coverage control for the both sequences (default 0)" > + <help>alignment must cover at least this value for both sequences</help> + <validator type="in_range" message="coverage must be at least 0." min="0"/> + </param> + </when> + </conditional> + + </when> + </conditional> + + <param name="print_alignment" type="boolean" truevalue="-p 1" falsevalue="" checked="false" label="Print alignment overlap in .clstr file"/> + + <param name="cluster_type" type="boolean" truevalue="-g 1" falsevalue="" checked="false" label="Slow Cluster" + help="by cd-hit's default algorithm, a sequence is clustered to the first + cluster that meet the threshold (fast cluster). If set the program + will cluster it into the most similar cluster that meet the threshold + (accurate but slow mode). This won't change the representatives of final clusters"/> + + </macro> + + <macro name="runtime_tuning"> + <conditional name="runtime"> + <param name="tuning" type="select" label="Runtime Memory and Threads"> + <option value="default" selected="true">Use Default settings</option> + <option value="tune" >Set Runtime options</option> + </param> + <when value="default"/> + <when value="tune"> + <param name="threads_num" type="integer" value="1" optional="true" label="number of threads; with 0, all CPUs will be used. (default 1)" > + <validator type="in_range" message="input cannot be negative." min="0" /> + </param> + <param name="memory_limit" type="integer" value="800" optional="true" label="memory limit (in MB) for the program; 0 for unlimitted. (default 800)" > + <validator type="in_range" message="input cannot be negative." min="0" /> + </param> + <param name="in_ram" type="boolean" truevalue="-B 1" falsevalue="" checked="false" label="Too big for in Memory calculation" + help="Use for huge databases"/> + </when> + </conditional> + </macro> + + <token name="@CITATION_SECTION@">------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + </token> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cd_hit_protein_in.fasta Thu Jun 27 21:27:06 2013 -0500 @@ -0,0 +1,50 @@ +>sp|P00325|ADH1B_HUMAN Alcohol dehydrogenase 1B OS=Homo sapiens GN=ADH1B PE=1 SV=2 +MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT +PLPVILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP +RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG +YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE +CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ +NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF +DLLHSGKSIRTVLTF +>tr|K7D361|K7D361_PANTR Alcohol dehydrogenase 1B (Class I), beta polypeptide OS=Pan troglodytes GN=ADH1B PE=2 SV=1 +MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT +PLPAILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP +RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG +YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE +CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ +NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF +DLLHSGKSIRTVLTF +>sp|P00329|ADH1_MOUSE Alcohol dehydrogenase 1 OS=Mus musculus GN=Adh1 PE=2 SV=2 +MSTAGKVIKCKAAVLWELHKPFTIEDIEVAPPKAHEVRIKMVATGVCRSDDHVVSGTLVT +PLPAVLGHEGAGIVESVGEGVTCVKPGDKVIPLFSPQCGECRICKHPESNFCSRSDLLMP +RGTLREGTSRFSCKGKQIHNFISTSTFSQYTVVDDIAVAKIDGASPLDKVCLIGCGFSTG +YGSAVKVAKVTPGSTCAVFGLGGVGLSVIIGCKAAGAARIIAVDINKDKFAKAKELGATE +CINPQDYSKPIQEVLQEMTDGGVDFSFEVIGRLDTMTSALLSCHAACGVSVVVGVPPNAQ +NLSMNPMLLLLGRTWKGAIFGGFKSKDSVPKLVADFMAKKFPLDPLITHVLPFEKINEAF +DLLRSGKSIRTVLTF +>sp|P00338-2|LDHA_HUMAN Isoform 2 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA +MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG +EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI +IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV +HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKECRYTLGDPKGA +AILKSSDVISFHCLGYNRILGGGCACCPFYLICD +>sp|P00338-5|LDHA_HUMAN Isoform 5 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA +MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG +EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI +IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV +HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKEVHKQVVERVFT +E +>sp|P00340|LDHA_CHICK L-lactate dehydrogenase A chain OS=Gallus gallus GN=LDHA PE=1 SV=3 +MSLKDHLIHNVHKEEHAHAHNKISVVGVGAVGMACAISILMKDLADELTLVDVVEDKLKG +EMLDLQHGSLFLKTPKIISGKDYSVTAHSKLVIVTAGARQQEGESRLNLVQRNVNIFKFI +IPNVVKYSPDCKLLIVSNPVDILTYVAWKISGFPKHRVIGSGCNLDSARFRHLMGERLGI +HPLSCHGWIVGEHGDSSVPVWSGVNVAGVSLKALHPDMGTDADKEHWKEVHKQVVDSAYE +VIKLKGYTSWAIGLSVADLAETIMKNLRRVHPISTAVKGMHGIKDDVFLSVPCVLGSSGI +TDVVKMILKPDEEEKIKKSADTLWGIQKELQF +>sp|P19858|LDHA_BOVIN L-lactate dehydrogenase A chain OS=Bos taurus GN=LDHA PE=2 SV=2 +MATLKDQLIQNLLKEEHVPQNKITIVGVGAVGMACAISILMKDLADEVALVDVMEDKLKG +EMMDLQHGSLFLRTPKIVSGKDYNVTANSRLVIITAGARQQEGESRLNLVQRNVNIFKFI +IPNIVKYSPNCKLLVVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV +HPLSCHGWILGEHGDSSVPVWSGVNVAGVSLKNLHPELGTDADKEQWKAVHKQVVDSAYE +VIKLKGYTSWAIGLSVADLAESIMKNLRRVHPISTMIKGLYGIKEDVFLSVPCILGQNGI +SDVVKVTLTHEEEACLKKSADTLWGIQKELQF