# HG changeset patch # User Jim Johnson # Date 1372386426 18000 # Node ID 75fde37f69e5c8c4411ce985cd5e39da7b43d45e # Parent 211ca88ce0476c15555e219d0627ec496f7058d7 Add cd-hit to protein fastas diff -r 211ca88ce047 -r 75fde37f69e5 cd_hit_est.xml --- a/cd_hit_est.xml Thu Sep 13 20:15:09 2012 -0500 +++ b/cd_hit_est.xml Thu Jun 27 21:27:06 2013 -0500 @@ -1,10 +1,15 @@ - + Cluster a nucleotide dataset into representative sequences cd-hit + + cdhit_macros.xml + - cd-hit-est -i $fasta_in -o rep_seq -c $similarity -n $wordsize $strand + cd-hit-est -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize $strand + #include source=$common_cdhit_options# + #include source=$runtime_tuning# @@ -22,6 +27,8 @@ + + @@ -29,12 +36,40 @@ + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + diff -r 211ca88ce047 -r 75fde37f69e5 cd_hit_protein.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cd_hit_protein.xml Thu Jun 27 21:27:06 2013 -0500 @@ -0,0 +1,115 @@ + + Cluster a protein dataset into representative sequences + + cd-hit + + + cdhit_macros.xml + + + cd-hit -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize + #include source=$common_cdhit_options# + #include source=$runtime_tuning# + + + + + + + + Suggested word size: + 5 for thresholds 0.7 ~ 1.0; + 4 for thresholds 0.6 ~ 0.7; + 3 for thresholds 0.5 ~ 0.6; + 2 for thresholds 0.4 ~ 0.5; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**CD-HIT** + +CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database. + +.. _CD-HIT: http://www.bioinformatics.org/cd-hit/ + +------ + +**Inputs** + +cd-hit requires a protein fasta dataset as input. + +------ + +**Outputs** + +A fasta datasets containing representative sequences. + +A text file listing the mapping of sequences to the representative sequences:: + + >Cluster 0 + 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... * + >Cluster 1 + 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80% + 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84% + 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... * + 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84% + 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63% + >Cluster 2 + 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60% + 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... * + 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73% + 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69% + + + + diff -r 211ca88ce047 -r 75fde37f69e5 cdhit_macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cdhit_macros.xml Thu Jun 27 21:27:06 2013 -0500 @@ -0,0 +1,168 @@ + + + + + + + + + + + + + + + + + + + + + + if set to 0, it takes the fasta defline and stops at first space + + + + if set to 0.9, the shorter sequences need to be at least 90% length of the representative of the cluster + + + + if set to 60, the length difference between the shorter sequences and the representative of the cluster can not be bigger than 60 + + + + + local sequence identity, calculated as : number of identical amino acids in alignment divided by the length of the alignment + You must set alignment coverage by length or fraction. + + + + + + + + + if set to 0.9, the alignment must covers 90% of the sequence + + + + if set to 60, and the length of the sequence is 400,then the alignment must be at least 340 (400-60) residues + + + + if set to 0.9, the alignment must covers 90% of the sequence + + + + if set to 60, and the length of the sequence is 400, then the alignment must be at least 340 (400-60) residues + + + + alignment must cover at least this value for both sequences + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ------ + +**Citation** + +For the underlying tool, please cite `DePristo MA, Banks E, Poplin R, Garimella KV, Maguire JR, Hartl C, Philippakis AA, del Angel G, Rivas MA, Hanna M, McKenna A, Fennell TJ, Kernytsky AM, Sivachenko AY, Cibulskis K, Gabriel SB, Altshuler D, Daly MJ. A framework for variation discovery and genotyping using next-generation DNA sequencing data. Nat Genet. 2011 May;43(5):491-8. <http://www.ncbi.nlm.nih.gov/pubmed/21478889>`_ + +If you use this tool in Galaxy, please cite Blankenberg D, et al. *In preparation.* + + + diff -r 211ca88ce047 -r 75fde37f69e5 test-data/cd_hit_protein_in.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/cd_hit_protein_in.fasta Thu Jun 27 21:27:06 2013 -0500 @@ -0,0 +1,50 @@ +>sp|P00325|ADH1B_HUMAN Alcohol dehydrogenase 1B OS=Homo sapiens GN=ADH1B PE=1 SV=2 +MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT +PLPVILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP +RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG +YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE +CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ +NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF +DLLHSGKSIRTVLTF +>tr|K7D361|K7D361_PANTR Alcohol dehydrogenase 1B (Class I), beta polypeptide OS=Pan troglodytes GN=ADH1B PE=2 SV=1 +MSTAGKVIKCKAAVLWEVKKPFSIEDVEVAPPKAYEVRIKMVAVGICRTDDHVVSGNLVT +PLPAILGHEAAGIVESVGEGVTTVKPGDKVIPLFTPQCGKCRVCKNPESNYCLKNDLGNP +RGTLQDGTRRFTCRGKPIHHFLGTSTFSQYTVVDENAVAKIDAASPLEKVCLIGCGFSTG +YGSAVNVAKVTPGSTCAVFGLGGVGLSAVMGCKAAGAARIIAVDINKDKFAKAKELGATE +CINPQDYKKPIQEVLKEMTDGGVDFSFEVIGRLDTMMASLLCCHEACGTSVIVGVPPASQ +NLSINPMLLLTGRTWKGAVYGGFKSKEGIPKLVADFMAKKFSLDALITHVLPFEKINEGF +DLLHSGKSIRTVLTF +>sp|P00329|ADH1_MOUSE Alcohol dehydrogenase 1 OS=Mus musculus GN=Adh1 PE=2 SV=2 +MSTAGKVIKCKAAVLWELHKPFTIEDIEVAPPKAHEVRIKMVATGVCRSDDHVVSGTLVT +PLPAVLGHEGAGIVESVGEGVTCVKPGDKVIPLFSPQCGECRICKHPESNFCSRSDLLMP +RGTLREGTSRFSCKGKQIHNFISTSTFSQYTVVDDIAVAKIDGASPLDKVCLIGCGFSTG +YGSAVKVAKVTPGSTCAVFGLGGVGLSVIIGCKAAGAARIIAVDINKDKFAKAKELGATE +CINPQDYSKPIQEVLQEMTDGGVDFSFEVIGRLDTMTSALLSCHAACGVSVVVGVPPNAQ +NLSMNPMLLLLGRTWKGAIFGGFKSKDSVPKLVADFMAKKFPLDPLITHVLPFEKINEAF +DLLRSGKSIRTVLTF +>sp|P00338-2|LDHA_HUMAN Isoform 2 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA +MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG +EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI +IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV +HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKECRYTLGDPKGA +AILKSSDVISFHCLGYNRILGGGCACCPFYLICD +>sp|P00338-5|LDHA_HUMAN Isoform 5 of L-lactate dehydrogenase A chain OS=Homo sapiens GN=LDHA +MATLKDQLIYNLLKEEQTPQNKITVVGVGAVGMACAISILMKDLADELALVDVIEDKLKG +EMMDLQHGSLFLRTPKIVSGKDYNVTANSKLVIITAGARQQEGESRLNLVQRNVNIFKFI +IPNVVKYSPNCKLLIVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV +HPLSCHGWVLGEHGDSSVPVWSGMNVAGVSLKTLHPDLGTDKDKEQWKEVHKQVVERVFT +E +>sp|P00340|LDHA_CHICK L-lactate dehydrogenase A chain OS=Gallus gallus GN=LDHA PE=1 SV=3 +MSLKDHLIHNVHKEEHAHAHNKISVVGVGAVGMACAISILMKDLADELTLVDVVEDKLKG +EMLDLQHGSLFLKTPKIISGKDYSVTAHSKLVIVTAGARQQEGESRLNLVQRNVNIFKFI +IPNVVKYSPDCKLLIVSNPVDILTYVAWKISGFPKHRVIGSGCNLDSARFRHLMGERLGI +HPLSCHGWIVGEHGDSSVPVWSGVNVAGVSLKALHPDMGTDADKEHWKEVHKQVVDSAYE +VIKLKGYTSWAIGLSVADLAETIMKNLRRVHPISTAVKGMHGIKDDVFLSVPCVLGSSGI +TDVVKMILKPDEEEKIKKSADTLWGIQKELQF +>sp|P19858|LDHA_BOVIN L-lactate dehydrogenase A chain OS=Bos taurus GN=LDHA PE=2 SV=2 +MATLKDQLIQNLLKEEHVPQNKITIVGVGAVGMACAISILMKDLADEVALVDVMEDKLKG +EMMDLQHGSLFLRTPKIVSGKDYNVTANSRLVIITAGARQQEGESRLNLVQRNVNIFKFI +IPNIVKYSPNCKLLVVSNPVDILTYVAWKISGFPKNRVIGSGCNLDSARFRYLMGERLGV +HPLSCHGWILGEHGDSSVPVWSGVNVAGVSLKNLHPELGTDADKEQWKAVHKQVVDSAYE +VIKLKGYTSWAIGLSVADLAESIMKNLRRVHPISTMIKGLYGIKEDVFLSVPCILGQNGI +SDVVKVTLTHEEEACLKKSADTLWGIQKELQF