Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/Tools/RestrictionEnzyme.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/RestrictionEnzyme.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,1247 @@ +#------------------------------------------------------------------ +# $Id: RestrictionEnzyme.pm,v 1.25.2.1 2003/06/29 00:53:20 jason Exp $ +# +# BioPerl module Bio::Tools::RestrictionEnzyme +# +# Cared for by Steve Chervitz <sac@bioperl.org> +# +# You may distribute this module under the same terms as perl itself +#------------------------------------------------------------------ + +## POD Documentation: + +=head1 NAME + +Bio::Tools::RestrictionEnzyme - Bioperl object for a restriction endonuclease +(cuts DNA at specific locations) + +=head1 SYNOPSIS + +=head2 Object Creation + + require Bio::Tools::RestrictionEnzyme; + + ## Create a new object by name. + + $re1 = new Bio::Tools::RestrictionEnzyme(-NAME =>'EcoRI'); + + ## Create a new object using special syntax + ## which specifies the enzyme name, recognition site, and cut position. + ## Used for enzymes not known to this module. + + $re2 = new Bio::Tools::RestrictionEnzyme(-NAME =>'EcoRV--GAT^ATC', + -MAKE =>'custom'); + + ## Get a list of the resulting fragments when a sequence is cut with + ## the given enzyme. The method expects a Bio::Seq object. + + @fragments = $re2->cut_seq($seqobj); + + ## Get a list of names of all available restriction enzymes + ## known to this module. + + @all = $re->available_list(); + + ## Get the names of restriction enzymes that have 6 bp + ## recognition sequences. + + @sixcutters = $re->available_list(6); + + +=head1 INSTALLATION + +This module is included with the central Bioperl distribution: + + http://bio.perl.org/Core/Latest + ftp://bio.perl.org/pub/DIST + +Follow the installation instructions included in the README file. + +=head1 DESCRIPTION + +The Bio::Tools::RestrictionEnzyme.pm module encapsulates generic data and +methods for using restriction endonucleases for in silico restriction +analysis of DNA sequences. + +=head2 Considerations + +This module is a precursor for a more full featured version that may do such +things as download data from online databases such as REBase http://www.neb.com/rebase/. +Thus, there is currently no functionality for obtaining data about commercial +availability for a restriction enzyme. + +At some point in the future, it may make sense to derive RestrictionEnzymes +from a class such as Bio::Enzyme or Bio::Prot::Protein (neither of which now +exist) so that more data about the enzyme and related information can be +easily obtained. + +This module is currently in use at + + http://genome-www.stanford.edu/Sacch3D/analysis/ + + +=head2 Digesting on Runs of N + +To digest a sequence on runs of N's in the sequence. Here's what you can do: + + $re_n = new Bio::Tools::RestrictionEnzyme(-name=>'N--NNNNN', + -make=>'custom'); + +Specify the number of N's you want to match in the -name parameter. +So the above example will recognize and cut at runs of 5 Ns. + If you wanted to cut at runs of 10 N's, you would use + + -name => 'N--NNNNNNNNNN' + +Note that you must use a specific number of N's, you cannot use a regexp to +digest at N+ for example, because the actual number of N's at each site are +not recorded when the sequence is analyzed. So cut_locations( ) wouldn't be +correct. + +=head1 EXAMPLES + +See the script examples/restriction.pl in the Bioperl distribution. + +=head1 DEPENDENCIES + +Bio::Tools::RestrictionEnzyme.pm is a concrete class that inherits from +B<Bio::Root::Root> and uses by delegation B<Bio::PrimarySeq>. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other Bioperl +modules. Send your comments and suggestions preferably to one of the Bioperl +mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track the bugs +and their resolution. Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz, E<lt>sac@bioperl.orgE<gt> + +=head1 COPYRIGHT + +Copyright (c) 1997-2002 Steve A. Chervitz. All Rights Reserved. +This module is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=head1 SEE ALSO + + Bio::Root::Root - Base class. + Bio::PrimarySeq - Lightweight sequence object. + + http://bio.perl.org/ - Bioperl Project Homepage + +=cut + +# +## +### +#### END of main POD documentation. +### +## +#' + + +=head1 APPENDIX + +Methods beginning with a leading underscore are considered private +and are intended for internal use by this module. They are +B<not> considered part of the public interface and are described here +for documentation purposes only. + +=cut + + +package Bio::Tools::RestrictionEnzyme; +use strict; + +use Bio::Root::Root; +use Exporter; + +use vars qw (@ISA @EXPORT_OK %EXPORT_TAGS $ID $version @RE_available $Revision); + +@ISA = qw(Bio::Root::Root Exporter); +@EXPORT_OK = qw(@RE_available); +%EXPORT_TAGS = ( std => [qw(@RE_available)] ); + +$ID = 'Bio::Tools::RestrictionEnzyme'; +$version = 0.04; +$Revision = '$Id: RestrictionEnzyme.pm,v 1.25.2.1 2003/06/29 00:53:20 jason Exp $'; #' + +# Generated from REBASE version 208 (strider format), dated Aug 1 2002 +# using scripts/contributed/rebase2list.pl +# Syntax: RE-name => 'SITE CUTS-AT' where SITE and CUTS-AT are separated +# by a space. + +my %RE = ( + 'AasI' => 'GACNNNNNNGTC 7', + 'AatI' => 'AGGCCT 3', + 'AatII' => 'GACGTC 5', + 'AauI' => 'TGTACA 1', + 'AccI' => 'GTMKAC 2', + 'AccII' => 'CGCG 2', + 'AccIII' => 'TCCGGA 1', + 'Acc16I' => 'TGCGCA 3', + 'Acc65I' => 'GGTACC 1', + 'Acc113I' => 'AGTACT 3', + 'AccB1I' => 'GGYRCC 1', + 'AccB7I' => 'CCANNNNNTGG 7', + 'AclI' => 'AACGTT 2', + 'AcsI' => 'RAATTY 1', + 'AcvI' => 'CACGTG 3', + 'AcyI' => 'GRCGYC 2', + 'AdeI' => 'CACNNNGTG 6', + 'AfaI' => 'GTAC 2', + 'AfeI' => 'AGCGCT 3', + 'AflI' => 'GGWCC 1', + 'AflII' => 'CTTAAG 1', + 'AflIII' => 'ACRYGT 1', + 'AgeI' => 'ACCGGT 1', + 'AhaIII' => 'TTTAAA 3', + 'AhdI' => 'GACNNNNNGTC 6', + 'AhlI' => 'ACTAGT 1', + 'AleI' => 'CACNNNNGTG 5', + 'AluI' => 'AGCT 2', + 'Alw21I' => 'GWGCWC 5', + 'Alw44I' => 'GTGCAC 1', + 'AlwNI' => 'CAGNNNCTG 6', + 'Ama87I' => 'CYCGRG 1', + 'AocI' => 'CCTNAGG 2', + 'Aor51HI' => 'AGCGCT 3', + 'ApaI' => 'GGGCCC 5', + 'ApaBI' => 'GCANNNNNTGC 8', + 'ApaLI' => 'GTGCAC 1', + 'ApoI' => 'RAATTY 1', + 'AscI' => 'GGCGCGCC 2', + 'AseI' => 'ATTAAT 2', + 'AsiAI' => 'ACCGGT 1', + 'AsiSI' => 'GCGATCGC 5', + 'AsnI' => 'ATTAAT 2', + 'AspI' => 'GACNNNGTC 4', + 'Asp700I' => 'GAANNNNTTC 5', + 'Asp718I' => 'GGTACC 1', + 'AspEI' => 'GACNNNNNGTC 6', + 'AspHI' => 'GWGCWC 5', + 'AspLEI' => 'GCGC 3', + 'AspS9I' => 'GGNCC 1', + 'AsuI' => 'GGNCC 1', + 'AsuII' => 'TTCGAA 2', + 'AsuC2I' => 'CCSGG 2', + 'AsuNHI' => 'GCTAGC 1', + 'AvaI' => 'CYCGRG 1', + 'AvaII' => 'GGWCC 1', + 'AviII' => 'TGCGCA 3', + 'AvrII' => 'CCTAGG 1', + 'AxyI' => 'CCTNAGG 2', + 'BalI' => 'TGGCCA 3', + 'BamHI' => 'GGATCC 1', + 'BanI' => 'GGYRCC 1', + 'BanII' => 'GRGCYC 5', + 'BanIII' => 'ATCGAT 2', + 'BbeI' => 'GGCGCC 5', + 'BbrPI' => 'CACGTG 3', + 'BbuI' => 'GCATGC 5', + 'Bbv12I' => 'GWGCWC 5', + 'BclI' => 'TGATCA 1', + 'BcnI' => 'CCSGG 2', + 'BcoI' => 'CYCGRG 1', + 'BcuI' => 'ACTAGT 1', + 'BetI' => 'WCCGGW 1', + 'BfaI' => 'CTAG 1', + 'BfmI' => 'CTRYAG 1', + 'BfrI' => 'CTTAAG 1', + 'BfrBI' => 'ATGCAT 3', + 'BfuCI' => 'GATC 0', + 'BglI' => 'GCCNNNNNGGC 7', + 'BglII' => 'AGATCT 1', + 'BlnI' => 'CCTAGG 1', + 'BloHII' => 'CTGCAG 5', + 'BlpI' => 'GCTNAGC 2', + 'Bme18I' => 'GGWCC 1', + 'Bme1390I' => 'CCNGG 2', + 'Bme1580I' => 'GKGCMC 5', + 'BmtI' => 'GCTAGC 5', + 'BmyI' => 'GDGCHC 5', + 'BoxI' => 'GACNNNNGTC 5', + 'Bpu14I' => 'TTCGAA 2', + 'Bpu1102I' => 'GCTNAGC 2', + 'Bsa29I' => 'ATCGAT 2', + 'BsaAI' => 'YACGTR 3', + 'BsaBI' => 'GATNNNNATC 5', + 'BsaHI' => 'GRCGYC 2', + 'BsaJI' => 'CCNNGG 1', + 'BsaOI' => 'CGRYCG 4', + 'BsaWI' => 'WCCGGW 1', + 'BscI' => 'ATCGAT 2', + 'Bsc4I' => 'CCNNNNNNNGG 7', + 'BscBI' => 'GGNNCC 3', + 'BscFI' => 'GATC 0', + 'Bse8I' => 'GATNNNNATC 5', + 'Bse21I' => 'CCTNAGG 2', + 'Bse118I' => 'RCCGGY 1', + 'BseAI' => 'TCCGGA 1', + 'BseBI' => 'CCWGG 2', + 'BseCI' => 'ATCGAT 2', + 'BseDI' => 'CCNNGG 1', + 'BseJI' => 'GATNNNNATC 5', + 'BseLI' => 'CCNNNNNNNGG 7', + 'BsePI' => 'GCGCGC 1', + 'BseSI' => 'GKGCMC 5', + 'BseX3I' => 'CGGCCG 1', + 'BshI' => 'GGCC 2', + 'Bsh1236I' => 'CGCG 2', + 'Bsh1285I' => 'CGRYCG 4', + 'BshFI' => 'GGCC 2', + 'BshNI' => 'GGYRCC 1', + 'BshTI' => 'ACCGGT 1', + 'BsiBI' => 'GATNNNNATC 5', + 'BsiCI' => 'TTCGAA 2', + 'BsiEI' => 'CGRYCG 4', + 'BsiHKAI' => 'GWGCWC 5', + 'BsiHKCI' => 'CYCGRG 1', + 'BsiLI' => 'CCWGG 2', + 'BsiMI' => 'TCCGGA 1', + 'BsiQI' => 'TGATCA 1', + 'BsiSI' => 'CCGG 1', + 'BsiWI' => 'CGTACG 1', + 'BsiXI' => 'ATCGAT 2', + 'BsiYI' => 'CCNNNNNNNGG 7', + 'BsiZI' => 'GGNCC 1', + 'BslI' => 'CCNNNNNNNGG 7', + 'BsoBI' => 'CYCGRG 1', + 'Bsp13I' => 'TCCGGA 1', + 'Bsp19I' => 'CCATGG 1', + 'Bsp68I' => 'TCGCGA 3', + 'Bsp106I' => 'ATCGAT 2', + 'Bsp119I' => 'TTCGAA 2', + 'Bsp120I' => 'GGGCCC 1', + 'Bsp143I' => 'GATC 0', + 'Bsp143II' => 'RGCGCY 5', + 'Bsp1286I' => 'GDGCHC 5', + 'Bsp1407I' => 'TGTACA 1', + 'Bsp1720I' => 'GCTNAGC 2', + 'BspA2I' => 'CCTAGG 1', + 'BspCI' => 'CGATCG 4', + 'BspDI' => 'ATCGAT 2', + 'BspEI' => 'TCCGGA 1', + 'BspHI' => 'TCATGA 1', + 'BspLI' => 'GGNNCC 3', + 'BspLU11I' => 'ACATGT 1', + 'BspMII' => 'TCCGGA 1', + 'BspTI' => 'CTTAAG 1', + 'BspT104I' => 'TTCGAA 2', + 'BspT107I' => 'GGYRCC 1', + 'BspXI' => 'ATCGAT 2', + 'BsrBRI' => 'GATNNNNATC 5', + 'BsrFI' => 'RCCGGY 1', + 'BsrGI' => 'TGTACA 1', + 'BssAI' => 'RCCGGY 1', + 'BssECI' => 'CCNNGG 1', + 'BssHI' => 'CTCGAG 1', + 'BssHII' => 'GCGCGC 1', + 'BssKI' => 'CCNGG 0', + 'BssNAI' => 'GTATAC 3', + 'BssT1I' => 'CCWWGG 1', + 'Bst98I' => 'CTTAAG 1', + 'Bst1107I' => 'GTATAC 3', + 'BstACI' => 'GRCGYC 2', + 'BstAPI' => 'GCANNNNNTGC 7', + 'BstBI' => 'TTCGAA 2', + 'BstBAI' => 'YACGTR 3', + 'Bst4CI' => 'ACNGT 3', + 'BstC8I' => 'GCNNGC 3', + 'BstDEI' => 'CTNAG 1', + 'BstDSI' => 'CCRYGG 1', + 'BstEII' => 'GGTNACC 1', + 'BstENI' => 'CCTNNNNNAGG 5', + 'BstENII' => 'GATC 0', + 'BstFNI' => 'CGCG 2', + 'BstH2I' => 'RGCGCY 5', + 'BstHHI' => 'GCGC 3', + 'BstHPI' => 'GTTAAC 3', + 'BstKTI' => 'GATC 3', + 'BstMAI' => 'CTGCAG 5', + 'BstMCI' => 'CGRYCG 4', + 'BstMWI' => 'GCNNNNNNNGC 7', + 'BstNI' => 'CCWGG 2', + 'BstNSI' => 'RCATGY 5', + 'BstOI' => 'CCWGG 2', + 'BstPI' => 'GGTNACC 1', + 'BstPAI' => 'GACNNNNGTC 5', + 'BstSCI' => 'CCNGG 0', + 'BstSFI' => 'CTRYAG 1', + 'BstSNI' => 'TACGTA 3', + 'BstUI' => 'CGCG 2', + 'Bst2UI' => 'CCWGG 2', + 'BstXI' => 'CCANNNNNNTGG 8', + 'BstX2I' => 'RGATCY 1', + 'BstYI' => 'RGATCY 1', + 'BstZI' => 'CGGCCG 1', + 'BstZ17I' => 'GTATAC 3', + 'Bsu15I' => 'ATCGAT 2', + 'Bsu36I' => 'CCTNAGG 2', + 'BsuRI' => 'GGCC 2', + 'BsuTUI' => 'ATCGAT 2', + 'BtgI' => 'CCRYGG 1', + 'BthCI' => 'GCNGC 4', + 'Cac8I' => 'GCNNGC 3', + 'CaiI' => 'CAGNNNCTG 6', + 'CauII' => 'CCSGG 2', + 'CciNI' => 'GCGGCCGC 2', + 'CelII' => 'GCTNAGC 2', + 'CfoI' => 'GCGC 3', + 'CfrI' => 'YGGCCR 1', + 'Cfr9I' => 'CCCGGG 1', + 'Cfr10I' => 'RCCGGY 1', + 'Cfr13I' => 'GGNCC 1', + 'Cfr42I' => 'CCGCGG 4', + 'ChaI' => 'GATC 4', + 'ClaI' => 'ATCGAT 2', + 'CpoI' => 'CGGWCCG 2', + 'CspI' => 'CGGWCCG 2', + 'Csp6I' => 'GTAC 1', + 'Csp45I' => 'TTCGAA 2', + 'CspAI' => 'ACCGGT 1', + 'CviAII' => 'CATG 1', + 'CviJI' => 'RGCY 2', + 'CviRI' => 'TGCA 2', + 'CviTI' => 'RGCY 2', + 'CvnI' => 'CCTNAGG 2', + 'DdeI' => 'CTNAG 1', + 'DpnI' => 'GATC 2', + 'DpnII' => 'GATC 0', + 'DraI' => 'TTTAAA 3', + 'DraII' => 'RGGNCCY 2', + 'DraIII' => 'CACNNNGTG 6', + 'DrdI' => 'GACNNNNNNGTC 7', + 'DsaI' => 'CCRYGG 1', + 'DseDI' => 'GACNNNNNNGTC 7', + 'EaeI' => 'YGGCCR 1', + 'EagI' => 'CGGCCG 1', + 'Eam1105I' => 'GACNNNNNGTC 6', + 'Ecl136II' => 'GAGCTC 3', + 'EclHKI' => 'GACNNNNNGTC 6', + 'EclXI' => 'CGGCCG 1', + 'Eco24I' => 'GRGCYC 5', + 'Eco32I' => 'GATATC 3', + 'Eco47I' => 'GGWCC 1', + 'Eco47III' => 'AGCGCT 3', + 'Eco52I' => 'CGGCCG 1', + 'Eco72I' => 'CACGTG 3', + 'Eco81I' => 'CCTNAGG 2', + 'Eco88I' => 'CYCGRG 1', + 'Eco91I' => 'GGTNACC 1', + 'Eco105I' => 'TACGTA 3', + 'Eco130I' => 'CCWWGG 1', + 'Eco147I' => 'AGGCCT 3', + 'EcoHI' => 'CCSGG 0', + 'EcoICRI' => 'GAGCTC 3', + 'EcoNI' => 'CCTNNNNNAGG 5', + 'EcoO65I' => 'GGTNACC 1', + 'EcoO109I' => 'RGGNCCY 2', + 'EcoRI' => 'GAATTC 1', + 'EcoRII' => 'CCWGG 0', + 'EcoRV' => 'GATATC 3', + 'EcoT14I' => 'CCWWGG 1', + 'EcoT22I' => 'ATGCAT 5', + 'EcoT38I' => 'GRGCYC 5', + 'EgeI' => 'GGCGCC 3', + 'EheI' => 'GGCGCC 3', + 'ErhI' => 'CCWWGG 1', + 'EsaBC3I' => 'TCGA 2', + 'EspI' => 'GCTNAGC 2', + 'FatI' => 'CATG 0', + 'FauNDI' => 'CATATG 2', + 'FbaI' => 'TGATCA 1', + 'FblI' => 'GTMKAC 2', + 'FmuI' => 'GGNCC 4', + 'FnuDII' => 'CGCG 2', + 'Fnu4HI' => 'GCNGC 2', + 'FriOI' => 'GRGCYC 5', + 'FseI' => 'GGCCGGCC 6', + 'FspI' => 'TGCGCA 3', + 'FspAI' => 'RTGCGCAY 4', + 'Fsp4HI' => 'GCNGC 2', + 'FunI' => 'AGCGCT 3', + 'FunII' => 'GAATTC 1', + 'HaeI' => 'WGGCCW 3', + 'HaeII' => 'RGCGCY 5', + 'HaeIII' => 'GGCC 2', + 'HapII' => 'CCGG 1', + 'HgiAI' => 'GWGCWC 5', + 'HgiCI' => 'GGYRCC 1', + 'HgiJII' => 'GRGCYC 5', + 'HhaI' => 'GCGC 3', + 'Hin1I' => 'GRCGYC 2', + 'Hin6I' => 'GCGC 1', + 'HinP1I' => 'GCGC 1', + 'HincII' => 'GTYRAC 3', + 'HindII' => 'GTYRAC 3', + 'HindIII' => 'AAGCTT 1', + 'HinfI' => 'GANTC 1', + 'HpaI' => 'GTTAAC 3', + 'HpaII' => 'CCGG 1', + 'Hpy8I' => 'GTNNAC 3', + 'Hpy99I' => 'CGWCG 5', + 'Hpy178III' => 'TCNNGA 2', + 'Hpy188I' => 'TCNGA 3', + 'Hpy188III' => 'TCNNGA 2', + 'HpyCH4I' => 'CATG 3', + 'HpyCH4III' => 'ACNGT 3', + 'HpyCH4IV' => 'ACGT 1', + 'HpyCH4V' => 'TGCA 2', + 'HpyF10VI' => 'GCNNNNNNNGC 8', + 'Hsp92I' => 'GRCGYC 2', + 'Hsp92II' => 'CATG 4', + 'HspAI' => 'GCGC 1', + 'ItaI' => 'GCNGC 2', + 'KasI' => 'GGCGCC 1', + 'KpnI' => 'GGTACC 5', + 'Kpn2I' => 'TCCGGA 1', + 'KspI' => 'CCGCGG 4', + 'Ksp22I' => 'TGATCA 1', + 'KspAI' => 'GTTAAC 3', + 'Kzo9I' => 'GATC 0', + 'LpnI' => 'RGCGCY 3', + 'LspI' => 'TTCGAA 2', + 'MabI' => 'ACCWGGT 1', + 'MaeI' => 'CTAG 1', + 'MaeII' => 'ACGT 1', + 'MaeIII' => 'GTNAC 0', + 'MamI' => 'GATNNNNATC 5', + 'MboI' => 'GATC 0', + 'McrI' => 'CGRYCG 4', + 'MfeI' => 'CAATTG 1', + 'MflI' => 'RGATCY 1', + 'MhlI' => 'GDGCHC 5', + 'MlsI' => 'TGGCCA 3', + 'MluI' => 'ACGCGT 1', + 'MluNI' => 'TGGCCA 3', + 'Mly113I' => 'GGCGCC 2', + 'Mph1103I' => 'ATGCAT 5', + 'MroI' => 'TCCGGA 1', + 'MroNI' => 'GCCGGC 1', + 'MroXI' => 'GAANNNNTTC 5', + 'MscI' => 'TGGCCA 3', + 'MseI' => 'TTAA 1', + 'MslI' => 'CAYNNNNRTG 5', + 'MspI' => 'CCGG 1', + 'Msp20I' => 'TGGCCA 3', + 'MspA1I' => 'CMGCKG 3', + 'MspCI' => 'CTTAAG 1', + 'MspR9I' => 'CCNGG 2', + 'MssI' => 'GTTTAAAC 4', + 'MstI' => 'TGCGCA 3', + 'MunI' => 'CAATTG 1', + 'MvaI' => 'CCWGG 2', + 'MvnI' => 'CGCG 2', + 'MwoI' => 'GCNNNNNNNGC 7', + 'NaeI' => 'GCCGGC 3', + 'NarI' => 'GGCGCC 2', + 'NciI' => 'CCSGG 2', + 'NcoI' => 'CCATGG 1', + 'NdeI' => 'CATATG 2', + 'NdeII' => 'GATC 0', + 'NgoAIV' => 'GCCGGC 1', + 'NgoMIV' => 'GCCGGC 1', + 'NheI' => 'GCTAGC 1', + 'NlaIII' => 'CATG 4', + 'NlaIV' => 'GGNNCC 3', + 'Nli3877I' => 'CYCGRG 5', + 'NmuCI' => 'GTSAC 0', + 'NotI' => 'GCGGCCGC 2', + 'NruI' => 'TCGCGA 3', + 'NruGI' => 'GACNNNNNGTC 6', + 'NsbI' => 'TGCGCA 3', + 'NsiI' => 'ATGCAT 5', + 'NspI' => 'RCATGY 5', + 'NspIII' => 'CYCGRG 1', + 'NspV' => 'TTCGAA 2', + 'NspBII' => 'CMGCKG 3', + 'OliI' => 'CACNNNNGTG 5', + 'PacI' => 'TTAATTAA 5', + 'PaeI' => 'GCATGC 5', + 'PaeR7I' => 'CTCGAG 1', + 'PagI' => 'TCATGA 1', + 'PalI' => 'GGCC 2', + 'PauI' => 'GCGCGC 1', + 'PceI' => 'AGGCCT 3', + 'PciI' => 'ACATGT 1', + 'PdiI' => 'GCCGGC 3', + 'PdmI' => 'GAANNNNTTC 5', + 'Pfl23II' => 'CGTACG 1', + 'PflBI' => 'CCANNNNNTGG 7', + 'PflFI' => 'GACNNNGTC 4', + 'PflMI' => 'CCANNNNNTGG 7', + 'PfoI' => 'TCCNGGA 1', + 'PinAI' => 'ACCGGT 1', + 'Ple19I' => 'CGATCG 4', + 'PmaCI' => 'CACGTG 3', + 'PmeI' => 'GTTTAAAC 4', + 'PmlI' => 'CACGTG 3', + 'Ppu10I' => 'ATGCAT 1', + 'PpuMI' => 'RGGWCCY 2', + 'PpuXI' => 'RGGWCCY 2', + 'PshAI' => 'GACNNNNGTC 5', + 'PshBI' => 'ATTAAT 2', + 'PsiI' => 'TTATAA 3', + 'Psp03I' => 'GGWCC 4', + 'Psp5II' => 'RGGWCCY 2', + 'Psp6I' => 'CCWGG 0', + 'Psp1406I' => 'AACGTT 2', + 'PspAI' => 'CCCGGG 1', + 'Psp124BI' => 'GAGCTC 5', + 'PspEI' => 'GGTNACC 1', + 'PspGI' => 'CCWGG 0', + 'PspLI' => 'CGTACG 1', + 'PspN4I' => 'GGNNCC 3', + 'PspOMI' => 'GGGCCC 1', + 'PspPI' => 'GGNCC 1', + 'PspPPI' => 'RGGWCCY 2', + 'PssI' => 'RGGNCCY 5', + 'PstI' => 'CTGCAG 5', + 'PsuI' => 'RGATCY 1', + 'PsyI' => 'GACNNNGTC 4', + 'PvuI' => 'CGATCG 4', + 'PvuII' => 'CAGCTG 3', + 'RcaI' => 'TCATGA 1', + 'RsaI' => 'GTAC 2', + 'RsrII' => 'CGGWCCG 2', + 'Rsr2I' => 'CGGWCCG 2', + 'SacI' => 'GAGCTC 5', + 'SacII' => 'CCGCGG 4', + 'SalI' => 'GTCGAC 1', + 'SanDI' => 'GGGWCCC 2', + 'SatI' => 'GCNGC 2', + 'SauI' => 'CCTNAGG 2', + 'Sau96I' => 'GGNCC 1', + 'Sau3AI' => 'GATC 0', + 'SbfI' => 'CCTGCAGG 6', + 'ScaI' => 'AGTACT 3', + 'SciI' => 'CTCGAG 3', + 'ScrFI' => 'CCNGG 2', + 'SdaI' => 'CCTGCAGG 6', + 'SduI' => 'GDGCHC 5', + 'SecI' => 'CCNNGG 1', + 'SelI' => 'CGCG 0', + 'SexAI' => 'ACCWGGT 1', + 'SfcI' => 'CTRYAG 1', + 'SfeI' => 'CTRYAG 1', + 'SfiI' => 'GGCCNNNNNGGCC 8', + 'SfoI' => 'GGCGCC 3', + 'Sfr274I' => 'CTCGAG 1', + 'Sfr303I' => 'CCGCGG 4', + 'SfuI' => 'TTCGAA 2', + 'SgfI' => 'GCGATCGC 5', + 'SgrAI' => 'CRCCGGYG 2', + 'SgrBI' => 'CCGCGG 4', + 'SinI' => 'GGWCC 1', + 'SlaI' => 'CTCGAG 1', + 'SmaI' => 'CCCGGG 3', + 'SmiI' => 'ATTTAAAT 4', + 'SmiMI' => 'CAYNNNNRTG 5', + 'SmlI' => 'CTYRAG 1', + 'SnaBI' => 'TACGTA 3', + 'SpaHI' => 'GCATGC 5', + 'SpeI' => 'ACTAGT 1', + 'SphI' => 'GCATGC 5', + 'SplI' => 'CGTACG 1', + 'SrfI' => 'GCCCGGGC 4', + 'Sse9I' => 'AATT 0', + 'Sse232I' => 'CGCCGGCG 2', + 'Sse8387I' => 'CCTGCAGG 6', + 'Sse8647I' => 'AGGWCCT 2', + 'SseBI' => 'AGGCCT 3', + 'SspI' => 'AATATT 3', + 'SspBI' => 'TGTACA 1', + 'SstI' => 'GAGCTC 5', + 'SstII' => 'CCGCGG 4', + 'StuI' => 'AGGCCT 3', + 'StyI' => 'CCWWGG 1', + 'SunI' => 'CGTACG 1', + 'SwaI' => 'ATTTAAAT 4', + 'TaaI' => 'ACNGT 3', + 'TaiI' => 'ACGT 4', + 'TaqI' => 'TCGA 1', + 'TasI' => 'AATT 0', + 'TatI' => 'WGTACW 1', + 'TauI' => 'GCSGC 4', + 'TelI' => 'GACNNNGTC 4', + 'TfiI' => 'GAWTC 1', + 'ThaI' => 'CGCG 2', + 'TliI' => 'CTCGAG 1', + 'Tru1I' => 'TTAA 1', + 'Tru9I' => 'TTAA 1', + 'TscI' => 'ACGT 4', + 'TseI' => 'GCWGC 1', + 'Tsp45I' => 'GTSAC 0', + 'Tsp509I' => 'AATT 0', + 'Tsp4CI' => 'ACNGT 3', + 'TspEI' => 'AATT 0', + 'Tth111I' => 'GACNNNGTC 4', + 'TthHB8I' => 'TCGA 1', + 'UnbI' => 'GGNCC 0', + 'Van91I' => 'CCANNNNNTGG 7', + 'Vha464I' => 'CTTAAG 1', + 'VneI' => 'GTGCAC 1', + 'VpaK11AI' => 'GGWCC 0', + 'VpaK11BI' => 'GGWCC 1', + 'VspI' => 'ATTAAT 2', + 'XagI' => 'CCTNNNNNAGG 5', + 'XapI' => 'RAATTY 1', + 'XbaI' => 'TCTAGA 1', + 'XceI' => 'RCATGY 5', + 'XcmI' => 'CCANNNNNNNNNTGG 8', + 'XhoI' => 'CTCGAG 1', + 'XhoII' => 'RGATCY 1', + 'XmaI' => 'CCCGGG 1', + 'XmaIII' => 'CGGCCG 1', + 'XmaCI' => 'CCCGGG 1', + 'XmaJI' => 'CCTAGG 1', + 'XmiI' => 'GTMKAC 2', + 'XmnI' => 'GAANNNNTTC 5', + 'XspI' => 'CTAG 1', + 'ZhoI' => 'ATCGAT 2', + 'ZraI' => 'GACGTC 3', + 'Zsp2I' => 'ATGCAT 5', +); + +@RE_available = sort keys %RE; + + +=head1 new + + Title : new + Purpose : Initializes the RestrictionEnzyme object and calls + : superclass constructor last (Bio:Seq.pm). + Returns : n/a + Argument : Parameters passed to new() + Comments : A RestrictionEnzyme object manages its recognition sequence + : as a Bio::PrimarySeq object. + +See Also : L<_make_custom>(), L<_make_standard>(), B<Bio::PrimarySeq.pm::_initialize()> + +=cut + +#--------------- +sub new { +#--------------- + my($class, @args) = @_; + + my $self = $class->SUPER::new(@args); + my ($name,$make) = $self->_rearrange([qw(NAME MAKE)],@args); + $name && $self->name($name); + my %data; + if(defined $make && $make eq 'custom') { + %data = $self->_make_custom($name); + } else { + %data = $self->_make_standard($name); + } + $self->{'_seq'} = new Bio::PrimarySeq(%data, + -VERBOSE =>$self->verbose, + -alphabet => 'dna', + ); + return $self; +} + + +#=head1 _make_standard +# +# Title : _make_standard +# Usage : n/a; automatically called by _initialize() +# Purpose : Permits custom RE object construction from name. +# : 'EcoRI'. +# Returns : Hash containing named parameters for Bio::PrimarySeq.pm constructor. +# Argument : String containing string with special syntax. +# Throws : Exception if the requested enzyme name is unavailable. +# : NOTE: Case sensitive. +# +#See Also : L<Bio::PrimarySeq::_initialize()|Bio::PrimarySeq>, L<_make_custom()|_make_custom> +# +#=cut + +#------------------ +sub _make_standard { +#------------------ + my($self, $name) = @_; + + $name =~ s/^\s+|\s+$//g; + + $self->is_available($name) || + $self->throw("Unavailable or undefined enzyme: $name (Note: CASE SENSITIVE)\n" . + "Currently available enzymes: \n@RE_available\n"); + + my @data = split( ' ', $RE{$name}); + my (%dat); + $dat{-SEQ} = $data[0]; + $dat{-NAME} = $dat{-ID}= $name; + $self->{'_cuts_after'} = $data[1]; + + return %dat; +} + + +#=head1 _make_custom +# +# Title : _make_custom +# Usage : n/a; automatically called by _initialize() +# Purpose : Permits custom RE object construction from strings +# : such as 'EcoRI--G^AATTC' as the name of the enzyme. +# Returns : Hash containing named parameters for Bio::PrimarySeq.pm constructor. +# Argument : String containing string with special syntax. +# Throws : Exception if the string has bad syntax. +# : Warning if the string did not specify cut position. +# : Places cut site after 5'-most position. +# +#See Also : L<Bio::PrimarySeq::_initialize()|Bio::PrimarySeq> +# +#=cut + +#' +#----------------- +sub _make_custom { +#----------------- + my($self, $name) = @_; + + $name =~ s/\s+//g; + my @parts = split '--', $name; + my (%dat); + $dat{-NAME} = $dat{-ID} = $parts[0]; + $self->name($parts[0]); ## Reset name + + $parts[1] || return $self->throw("Undefined recognition site for $parts[0].", + "Use this syntax: EcoRV--GAT^ATC"); + ## Determine the cuts_after point. + my $cut_index = index $parts[1], '^'; + if( $cut_index <0) { $cut_index = 0; + $self->warn("Unknown cut position for $parts[0]. Assuming position 0\n" . + "Use carat to specify cut position (e.g., G^AATTC)"); } + $self->{'_cuts_after'} = $cut_index; + + ## Save the recognition sequence after removing the '^' + $parts[1] =~ s/\^//g; + $dat{-SEQ} = $parts[1]; + return %dat; +} + + +=head1 cuts_after + + Title : cuts_after + Usage : $num = $re->cuts_after(); + Purpose : Sets/Gets an integer indicating the position of cleavage + : relative to the 5' end of the recognition sequence. + Returns : Integer + Argument : Integer (optional) + Throws : Exception if argument is non-numeric. + Access : Public + Comments : This method is only needed to change the cuts at + : position. This data is automatically set during + : construction. + +See Also : L<_make_standard()|_make_standard>, L<_make_custom()|_make_custom> + +=cut + +#' +#--------------- +sub cuts_after { +#--------------- + my $self = shift; + if(@_) { my $num = shift; + if($num == 0 and $num ne '0') { + $self->throw("The cuts_after position be an integer ($num)"); + } + $self->{'_cuts_after'} = $num; + } + $self->{'_cuts_after'}; +} + + + +=head1 site + + Title : site + Usage : $re->site(); + Purpose : Gets the recognition sequence for the enzyme. + Example : $seq_string = $re->site(); + Returns : String containing recognition sequence indicating + : cleavage site as in 'G^AATTC'. + Argument : n/a + Throws : n/a + Comments : If you want a simple string representing the site without + any '^', use the string() method. + +See Also : L<string()|string> + +=cut + +#--------- +sub site { +#--------- + my $self = shift; + my $seq = $self->seq; + my $cuts_after = $self->cuts_after; + if($cuts_after > 0) { + if( $cuts_after >= $seq->length) { + return $seq->seq.'^'; + } else { + return $seq->subseq(1, $self->cuts_after).'^'.$seq->subseq($self->cuts_after+1, $seq->length); + } + } else { + return $seq->seq; + } +} + + +=head1 seq + + Title : seq + Usage : $re->seq(); + Purpose : Get the Bio::PrimarySeq.pm-derived object representing + : the recognition sequence + Returns : String + Argument : n/a + Throws : n/a + +See Also : L<string()|string>, L<revcom()|revcom> + +=cut + +#--------- +sub seq { my $self = shift; $self->{'_seq'}; } +#--------- + + + +=head1 string + + Title : string + Usage : $re->string(); + Purpose : Get a string representing the recognition sequence. + Returns : String. Does NOT contain a '^' representing the cut location + as returned by the site() method + Argument : n/a + Throws : n/a + Comments : Delegates to the Bio::PrimarySeq-derived object. + +See Also : L<seq()|seq>, L<site()|site>, L<revcom()|revcom> + +=cut + +#----------- +sub string { my $self = shift; $self->{'_seq'}->seq; } +#----------- + + + +=head1 revcom + + Title : revcom + Usage : $re->revcom(); + Purpose : Get a string representing the reverse complement of + : the recognition sequence. + Returns : String + Argument : n/a + Throws : n/a + Comments : Delegates to the Bio::PrimarySeq.pm-derived object, but needs to + get out the string from it, as now Bio::PrimarySeq->revcom makes a + Bio::PrimarySeq object + +See Also : L<seq()|seq>, L<string()|string> + +=cut + +#----------- +sub revcom { my $self = shift; $self->{'_seq'}->revcom->seq(); } +#----------- + + + +=head1 cut_seq + + Title : cut_seq + Usage : $re->cut_seq(<sequence object>); + Purpose : Conceptually cut or "digest" a DNA sequence with the given enzyme. + Example : $string = $re->cut_seq(<sequence object>); + Returns : List of strings containing the resulting fragments. + Argument : Reference to a Bio::PrimarySeq.pm-derived object. + Throws : Exception if argument is not an object. + : (Does not yet verify that it is derived from Bio::PrimarySeq.pm.) + Comments : Strategy relies on Perl's built-in split() function. + : Since split removes the recognition pattern, the resulting + : fragments are repaired after split()-ing. + : A side-effect of this is that for sites with ambiguous + : recognition sequence (i.e., containing N), the fragments + : will contain ambiguity characters instead of AGCT. + : + : There is currently no support for partial digestions. + : There is currently no support for circular sequences. + : (This should just involve merging the first and last frag + : if $seqObj->is_circular returns true). + +=cut + +#' +#------------- +sub cut_seq { +#------------- + my( $self, $seqObj) = @_; + if( !ref($seqObj) || + ! $seqObj->isa('Bio::PrimarySeqI') ) { + $self->throw( "Can't cut sequence. Missing or invalid object". + "seqObj: $seqObj"); + } + + my $cuts_after = $self->{'_cuts_after'}; + my ($site_3prime_seq, $site_5prime_seq); + my $reSeq = $self->seq; + if($cuts_after == 0) { + $site_3prime_seq = ''; + $site_5prime_seq = $reSeq->seq(); + } elsif($cuts_after == $reSeq->length) { + $site_3prime_seq = $reSeq->seq(); + $site_5prime_seq = ''; + } else { + $site_3prime_seq = $reSeq->subseq(1, $self->{'_cuts_after'}); + $site_5prime_seq = $reSeq->subseq($self->{'_cuts_after'}+1, $reSeq->length); + } + + $self->debug("3' site: $site_3prime_seq\n5' site: $site_5prime_seq\n"); + + my(@re_frags); + my $seq = uc $self->_expanded_string; + + if(!$self->palindromic and $self->name ne 'N') { + my $revseq = $self->_expanded_string( $reSeq->revcom->seq ); + $seq .= '|'.uc($revseq); + } + $self->debug(sprintf("$ID: site seq: %s\n\n", $seq)); + $self->debug(sprintf("$ID: splitting %s\n\n",$reSeq->seq)); + @re_frags = split(/$seq/i, $seqObj->seq); + + $self->debug("$ID: cut_seq, ".scalar @re_frags. " fragments.\n"); + + ## Re-attach the split recognition site back to the frags + ## since perl zapped them in the split() call. + my($i); + my $numFrags = scalar @re_frags; + for($i=0; $i<$numFrags; $i++) { + $i < $#re_frags and $re_frags[$i] = $re_frags[$i].$site_3prime_seq; + $i > 0 and $re_frags[$i] = $site_5prime_seq.$re_frags[$i]; + } + @re_frags; +} + +=head1 cut_locations + + Title : cut_locations + Usage : my $locations = $re->cut_locations(<sequence_object>); + Purpose : Report the location of the recognition site(s) within + : an input sequence. + Example : my $locations = $re->annotate_seq($seqObj); + Returns : Arrayref of starting locations where enzyme would cut + Argument : Reference to a Bio::PrimarySeqI-derived sequence object. + Throws : n/a + Comments : + +=cut + +#----------------- +sub cut_locations { +#----------------- + my($self, $seqobj) = @_; + + my $site = $self->_expanded_string; + my $seq = $seqobj->seq; + study($seq); + my @locations; + while( $seq =~ /($site)/ig ) { + # $` is preceding string before pattern so length returns position + push @locations, length($`); + } + return \@locations; +} + +# Purpose : Expand nucleotide ambiguity codes to their representative letters +# Argument: (optional) the string to be expanded. If not supplied, used +# the string returned by $self->string(). +# Returns : String +sub _expanded_string { + my ($self, $str) = @_; + + $str ||= $self->string; + + if( $self->name ne 'N' ) { + $str =~ s/N|X/\./g; + $str =~ s/R/\[AG\]/g; + $str =~ s/Y/\[CT\]/g; + $str =~ s/S/\[GC\]/g; + $str =~ s/W/\[AT\]/g; + $str =~ s/M/\[AC\]/g; + $str =~ s/K/\[TG\]/g; + $str =~ s/B/\[CGT\]/g; + $str =~ s/D/\[AGT\]/g; + $str =~ s/H/\[ACT\]/g; + $str =~ s/V/\[ACG\]/g; + } + return $str; +} + + +=head1 annotate_seq + + Title : annotate_seq + Usage : $re->annotate_seq(<sequence_object>); + Purpose : Identify the location of the recognition site(s) within + : an input sequence. Uses HTML. + Example : $annot_seq = $re->annotate_seq($seqObj); + Returns : String containing the annotated sequence. + Argument : Reference to a Bio::PrimarySeq.pm-derived sequence object. + Throws : n/a + Comments : The annotated sequence must be viewed with a web + : browser to see the location(s) of the recognition site(s). + +=cut + +#----------------- +sub annotate_seq { +#----------------- + my($self, $seqObj) = @_; + + my $site = $self->_expanded_string; + my $seq = $seqObj->seq; + + $seq =~ s|$site|<b>$site</b>|g; + return $seq; +} + + +=head1 palindromic + + Title : palindromic + Usage : $re->palindromic(); + Purpose : Determines if the recognition sequence is palindromic + : for the current restriction enzyme. + Returns : Boolean + Argument : n/a + Throws : n/a + Access : Public + Comments : A palindromic site (EcoRI): 5-GAATTC-3 + : 3-CTTAAG-5 + +=cut + +#---------------- +sub palindromic { +#---------------- + my $self = shift; + $self->string eq $self->revcom; +} + + + +=head1 is_available + + Title : is_available + Usage : $re->is_available(<string containing name of enzyme>); + Purpose : Determine if an enzyme is available (to this module). + : (see the package lexical %RE). + Example : $re->is_available('EcoRI'); + : &Bio::Tools::RestrictionEnzyme::is_available($object,'EcoRI'); + Returns : Boolean + Argument : String + Throws : n/a + Comments : This method does NOT give information about + : commercial availability (yet). + : Enzyme names are CASE SENSITIVE. + +See Also : L<available_list()|available_list> + +=cut + +#---------------- +sub is_available { +#---------------- + my($self,$name) = @_; + exists $RE{$name}; +} + +#-------------- +sub available { +#-------------- + my($self,$name) = @_; + print STDERR "\nDeprecated method: $ID:: available(); ". + "use is_available() instead.\n"; + $self->is_available($name); +} + + +=head2 name + + Title : name + Usage : $obj->name($newval) + Function: + Example : + Returns : value of name + Args : newvalue (optional) + + +=cut + +sub name{ + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'name'} = $value; + } + return $obj->{'name'}; + +} + +=head1 available_list + + Title : available_list + Usage : $re->available_list([<integer>]); + Purpose : Retrieve a list of currently available enzymes. + Example : @all = $re->available_list(); ## All enzymes + : @six_cutters = $re->available_list(6); ## All 6-cutters + Returns : List of strings + Argument : Integer (optional) + Throws : n/a + Comments : This method may be more appropriate for a REData.pm class. + +See Also : L<is_available()|is_available> + +=cut + +#------------------- +sub available_list { +#------------------- + my($self,$size) = @_; + $size ||= 'all'; + + $size eq 'all' and return @RE_available; + + my(@data, @names); + foreach (@RE_available) { + @data = split /\s/, $RE{$_}; + if(length $data[0] == $size) { + push @names, $_; + } + } + @names; +} + +1;