Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/LiveSeq/IO/SRS.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: SRS.pm,v 1.7 2001/06/18 08:27:55 heikki Exp $ | |
| 2 # | |
| 3 # bioperl module for Bio::LiveSeq::IO::SRS | |
| 4 # | |
| 5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net> | |
| 6 # | |
| 7 # Copyright Joseph Insana | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 # | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::LiveSeq::IO::SRS - Loader for LiveSeq from EMBL entries with SRS | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 my $db="EMBL"; | |
| 20 my $acc_id="M20132"; | |
| 21 my $query="embl-acc:$acc_id"; | |
| 22 | |
| 23 my $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query"); | |
| 24 | |
| 25 my @translationobjects=$loader->entry2liveseq(); | |
| 26 | |
| 27 my $gene="AR"; | |
| 28 my $gene=$loader->gene2liveseq("gene"); | |
| 29 | |
| 30 NOTE: The only -db now supported is EMBL. Hence it defaults to EMBL. | |
| 31 | |
| 32 =head1 DESCRIPTION | |
| 33 | |
| 34 This package uses the SRS (Sequence Retrieval System) to fetch a sequence | |
| 35 database entry, analyse it and create LiveSeq objects out of it. | |
| 36 | |
| 37 An embl-acc ID has to be passed to this package which will return references | |
| 38 to all translation objects created from the EMBL entry. | |
| 39 References to Transcription, DNA and Exon objects can all be retrieved departing | |
| 40 from these. | |
| 41 | |
| 42 Alternatively, a specific "gene" name can be specified, together with the | |
| 43 embl-acc ID. This will create a LiveSeq::Gene object with all relevant gene | |
| 44 features attached/created. | |
| 45 | |
| 46 =head1 AUTHOR - Joseph A.L. Insana | |
| 47 | |
| 48 Email: Insana@ebi.ac.uk, jinsana@gmx.net | |
| 49 | |
| 50 Address: | |
| 51 | |
| 52 EMBL Outstation, European Bioinformatics Institute | |
| 53 Wellcome Trust Genome Campus, Hinxton | |
| 54 Cambs. CB10 1SD, United Kingdom | |
| 55 | |
| 56 =head1 APPENDIX | |
| 57 | |
| 58 The rest of the documentation details each of the object | |
| 59 methods. Internal methods are usually preceded with a _ | |
| 60 | |
| 61 =cut | |
| 62 | |
| 63 # Let the code begin... | |
| 64 | |
| 65 package Bio::LiveSeq::IO::SRS; | |
| 66 $VERSION=2.4; | |
| 67 | |
| 68 # Version history: | |
| 69 # Wed Apr 5 13:06:43 BST 2000 v 1.0 restarted as a child of Loader.pm | |
| 70 # Wed Apr 5 15:57:22 BST 2000 v 1.1 load() created | |
| 71 # Thu Apr 6 02:52:11 BST 2000 v 1.2 new field "range" in hash | |
| 72 # Thu Apr 6 03:11:29 BST 2000 v 1.22 transition from $location to @range | |
| 73 # Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division | |
| 74 # Tue Apr 18 17:15:26 BST 2000 v 2.0 started coding swissprot2hash | |
| 75 # Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names | |
| 76 # Wed Jun 7 02:08:57 BST 2000 v 2.2 added support for joinedlocation features | |
| 77 # Thu Jun 29 19:07:59 BST 2000 v 2.22 Gene stripped of possible newlines (horrible characteristic of some entries!!!!) | |
| 78 # Fri Jun 30 14:08:21 BST 2000 v 2.3 SRS querying routines now conveniently wrapped in eval{} blocks to avoid SRS crash the whole LiveSeq | |
| 79 # Tue Jul 4 14:07:52 BST 2000 v 2.31 note&number added in val_qual_names | |
| 80 # Mon Sep 4 17:46:42 BST 2000 v 2.4 novelaasequence2gene() added | |
| 81 | |
| 82 use strict; | |
| 83 use Carp qw(cluck croak carp); | |
| 84 use vars qw($VERSION @ISA); | |
| 85 use lib $ENV{SRSEXE}; | |
| 86 use srsperl; | |
| 87 use Bio::Tools::CodonTable; # for novelaasequence2gene | |
| 88 | |
| 89 use Bio::LiveSeq::IO::Loader 2.2; | |
| 90 | |
| 91 @ISA=qw(Bio::LiveSeq::IO::Loader); | |
| 92 | |
| 93 # This package can in the future host other databases loading subroutines. | |
| 94 # e.g. ensembl2hash | |
| 95 | |
| 96 =head2 load | |
| 97 | |
| 98 Title : load | |
| 99 Usage : my $acc_id="M20132"; | |
| 100 my $query="embl-acc:$acc_id"; | |
| 101 $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query"); | |
| 102 | |
| 103 Function: loads an entry with SRS from a database into a hash | |
| 104 Returns : reference to a new object of class IO::SRS holding an entry | |
| 105 Errorcode 0 | |
| 106 Args : an SRS query resulting in one fetched EMBL (by default) entry | |
| 107 | |
| 108 =cut | |
| 109 | |
| 110 sub load { | |
| 111 my ($thing, %args) = @_; | |
| 112 my $class = ref($thing) || $thing; | |
| 113 my ($obj,%loader); | |
| 114 | |
| 115 my ($db,$query)=($args{-db},$args{-query}); | |
| 116 | |
| 117 if (defined($db)) { | |
| 118 unless ($db eq "EMBL") { | |
| 119 carp "Note: only EMBL now supported!"; | |
| 120 return(0); | |
| 121 } | |
| 122 } else { | |
| 123 $db="EMBL"; | |
| 124 } | |
| 125 | |
| 126 my $hashref; | |
| 127 if ($db eq "EMBL") { | |
| 128 my $test_transl=0; # change to 0 to avoid comparison of translation | |
| 129 | |
| 130 # these can be changed for future needs | |
| 131 my @embl_valid_feature_names=qw(CDS exon prim_transcript intron repeat_unit repeat_region mRNA); | |
| 132 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table); | |
| 133 | |
| 134 # dunno yet how to implement test_transl again.... | |
| 135 # probably on a one-on-one basis with each translation? | |
| 136 if ($test_transl) { | |
| 137 push (@embl_valid_qual_names,"translation"); # needed for test_transl | |
| 138 } | |
| 139 | |
| 140 # not to have the whole program die because of SRS death | |
| 141 eval { | |
| 142 $hashref=&embl2hash("$query",\@embl_valid_feature_names,\@embl_valid_qual_names); | |
| 143 }; | |
| 144 my $errormsg; | |
| 145 if ($@) { | |
| 146 foreach $errormsg (split(/\n/,$@)) { | |
| 147 if (index($errormsg,"in cleanup") == -1) { | |
| 148 carp "SRS EMBL Loader: $@"; | |
| 149 } | |
| 150 } | |
| 151 } | |
| 152 } | |
| 153 unless ($hashref) { return (0); } | |
| 154 | |
| 155 %loader = (db => $db, query => $query, hash => $hashref); | |
| 156 $obj = \%loader; | |
| 157 $obj = bless $obj, $class; | |
| 158 return $obj; | |
| 159 } | |
| 160 | |
| 161 =head2 embl2hash | |
| 162 | |
| 163 Title : embl2hash | |
| 164 Function: retrieves with SRS an EMBL entry, parses it and creates | |
| 165 a hash that contains all the information. | |
| 166 Returns : a reference to a hash | |
| 167 Errorcode: 0 | |
| 168 Args : an SRS query resulting in one fetched EMBL entry | |
| 169 i.e. a string in a form like "embl-acc:accession_number" | |
| 170 two array references to skip features and qualifiers (for | |
| 171 performance) | |
| 172 Example: @valid_features=qw(CDS exon prim_transcript mRNA); | |
| 173 @valid_qualifiers=qw(gene codon_start db_xref product rpt_family); | |
| 174 $hashref=&embl2hash("$query",\@valid_features,\@valid_qualifiers); | |
| 175 | |
| 176 =cut | |
| 177 | |
| 178 # this has to be defined here as it is the only thing really proper to | |
| 179 # the /SRS/ loader. Every loader has to sport his own "entry2hash" function | |
| 180 # the main functions will be in the Loader.pm | |
| 181 # arguments: embl SRS query resulting in one fetched EMBL entry | |
| 182 # to skip features and qualifiers (for performance), two array | |
| 183 # references must be passed (this can change into string arguments to | |
| 184 # be passed....) | |
| 185 # returns: a reference to a hash containing the important features requested | |
| 186 sub embl2hash { | |
| 187 my ($i,$j); | |
| 188 my $query=$_[0]; | |
| 189 my %valid_features; my %valid_names; | |
| 190 if ($_[1]) { | |
| 191 %valid_features = map {$_, 1} @{$_[1]}; # to skip features | |
| 192 } | |
| 193 if ($_[2]) { | |
| 194 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers | |
| 195 } | |
| 196 # SRS API used to fetch all relevant fields | |
| 197 my $sess = new Session; | |
| 198 my $set = $sess->query("[$query]", ""); | |
| 199 my $numEntries=$set->size(); | |
| 200 if ($numEntries < 1) { | |
| 201 carp "No sequence entry found for the query $query"; | |
| 202 return (0); | |
| 203 } elsif ($numEntries > 1) { | |
| 204 carp "Too many entries found for the input given"; | |
| 205 return (0); | |
| 206 } else { | |
| 207 my $entry = $set->getEntry(0); | |
| 208 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Division); | |
| 209 # Fetch what we can fetch without the loader | |
| 210 $entry_ID = $entry->fieldToString("id","","",""); | |
| 211 $entry_AccNumber = $entry->fieldToString("acc","","",""); | |
| 212 $entry_Molecule = $entry->fieldToString("mol","","",""); | |
| 213 $entry_Division = $entry->fieldToString("div","","",""); | |
| 214 $entry_Description = $entry->fieldToString("des","","",""); | |
| 215 $entry_Description =~ s/\n/ /g; | |
| 216 $entry_Organism = $entry->fieldToString("org","","",""); | |
| 217 $entry_SeqLength = $entry->fieldToString("sl","","",""); | |
| 218 # Now use the loader | |
| 219 my $loadedentry = $entry->load("EMBL"); | |
| 220 # Fetch the rest via the loader | |
| 221 $entry_Sequence = $loadedentry->attrStr("sequence"); | |
| 222 $entry_Sequence =~ s/\n//g; # from plain format to raw string | |
| 223 | |
| 224 # put into the hash | |
| 225 my %entryhash; | |
| 226 $entryhash{ID}=$entry_ID; | |
| 227 $entryhash{AccNumber}=$entry_AccNumber; | |
| 228 $entryhash{Molecule}=$entry_Molecule; | |
| 229 $entryhash{Division}=$entry_Division; | |
| 230 $entryhash{Description}=$entry_Description; | |
| 231 $entryhash{Organism}=$entry_Organism; | |
| 232 $entryhash{Sequence}=$entry_Sequence; | |
| 233 $entryhash{SeqLength}=$entry_SeqLength; | |
| 234 | |
| 235 # create features array | |
| 236 my $features = $loadedentry->attrObjList("features"); | |
| 237 my $featuresnumber= $features->size(); | |
| 238 $entryhash{FeaturesNumber}=$featuresnumber; | |
| 239 my ($feature,$feature_name,$feature_location); | |
| 240 my ($feature_qual_names,$feature_qual_values); | |
| 241 my ($feature_qual_name,$feature_qual_value); | |
| 242 my ($feature_qual_number,$feature_qual_number2); | |
| 243 my @features; | |
| 244 for $i (0..$featuresnumber-1) { | |
| 245 my %feature; | |
| 246 $feature = $features->get($i); | |
| 247 $feature_name = $feature->attrStr("FtKey"); | |
| 248 | |
| 249 #unless ($feature_name eq "CDS") { | |
| 250 unless ($valid_features{$feature_name}) { | |
| 251 #print "not valid feature: $feature_name\n"; | |
| 252 next; | |
| 253 } | |
| 254 #print "now processing feature: $feature_name\n"; | |
| 255 $feature_location = $feature->attrStr("FtLocation"); | |
| 256 $feature_location =~ s/\n//g; | |
| 257 $feature_qual_names= $feature->attrStrList("FtQualNames"); | |
| 258 $feature_qual_values= $feature->attrStrList("FtQualValues"); | |
| 259 $feature_qual_number= $feature_qual_names->size(); | |
| 260 $feature_qual_number2= $feature_qual_values->size(); # paranoia check | |
| 261 if ($feature_qual_number > $feature_qual_number2) { | |
| 262 carp ("Warning with Feature at position $i ($feature_name): There are MORE qualifier names than qualifier values."); | |
| 263 # this can happen, e.g. "/partial", let's move on, just issue a warning | |
| 264 #return (0); | |
| 265 } elsif ($feature_qual_number < $feature_qual_number2) { | |
| 266 carp ("Error with Feature at position $i ($feature_name): There are LESS qualifier names than qualifier values. Stopped"); | |
| 267 return (0); | |
| 268 } | |
| 269 #} else {print "NUMBER OF QUALIFIERS: $feature_qual_number \n";} # DEBUG | |
| 270 | |
| 271 # Put things inside hash | |
| 272 $feature{position}=$i; | |
| 273 $feature{name}=$feature_name; | |
| 274 | |
| 275 # new range field in hash | |
| 276 my @range; | |
| 277 if ($feature_name eq "CDS") { | |
| 278 @range=cdslocation2transcript($feature_location); | |
| 279 $feature{locationtype}="joined"; | |
| 280 } else { | |
| 281 if (index($feature_location,"join") == -1) { | |
| 282 @range=location2range($feature_location); | |
| 283 } else { # complex location in feature different than CDS | |
| 284 @range=joinedlocation2range($feature_location); | |
| 285 $feature{locationtype}="joined"; | |
| 286 } | |
| 287 } | |
| 288 $feature{range}=\@range; | |
| 289 $feature{location}="deprecated"; | |
| 290 my %feature_qualifiers; | |
| 291 for $j (0..$feature_qual_number-1) { | |
| 292 $feature_qual_name=$feature_qual_names->get($j); | |
| 293 $feature_qual_name =~ s/^\///; # eliminates heading "/" | |
| 294 | |
| 295 # skip all not interesting (for now) qualifiers | |
| 296 unless ($valid_names{$feature_qual_name}) { | |
| 297 #print "not valid name: $feature_qual_name\n"; | |
| 298 next; | |
| 299 } | |
| 300 #print "now processing: $feature_qual_name\n"; | |
| 301 $feature_qual_value=$feature_qual_values->get($j); | |
| 302 $feature_qual_value =~ s/^"//; # eliminates heading " | |
| 303 $feature_qual_value =~ s/"$//; # eliminates trailing " | |
| 304 $feature_qual_value =~ s/\n//g; # eliminates all new lines | |
| 305 $feature_qualifiers{$feature_qual_name}=$feature_qual_value; | |
| 306 } | |
| 307 $feature{qualifiers}=\%feature_qualifiers; | |
| 308 push (@features,\%feature); # array of features | |
| 309 } | |
| 310 $entryhash{Features}=\@features; # put this also into the hash | |
| 311 my @cds; # array just of CDSs | |
| 312 for $i (0..$#features) { | |
| 313 if ($features[$i]->{'name'} eq "CDS") { | |
| 314 push(@cds,$features[$i]); | |
| 315 } | |
| 316 } | |
| 317 $entryhash{CDS}=\@cds; # put this also into the hash | |
| 318 return (\%entryhash); | |
| 319 } | |
| 320 } | |
| 321 | |
| 322 # argument: location field of an EMBL feature | |
| 323 # returns: array with correct $start $end and $strand to create LiveSeq obj | |
| 324 sub location2range { | |
| 325 my ($location)=@_; | |
| 326 my ($start,$end,$strand); | |
| 327 if (index($location,"complement") == -1) { # forward strand | |
| 328 $strand=1; | |
| 329 } else { # reverse strand | |
| 330 $location =~ s/complement\(//g; | |
| 331 $location =~ s/\)//g; | |
| 332 $strand=-1; | |
| 333 } | |
| 334 $location =~ s/\<//g; | |
| 335 $location =~ s/\>//g; | |
| 336 my @range=split(/\.\./,$location); | |
| 337 if (scalar(@range) == 1) { # special case of range with just one position (e.g. polyA_site EMBL features | |
| 338 $start=$end=$range[0]; | |
| 339 } else { | |
| 340 if ($strand == 1) { | |
| 341 ($start,$end)=@range; | |
| 342 } else { # reverse strand | |
| 343 ($end,$start)=@range; | |
| 344 } | |
| 345 } | |
| 346 return ($start,$end,$strand); | |
| 347 } | |
| 348 | |
| 349 # argument: location field of a CDS feature | |
| 350 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj | |
| 351 sub cdslocation2transcript { | |
| 352 my ($location)=@_; | |
| 353 my @exonlocs; | |
| 354 my $exonloc; | |
| 355 my @exon; | |
| 356 my @transcript=(); | |
| 357 $location =~ s/^join\(//; | |
| 358 $location =~ s/\)$//; | |
| 359 @exonlocs = split (/\,/,$location); | |
| 360 for $exonloc (@exonlocs) { | |
| 361 my @exon=location2range($exonloc); | |
| 362 push (@transcript,\@exon); | |
| 363 } | |
| 364 return (@transcript); | |
| 365 } | |
| 366 | |
| 367 # argument: location field of a CDS feature | |
| 368 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj | |
| 369 sub joinedlocation2range { | |
| 370 &cdslocation2transcript; | |
| 371 } | |
| 372 | |
| 373 | |
| 374 =head2 get_swisshash | |
| 375 | |
| 376 Title : get_swisshash | |
| 377 Usage : $loader->get_swisshash(); | |
| 378 Example : $swisshash=$loader->swissprot2hash("SWISS-PROT:P10275") | |
| 379 Function: retrieves with SRS a SwissProt entry, parses it and creates | |
| 380 a hash that contains all the information. | |
| 381 Returns : a reference to a hash | |
| 382 Errorcode: 0 | |
| 383 Args : the db_xref qualifier's value from an EMBL CDS Feature | |
| 384 i.e. a string in the form "SWISS-PROT:accession_number" | |
| 385 Note : this can be modified (adding a second argument) to retrieve | |
| 386 and parse SWTREMBL, SWALL... entries | |
| 387 | |
| 388 =cut | |
| 389 | |
| 390 # argument: db_xref qualifier's value from EMBL CDS | |
| 391 # errorcode: 0 | |
| 392 # returns hashref | |
| 393 sub get_swisshash { | |
| 394 my ($self,$query)=@_; | |
| 395 if (index($query,"SWISS-PROT") == -1) { | |
| 396 return (0); | |
| 397 } | |
| 398 $query =~ s/SWISS-PROT/swissprot-acc/; | |
| 399 my $hashref; | |
| 400 eval { | |
| 401 $hashref=&swissprot2hash("$query"); | |
| 402 }; | |
| 403 my $errormsg; | |
| 404 if ($@) { | |
| 405 foreach $errormsg (split(/\n/,$@)) { | |
| 406 if (index($errormsg,"in cleanup") == -1) { | |
| 407 carp "SRS Swissprot Loader: $@"; | |
| 408 } | |
| 409 } | |
| 410 } | |
| 411 unless ($hashref) { | |
| 412 return (0); | |
| 413 } else { | |
| 414 return ($hashref); | |
| 415 } | |
| 416 } | |
| 417 | |
| 418 =head2 swissprot2hash | |
| 419 | |
| 420 Title : swissprot2hash | |
| 421 Usage : $loader->swissprot2hash(); | |
| 422 Example : $swisshash=$loader->swissprot2hash("swissprot-acc:P10275") | |
| 423 Function: retrieves with SRS a SwissProt entry, parses it and creates | |
| 424 a hash that contains all the information. | |
| 425 Returns : a reference to a hash | |
| 426 Errorcode: 0 | |
| 427 Args : an SRS query resulting in one entry from SwissProt database | |
| 428 i.e. a string in the form "swissprot-acc:accession_number" | |
| 429 Note : this can be modified (adding a second argument) to retrieve | |
| 430 and parse SWTREMBL, SWALL... entries | |
| 431 | |
| 432 =cut | |
| 433 | |
| 434 # arguments: swissprot SRS query resulting in one fetched swissprot entry | |
| 435 # returns: a reference to a hash containing the important features requested | |
| 436 sub swissprot2hash { | |
| 437 my ($i,$j); | |
| 438 my $query=$_[0]; | |
| 439 # SRS API used to fetch all relevant fields | |
| 440 my $sess = new Session; | |
| 441 my $set = $sess->query("[$query]", ""); | |
| 442 my $numEntries = $set->size(); | |
| 443 if ($numEntries < 1) { | |
| 444 carp "No sequence entry found for the query $query"; | |
| 445 return (0); | |
| 446 } elsif ($numEntries > 1) { | |
| 447 carp "Too many entries found for the input given"; | |
| 448 return (0); | |
| 449 } else { | |
| 450 my $entry = $set->getEntry(0); | |
| 451 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Gene); | |
| 452 # Fetch what we can fetch without the loader | |
| 453 $entry_ID = $entry->fieldToString("id","","",""); | |
| 454 $entry_AccNumber = $entry->fieldToString("acc","","",""); | |
| 455 $entry_Gene = $entry->fieldToString("gen","","",""); | |
| 456 $entry_Gene =~ s/\n/ /g; | |
| 457 $entry_Description = $entry->fieldToString("des","","",""); | |
| 458 $entry_Description =~ s/\n/ /g; | |
| 459 $entry_Organism = $entry->fieldToString("org","","",""); | |
| 460 chop $entry_Organism; | |
| 461 $entry_SeqLength = $entry->fieldToString("sl","","",""); | |
| 462 # Now use the loader | |
| 463 my $loadedentry = $entry->load("Swissprot"); | |
| 464 # Fetch the rest via the loader | |
| 465 $entry_Sequence = $loadedentry->attrStr("Sequence"); | |
| 466 $entry_Sequence =~ s/\n//g; # from plain format to raw string | |
| 467 | |
| 468 # put into the hash | |
| 469 my %entryhash; | |
| 470 $entryhash{ID}=$entry_ID; | |
| 471 $entryhash{AccNumber}=$entry_AccNumber; | |
| 472 $entryhash{Molecule}=$entry_Molecule; | |
| 473 $entryhash{Gene}=$entry_Gene; | |
| 474 $entryhash{Description}=$entry_Description; | |
| 475 $entryhash{Organism}=$entry_Organism; | |
| 476 $entryhash{Sequence}=$entry_Sequence; | |
| 477 $entryhash{SeqLength}=$entry_SeqLength; | |
| 478 | |
| 479 # create features array | |
| 480 my $features = $loadedentry->attrObjList("Features"); | |
| 481 my $featuresnumber= $features->size(); | |
| 482 $entryhash{FeaturesNumber}=$featuresnumber; | |
| 483 my ($feature,$feature_name,$feature_description,$feature_location); | |
| 484 my @features; | |
| 485 for $i (0..$featuresnumber-1) { | |
| 486 my %feature; | |
| 487 $feature = $features->get($i); | |
| 488 $feature_name = $feature->attrStr("FtKey"); | |
| 489 $feature_location = $feature->attrStr("FtLocation"); | |
| 490 $feature_location =~ s/ +/ /g; | |
| 491 $feature_description = $feature->attrStr("FtDescription"); | |
| 492 chop $feature_description; | |
| 493 $feature_description =~ s/\nFT / /g; | |
| 494 | |
| 495 # Put things inside hash | |
| 496 $feature{position}=$i; | |
| 497 $feature{name}=$feature_name; | |
| 498 $feature{location}=$feature_location; | |
| 499 $feature{description}=$feature_description; | |
| 500 | |
| 501 push (@features,\%feature); # array of features | |
| 502 } | |
| 503 $entryhash{Features}=\@features; # put this also into the hash | |
| 504 return (\%entryhash); | |
| 505 } | |
| 506 } | |
| 507 | |
| 508 =head2 novelaasequence2gene | |
| 509 | |
| 510 Title : novelaasequence2gene | |
| 511 Usage : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); | |
| 512 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*", | |
| 513 -genome => "Homo sapiens"); | |
| 514 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*", | |
| 515 -genome => "Mitochondrion Homo sapiens", | |
| 516 -gene_name => "tyr-kinase"); | |
| 517 | |
| 518 Function: creates LiveSeq objects from a novel amino acid sequence, | |
| 519 using codon usage database to choose codons according to | |
| 520 relative frequencies. | |
| 521 If a genome latin name is not specified, the default is to use | |
| 522 'Homo sapiens' (taxonomy ID 9606). | |
| 523 Returns : reference to a Gene object containing references to LiveSeq objects | |
| 524 Errorcode 0 | |
| 525 Args : string containing an amino acid sequence | |
| 526 string (optional) with a species/genome latin name | |
| 527 string specifying a gene name | |
| 528 Note : SRS access to TAXON and CODONUSAGE databases is required | |
| 529 | |
| 530 =cut | |
| 531 | |
| 532 sub novelaasequence2gene { | |
| 533 my ($self, %args) = @_; | |
| 534 my ($gene_name,$species_name,$aasequence)=($args{-gene_name},$args{-genome},$args{-aasequence}); | |
| 535 unless ($aasequence) { | |
| 536 carp "aasequence not given"; | |
| 537 return (0); | |
| 538 } | |
| 539 unless ($gene_name) { | |
| 540 $gene_name="Novel Unknown"; | |
| 541 } | |
| 542 unless ($species_name) { | |
| 543 $species_name="Homo sapiens"; | |
| 544 } | |
| 545 | |
| 546 my $sess = new Session; | |
| 547 my ($e,$numEntries,$set); | |
| 548 | |
| 549 # codonusage lookup | |
| 550 my $codonusage_usage; | |
| 551 my @species_codon_usage; | |
| 552 $set = $sess->query("[codonusage:'$species_name']", ""); | |
| 553 $numEntries = $set->size(); | |
| 554 if ($numEntries > 0) { | |
| 555 $e = $set->getEntry(0); | |
| 556 $species_name = $e->fieldToString("id","","",""); | |
| 557 $codonusage_usage = $e->fieldToString("usage","","",""); | |
| 558 @species_codon_usage=split(/\s/,$codonusage_usage); # spaces or tabs | |
| 559 if ($numEntries > 1) { | |
| 560 carp "Search in Codon Usage DB resulted in $numEntries results. Taking the first one: $species_name"; | |
| 561 } | |
| 562 } else { | |
| 563 carp "Genome not found in codon usage DB."; | |
| 564 return (0); | |
| 565 } | |
| 566 | |
| 567 # taxonomy lookup | |
| 568 my $mito_flag = 0; | |
| 569 my $species_origin; | |
| 570 if ($species_name =~ /^Mitochondrion /) { | |
| 571 $mito_flag = 1; | |
| 572 $species_origin = $species_name; | |
| 573 $species_origin =~ s/^Mitochondrion //; | |
| 574 $set = $sess->query("[taxonomy-species:'$species_origin']", ""); | |
| 575 } elsif ($species_name =~ /^Chloroplast |^Kinetoplast |^Chromoplast /) { | |
| 576 $species_origin = $species_name; | |
| 577 $species_origin =~ s/^Chromoplast //; | |
| 578 $species_origin =~ s/^Kinetoplast //; | |
| 579 $species_origin =~ s/^Chloroplast //; | |
| 580 $set = $sess->query("[taxonomy-species:'$species_origin']", ""); | |
| 581 } else { | |
| 582 $set = $sess->query("[taxonomy-species:'$species_name']", ""); | |
| 583 } | |
| 584 $numEntries = $set->size(); | |
| 585 my ($taxonomy_id,$taxonomy_gc,$taxonomy_mgc,$taxonomy_name); | |
| 586 if ($numEntries > 0) { | |
| 587 $e = $set->getEntry(0); | |
| 588 $taxonomy_id = $e->fieldToString("id","","",""); | |
| 589 $taxonomy_name = $e->fieldToString("species","","",""); | |
| 590 $taxonomy_gc = $e->fieldToString("gc","","",""); | |
| 591 $taxonomy_mgc = $e->fieldToString("mgc","","",""); | |
| 592 if ($numEntries > 1) { | |
| 593 carp "Note! More than one entry found in taxonomy DB for the genome query given. Using the first one: $taxonomy_name ($taxonomy_id)"; | |
| 594 } | |
| 595 } else { | |
| 596 carp "Genome not found in taxonomy DB."; | |
| 597 return (0); | |
| 598 } | |
| 599 # Lookup appropriate translation table | |
| 600 my $ttabid; | |
| 601 if ($mito_flag) { | |
| 602 $ttabid = $taxonomy_mgc; | |
| 603 } else { | |
| 604 $ttabid = $taxonomy_gc; | |
| 605 } | |
| 606 | |
| 607 my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name); | |
| 608 return ($gene); | |
| 609 } | |
| 610 | |
| 611 1; |
