Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/LiveSeq/IO/SRS.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: SRS.pm,v 1.7 2001/06/18 08:27:55 heikki Exp $ | |
2 # | |
3 # bioperl module for Bio::LiveSeq::IO::SRS | |
4 # | |
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net> | |
6 # | |
7 # Copyright Joseph Insana | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 # | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::LiveSeq::IO::SRS - Loader for LiveSeq from EMBL entries with SRS | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 my $db="EMBL"; | |
20 my $acc_id="M20132"; | |
21 my $query="embl-acc:$acc_id"; | |
22 | |
23 my $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query"); | |
24 | |
25 my @translationobjects=$loader->entry2liveseq(); | |
26 | |
27 my $gene="AR"; | |
28 my $gene=$loader->gene2liveseq("gene"); | |
29 | |
30 NOTE: The only -db now supported is EMBL. Hence it defaults to EMBL. | |
31 | |
32 =head1 DESCRIPTION | |
33 | |
34 This package uses the SRS (Sequence Retrieval System) to fetch a sequence | |
35 database entry, analyse it and create LiveSeq objects out of it. | |
36 | |
37 An embl-acc ID has to be passed to this package which will return references | |
38 to all translation objects created from the EMBL entry. | |
39 References to Transcription, DNA and Exon objects can all be retrieved departing | |
40 from these. | |
41 | |
42 Alternatively, a specific "gene" name can be specified, together with the | |
43 embl-acc ID. This will create a LiveSeq::Gene object with all relevant gene | |
44 features attached/created. | |
45 | |
46 =head1 AUTHOR - Joseph A.L. Insana | |
47 | |
48 Email: Insana@ebi.ac.uk, jinsana@gmx.net | |
49 | |
50 Address: | |
51 | |
52 EMBL Outstation, European Bioinformatics Institute | |
53 Wellcome Trust Genome Campus, Hinxton | |
54 Cambs. CB10 1SD, United Kingdom | |
55 | |
56 =head1 APPENDIX | |
57 | |
58 The rest of the documentation details each of the object | |
59 methods. Internal methods are usually preceded with a _ | |
60 | |
61 =cut | |
62 | |
63 # Let the code begin... | |
64 | |
65 package Bio::LiveSeq::IO::SRS; | |
66 $VERSION=2.4; | |
67 | |
68 # Version history: | |
69 # Wed Apr 5 13:06:43 BST 2000 v 1.0 restarted as a child of Loader.pm | |
70 # Wed Apr 5 15:57:22 BST 2000 v 1.1 load() created | |
71 # Thu Apr 6 02:52:11 BST 2000 v 1.2 new field "range" in hash | |
72 # Thu Apr 6 03:11:29 BST 2000 v 1.22 transition from $location to @range | |
73 # Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division | |
74 # Tue Apr 18 17:15:26 BST 2000 v 2.0 started coding swissprot2hash | |
75 # Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names | |
76 # Wed Jun 7 02:08:57 BST 2000 v 2.2 added support for joinedlocation features | |
77 # Thu Jun 29 19:07:59 BST 2000 v 2.22 Gene stripped of possible newlines (horrible characteristic of some entries!!!!) | |
78 # Fri Jun 30 14:08:21 BST 2000 v 2.3 SRS querying routines now conveniently wrapped in eval{} blocks to avoid SRS crash the whole LiveSeq | |
79 # Tue Jul 4 14:07:52 BST 2000 v 2.31 note&number added in val_qual_names | |
80 # Mon Sep 4 17:46:42 BST 2000 v 2.4 novelaasequence2gene() added | |
81 | |
82 use strict; | |
83 use Carp qw(cluck croak carp); | |
84 use vars qw($VERSION @ISA); | |
85 use lib $ENV{SRSEXE}; | |
86 use srsperl; | |
87 use Bio::Tools::CodonTable; # for novelaasequence2gene | |
88 | |
89 use Bio::LiveSeq::IO::Loader 2.2; | |
90 | |
91 @ISA=qw(Bio::LiveSeq::IO::Loader); | |
92 | |
93 # This package can in the future host other databases loading subroutines. | |
94 # e.g. ensembl2hash | |
95 | |
96 =head2 load | |
97 | |
98 Title : load | |
99 Usage : my $acc_id="M20132"; | |
100 my $query="embl-acc:$acc_id"; | |
101 $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query"); | |
102 | |
103 Function: loads an entry with SRS from a database into a hash | |
104 Returns : reference to a new object of class IO::SRS holding an entry | |
105 Errorcode 0 | |
106 Args : an SRS query resulting in one fetched EMBL (by default) entry | |
107 | |
108 =cut | |
109 | |
110 sub load { | |
111 my ($thing, %args) = @_; | |
112 my $class = ref($thing) || $thing; | |
113 my ($obj,%loader); | |
114 | |
115 my ($db,$query)=($args{-db},$args{-query}); | |
116 | |
117 if (defined($db)) { | |
118 unless ($db eq "EMBL") { | |
119 carp "Note: only EMBL now supported!"; | |
120 return(0); | |
121 } | |
122 } else { | |
123 $db="EMBL"; | |
124 } | |
125 | |
126 my $hashref; | |
127 if ($db eq "EMBL") { | |
128 my $test_transl=0; # change to 0 to avoid comparison of translation | |
129 | |
130 # these can be changed for future needs | |
131 my @embl_valid_feature_names=qw(CDS exon prim_transcript intron repeat_unit repeat_region mRNA); | |
132 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table); | |
133 | |
134 # dunno yet how to implement test_transl again.... | |
135 # probably on a one-on-one basis with each translation? | |
136 if ($test_transl) { | |
137 push (@embl_valid_qual_names,"translation"); # needed for test_transl | |
138 } | |
139 | |
140 # not to have the whole program die because of SRS death | |
141 eval { | |
142 $hashref=&embl2hash("$query",\@embl_valid_feature_names,\@embl_valid_qual_names); | |
143 }; | |
144 my $errormsg; | |
145 if ($@) { | |
146 foreach $errormsg (split(/\n/,$@)) { | |
147 if (index($errormsg,"in cleanup") == -1) { | |
148 carp "SRS EMBL Loader: $@"; | |
149 } | |
150 } | |
151 } | |
152 } | |
153 unless ($hashref) { return (0); } | |
154 | |
155 %loader = (db => $db, query => $query, hash => $hashref); | |
156 $obj = \%loader; | |
157 $obj = bless $obj, $class; | |
158 return $obj; | |
159 } | |
160 | |
161 =head2 embl2hash | |
162 | |
163 Title : embl2hash | |
164 Function: retrieves with SRS an EMBL entry, parses it and creates | |
165 a hash that contains all the information. | |
166 Returns : a reference to a hash | |
167 Errorcode: 0 | |
168 Args : an SRS query resulting in one fetched EMBL entry | |
169 i.e. a string in a form like "embl-acc:accession_number" | |
170 two array references to skip features and qualifiers (for | |
171 performance) | |
172 Example: @valid_features=qw(CDS exon prim_transcript mRNA); | |
173 @valid_qualifiers=qw(gene codon_start db_xref product rpt_family); | |
174 $hashref=&embl2hash("$query",\@valid_features,\@valid_qualifiers); | |
175 | |
176 =cut | |
177 | |
178 # this has to be defined here as it is the only thing really proper to | |
179 # the /SRS/ loader. Every loader has to sport his own "entry2hash" function | |
180 # the main functions will be in the Loader.pm | |
181 # arguments: embl SRS query resulting in one fetched EMBL entry | |
182 # to skip features and qualifiers (for performance), two array | |
183 # references must be passed (this can change into string arguments to | |
184 # be passed....) | |
185 # returns: a reference to a hash containing the important features requested | |
186 sub embl2hash { | |
187 my ($i,$j); | |
188 my $query=$_[0]; | |
189 my %valid_features; my %valid_names; | |
190 if ($_[1]) { | |
191 %valid_features = map {$_, 1} @{$_[1]}; # to skip features | |
192 } | |
193 if ($_[2]) { | |
194 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers | |
195 } | |
196 # SRS API used to fetch all relevant fields | |
197 my $sess = new Session; | |
198 my $set = $sess->query("[$query]", ""); | |
199 my $numEntries=$set->size(); | |
200 if ($numEntries < 1) { | |
201 carp "No sequence entry found for the query $query"; | |
202 return (0); | |
203 } elsif ($numEntries > 1) { | |
204 carp "Too many entries found for the input given"; | |
205 return (0); | |
206 } else { | |
207 my $entry = $set->getEntry(0); | |
208 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Division); | |
209 # Fetch what we can fetch without the loader | |
210 $entry_ID = $entry->fieldToString("id","","",""); | |
211 $entry_AccNumber = $entry->fieldToString("acc","","",""); | |
212 $entry_Molecule = $entry->fieldToString("mol","","",""); | |
213 $entry_Division = $entry->fieldToString("div","","",""); | |
214 $entry_Description = $entry->fieldToString("des","","",""); | |
215 $entry_Description =~ s/\n/ /g; | |
216 $entry_Organism = $entry->fieldToString("org","","",""); | |
217 $entry_SeqLength = $entry->fieldToString("sl","","",""); | |
218 # Now use the loader | |
219 my $loadedentry = $entry->load("EMBL"); | |
220 # Fetch the rest via the loader | |
221 $entry_Sequence = $loadedentry->attrStr("sequence"); | |
222 $entry_Sequence =~ s/\n//g; # from plain format to raw string | |
223 | |
224 # put into the hash | |
225 my %entryhash; | |
226 $entryhash{ID}=$entry_ID; | |
227 $entryhash{AccNumber}=$entry_AccNumber; | |
228 $entryhash{Molecule}=$entry_Molecule; | |
229 $entryhash{Division}=$entry_Division; | |
230 $entryhash{Description}=$entry_Description; | |
231 $entryhash{Organism}=$entry_Organism; | |
232 $entryhash{Sequence}=$entry_Sequence; | |
233 $entryhash{SeqLength}=$entry_SeqLength; | |
234 | |
235 # create features array | |
236 my $features = $loadedentry->attrObjList("features"); | |
237 my $featuresnumber= $features->size(); | |
238 $entryhash{FeaturesNumber}=$featuresnumber; | |
239 my ($feature,$feature_name,$feature_location); | |
240 my ($feature_qual_names,$feature_qual_values); | |
241 my ($feature_qual_name,$feature_qual_value); | |
242 my ($feature_qual_number,$feature_qual_number2); | |
243 my @features; | |
244 for $i (0..$featuresnumber-1) { | |
245 my %feature; | |
246 $feature = $features->get($i); | |
247 $feature_name = $feature->attrStr("FtKey"); | |
248 | |
249 #unless ($feature_name eq "CDS") { | |
250 unless ($valid_features{$feature_name}) { | |
251 #print "not valid feature: $feature_name\n"; | |
252 next; | |
253 } | |
254 #print "now processing feature: $feature_name\n"; | |
255 $feature_location = $feature->attrStr("FtLocation"); | |
256 $feature_location =~ s/\n//g; | |
257 $feature_qual_names= $feature->attrStrList("FtQualNames"); | |
258 $feature_qual_values= $feature->attrStrList("FtQualValues"); | |
259 $feature_qual_number= $feature_qual_names->size(); | |
260 $feature_qual_number2= $feature_qual_values->size(); # paranoia check | |
261 if ($feature_qual_number > $feature_qual_number2) { | |
262 carp ("Warning with Feature at position $i ($feature_name): There are MORE qualifier names than qualifier values."); | |
263 # this can happen, e.g. "/partial", let's move on, just issue a warning | |
264 #return (0); | |
265 } elsif ($feature_qual_number < $feature_qual_number2) { | |
266 carp ("Error with Feature at position $i ($feature_name): There are LESS qualifier names than qualifier values. Stopped"); | |
267 return (0); | |
268 } | |
269 #} else {print "NUMBER OF QUALIFIERS: $feature_qual_number \n";} # DEBUG | |
270 | |
271 # Put things inside hash | |
272 $feature{position}=$i; | |
273 $feature{name}=$feature_name; | |
274 | |
275 # new range field in hash | |
276 my @range; | |
277 if ($feature_name eq "CDS") { | |
278 @range=cdslocation2transcript($feature_location); | |
279 $feature{locationtype}="joined"; | |
280 } else { | |
281 if (index($feature_location,"join") == -1) { | |
282 @range=location2range($feature_location); | |
283 } else { # complex location in feature different than CDS | |
284 @range=joinedlocation2range($feature_location); | |
285 $feature{locationtype}="joined"; | |
286 } | |
287 } | |
288 $feature{range}=\@range; | |
289 $feature{location}="deprecated"; | |
290 my %feature_qualifiers; | |
291 for $j (0..$feature_qual_number-1) { | |
292 $feature_qual_name=$feature_qual_names->get($j); | |
293 $feature_qual_name =~ s/^\///; # eliminates heading "/" | |
294 | |
295 # skip all not interesting (for now) qualifiers | |
296 unless ($valid_names{$feature_qual_name}) { | |
297 #print "not valid name: $feature_qual_name\n"; | |
298 next; | |
299 } | |
300 #print "now processing: $feature_qual_name\n"; | |
301 $feature_qual_value=$feature_qual_values->get($j); | |
302 $feature_qual_value =~ s/^"//; # eliminates heading " | |
303 $feature_qual_value =~ s/"$//; # eliminates trailing " | |
304 $feature_qual_value =~ s/\n//g; # eliminates all new lines | |
305 $feature_qualifiers{$feature_qual_name}=$feature_qual_value; | |
306 } | |
307 $feature{qualifiers}=\%feature_qualifiers; | |
308 push (@features,\%feature); # array of features | |
309 } | |
310 $entryhash{Features}=\@features; # put this also into the hash | |
311 my @cds; # array just of CDSs | |
312 for $i (0..$#features) { | |
313 if ($features[$i]->{'name'} eq "CDS") { | |
314 push(@cds,$features[$i]); | |
315 } | |
316 } | |
317 $entryhash{CDS}=\@cds; # put this also into the hash | |
318 return (\%entryhash); | |
319 } | |
320 } | |
321 | |
322 # argument: location field of an EMBL feature | |
323 # returns: array with correct $start $end and $strand to create LiveSeq obj | |
324 sub location2range { | |
325 my ($location)=@_; | |
326 my ($start,$end,$strand); | |
327 if (index($location,"complement") == -1) { # forward strand | |
328 $strand=1; | |
329 } else { # reverse strand | |
330 $location =~ s/complement\(//g; | |
331 $location =~ s/\)//g; | |
332 $strand=-1; | |
333 } | |
334 $location =~ s/\<//g; | |
335 $location =~ s/\>//g; | |
336 my @range=split(/\.\./,$location); | |
337 if (scalar(@range) == 1) { # special case of range with just one position (e.g. polyA_site EMBL features | |
338 $start=$end=$range[0]; | |
339 } else { | |
340 if ($strand == 1) { | |
341 ($start,$end)=@range; | |
342 } else { # reverse strand | |
343 ($end,$start)=@range; | |
344 } | |
345 } | |
346 return ($start,$end,$strand); | |
347 } | |
348 | |
349 # argument: location field of a CDS feature | |
350 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj | |
351 sub cdslocation2transcript { | |
352 my ($location)=@_; | |
353 my @exonlocs; | |
354 my $exonloc; | |
355 my @exon; | |
356 my @transcript=(); | |
357 $location =~ s/^join\(//; | |
358 $location =~ s/\)$//; | |
359 @exonlocs = split (/\,/,$location); | |
360 for $exonloc (@exonlocs) { | |
361 my @exon=location2range($exonloc); | |
362 push (@transcript,\@exon); | |
363 } | |
364 return (@transcript); | |
365 } | |
366 | |
367 # argument: location field of a CDS feature | |
368 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj | |
369 sub joinedlocation2range { | |
370 &cdslocation2transcript; | |
371 } | |
372 | |
373 | |
374 =head2 get_swisshash | |
375 | |
376 Title : get_swisshash | |
377 Usage : $loader->get_swisshash(); | |
378 Example : $swisshash=$loader->swissprot2hash("SWISS-PROT:P10275") | |
379 Function: retrieves with SRS a SwissProt entry, parses it and creates | |
380 a hash that contains all the information. | |
381 Returns : a reference to a hash | |
382 Errorcode: 0 | |
383 Args : the db_xref qualifier's value from an EMBL CDS Feature | |
384 i.e. a string in the form "SWISS-PROT:accession_number" | |
385 Note : this can be modified (adding a second argument) to retrieve | |
386 and parse SWTREMBL, SWALL... entries | |
387 | |
388 =cut | |
389 | |
390 # argument: db_xref qualifier's value from EMBL CDS | |
391 # errorcode: 0 | |
392 # returns hashref | |
393 sub get_swisshash { | |
394 my ($self,$query)=@_; | |
395 if (index($query,"SWISS-PROT") == -1) { | |
396 return (0); | |
397 } | |
398 $query =~ s/SWISS-PROT/swissprot-acc/; | |
399 my $hashref; | |
400 eval { | |
401 $hashref=&swissprot2hash("$query"); | |
402 }; | |
403 my $errormsg; | |
404 if ($@) { | |
405 foreach $errormsg (split(/\n/,$@)) { | |
406 if (index($errormsg,"in cleanup") == -1) { | |
407 carp "SRS Swissprot Loader: $@"; | |
408 } | |
409 } | |
410 } | |
411 unless ($hashref) { | |
412 return (0); | |
413 } else { | |
414 return ($hashref); | |
415 } | |
416 } | |
417 | |
418 =head2 swissprot2hash | |
419 | |
420 Title : swissprot2hash | |
421 Usage : $loader->swissprot2hash(); | |
422 Example : $swisshash=$loader->swissprot2hash("swissprot-acc:P10275") | |
423 Function: retrieves with SRS a SwissProt entry, parses it and creates | |
424 a hash that contains all the information. | |
425 Returns : a reference to a hash | |
426 Errorcode: 0 | |
427 Args : an SRS query resulting in one entry from SwissProt database | |
428 i.e. a string in the form "swissprot-acc:accession_number" | |
429 Note : this can be modified (adding a second argument) to retrieve | |
430 and parse SWTREMBL, SWALL... entries | |
431 | |
432 =cut | |
433 | |
434 # arguments: swissprot SRS query resulting in one fetched swissprot entry | |
435 # returns: a reference to a hash containing the important features requested | |
436 sub swissprot2hash { | |
437 my ($i,$j); | |
438 my $query=$_[0]; | |
439 # SRS API used to fetch all relevant fields | |
440 my $sess = new Session; | |
441 my $set = $sess->query("[$query]", ""); | |
442 my $numEntries = $set->size(); | |
443 if ($numEntries < 1) { | |
444 carp "No sequence entry found for the query $query"; | |
445 return (0); | |
446 } elsif ($numEntries > 1) { | |
447 carp "Too many entries found for the input given"; | |
448 return (0); | |
449 } else { | |
450 my $entry = $set->getEntry(0); | |
451 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Gene); | |
452 # Fetch what we can fetch without the loader | |
453 $entry_ID = $entry->fieldToString("id","","",""); | |
454 $entry_AccNumber = $entry->fieldToString("acc","","",""); | |
455 $entry_Gene = $entry->fieldToString("gen","","",""); | |
456 $entry_Gene =~ s/\n/ /g; | |
457 $entry_Description = $entry->fieldToString("des","","",""); | |
458 $entry_Description =~ s/\n/ /g; | |
459 $entry_Organism = $entry->fieldToString("org","","",""); | |
460 chop $entry_Organism; | |
461 $entry_SeqLength = $entry->fieldToString("sl","","",""); | |
462 # Now use the loader | |
463 my $loadedentry = $entry->load("Swissprot"); | |
464 # Fetch the rest via the loader | |
465 $entry_Sequence = $loadedentry->attrStr("Sequence"); | |
466 $entry_Sequence =~ s/\n//g; # from plain format to raw string | |
467 | |
468 # put into the hash | |
469 my %entryhash; | |
470 $entryhash{ID}=$entry_ID; | |
471 $entryhash{AccNumber}=$entry_AccNumber; | |
472 $entryhash{Molecule}=$entry_Molecule; | |
473 $entryhash{Gene}=$entry_Gene; | |
474 $entryhash{Description}=$entry_Description; | |
475 $entryhash{Organism}=$entry_Organism; | |
476 $entryhash{Sequence}=$entry_Sequence; | |
477 $entryhash{SeqLength}=$entry_SeqLength; | |
478 | |
479 # create features array | |
480 my $features = $loadedentry->attrObjList("Features"); | |
481 my $featuresnumber= $features->size(); | |
482 $entryhash{FeaturesNumber}=$featuresnumber; | |
483 my ($feature,$feature_name,$feature_description,$feature_location); | |
484 my @features; | |
485 for $i (0..$featuresnumber-1) { | |
486 my %feature; | |
487 $feature = $features->get($i); | |
488 $feature_name = $feature->attrStr("FtKey"); | |
489 $feature_location = $feature->attrStr("FtLocation"); | |
490 $feature_location =~ s/ +/ /g; | |
491 $feature_description = $feature->attrStr("FtDescription"); | |
492 chop $feature_description; | |
493 $feature_description =~ s/\nFT / /g; | |
494 | |
495 # Put things inside hash | |
496 $feature{position}=$i; | |
497 $feature{name}=$feature_name; | |
498 $feature{location}=$feature_location; | |
499 $feature{description}=$feature_description; | |
500 | |
501 push (@features,\%feature); # array of features | |
502 } | |
503 $entryhash{Features}=\@features; # put this also into the hash | |
504 return (\%entryhash); | |
505 } | |
506 } | |
507 | |
508 =head2 novelaasequence2gene | |
509 | |
510 Title : novelaasequence2gene | |
511 Usage : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); | |
512 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*", | |
513 -genome => "Homo sapiens"); | |
514 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*", | |
515 -genome => "Mitochondrion Homo sapiens", | |
516 -gene_name => "tyr-kinase"); | |
517 | |
518 Function: creates LiveSeq objects from a novel amino acid sequence, | |
519 using codon usage database to choose codons according to | |
520 relative frequencies. | |
521 If a genome latin name is not specified, the default is to use | |
522 'Homo sapiens' (taxonomy ID 9606). | |
523 Returns : reference to a Gene object containing references to LiveSeq objects | |
524 Errorcode 0 | |
525 Args : string containing an amino acid sequence | |
526 string (optional) with a species/genome latin name | |
527 string specifying a gene name | |
528 Note : SRS access to TAXON and CODONUSAGE databases is required | |
529 | |
530 =cut | |
531 | |
532 sub novelaasequence2gene { | |
533 my ($self, %args) = @_; | |
534 my ($gene_name,$species_name,$aasequence)=($args{-gene_name},$args{-genome},$args{-aasequence}); | |
535 unless ($aasequence) { | |
536 carp "aasequence not given"; | |
537 return (0); | |
538 } | |
539 unless ($gene_name) { | |
540 $gene_name="Novel Unknown"; | |
541 } | |
542 unless ($species_name) { | |
543 $species_name="Homo sapiens"; | |
544 } | |
545 | |
546 my $sess = new Session; | |
547 my ($e,$numEntries,$set); | |
548 | |
549 # codonusage lookup | |
550 my $codonusage_usage; | |
551 my @species_codon_usage; | |
552 $set = $sess->query("[codonusage:'$species_name']", ""); | |
553 $numEntries = $set->size(); | |
554 if ($numEntries > 0) { | |
555 $e = $set->getEntry(0); | |
556 $species_name = $e->fieldToString("id","","",""); | |
557 $codonusage_usage = $e->fieldToString("usage","","",""); | |
558 @species_codon_usage=split(/\s/,$codonusage_usage); # spaces or tabs | |
559 if ($numEntries > 1) { | |
560 carp "Search in Codon Usage DB resulted in $numEntries results. Taking the first one: $species_name"; | |
561 } | |
562 } else { | |
563 carp "Genome not found in codon usage DB."; | |
564 return (0); | |
565 } | |
566 | |
567 # taxonomy lookup | |
568 my $mito_flag = 0; | |
569 my $species_origin; | |
570 if ($species_name =~ /^Mitochondrion /) { | |
571 $mito_flag = 1; | |
572 $species_origin = $species_name; | |
573 $species_origin =~ s/^Mitochondrion //; | |
574 $set = $sess->query("[taxonomy-species:'$species_origin']", ""); | |
575 } elsif ($species_name =~ /^Chloroplast |^Kinetoplast |^Chromoplast /) { | |
576 $species_origin = $species_name; | |
577 $species_origin =~ s/^Chromoplast //; | |
578 $species_origin =~ s/^Kinetoplast //; | |
579 $species_origin =~ s/^Chloroplast //; | |
580 $set = $sess->query("[taxonomy-species:'$species_origin']", ""); | |
581 } else { | |
582 $set = $sess->query("[taxonomy-species:'$species_name']", ""); | |
583 } | |
584 $numEntries = $set->size(); | |
585 my ($taxonomy_id,$taxonomy_gc,$taxonomy_mgc,$taxonomy_name); | |
586 if ($numEntries > 0) { | |
587 $e = $set->getEntry(0); | |
588 $taxonomy_id = $e->fieldToString("id","","",""); | |
589 $taxonomy_name = $e->fieldToString("species","","",""); | |
590 $taxonomy_gc = $e->fieldToString("gc","","",""); | |
591 $taxonomy_mgc = $e->fieldToString("mgc","","",""); | |
592 if ($numEntries > 1) { | |
593 carp "Note! More than one entry found in taxonomy DB for the genome query given. Using the first one: $taxonomy_name ($taxonomy_id)"; | |
594 } | |
595 } else { | |
596 carp "Genome not found in taxonomy DB."; | |
597 return (0); | |
598 } | |
599 # Lookup appropriate translation table | |
600 my $ttabid; | |
601 if ($mito_flag) { | |
602 $ttabid = $taxonomy_mgc; | |
603 } else { | |
604 $ttabid = $taxonomy_gc; | |
605 } | |
606 | |
607 my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name); | |
608 return ($gene); | |
609 } | |
610 | |
611 1; |