0
|
1 # $Id: SRS.pm,v 1.7 2001/06/18 08:27:55 heikki Exp $
|
|
2 #
|
|
3 # bioperl module for Bio::LiveSeq::IO::SRS
|
|
4 #
|
|
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
|
|
6 #
|
|
7 # Copyright Joseph Insana
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::LiveSeq::IO::SRS - Loader for LiveSeq from EMBL entries with SRS
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 my $db="EMBL";
|
|
20 my $acc_id="M20132";
|
|
21 my $query="embl-acc:$acc_id";
|
|
22
|
|
23 my $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
|
|
24
|
|
25 my @translationobjects=$loader->entry2liveseq();
|
|
26
|
|
27 my $gene="AR";
|
|
28 my $gene=$loader->gene2liveseq("gene");
|
|
29
|
|
30 NOTE: The only -db now supported is EMBL. Hence it defaults to EMBL.
|
|
31
|
|
32 =head1 DESCRIPTION
|
|
33
|
|
34 This package uses the SRS (Sequence Retrieval System) to fetch a sequence
|
|
35 database entry, analyse it and create LiveSeq objects out of it.
|
|
36
|
|
37 An embl-acc ID has to be passed to this package which will return references
|
|
38 to all translation objects created from the EMBL entry.
|
|
39 References to Transcription, DNA and Exon objects can all be retrieved departing
|
|
40 from these.
|
|
41
|
|
42 Alternatively, a specific "gene" name can be specified, together with the
|
|
43 embl-acc ID. This will create a LiveSeq::Gene object with all relevant gene
|
|
44 features attached/created.
|
|
45
|
|
46 =head1 AUTHOR - Joseph A.L. Insana
|
|
47
|
|
48 Email: Insana@ebi.ac.uk, jinsana@gmx.net
|
|
49
|
|
50 Address:
|
|
51
|
|
52 EMBL Outstation, European Bioinformatics Institute
|
|
53 Wellcome Trust Genome Campus, Hinxton
|
|
54 Cambs. CB10 1SD, United Kingdom
|
|
55
|
|
56 =head1 APPENDIX
|
|
57
|
|
58 The rest of the documentation details each of the object
|
|
59 methods. Internal methods are usually preceded with a _
|
|
60
|
|
61 =cut
|
|
62
|
|
63 # Let the code begin...
|
|
64
|
|
65 package Bio::LiveSeq::IO::SRS;
|
|
66 $VERSION=2.4;
|
|
67
|
|
68 # Version history:
|
|
69 # Wed Apr 5 13:06:43 BST 2000 v 1.0 restarted as a child of Loader.pm
|
|
70 # Wed Apr 5 15:57:22 BST 2000 v 1.1 load() created
|
|
71 # Thu Apr 6 02:52:11 BST 2000 v 1.2 new field "range" in hash
|
|
72 # Thu Apr 6 03:11:29 BST 2000 v 1.22 transition from $location to @range
|
|
73 # Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division
|
|
74 # Tue Apr 18 17:15:26 BST 2000 v 2.0 started coding swissprot2hash
|
|
75 # Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names
|
|
76 # Wed Jun 7 02:08:57 BST 2000 v 2.2 added support for joinedlocation features
|
|
77 # Thu Jun 29 19:07:59 BST 2000 v 2.22 Gene stripped of possible newlines (horrible characteristic of some entries!!!!)
|
|
78 # Fri Jun 30 14:08:21 BST 2000 v 2.3 SRS querying routines now conveniently wrapped in eval{} blocks to avoid SRS crash the whole LiveSeq
|
|
79 # Tue Jul 4 14:07:52 BST 2000 v 2.31 note&number added in val_qual_names
|
|
80 # Mon Sep 4 17:46:42 BST 2000 v 2.4 novelaasequence2gene() added
|
|
81
|
|
82 use strict;
|
|
83 use Carp qw(cluck croak carp);
|
|
84 use vars qw($VERSION @ISA);
|
|
85 use lib $ENV{SRSEXE};
|
|
86 use srsperl;
|
|
87 use Bio::Tools::CodonTable; # for novelaasequence2gene
|
|
88
|
|
89 use Bio::LiveSeq::IO::Loader 2.2;
|
|
90
|
|
91 @ISA=qw(Bio::LiveSeq::IO::Loader);
|
|
92
|
|
93 # This package can in the future host other databases loading subroutines.
|
|
94 # e.g. ensembl2hash
|
|
95
|
|
96 =head2 load
|
|
97
|
|
98 Title : load
|
|
99 Usage : my $acc_id="M20132";
|
|
100 my $query="embl-acc:$acc_id";
|
|
101 $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
|
|
102
|
|
103 Function: loads an entry with SRS from a database into a hash
|
|
104 Returns : reference to a new object of class IO::SRS holding an entry
|
|
105 Errorcode 0
|
|
106 Args : an SRS query resulting in one fetched EMBL (by default) entry
|
|
107
|
|
108 =cut
|
|
109
|
|
110 sub load {
|
|
111 my ($thing, %args) = @_;
|
|
112 my $class = ref($thing) || $thing;
|
|
113 my ($obj,%loader);
|
|
114
|
|
115 my ($db,$query)=($args{-db},$args{-query});
|
|
116
|
|
117 if (defined($db)) {
|
|
118 unless ($db eq "EMBL") {
|
|
119 carp "Note: only EMBL now supported!";
|
|
120 return(0);
|
|
121 }
|
|
122 } else {
|
|
123 $db="EMBL";
|
|
124 }
|
|
125
|
|
126 my $hashref;
|
|
127 if ($db eq "EMBL") {
|
|
128 my $test_transl=0; # change to 0 to avoid comparison of translation
|
|
129
|
|
130 # these can be changed for future needs
|
|
131 my @embl_valid_feature_names=qw(CDS exon prim_transcript intron repeat_unit repeat_region mRNA);
|
|
132 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
|
|
133
|
|
134 # dunno yet how to implement test_transl again....
|
|
135 # probably on a one-on-one basis with each translation?
|
|
136 if ($test_transl) {
|
|
137 push (@embl_valid_qual_names,"translation"); # needed for test_transl
|
|
138 }
|
|
139
|
|
140 # not to have the whole program die because of SRS death
|
|
141 eval {
|
|
142 $hashref=&embl2hash("$query",\@embl_valid_feature_names,\@embl_valid_qual_names);
|
|
143 };
|
|
144 my $errormsg;
|
|
145 if ($@) {
|
|
146 foreach $errormsg (split(/\n/,$@)) {
|
|
147 if (index($errormsg,"in cleanup") == -1) {
|
|
148 carp "SRS EMBL Loader: $@";
|
|
149 }
|
|
150 }
|
|
151 }
|
|
152 }
|
|
153 unless ($hashref) { return (0); }
|
|
154
|
|
155 %loader = (db => $db, query => $query, hash => $hashref);
|
|
156 $obj = \%loader;
|
|
157 $obj = bless $obj, $class;
|
|
158 return $obj;
|
|
159 }
|
|
160
|
|
161 =head2 embl2hash
|
|
162
|
|
163 Title : embl2hash
|
|
164 Function: retrieves with SRS an EMBL entry, parses it and creates
|
|
165 a hash that contains all the information.
|
|
166 Returns : a reference to a hash
|
|
167 Errorcode: 0
|
|
168 Args : an SRS query resulting in one fetched EMBL entry
|
|
169 i.e. a string in a form like "embl-acc:accession_number"
|
|
170 two array references to skip features and qualifiers (for
|
|
171 performance)
|
|
172 Example: @valid_features=qw(CDS exon prim_transcript mRNA);
|
|
173 @valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
|
|
174 $hashref=&embl2hash("$query",\@valid_features,\@valid_qualifiers);
|
|
175
|
|
176 =cut
|
|
177
|
|
178 # this has to be defined here as it is the only thing really proper to
|
|
179 # the /SRS/ loader. Every loader has to sport his own "entry2hash" function
|
|
180 # the main functions will be in the Loader.pm
|
|
181 # arguments: embl SRS query resulting in one fetched EMBL entry
|
|
182 # to skip features and qualifiers (for performance), two array
|
|
183 # references must be passed (this can change into string arguments to
|
|
184 # be passed....)
|
|
185 # returns: a reference to a hash containing the important features requested
|
|
186 sub embl2hash {
|
|
187 my ($i,$j);
|
|
188 my $query=$_[0];
|
|
189 my %valid_features; my %valid_names;
|
|
190 if ($_[1]) {
|
|
191 %valid_features = map {$_, 1} @{$_[1]}; # to skip features
|
|
192 }
|
|
193 if ($_[2]) {
|
|
194 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
|
|
195 }
|
|
196 # SRS API used to fetch all relevant fields
|
|
197 my $sess = new Session;
|
|
198 my $set = $sess->query("[$query]", "");
|
|
199 my $numEntries=$set->size();
|
|
200 if ($numEntries < 1) {
|
|
201 carp "No sequence entry found for the query $query";
|
|
202 return (0);
|
|
203 } elsif ($numEntries > 1) {
|
|
204 carp "Too many entries found for the input given";
|
|
205 return (0);
|
|
206 } else {
|
|
207 my $entry = $set->getEntry(0);
|
|
208 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Division);
|
|
209 # Fetch what we can fetch without the loader
|
|
210 $entry_ID = $entry->fieldToString("id","","","");
|
|
211 $entry_AccNumber = $entry->fieldToString("acc","","","");
|
|
212 $entry_Molecule = $entry->fieldToString("mol","","","");
|
|
213 $entry_Division = $entry->fieldToString("div","","","");
|
|
214 $entry_Description = $entry->fieldToString("des","","","");
|
|
215 $entry_Description =~ s/\n/ /g;
|
|
216 $entry_Organism = $entry->fieldToString("org","","","");
|
|
217 $entry_SeqLength = $entry->fieldToString("sl","","","");
|
|
218 # Now use the loader
|
|
219 my $loadedentry = $entry->load("EMBL");
|
|
220 # Fetch the rest via the loader
|
|
221 $entry_Sequence = $loadedentry->attrStr("sequence");
|
|
222 $entry_Sequence =~ s/\n//g; # from plain format to raw string
|
|
223
|
|
224 # put into the hash
|
|
225 my %entryhash;
|
|
226 $entryhash{ID}=$entry_ID;
|
|
227 $entryhash{AccNumber}=$entry_AccNumber;
|
|
228 $entryhash{Molecule}=$entry_Molecule;
|
|
229 $entryhash{Division}=$entry_Division;
|
|
230 $entryhash{Description}=$entry_Description;
|
|
231 $entryhash{Organism}=$entry_Organism;
|
|
232 $entryhash{Sequence}=$entry_Sequence;
|
|
233 $entryhash{SeqLength}=$entry_SeqLength;
|
|
234
|
|
235 # create features array
|
|
236 my $features = $loadedentry->attrObjList("features");
|
|
237 my $featuresnumber= $features->size();
|
|
238 $entryhash{FeaturesNumber}=$featuresnumber;
|
|
239 my ($feature,$feature_name,$feature_location);
|
|
240 my ($feature_qual_names,$feature_qual_values);
|
|
241 my ($feature_qual_name,$feature_qual_value);
|
|
242 my ($feature_qual_number,$feature_qual_number2);
|
|
243 my @features;
|
|
244 for $i (0..$featuresnumber-1) {
|
|
245 my %feature;
|
|
246 $feature = $features->get($i);
|
|
247 $feature_name = $feature->attrStr("FtKey");
|
|
248
|
|
249 #unless ($feature_name eq "CDS") {
|
|
250 unless ($valid_features{$feature_name}) {
|
|
251 #print "not valid feature: $feature_name\n";
|
|
252 next;
|
|
253 }
|
|
254 #print "now processing feature: $feature_name\n";
|
|
255 $feature_location = $feature->attrStr("FtLocation");
|
|
256 $feature_location =~ s/\n//g;
|
|
257 $feature_qual_names= $feature->attrStrList("FtQualNames");
|
|
258 $feature_qual_values= $feature->attrStrList("FtQualValues");
|
|
259 $feature_qual_number= $feature_qual_names->size();
|
|
260 $feature_qual_number2= $feature_qual_values->size(); # paranoia check
|
|
261 if ($feature_qual_number > $feature_qual_number2) {
|
|
262 carp ("Warning with Feature at position $i ($feature_name): There are MORE qualifier names than qualifier values.");
|
|
263 # this can happen, e.g. "/partial", let's move on, just issue a warning
|
|
264 #return (0);
|
|
265 } elsif ($feature_qual_number < $feature_qual_number2) {
|
|
266 carp ("Error with Feature at position $i ($feature_name): There are LESS qualifier names than qualifier values. Stopped");
|
|
267 return (0);
|
|
268 }
|
|
269 #} else {print "NUMBER OF QUALIFIERS: $feature_qual_number \n";} # DEBUG
|
|
270
|
|
271 # Put things inside hash
|
|
272 $feature{position}=$i;
|
|
273 $feature{name}=$feature_name;
|
|
274
|
|
275 # new range field in hash
|
|
276 my @range;
|
|
277 if ($feature_name eq "CDS") {
|
|
278 @range=cdslocation2transcript($feature_location);
|
|
279 $feature{locationtype}="joined";
|
|
280 } else {
|
|
281 if (index($feature_location,"join") == -1) {
|
|
282 @range=location2range($feature_location);
|
|
283 } else { # complex location in feature different than CDS
|
|
284 @range=joinedlocation2range($feature_location);
|
|
285 $feature{locationtype}="joined";
|
|
286 }
|
|
287 }
|
|
288 $feature{range}=\@range;
|
|
289 $feature{location}="deprecated";
|
|
290 my %feature_qualifiers;
|
|
291 for $j (0..$feature_qual_number-1) {
|
|
292 $feature_qual_name=$feature_qual_names->get($j);
|
|
293 $feature_qual_name =~ s/^\///; # eliminates heading "/"
|
|
294
|
|
295 # skip all not interesting (for now) qualifiers
|
|
296 unless ($valid_names{$feature_qual_name}) {
|
|
297 #print "not valid name: $feature_qual_name\n";
|
|
298 next;
|
|
299 }
|
|
300 #print "now processing: $feature_qual_name\n";
|
|
301 $feature_qual_value=$feature_qual_values->get($j);
|
|
302 $feature_qual_value =~ s/^"//; # eliminates heading "
|
|
303 $feature_qual_value =~ s/"$//; # eliminates trailing "
|
|
304 $feature_qual_value =~ s/\n//g; # eliminates all new lines
|
|
305 $feature_qualifiers{$feature_qual_name}=$feature_qual_value;
|
|
306 }
|
|
307 $feature{qualifiers}=\%feature_qualifiers;
|
|
308 push (@features,\%feature); # array of features
|
|
309 }
|
|
310 $entryhash{Features}=\@features; # put this also into the hash
|
|
311 my @cds; # array just of CDSs
|
|
312 for $i (0..$#features) {
|
|
313 if ($features[$i]->{'name'} eq "CDS") {
|
|
314 push(@cds,$features[$i]);
|
|
315 }
|
|
316 }
|
|
317 $entryhash{CDS}=\@cds; # put this also into the hash
|
|
318 return (\%entryhash);
|
|
319 }
|
|
320 }
|
|
321
|
|
322 # argument: location field of an EMBL feature
|
|
323 # returns: array with correct $start $end and $strand to create LiveSeq obj
|
|
324 sub location2range {
|
|
325 my ($location)=@_;
|
|
326 my ($start,$end,$strand);
|
|
327 if (index($location,"complement") == -1) { # forward strand
|
|
328 $strand=1;
|
|
329 } else { # reverse strand
|
|
330 $location =~ s/complement\(//g;
|
|
331 $location =~ s/\)//g;
|
|
332 $strand=-1;
|
|
333 }
|
|
334 $location =~ s/\<//g;
|
|
335 $location =~ s/\>//g;
|
|
336 my @range=split(/\.\./,$location);
|
|
337 if (scalar(@range) == 1) { # special case of range with just one position (e.g. polyA_site EMBL features
|
|
338 $start=$end=$range[0];
|
|
339 } else {
|
|
340 if ($strand == 1) {
|
|
341 ($start,$end)=@range;
|
|
342 } else { # reverse strand
|
|
343 ($end,$start)=@range;
|
|
344 }
|
|
345 }
|
|
346 return ($start,$end,$strand);
|
|
347 }
|
|
348
|
|
349 # argument: location field of a CDS feature
|
|
350 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj
|
|
351 sub cdslocation2transcript {
|
|
352 my ($location)=@_;
|
|
353 my @exonlocs;
|
|
354 my $exonloc;
|
|
355 my @exon;
|
|
356 my @transcript=();
|
|
357 $location =~ s/^join\(//;
|
|
358 $location =~ s/\)$//;
|
|
359 @exonlocs = split (/\,/,$location);
|
|
360 for $exonloc (@exonlocs) {
|
|
361 my @exon=location2range($exonloc);
|
|
362 push (@transcript,\@exon);
|
|
363 }
|
|
364 return (@transcript);
|
|
365 }
|
|
366
|
|
367 # argument: location field of a CDS feature
|
|
368 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj
|
|
369 sub joinedlocation2range {
|
|
370 &cdslocation2transcript;
|
|
371 }
|
|
372
|
|
373
|
|
374 =head2 get_swisshash
|
|
375
|
|
376 Title : get_swisshash
|
|
377 Usage : $loader->get_swisshash();
|
|
378 Example : $swisshash=$loader->swissprot2hash("SWISS-PROT:P10275")
|
|
379 Function: retrieves with SRS a SwissProt entry, parses it and creates
|
|
380 a hash that contains all the information.
|
|
381 Returns : a reference to a hash
|
|
382 Errorcode: 0
|
|
383 Args : the db_xref qualifier's value from an EMBL CDS Feature
|
|
384 i.e. a string in the form "SWISS-PROT:accession_number"
|
|
385 Note : this can be modified (adding a second argument) to retrieve
|
|
386 and parse SWTREMBL, SWALL... entries
|
|
387
|
|
388 =cut
|
|
389
|
|
390 # argument: db_xref qualifier's value from EMBL CDS
|
|
391 # errorcode: 0
|
|
392 # returns hashref
|
|
393 sub get_swisshash {
|
|
394 my ($self,$query)=@_;
|
|
395 if (index($query,"SWISS-PROT") == -1) {
|
|
396 return (0);
|
|
397 }
|
|
398 $query =~ s/SWISS-PROT/swissprot-acc/;
|
|
399 my $hashref;
|
|
400 eval {
|
|
401 $hashref=&swissprot2hash("$query");
|
|
402 };
|
|
403 my $errormsg;
|
|
404 if ($@) {
|
|
405 foreach $errormsg (split(/\n/,$@)) {
|
|
406 if (index($errormsg,"in cleanup") == -1) {
|
|
407 carp "SRS Swissprot Loader: $@";
|
|
408 }
|
|
409 }
|
|
410 }
|
|
411 unless ($hashref) {
|
|
412 return (0);
|
|
413 } else {
|
|
414 return ($hashref);
|
|
415 }
|
|
416 }
|
|
417
|
|
418 =head2 swissprot2hash
|
|
419
|
|
420 Title : swissprot2hash
|
|
421 Usage : $loader->swissprot2hash();
|
|
422 Example : $swisshash=$loader->swissprot2hash("swissprot-acc:P10275")
|
|
423 Function: retrieves with SRS a SwissProt entry, parses it and creates
|
|
424 a hash that contains all the information.
|
|
425 Returns : a reference to a hash
|
|
426 Errorcode: 0
|
|
427 Args : an SRS query resulting in one entry from SwissProt database
|
|
428 i.e. a string in the form "swissprot-acc:accession_number"
|
|
429 Note : this can be modified (adding a second argument) to retrieve
|
|
430 and parse SWTREMBL, SWALL... entries
|
|
431
|
|
432 =cut
|
|
433
|
|
434 # arguments: swissprot SRS query resulting in one fetched swissprot entry
|
|
435 # returns: a reference to a hash containing the important features requested
|
|
436 sub swissprot2hash {
|
|
437 my ($i,$j);
|
|
438 my $query=$_[0];
|
|
439 # SRS API used to fetch all relevant fields
|
|
440 my $sess = new Session;
|
|
441 my $set = $sess->query("[$query]", "");
|
|
442 my $numEntries = $set->size();
|
|
443 if ($numEntries < 1) {
|
|
444 carp "No sequence entry found for the query $query";
|
|
445 return (0);
|
|
446 } elsif ($numEntries > 1) {
|
|
447 carp "Too many entries found for the input given";
|
|
448 return (0);
|
|
449 } else {
|
|
450 my $entry = $set->getEntry(0);
|
|
451 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Gene);
|
|
452 # Fetch what we can fetch without the loader
|
|
453 $entry_ID = $entry->fieldToString("id","","","");
|
|
454 $entry_AccNumber = $entry->fieldToString("acc","","","");
|
|
455 $entry_Gene = $entry->fieldToString("gen","","","");
|
|
456 $entry_Gene =~ s/\n/ /g;
|
|
457 $entry_Description = $entry->fieldToString("des","","","");
|
|
458 $entry_Description =~ s/\n/ /g;
|
|
459 $entry_Organism = $entry->fieldToString("org","","","");
|
|
460 chop $entry_Organism;
|
|
461 $entry_SeqLength = $entry->fieldToString("sl","","","");
|
|
462 # Now use the loader
|
|
463 my $loadedentry = $entry->load("Swissprot");
|
|
464 # Fetch the rest via the loader
|
|
465 $entry_Sequence = $loadedentry->attrStr("Sequence");
|
|
466 $entry_Sequence =~ s/\n//g; # from plain format to raw string
|
|
467
|
|
468 # put into the hash
|
|
469 my %entryhash;
|
|
470 $entryhash{ID}=$entry_ID;
|
|
471 $entryhash{AccNumber}=$entry_AccNumber;
|
|
472 $entryhash{Molecule}=$entry_Molecule;
|
|
473 $entryhash{Gene}=$entry_Gene;
|
|
474 $entryhash{Description}=$entry_Description;
|
|
475 $entryhash{Organism}=$entry_Organism;
|
|
476 $entryhash{Sequence}=$entry_Sequence;
|
|
477 $entryhash{SeqLength}=$entry_SeqLength;
|
|
478
|
|
479 # create features array
|
|
480 my $features = $loadedentry->attrObjList("Features");
|
|
481 my $featuresnumber= $features->size();
|
|
482 $entryhash{FeaturesNumber}=$featuresnumber;
|
|
483 my ($feature,$feature_name,$feature_description,$feature_location);
|
|
484 my @features;
|
|
485 for $i (0..$featuresnumber-1) {
|
|
486 my %feature;
|
|
487 $feature = $features->get($i);
|
|
488 $feature_name = $feature->attrStr("FtKey");
|
|
489 $feature_location = $feature->attrStr("FtLocation");
|
|
490 $feature_location =~ s/ +/ /g;
|
|
491 $feature_description = $feature->attrStr("FtDescription");
|
|
492 chop $feature_description;
|
|
493 $feature_description =~ s/\nFT / /g;
|
|
494
|
|
495 # Put things inside hash
|
|
496 $feature{position}=$i;
|
|
497 $feature{name}=$feature_name;
|
|
498 $feature{location}=$feature_location;
|
|
499 $feature{description}=$feature_description;
|
|
500
|
|
501 push (@features,\%feature); # array of features
|
|
502 }
|
|
503 $entryhash{Features}=\@features; # put this also into the hash
|
|
504 return (\%entryhash);
|
|
505 }
|
|
506 }
|
|
507
|
|
508 =head2 novelaasequence2gene
|
|
509
|
|
510 Title : novelaasequence2gene
|
|
511 Usage : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
|
|
512 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
|
|
513 -genome => "Homo sapiens");
|
|
514 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
|
|
515 -genome => "Mitochondrion Homo sapiens",
|
|
516 -gene_name => "tyr-kinase");
|
|
517
|
|
518 Function: creates LiveSeq objects from a novel amino acid sequence,
|
|
519 using codon usage database to choose codons according to
|
|
520 relative frequencies.
|
|
521 If a genome latin name is not specified, the default is to use
|
|
522 'Homo sapiens' (taxonomy ID 9606).
|
|
523 Returns : reference to a Gene object containing references to LiveSeq objects
|
|
524 Errorcode 0
|
|
525 Args : string containing an amino acid sequence
|
|
526 string (optional) with a species/genome latin name
|
|
527 string specifying a gene name
|
|
528 Note : SRS access to TAXON and CODONUSAGE databases is required
|
|
529
|
|
530 =cut
|
|
531
|
|
532 sub novelaasequence2gene {
|
|
533 my ($self, %args) = @_;
|
|
534 my ($gene_name,$species_name,$aasequence)=($args{-gene_name},$args{-genome},$args{-aasequence});
|
|
535 unless ($aasequence) {
|
|
536 carp "aasequence not given";
|
|
537 return (0);
|
|
538 }
|
|
539 unless ($gene_name) {
|
|
540 $gene_name="Novel Unknown";
|
|
541 }
|
|
542 unless ($species_name) {
|
|
543 $species_name="Homo sapiens";
|
|
544 }
|
|
545
|
|
546 my $sess = new Session;
|
|
547 my ($e,$numEntries,$set);
|
|
548
|
|
549 # codonusage lookup
|
|
550 my $codonusage_usage;
|
|
551 my @species_codon_usage;
|
|
552 $set = $sess->query("[codonusage:'$species_name']", "");
|
|
553 $numEntries = $set->size();
|
|
554 if ($numEntries > 0) {
|
|
555 $e = $set->getEntry(0);
|
|
556 $species_name = $e->fieldToString("id","","","");
|
|
557 $codonusage_usage = $e->fieldToString("usage","","","");
|
|
558 @species_codon_usage=split(/\s/,$codonusage_usage); # spaces or tabs
|
|
559 if ($numEntries > 1) {
|
|
560 carp "Search in Codon Usage DB resulted in $numEntries results. Taking the first one: $species_name";
|
|
561 }
|
|
562 } else {
|
|
563 carp "Genome not found in codon usage DB.";
|
|
564 return (0);
|
|
565 }
|
|
566
|
|
567 # taxonomy lookup
|
|
568 my $mito_flag = 0;
|
|
569 my $species_origin;
|
|
570 if ($species_name =~ /^Mitochondrion /) {
|
|
571 $mito_flag = 1;
|
|
572 $species_origin = $species_name;
|
|
573 $species_origin =~ s/^Mitochondrion //;
|
|
574 $set = $sess->query("[taxonomy-species:'$species_origin']", "");
|
|
575 } elsif ($species_name =~ /^Chloroplast |^Kinetoplast |^Chromoplast /) {
|
|
576 $species_origin = $species_name;
|
|
577 $species_origin =~ s/^Chromoplast //;
|
|
578 $species_origin =~ s/^Kinetoplast //;
|
|
579 $species_origin =~ s/^Chloroplast //;
|
|
580 $set = $sess->query("[taxonomy-species:'$species_origin']", "");
|
|
581 } else {
|
|
582 $set = $sess->query("[taxonomy-species:'$species_name']", "");
|
|
583 }
|
|
584 $numEntries = $set->size();
|
|
585 my ($taxonomy_id,$taxonomy_gc,$taxonomy_mgc,$taxonomy_name);
|
|
586 if ($numEntries > 0) {
|
|
587 $e = $set->getEntry(0);
|
|
588 $taxonomy_id = $e->fieldToString("id","","","");
|
|
589 $taxonomy_name = $e->fieldToString("species","","","");
|
|
590 $taxonomy_gc = $e->fieldToString("gc","","","");
|
|
591 $taxonomy_mgc = $e->fieldToString("mgc","","","");
|
|
592 if ($numEntries > 1) {
|
|
593 carp "Note! More than one entry found in taxonomy DB for the genome query given. Using the first one: $taxonomy_name ($taxonomy_id)";
|
|
594 }
|
|
595 } else {
|
|
596 carp "Genome not found in taxonomy DB.";
|
|
597 return (0);
|
|
598 }
|
|
599 # Lookup appropriate translation table
|
|
600 my $ttabid;
|
|
601 if ($mito_flag) {
|
|
602 $ttabid = $taxonomy_mgc;
|
|
603 } else {
|
|
604 $ttabid = $taxonomy_gc;
|
|
605 }
|
|
606
|
|
607 my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
|
|
608 return ($gene);
|
|
609 }
|
|
610
|
|
611 1;
|