comparison variant_effect_predictor/Bio/LiveSeq/IO/SRS.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: SRS.pm,v 1.7 2001/06/18 08:27:55 heikki Exp $
2 #
3 # bioperl module for Bio::LiveSeq::IO::SRS
4 #
5 # Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
6 #
7 # Copyright Joseph Insana
8 #
9 # You may distribute this module under the same terms as perl itself
10 #
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::LiveSeq::IO::SRS - Loader for LiveSeq from EMBL entries with SRS
16
17 =head1 SYNOPSIS
18
19 my $db="EMBL";
20 my $acc_id="M20132";
21 my $query="embl-acc:$acc_id";
22
23 my $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
24
25 my @translationobjects=$loader->entry2liveseq();
26
27 my $gene="AR";
28 my $gene=$loader->gene2liveseq("gene");
29
30 NOTE: The only -db now supported is EMBL. Hence it defaults to EMBL.
31
32 =head1 DESCRIPTION
33
34 This package uses the SRS (Sequence Retrieval System) to fetch a sequence
35 database entry, analyse it and create LiveSeq objects out of it.
36
37 An embl-acc ID has to be passed to this package which will return references
38 to all translation objects created from the EMBL entry.
39 References to Transcription, DNA and Exon objects can all be retrieved departing
40 from these.
41
42 Alternatively, a specific "gene" name can be specified, together with the
43 embl-acc ID. This will create a LiveSeq::Gene object with all relevant gene
44 features attached/created.
45
46 =head1 AUTHOR - Joseph A.L. Insana
47
48 Email: Insana@ebi.ac.uk, jinsana@gmx.net
49
50 Address:
51
52 EMBL Outstation, European Bioinformatics Institute
53 Wellcome Trust Genome Campus, Hinxton
54 Cambs. CB10 1SD, United Kingdom
55
56 =head1 APPENDIX
57
58 The rest of the documentation details each of the object
59 methods. Internal methods are usually preceded with a _
60
61 =cut
62
63 # Let the code begin...
64
65 package Bio::LiveSeq::IO::SRS;
66 $VERSION=2.4;
67
68 # Version history:
69 # Wed Apr 5 13:06:43 BST 2000 v 1.0 restarted as a child of Loader.pm
70 # Wed Apr 5 15:57:22 BST 2000 v 1.1 load() created
71 # Thu Apr 6 02:52:11 BST 2000 v 1.2 new field "range" in hash
72 # Thu Apr 6 03:11:29 BST 2000 v 1.22 transition from $location to @range
73 # Thu Apr 6 03:40:04 BST 2000 v 1.25 added Division
74 # Tue Apr 18 17:15:26 BST 2000 v 2.0 started coding swissprot2hash
75 # Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names
76 # Wed Jun 7 02:08:57 BST 2000 v 2.2 added support for joinedlocation features
77 # Thu Jun 29 19:07:59 BST 2000 v 2.22 Gene stripped of possible newlines (horrible characteristic of some entries!!!!)
78 # Fri Jun 30 14:08:21 BST 2000 v 2.3 SRS querying routines now conveniently wrapped in eval{} blocks to avoid SRS crash the whole LiveSeq
79 # Tue Jul 4 14:07:52 BST 2000 v 2.31 note&number added in val_qual_names
80 # Mon Sep 4 17:46:42 BST 2000 v 2.4 novelaasequence2gene() added
81
82 use strict;
83 use Carp qw(cluck croak carp);
84 use vars qw($VERSION @ISA);
85 use lib $ENV{SRSEXE};
86 use srsperl;
87 use Bio::Tools::CodonTable; # for novelaasequence2gene
88
89 use Bio::LiveSeq::IO::Loader 2.2;
90
91 @ISA=qw(Bio::LiveSeq::IO::Loader);
92
93 # This package can in the future host other databases loading subroutines.
94 # e.g. ensembl2hash
95
96 =head2 load
97
98 Title : load
99 Usage : my $acc_id="M20132";
100 my $query="embl-acc:$acc_id";
101 $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
102
103 Function: loads an entry with SRS from a database into a hash
104 Returns : reference to a new object of class IO::SRS holding an entry
105 Errorcode 0
106 Args : an SRS query resulting in one fetched EMBL (by default) entry
107
108 =cut
109
110 sub load {
111 my ($thing, %args) = @_;
112 my $class = ref($thing) || $thing;
113 my ($obj,%loader);
114
115 my ($db,$query)=($args{-db},$args{-query});
116
117 if (defined($db)) {
118 unless ($db eq "EMBL") {
119 carp "Note: only EMBL now supported!";
120 return(0);
121 }
122 } else {
123 $db="EMBL";
124 }
125
126 my $hashref;
127 if ($db eq "EMBL") {
128 my $test_transl=0; # change to 0 to avoid comparison of translation
129
130 # these can be changed for future needs
131 my @embl_valid_feature_names=qw(CDS exon prim_transcript intron repeat_unit repeat_region mRNA);
132 my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
133
134 # dunno yet how to implement test_transl again....
135 # probably on a one-on-one basis with each translation?
136 if ($test_transl) {
137 push (@embl_valid_qual_names,"translation"); # needed for test_transl
138 }
139
140 # not to have the whole program die because of SRS death
141 eval {
142 $hashref=&embl2hash("$query",\@embl_valid_feature_names,\@embl_valid_qual_names);
143 };
144 my $errormsg;
145 if ($@) {
146 foreach $errormsg (split(/\n/,$@)) {
147 if (index($errormsg,"in cleanup") == -1) {
148 carp "SRS EMBL Loader: $@";
149 }
150 }
151 }
152 }
153 unless ($hashref) { return (0); }
154
155 %loader = (db => $db, query => $query, hash => $hashref);
156 $obj = \%loader;
157 $obj = bless $obj, $class;
158 return $obj;
159 }
160
161 =head2 embl2hash
162
163 Title : embl2hash
164 Function: retrieves with SRS an EMBL entry, parses it and creates
165 a hash that contains all the information.
166 Returns : a reference to a hash
167 Errorcode: 0
168 Args : an SRS query resulting in one fetched EMBL entry
169 i.e. a string in a form like "embl-acc:accession_number"
170 two array references to skip features and qualifiers (for
171 performance)
172 Example: @valid_features=qw(CDS exon prim_transcript mRNA);
173 @valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
174 $hashref=&embl2hash("$query",\@valid_features,\@valid_qualifiers);
175
176 =cut
177
178 # this has to be defined here as it is the only thing really proper to
179 # the /SRS/ loader. Every loader has to sport his own "entry2hash" function
180 # the main functions will be in the Loader.pm
181 # arguments: embl SRS query resulting in one fetched EMBL entry
182 # to skip features and qualifiers (for performance), two array
183 # references must be passed (this can change into string arguments to
184 # be passed....)
185 # returns: a reference to a hash containing the important features requested
186 sub embl2hash {
187 my ($i,$j);
188 my $query=$_[0];
189 my %valid_features; my %valid_names;
190 if ($_[1]) {
191 %valid_features = map {$_, 1} @{$_[1]}; # to skip features
192 }
193 if ($_[2]) {
194 %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
195 }
196 # SRS API used to fetch all relevant fields
197 my $sess = new Session;
198 my $set = $sess->query("[$query]", "");
199 my $numEntries=$set->size();
200 if ($numEntries < 1) {
201 carp "No sequence entry found for the query $query";
202 return (0);
203 } elsif ($numEntries > 1) {
204 carp "Too many entries found for the input given";
205 return (0);
206 } else {
207 my $entry = $set->getEntry(0);
208 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Division);
209 # Fetch what we can fetch without the loader
210 $entry_ID = $entry->fieldToString("id","","","");
211 $entry_AccNumber = $entry->fieldToString("acc","","","");
212 $entry_Molecule = $entry->fieldToString("mol","","","");
213 $entry_Division = $entry->fieldToString("div","","","");
214 $entry_Description = $entry->fieldToString("des","","","");
215 $entry_Description =~ s/\n/ /g;
216 $entry_Organism = $entry->fieldToString("org","","","");
217 $entry_SeqLength = $entry->fieldToString("sl","","","");
218 # Now use the loader
219 my $loadedentry = $entry->load("EMBL");
220 # Fetch the rest via the loader
221 $entry_Sequence = $loadedentry->attrStr("sequence");
222 $entry_Sequence =~ s/\n//g; # from plain format to raw string
223
224 # put into the hash
225 my %entryhash;
226 $entryhash{ID}=$entry_ID;
227 $entryhash{AccNumber}=$entry_AccNumber;
228 $entryhash{Molecule}=$entry_Molecule;
229 $entryhash{Division}=$entry_Division;
230 $entryhash{Description}=$entry_Description;
231 $entryhash{Organism}=$entry_Organism;
232 $entryhash{Sequence}=$entry_Sequence;
233 $entryhash{SeqLength}=$entry_SeqLength;
234
235 # create features array
236 my $features = $loadedentry->attrObjList("features");
237 my $featuresnumber= $features->size();
238 $entryhash{FeaturesNumber}=$featuresnumber;
239 my ($feature,$feature_name,$feature_location);
240 my ($feature_qual_names,$feature_qual_values);
241 my ($feature_qual_name,$feature_qual_value);
242 my ($feature_qual_number,$feature_qual_number2);
243 my @features;
244 for $i (0..$featuresnumber-1) {
245 my %feature;
246 $feature = $features->get($i);
247 $feature_name = $feature->attrStr("FtKey");
248
249 #unless ($feature_name eq "CDS") {
250 unless ($valid_features{$feature_name}) {
251 #print "not valid feature: $feature_name\n";
252 next;
253 }
254 #print "now processing feature: $feature_name\n";
255 $feature_location = $feature->attrStr("FtLocation");
256 $feature_location =~ s/\n//g;
257 $feature_qual_names= $feature->attrStrList("FtQualNames");
258 $feature_qual_values= $feature->attrStrList("FtQualValues");
259 $feature_qual_number= $feature_qual_names->size();
260 $feature_qual_number2= $feature_qual_values->size(); # paranoia check
261 if ($feature_qual_number > $feature_qual_number2) {
262 carp ("Warning with Feature at position $i ($feature_name): There are MORE qualifier names than qualifier values.");
263 # this can happen, e.g. "/partial", let's move on, just issue a warning
264 #return (0);
265 } elsif ($feature_qual_number < $feature_qual_number2) {
266 carp ("Error with Feature at position $i ($feature_name): There are LESS qualifier names than qualifier values. Stopped");
267 return (0);
268 }
269 #} else {print "NUMBER OF QUALIFIERS: $feature_qual_number \n";} # DEBUG
270
271 # Put things inside hash
272 $feature{position}=$i;
273 $feature{name}=$feature_name;
274
275 # new range field in hash
276 my @range;
277 if ($feature_name eq "CDS") {
278 @range=cdslocation2transcript($feature_location);
279 $feature{locationtype}="joined";
280 } else {
281 if (index($feature_location,"join") == -1) {
282 @range=location2range($feature_location);
283 } else { # complex location in feature different than CDS
284 @range=joinedlocation2range($feature_location);
285 $feature{locationtype}="joined";
286 }
287 }
288 $feature{range}=\@range;
289 $feature{location}="deprecated";
290 my %feature_qualifiers;
291 for $j (0..$feature_qual_number-1) {
292 $feature_qual_name=$feature_qual_names->get($j);
293 $feature_qual_name =~ s/^\///; # eliminates heading "/"
294
295 # skip all not interesting (for now) qualifiers
296 unless ($valid_names{$feature_qual_name}) {
297 #print "not valid name: $feature_qual_name\n";
298 next;
299 }
300 #print "now processing: $feature_qual_name\n";
301 $feature_qual_value=$feature_qual_values->get($j);
302 $feature_qual_value =~ s/^"//; # eliminates heading "
303 $feature_qual_value =~ s/"$//; # eliminates trailing "
304 $feature_qual_value =~ s/\n//g; # eliminates all new lines
305 $feature_qualifiers{$feature_qual_name}=$feature_qual_value;
306 }
307 $feature{qualifiers}=\%feature_qualifiers;
308 push (@features,\%feature); # array of features
309 }
310 $entryhash{Features}=\@features; # put this also into the hash
311 my @cds; # array just of CDSs
312 for $i (0..$#features) {
313 if ($features[$i]->{'name'} eq "CDS") {
314 push(@cds,$features[$i]);
315 }
316 }
317 $entryhash{CDS}=\@cds; # put this also into the hash
318 return (\%entryhash);
319 }
320 }
321
322 # argument: location field of an EMBL feature
323 # returns: array with correct $start $end and $strand to create LiveSeq obj
324 sub location2range {
325 my ($location)=@_;
326 my ($start,$end,$strand);
327 if (index($location,"complement") == -1) { # forward strand
328 $strand=1;
329 } else { # reverse strand
330 $location =~ s/complement\(//g;
331 $location =~ s/\)//g;
332 $strand=-1;
333 }
334 $location =~ s/\<//g;
335 $location =~ s/\>//g;
336 my @range=split(/\.\./,$location);
337 if (scalar(@range) == 1) { # special case of range with just one position (e.g. polyA_site EMBL features
338 $start=$end=$range[0];
339 } else {
340 if ($strand == 1) {
341 ($start,$end)=@range;
342 } else { # reverse strand
343 ($end,$start)=@range;
344 }
345 }
346 return ($start,$end,$strand);
347 }
348
349 # argument: location field of a CDS feature
350 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj
351 sub cdslocation2transcript {
352 my ($location)=@_;
353 my @exonlocs;
354 my $exonloc;
355 my @exon;
356 my @transcript=();
357 $location =~ s/^join\(//;
358 $location =~ s/\)$//;
359 @exonlocs = split (/\,/,$location);
360 for $exonloc (@exonlocs) {
361 my @exon=location2range($exonloc);
362 push (@transcript,\@exon);
363 }
364 return (@transcript);
365 }
366
367 # argument: location field of a CDS feature
368 # returns: array of exons arrayref, useful to create LiveSeq Transcript obj
369 sub joinedlocation2range {
370 &cdslocation2transcript;
371 }
372
373
374 =head2 get_swisshash
375
376 Title : get_swisshash
377 Usage : $loader->get_swisshash();
378 Example : $swisshash=$loader->swissprot2hash("SWISS-PROT:P10275")
379 Function: retrieves with SRS a SwissProt entry, parses it and creates
380 a hash that contains all the information.
381 Returns : a reference to a hash
382 Errorcode: 0
383 Args : the db_xref qualifier's value from an EMBL CDS Feature
384 i.e. a string in the form "SWISS-PROT:accession_number"
385 Note : this can be modified (adding a second argument) to retrieve
386 and parse SWTREMBL, SWALL... entries
387
388 =cut
389
390 # argument: db_xref qualifier's value from EMBL CDS
391 # errorcode: 0
392 # returns hashref
393 sub get_swisshash {
394 my ($self,$query)=@_;
395 if (index($query,"SWISS-PROT") == -1) {
396 return (0);
397 }
398 $query =~ s/SWISS-PROT/swissprot-acc/;
399 my $hashref;
400 eval {
401 $hashref=&swissprot2hash("$query");
402 };
403 my $errormsg;
404 if ($@) {
405 foreach $errormsg (split(/\n/,$@)) {
406 if (index($errormsg,"in cleanup") == -1) {
407 carp "SRS Swissprot Loader: $@";
408 }
409 }
410 }
411 unless ($hashref) {
412 return (0);
413 } else {
414 return ($hashref);
415 }
416 }
417
418 =head2 swissprot2hash
419
420 Title : swissprot2hash
421 Usage : $loader->swissprot2hash();
422 Example : $swisshash=$loader->swissprot2hash("swissprot-acc:P10275")
423 Function: retrieves with SRS a SwissProt entry, parses it and creates
424 a hash that contains all the information.
425 Returns : a reference to a hash
426 Errorcode: 0
427 Args : an SRS query resulting in one entry from SwissProt database
428 i.e. a string in the form "swissprot-acc:accession_number"
429 Note : this can be modified (adding a second argument) to retrieve
430 and parse SWTREMBL, SWALL... entries
431
432 =cut
433
434 # arguments: swissprot SRS query resulting in one fetched swissprot entry
435 # returns: a reference to a hash containing the important features requested
436 sub swissprot2hash {
437 my ($i,$j);
438 my $query=$_[0];
439 # SRS API used to fetch all relevant fields
440 my $sess = new Session;
441 my $set = $sess->query("[$query]", "");
442 my $numEntries = $set->size();
443 if ($numEntries < 1) {
444 carp "No sequence entry found for the query $query";
445 return (0);
446 } elsif ($numEntries > 1) {
447 carp "Too many entries found for the input given";
448 return (0);
449 } else {
450 my $entry = $set->getEntry(0);
451 my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Gene);
452 # Fetch what we can fetch without the loader
453 $entry_ID = $entry->fieldToString("id","","","");
454 $entry_AccNumber = $entry->fieldToString("acc","","","");
455 $entry_Gene = $entry->fieldToString("gen","","","");
456 $entry_Gene =~ s/\n/ /g;
457 $entry_Description = $entry->fieldToString("des","","","");
458 $entry_Description =~ s/\n/ /g;
459 $entry_Organism = $entry->fieldToString("org","","","");
460 chop $entry_Organism;
461 $entry_SeqLength = $entry->fieldToString("sl","","","");
462 # Now use the loader
463 my $loadedentry = $entry->load("Swissprot");
464 # Fetch the rest via the loader
465 $entry_Sequence = $loadedentry->attrStr("Sequence");
466 $entry_Sequence =~ s/\n//g; # from plain format to raw string
467
468 # put into the hash
469 my %entryhash;
470 $entryhash{ID}=$entry_ID;
471 $entryhash{AccNumber}=$entry_AccNumber;
472 $entryhash{Molecule}=$entry_Molecule;
473 $entryhash{Gene}=$entry_Gene;
474 $entryhash{Description}=$entry_Description;
475 $entryhash{Organism}=$entry_Organism;
476 $entryhash{Sequence}=$entry_Sequence;
477 $entryhash{SeqLength}=$entry_SeqLength;
478
479 # create features array
480 my $features = $loadedentry->attrObjList("Features");
481 my $featuresnumber= $features->size();
482 $entryhash{FeaturesNumber}=$featuresnumber;
483 my ($feature,$feature_name,$feature_description,$feature_location);
484 my @features;
485 for $i (0..$featuresnumber-1) {
486 my %feature;
487 $feature = $features->get($i);
488 $feature_name = $feature->attrStr("FtKey");
489 $feature_location = $feature->attrStr("FtLocation");
490 $feature_location =~ s/ +/ /g;
491 $feature_description = $feature->attrStr("FtDescription");
492 chop $feature_description;
493 $feature_description =~ s/\nFT / /g;
494
495 # Put things inside hash
496 $feature{position}=$i;
497 $feature{name}=$feature_name;
498 $feature{location}=$feature_location;
499 $feature{description}=$feature_description;
500
501 push (@features,\%feature); # array of features
502 }
503 $entryhash{Features}=\@features; # put this also into the hash
504 return (\%entryhash);
505 }
506 }
507
508 =head2 novelaasequence2gene
509
510 Title : novelaasequence2gene
511 Usage : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
512 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
513 -genome => "Homo sapiens");
514 : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
515 -genome => "Mitochondrion Homo sapiens",
516 -gene_name => "tyr-kinase");
517
518 Function: creates LiveSeq objects from a novel amino acid sequence,
519 using codon usage database to choose codons according to
520 relative frequencies.
521 If a genome latin name is not specified, the default is to use
522 'Homo sapiens' (taxonomy ID 9606).
523 Returns : reference to a Gene object containing references to LiveSeq objects
524 Errorcode 0
525 Args : string containing an amino acid sequence
526 string (optional) with a species/genome latin name
527 string specifying a gene name
528 Note : SRS access to TAXON and CODONUSAGE databases is required
529
530 =cut
531
532 sub novelaasequence2gene {
533 my ($self, %args) = @_;
534 my ($gene_name,$species_name,$aasequence)=($args{-gene_name},$args{-genome},$args{-aasequence});
535 unless ($aasequence) {
536 carp "aasequence not given";
537 return (0);
538 }
539 unless ($gene_name) {
540 $gene_name="Novel Unknown";
541 }
542 unless ($species_name) {
543 $species_name="Homo sapiens";
544 }
545
546 my $sess = new Session;
547 my ($e,$numEntries,$set);
548
549 # codonusage lookup
550 my $codonusage_usage;
551 my @species_codon_usage;
552 $set = $sess->query("[codonusage:'$species_name']", "");
553 $numEntries = $set->size();
554 if ($numEntries > 0) {
555 $e = $set->getEntry(0);
556 $species_name = $e->fieldToString("id","","","");
557 $codonusage_usage = $e->fieldToString("usage","","","");
558 @species_codon_usage=split(/\s/,$codonusage_usage); # spaces or tabs
559 if ($numEntries > 1) {
560 carp "Search in Codon Usage DB resulted in $numEntries results. Taking the first one: $species_name";
561 }
562 } else {
563 carp "Genome not found in codon usage DB.";
564 return (0);
565 }
566
567 # taxonomy lookup
568 my $mito_flag = 0;
569 my $species_origin;
570 if ($species_name =~ /^Mitochondrion /) {
571 $mito_flag = 1;
572 $species_origin = $species_name;
573 $species_origin =~ s/^Mitochondrion //;
574 $set = $sess->query("[taxonomy-species:'$species_origin']", "");
575 } elsif ($species_name =~ /^Chloroplast |^Kinetoplast |^Chromoplast /) {
576 $species_origin = $species_name;
577 $species_origin =~ s/^Chromoplast //;
578 $species_origin =~ s/^Kinetoplast //;
579 $species_origin =~ s/^Chloroplast //;
580 $set = $sess->query("[taxonomy-species:'$species_origin']", "");
581 } else {
582 $set = $sess->query("[taxonomy-species:'$species_name']", "");
583 }
584 $numEntries = $set->size();
585 my ($taxonomy_id,$taxonomy_gc,$taxonomy_mgc,$taxonomy_name);
586 if ($numEntries > 0) {
587 $e = $set->getEntry(0);
588 $taxonomy_id = $e->fieldToString("id","","","");
589 $taxonomy_name = $e->fieldToString("species","","","");
590 $taxonomy_gc = $e->fieldToString("gc","","","");
591 $taxonomy_mgc = $e->fieldToString("mgc","","","");
592 if ($numEntries > 1) {
593 carp "Note! More than one entry found in taxonomy DB for the genome query given. Using the first one: $taxonomy_name ($taxonomy_id)";
594 }
595 } else {
596 carp "Genome not found in taxonomy DB.";
597 return (0);
598 }
599 # Lookup appropriate translation table
600 my $ttabid;
601 if ($mito_flag) {
602 $ttabid = $taxonomy_mgc;
603 } else {
604 $ttabid = $taxonomy_gc;
605 }
606
607 my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
608 return ($gene);
609 }
610
611 1;