Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/LiveSeq/IO/SRS.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/LiveSeq/IO/SRS.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,611 @@
+# $Id: SRS.pm,v 1.7 2001/06/18 08:27:55 heikki Exp $
+#
+# bioperl module for Bio::LiveSeq::IO::SRS
+#
+# Cared for by Joseph Insana <insana@ebi.ac.uk> <jinsana@gmx.net>
+#
+# Copyright Joseph Insana
+#
+# You may distribute this module under the same terms as perl itself
+#
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::LiveSeq::IO::SRS - Loader for LiveSeq from EMBL entries with SRS
+
+=head1 SYNOPSIS
+
+  my $db="EMBL";
+  my $acc_id="M20132";
+  my $query="embl-acc:$acc_id";
+
+  my $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
+
+  my @translationobjects=$loader->entry2liveseq();
+
+  my $gene="AR";
+  my $gene=$loader->gene2liveseq("gene");
+
+  NOTE: The only -db now supported is EMBL. Hence it defaults to EMBL.
+
+=head1 DESCRIPTION
+
+This package uses the SRS (Sequence Retrieval System) to fetch a sequence
+database entry, analyse it and create LiveSeq objects out of it.
+
+An embl-acc ID has to be passed to this package which will return references
+to all translation objects created from the EMBL entry.
+References to Transcription, DNA and Exon objects can all be retrieved departing
+from these.
+
+Alternatively, a specific "gene" name can be specified, together with the
+embl-acc ID. This will create a LiveSeq::Gene object with all relevant gene
+features attached/created.
+
+=head1 AUTHOR - Joseph A.L. Insana
+
+Email:  Insana@ebi.ac.uk, jinsana@gmx.net
+
+Address: 
+
+     EMBL Outstation, European Bioinformatics Institute
+     Wellcome Trust Genome Campus, Hinxton
+     Cambs. CB10 1SD, United Kingdom 
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+# Let the code begin...
+
+package Bio::LiveSeq::IO::SRS;
+$VERSION=2.4;
+
+# Version history:
+# Wed Apr  5 13:06:43 BST 2000 v 1.0 restarted as a child of Loader.pm
+# Wed Apr  5 15:57:22 BST 2000 v 1.1 load() created
+# Thu Apr  6 02:52:11 BST 2000 v 1.2 new field "range" in hash
+# Thu Apr  6 03:11:29 BST 2000 v 1.22 transition from $location to @range
+# Thu Apr  6 03:40:04 BST 2000 v 1.25 added Division
+# Tue Apr 18 17:15:26 BST 2000 v 2.0 started coding swissprot2hash
+# Thu Apr 20 02:17:28 BST 2000 v 2.1 mRNA added to valid_feature_names
+# Wed Jun  7 02:08:57 BST 2000 v 2.2 added support for joinedlocation features
+# Thu Jun 29 19:07:59 BST 2000 v 2.22 Gene stripped of possible newlines (horrible characteristic of some entries!!!!)
+# Fri Jun 30 14:08:21 BST 2000 v 2.3 SRS querying routines now conveniently wrapped in eval{} blocks to avoid SRS crash the whole LiveSeq
+# Tue Jul  4 14:07:52 BST 2000 v 2.31 note&number added in val_qual_names
+# Mon Sep  4 17:46:42 BST 2000 v 2.4 novelaasequence2gene() added
+
+use strict;
+use Carp qw(cluck croak carp);
+use vars qw($VERSION @ISA);
+use lib $ENV{SRSEXE};
+use srsperl;
+use Bio::Tools::CodonTable; # for novelaasequence2gene
+
+use Bio::LiveSeq::IO::Loader 2.2;
+
+@ISA=qw(Bio::LiveSeq::IO::Loader);
+
+# This package can in the future host other databases loading subroutines.
+# e.g. ensembl2hash
+
+=head2 load
+
+  Title   : load
+  Usage   : my $acc_id="M20132";
+            my $query="embl-acc:$acc_id";
+            $loader=Bio::LiveSeq::IO::SRS->load(-db=>"EMBL", -query=>"$query");
+
+  Function: loads an entry with SRS from a database into a hash
+  Returns : reference to a new object of class IO::SRS holding an entry
+  Errorcode 0
+  Args    : an SRS query resulting in one fetched EMBL (by default) entry
+
+=cut
+
+sub load {
+  my ($thing, %args) = @_;
+  my $class = ref($thing) || $thing;
+  my ($obj,%loader);
+
+  my ($db,$query)=($args{-db},$args{-query});
+
+  if (defined($db)) {
+    unless ($db eq "EMBL") {
+      carp "Note: only EMBL now supported!";
+      return(0);
+    }
+  } else {
+    $db="EMBL";
+  }
+
+  my $hashref;
+  if ($db eq "EMBL") {
+    my $test_transl=0; # change to 0 to avoid comparison of translation
+
+    # these can be changed for future needs
+    my @embl_valid_feature_names=qw(CDS exon prim_transcript intron repeat_unit repeat_region mRNA);
+    my @embl_valid_qual_names=qw(gene codon_start db_xref product note number rpt_family transl_table);
+
+    # dunno yet how to implement test_transl again....
+    # probably on a one-on-one basis with each translation?
+    if ($test_transl) {
+      push (@embl_valid_qual_names,"translation"); # needed for test_transl
+    }
+
+    # not to have the whole program die because of SRS death
+    eval {
+      $hashref=&embl2hash("$query",\@embl_valid_feature_names,\@embl_valid_qual_names);
+    };
+    my $errormsg;
+    if ($@) {
+      foreach $errormsg (split(/\n/,$@)) {
+	if (index($errormsg,"in cleanup") == -1) {
+	  carp "SRS EMBL Loader: $@";
+	}
+      }
+    }
+  }
+  unless ($hashref) { return (0); }
+
+  %loader = (db => $db, query => $query, hash => $hashref);
+  $obj = \%loader;
+  $obj = bless $obj, $class;
+  return $obj;
+}
+
+=head2 embl2hash
+
+  Title   : embl2hash
+  Function: retrieves with SRS an EMBL entry, parses it and creates
+            a hash that contains all the information.
+  Returns : a reference to a hash
+  Errorcode: 0
+  Args    : an SRS query resulting in one fetched EMBL entry
+              i.e. a string in a form like "embl-acc:accession_number"
+	    two array references to skip features and qualifiers (for
+	    performance)
+  Example: @valid_features=qw(CDS exon prim_transcript mRNA);
+           @valid_qualifiers=qw(gene codon_start db_xref product rpt_family);
+           $hashref=&embl2hash("$query",\@valid_features,\@valid_qualifiers);
+
+=cut
+
+# this has to be defined here as it is the only thing really proper to
+# the /SRS/ loader. Every loader has to sport his own "entry2hash" function
+# the main functions will be in the Loader.pm
+# arguments: embl SRS query resulting in one fetched EMBL entry
+# to skip features and qualifiers (for performance), two array
+# references must be passed (this can change into string arguments to
+# be passed....)
+# returns: a reference to a hash containing the important features requested
+sub embl2hash {
+  my ($i,$j);
+  my $query=$_[0];
+  my %valid_features; my %valid_names;
+  if ($_[1]) {
+    %valid_features = map {$_, 1} @{$_[1]}; # to skip features
+  }
+  if ($_[2]) {
+    %valid_names = map {$_, 1} @{$_[2]}; # to skip qualifiers
+  }
+  # SRS API used to fetch all relevant fields
+  my $sess = new Session;
+  my $set = $sess->query("[$query]", "");
+  my $numEntries=$set->size();
+  if ($numEntries < 1) {
+    carp "No sequence entry found for the query $query";
+    return (0);
+  } elsif ($numEntries > 1) {
+    carp "Too many entries found for the input given";
+    return (0);
+  } else {
+    my $entry = $set->getEntry(0);
+    my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Division);
+    # Fetch what we can fetch without the loader
+    $entry_ID = $entry->fieldToString("id","","","");
+    $entry_AccNumber = $entry->fieldToString("acc","","","");
+    $entry_Molecule = $entry->fieldToString("mol","","","");
+    $entry_Division = $entry->fieldToString("div","","","");
+    $entry_Description = $entry->fieldToString("des","","","");
+    $entry_Description =~ s/\n/ /g;
+    $entry_Organism = $entry->fieldToString("org","","","");
+    $entry_SeqLength = $entry->fieldToString("sl","","","");
+    # Now use the loader
+    my $loadedentry = $entry->load("EMBL");
+    # Fetch the rest via the loader
+    $entry_Sequence = $loadedentry->attrStr("sequence");
+    $entry_Sequence =~ s/\n//g; # from plain format to raw string
+
+    # put into the hash
+    my %entryhash;
+    $entryhash{ID}=$entry_ID;
+    $entryhash{AccNumber}=$entry_AccNumber;
+    $entryhash{Molecule}=$entry_Molecule;
+    $entryhash{Division}=$entry_Division;
+    $entryhash{Description}=$entry_Description;
+    $entryhash{Organism}=$entry_Organism;
+    $entryhash{Sequence}=$entry_Sequence;
+    $entryhash{SeqLength}=$entry_SeqLength;
+
+    # create features array
+    my $features = $loadedentry->attrObjList("features");
+    my $featuresnumber= $features->size();
+    $entryhash{FeaturesNumber}=$featuresnumber;
+    my ($feature,$feature_name,$feature_location);
+    my ($feature_qual_names,$feature_qual_values);
+    my ($feature_qual_name,$feature_qual_value);
+    my ($feature_qual_number,$feature_qual_number2);
+    my @features;
+    for $i (0..$featuresnumber-1) {
+      my %feature;
+      $feature = $features->get($i);
+      $feature_name = $feature->attrStr("FtKey");
+
+      #unless ($feature_name eq "CDS") {
+      unless ($valid_features{$feature_name}) {
+	#print "not valid feature: $feature_name\n";
+	next;
+      }
+      #print "now processing feature: $feature_name\n";
+      $feature_location = $feature->attrStr("FtLocation");
+      $feature_location =~ s/\n//g;
+      $feature_qual_names= $feature->attrStrList("FtQualNames");
+      $feature_qual_values= $feature->attrStrList("FtQualValues");
+      $feature_qual_number= $feature_qual_names->size();
+      $feature_qual_number2= $feature_qual_values->size(); # paranoia check
+      if ($feature_qual_number > $feature_qual_number2) {
+	carp ("Warning with Feature at position $i ($feature_name): There are MORE qualifier names than qualifier values.");
+	# this can happen, e.g. "/partial", let's move on, just issue a warning
+	#return (0);
+      } elsif ($feature_qual_number < $feature_qual_number2) {
+	carp ("Error with Feature at position $i ($feature_name): There are LESS qualifier names than qualifier values. Stopped");
+	return (0);
+      }
+      #} else {print "NUMBER OF QUALIFIERS: $feature_qual_number \n";} # DEBUG
+
+      # Put things inside hash
+      $feature{position}=$i;
+      $feature{name}=$feature_name;
+
+      # new range field in hash
+      my @range;
+      if ($feature_name eq "CDS") {
+	@range=cdslocation2transcript($feature_location);
+	$feature{locationtype}="joined";
+      } else {
+	if (index($feature_location,"join") == -1) {
+	  @range=location2range($feature_location);
+	} else { # complex location in feature different than CDS
+	  @range=joinedlocation2range($feature_location);
+	  $feature{locationtype}="joined";
+	}
+      }
+      $feature{range}=\@range;
+      $feature{location}="deprecated";
+      my %feature_qualifiers;
+      for $j (0..$feature_qual_number-1) {
+	$feature_qual_name=$feature_qual_names->get($j);
+	$feature_qual_name =~ s/^\///; # eliminates heading "/"
+
+	# skip all not interesting (for now) qualifiers
+        unless ($valid_names{$feature_qual_name}) {
+	  #print "not valid name: $feature_qual_name\n";
+	  next;
+	}
+	#print "now processing: $feature_qual_name\n";
+	$feature_qual_value=$feature_qual_values->get($j);
+	$feature_qual_value =~ s/^"//; # eliminates heading "
+	$feature_qual_value =~ s/"$//; # eliminates trailing "
+	$feature_qual_value =~ s/\n//g; # eliminates all new lines
+	$feature_qualifiers{$feature_qual_name}=$feature_qual_value;
+      }
+      $feature{qualifiers}=\%feature_qualifiers;
+      push (@features,\%feature); # array of features
+    }
+    $entryhash{Features}=\@features; # put this also into the hash
+    my @cds; # array just of CDSs
+    for $i (0..$#features) {
+      if ($features[$i]->{'name'} eq "CDS") {
+	push(@cds,$features[$i]);
+      }
+    }
+    $entryhash{CDS}=\@cds; # put this also into the hash
+    return (\%entryhash);
+  }
+}
+
+# argument: location field of an EMBL feature
+# returns: array with correct $start $end and $strand to create LiveSeq obj
+sub location2range {
+  my ($location)=@_;
+  my ($start,$end,$strand);
+  if (index($location,"complement") == -1) { # forward strand
+    $strand=1;
+  } else { # reverse strand
+    $location =~ s/complement\(//g;
+    $location =~ s/\)//g;
+    $strand=-1;
+  }
+  $location =~ s/\<//g;
+  $location =~ s/\>//g;
+  my @range=split(/\.\./,$location);
+  if (scalar(@range) == 1) { # special case of range with just one position (e.g. polyA_site EMBL features
+    $start=$end=$range[0];
+  } else {
+    if ($strand == 1) {
+      ($start,$end)=@range;
+    } else { # reverse strand
+      ($end,$start)=@range;
+    }
+  }
+  return ($start,$end,$strand);
+}
+
+# argument: location field of a CDS feature
+# returns: array of exons arrayref, useful to create LiveSeq Transcript obj
+sub cdslocation2transcript {
+  my ($location)=@_;
+  my @exonlocs;
+  my $exonloc;
+  my @exon;
+  my @transcript=();
+  $location =~ s/^join\(//;
+  $location =~ s/\)$//;
+  @exonlocs = split (/\,/,$location);
+  for $exonloc (@exonlocs) {
+    my @exon=location2range($exonloc);
+    push (@transcript,\@exon);
+  }
+  return (@transcript);
+}
+
+# argument: location field of a CDS feature
+# returns: array of exons arrayref, useful to create LiveSeq Transcript obj
+sub joinedlocation2range {
+  &cdslocation2transcript;
+}
+
+
+=head2 get_swisshash
+
+  Title   : get_swisshash
+  Usage   : $loader->get_swisshash();
+  Example : $swisshash=$loader->swissprot2hash("SWISS-PROT:P10275")
+  Function: retrieves with SRS a SwissProt entry, parses it and creates
+            a hash that contains all the information.
+  Returns : a reference to a hash
+  Errorcode: 0
+  Args    : the db_xref qualifier's value from an EMBL CDS Feature
+            i.e. a string in the form "SWISS-PROT:accession_number"
+  Note    : this can be modified (adding a second argument) to retrieve
+            and parse SWTREMBL, SWALL... entries
+
+=cut
+
+# argument: db_xref qualifier's value from EMBL CDS
+# errorcode: 0
+# returns hashref
+sub get_swisshash {
+  my ($self,$query)=@_;
+  if (index($query,"SWISS-PROT") == -1) {
+    return (0);
+  }
+  $query =~ s/SWISS-PROT/swissprot-acc/;
+  my $hashref;
+  eval {
+    $hashref=&swissprot2hash("$query");
+  };
+  my $errormsg;
+  if ($@) {
+    foreach $errormsg (split(/\n/,$@)) {
+      if (index($errormsg,"in cleanup") == -1) {
+	carp "SRS Swissprot Loader: $@";
+      }
+    }
+  }
+  unless ($hashref) {
+    return (0);
+  } else {
+    return ($hashref);
+  }
+}
+
+=head2 swissprot2hash
+
+  Title   : swissprot2hash
+  Usage   : $loader->swissprot2hash();
+  Example : $swisshash=$loader->swissprot2hash("swissprot-acc:P10275")
+  Function: retrieves with SRS a SwissProt entry, parses it and creates
+            a hash that contains all the information.
+  Returns : a reference to a hash
+  Errorcode: 0
+  Args    : an SRS query resulting in one entry from SwissProt database
+            i.e. a string in the form "swissprot-acc:accession_number"
+  Note    : this can be modified (adding a second argument) to retrieve
+            and parse SWTREMBL, SWALL... entries
+
+=cut
+
+# arguments: swissprot SRS query resulting in one fetched swissprot entry
+# returns: a reference to a hash containing the important features requested
+sub swissprot2hash {
+  my ($i,$j);
+  my $query=$_[0];
+  # SRS API used to fetch all relevant fields
+  my $sess = new Session;
+  my $set = $sess->query("[$query]", "");
+  my $numEntries = $set->size();
+  if ($numEntries < 1) {
+    carp "No sequence entry found for the query $query";
+    return (0);
+  } elsif ($numEntries > 1) {
+    carp "Too many entries found for the input given";
+    return (0);
+  } else {
+    my $entry = $set->getEntry(0);
+    my ($entry_ID,$entry_AccNumber,$entry_Molecule,$entry_Description,$entry_Organism,$entry_SeqLength,$entry_Sequence,$entry_Gene);
+    # Fetch what we can fetch without the loader
+    $entry_ID = $entry->fieldToString("id","","","");
+    $entry_AccNumber = $entry->fieldToString("acc","","","");
+    $entry_Gene = $entry->fieldToString("gen","","","");
+    $entry_Gene =~ s/\n/ /g;
+    $entry_Description = $entry->fieldToString("des","","","");
+    $entry_Description =~ s/\n/ /g;
+    $entry_Organism = $entry->fieldToString("org","","","");
+    chop $entry_Organism;
+    $entry_SeqLength = $entry->fieldToString("sl","","","");
+    # Now use the loader
+    my $loadedentry = $entry->load("Swissprot");
+    # Fetch the rest via the loader
+    $entry_Sequence = $loadedentry->attrStr("Sequence");
+    $entry_Sequence =~ s/\n//g; # from plain format to raw string
+
+    # put into the hash
+    my %entryhash;
+    $entryhash{ID}=$entry_ID;
+    $entryhash{AccNumber}=$entry_AccNumber;
+    $entryhash{Molecule}=$entry_Molecule;
+    $entryhash{Gene}=$entry_Gene;
+    $entryhash{Description}=$entry_Description;
+    $entryhash{Organism}=$entry_Organism;
+    $entryhash{Sequence}=$entry_Sequence;
+    $entryhash{SeqLength}=$entry_SeqLength;
+
+    # create features array
+    my $features = $loadedentry->attrObjList("Features");
+    my $featuresnumber= $features->size();
+    $entryhash{FeaturesNumber}=$featuresnumber;
+    my ($feature,$feature_name,$feature_description,$feature_location);
+    my @features;
+    for $i (0..$featuresnumber-1) {
+      my %feature;
+      $feature = $features->get($i);
+      $feature_name = $feature->attrStr("FtKey");
+      $feature_location = $feature->attrStr("FtLocation");
+      $feature_location =~ s/ +/ /g;
+      $feature_description = $feature->attrStr("FtDescription");
+      chop $feature_description;
+      $feature_description =~ s/\nFT                                / /g;
+
+      # Put things inside hash
+      $feature{position}=$i;
+      $feature{name}=$feature_name;
+      $feature{location}=$feature_location;
+      $feature{description}=$feature_description;
+
+      push (@features,\%feature); # array of features
+    }
+    $entryhash{Features}=\@features; # put this also into the hash
+    return (\%entryhash);
+  }
+}
+
+=head2 novelaasequence2gene
+
+  Title   : novelaasequence2gene
+  Usage   : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
+          : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
+                                             -genome => "Homo sapiens");
+          : $gene=Bio::LiveSeq::IO::SRS->novelaasequence2gene(-aasequence => "MGLAAPTRS*",
+                                             -genome => "Mitochondrion Homo sapiens",
+                                             -gene_name => "tyr-kinase");
+
+  Function: creates LiveSeq objects from a novel amino acid sequence,
+            using codon usage database to choose codons according to
+            relative frequencies.
+            If a genome latin name is not specified, the default is to use
+            'Homo sapiens' (taxonomy ID 9606).
+  Returns : reference to a Gene object containing references to LiveSeq objects
+  Errorcode 0
+  Args    : string containing an amino acid sequence
+            string (optional) with a species/genome latin name
+            string specifying a gene name
+  Note    : SRS access to TAXON and CODONUSAGE databases is required
+
+=cut
+
+sub novelaasequence2gene {
+  my ($self, %args) = @_;
+  my ($gene_name,$species_name,$aasequence)=($args{-gene_name},$args{-genome},$args{-aasequence});
+  unless ($aasequence) {
+    carp "aasequence not given";
+    return (0);
+  }
+  unless ($gene_name) {
+    $gene_name="Novel Unknown";
+  }
+  unless ($species_name) {
+    $species_name="Homo sapiens";
+  }
+  
+  my $sess = new Session;
+  my ($e,$numEntries,$set);
+
+  # codonusage lookup
+  my $codonusage_usage;
+  my @species_codon_usage;
+  $set = $sess->query("[codonusage:'$species_name']", "");
+  $numEntries = $set->size();
+  if ($numEntries > 0) {
+    $e = $set->getEntry(0);
+    $species_name = $e->fieldToString("id","","","");
+    $codonusage_usage = $e->fieldToString("usage","","","");
+    @species_codon_usage=split(/\s/,$codonusage_usage); # spaces or tabs
+    if ($numEntries > 1) {
+      carp "Search in Codon Usage DB resulted in $numEntries results. Taking the first one: $species_name";
+    }
+  } else {
+      carp "Genome not found in codon usage DB.";
+      return (0);
+  }  
+
+  # taxonomy lookup
+  my $mito_flag = 0;
+  my $species_origin;
+  if ($species_name =~ /^Mitochondrion /) {
+    $mito_flag = 1;
+    $species_origin = $species_name;
+    $species_origin =~ s/^Mitochondrion //;
+    $set = $sess->query("[taxonomy-species:'$species_origin']", "");
+  } elsif ($species_name =~ /^Chloroplast |^Kinetoplast |^Chromoplast /) {
+    $species_origin = $species_name;
+    $species_origin =~ s/^Chromoplast //;
+    $species_origin =~ s/^Kinetoplast //;
+    $species_origin =~ s/^Chloroplast //;
+    $set = $sess->query("[taxonomy-species:'$species_origin']", "");
+  } else {
+    $set = $sess->query("[taxonomy-species:'$species_name']", "");
+  }
+  $numEntries = $set->size();
+  my ($taxonomy_id,$taxonomy_gc,$taxonomy_mgc,$taxonomy_name);
+  if ($numEntries > 0) {
+    $e = $set->getEntry(0);
+    $taxonomy_id = $e->fieldToString("id","","","");
+    $taxonomy_name = $e->fieldToString("species","","","");
+    $taxonomy_gc = $e->fieldToString("gc","","","");
+    $taxonomy_mgc = $e->fieldToString("mgc","","","");
+    if ($numEntries > 1) {
+      carp "Note! More than one entry found in taxonomy DB for the genome query given. Using the first one: $taxonomy_name ($taxonomy_id)";
+    }
+  } else {
+    carp "Genome not found in taxonomy DB.";
+    return (0);
+  }
+  # Lookup appropriate translation table
+  my $ttabid;
+  if ($mito_flag) {
+    $ttabid = $taxonomy_mgc;
+  } else {
+    $ttabid = $taxonomy_gc;
+  }
+
+  my $gene=Bio::LiveSeq::IO::Loader::_common_novelaasequence2gene(\@species_codon_usage,$ttabid,$aasequence,$gene_name);
+  return ($gene);
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children