diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/CoordSystem.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/CoordSystem.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,566 @@
+ #
+# EnsEMBL module for Bio::EnsEMBL::Funcgen::CoordSystem
+#
+
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::CoordSystem
+
+=head1 SYNOPSIS
+
+  my $db = Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new(...);
+
+  my $csa = $db->get_CoordSystemAdaptor();
+
+  #
+  # Get default chromosome coord system for the 39_36a DB:
+  #
+  my $cs = $csa->fetch_by_name_schema_build_version('chromosome', '39_36a');
+  my $str = join ':', $cs->name(),$cs->version(),$cs->dbID();
+  print "$str\n";
+
+
+=head1 DESCRIPTION
+
+This has been adapted from the core CoordSystem object to accomodate the multi-assembly
+aspects of the eFG schema, namely hadnling the schema_build of the referenced core DB.
+
+This is a simple object which contains a few coordinate system attributes:
+name, internal identifier, version and schema_build.  A coordinate system is 
+uniquely defined by its name and version and which DB it came from i.e. schema_build.  
+A version of a coordinate system applies to all sequences within a coordinate system.  
+This should not be confused with individual sequence versions.
+
+Take for example the Human assembly.  The version 'NCBI33' applies to
+to all chromosomes in the NCBI33 assembly (that is the entire 'chromosome'
+coordinate system).  The 'clone' coordinate system in the same database would
+have no version however.  Although the clone sequences have their own sequence
+versions, there is no version which applies to the entire set of clones.
+
+Coordinate system objects are immutable. Their name and version, and other
+attributes may not be altered after they are created.
+
+=cut
+
+
+use strict;
+use warnings;
+
+package Bio::EnsEMBL::Funcgen::CoordSystem;
+
+use Bio::EnsEMBL::Storable;
+
+use Bio::EnsEMBL::Utils::Argument  qw(rearrange);
+use Bio::EnsEMBL::Utils::Exception qw(throw);
+
+use vars qw(@ISA);
+
+@ISA = qw(Bio::EnsEMBL::Storable);
+
+my %warnings;
+
+
+=head2 new
+
+  Arg [..]   : List of named arguments:
+               -NAME      - The name of the coordinate system
+               -VERSION   - (optional) The version of the coordinate system.
+                            Note that if the version passed in is undefined,
+                            it will be set to the empty string in the
+                            resulting CoordSystem object.
+               -RANK      - The rank of the coordinate system. The highest
+                            level coordinate system should have rank 1, the
+                            second highest rank 2 and so on.  An example of
+                            a high level coordinate system is 'chromosome' an
+                            example of a lower level coordinate system is
+                            'clone'.
+               -SCHEMA_BUILD - The schema and data build version of the DB of
+                               origin.
+               -TOP_LEVEL - (optional) Sets whether this is a top-level coord
+                            system. Default = 0. This should only be set to
+                            true if you are creating an artificial toplevel
+                            coordsystem by the name of 'toplevel'
+               -SEQUENCE_LEVEL - (optional) Sets whether this is a sequence
+                                 level coordinate system. Default = 0
+               -DEFAULT      - (optional)
+                               Whether this is the default version of the 
+                               coordinate systems of this name. Default = 0
+               -DBID         - (optional) The internal identifier of this
+                                coordinate system
+               -ADAPTOR      - (optional) The adaptor which provides database
+                               interaction for this object
+  Example    : $cs = Bio::EnsEMBL::CoordSystem->new(-NAME    => 'chromosome',
+                                                    -VERSION => 'NCBI33',
+                                                    -RANK    => 1,
+                                                    -DBID    => 1,
+                                                    -SCHEMA_BUILD => '39_36a',
+                                                    -ADAPTOR => adaptor,
+                                                    -DEFAULT => 1,
+                                                    -SEQUENCE_LEVEL => 0);
+  Description: Creates a new CoordSystem object representing a coordinate
+               system.
+  Returntype : Bio::EnsEMBL::Funcgen::CoordSystem
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub new {
+  my $caller = shift;
+  my $class = ref($caller) || $caller;
+
+  my $self = $class->SUPER::new(@_);
+
+
+  #Can we just hadnle schema_build here and call super->new for the rest.
+  #We will also have to handle the top/default levels issues with multiple DBs
+
+
+  #my ($name, $version, $sbuild, $top_level, $sequence_level, $default, $rank) =
+  #  rearrange(['NAME','VERSION', 'SCHEMA_BUILD','TOP_LEVEL', 'SEQUENCE_LEVEL',
+  #             'DEFAULT', 'RANK'], @_);
+
+  my ($name, $version) =  rearrange(['NAME','VERSION'], @_);
+
+
+  throw('A name argument is required') if(! $name);
+  
+
+  $version = '' if(!defined($version));
+  
+
+  #$top_level       = ($top_level)      ? 1 : 0;
+  #$sequence_level  = ($sequence_level) ? 1 : 0;
+  #$default         = ($default)        ? 1 : 0;
+  #$rank ||= 0;
+
+  #if($top_level) {
+  #  if($rank) {
+  #    throw('RANK argument must be 0 if TOP_LEVEL is 1');
+  #  }
+
+  #  if($name) {
+  #    if($name ne 'toplevel') {
+  #      throw('The NAME argument must be "toplevel" if TOP_LEVEL is 1')
+  #    }
+  #  } else {
+  #    $name = 'toplevel';
+  #  }
+
+  #  if($sequence_level) {
+  #    throw("SEQUENCE_LEVEL argument must be 0 if TOP_LEVEL is 1");
+  #  }
+
+  #  $default = 0;
+
+#  } else {
+#    if(!$rank) {
+#      throw("RANK argument must be non-zero if not toplevel CoordSystem");
+#    }
+#    if($name eq 'toplevel') {
+#      throw("Cannot name coord system 'toplevel' unless TOP_LEVEL is 1");
+#    }
+#  }
+
+#  if($rank !~ /^\d+$/) {
+#    throw('The RANK argument must be a positive integer');
+#  }
+
+
+  $self->{'core_cache'} = {};
+  $self->{'version'} = $version;
+  $self->{'name'} = $name;
+  #$self->{'schema_build'} = $sbuild;
+  #$self->{'top_level'} = $top_level;
+  #$self->{'sequence_level'} = $sequence_level;
+  #$self->{'default'} = $default;
+  #$self->{'rank'}    = $rank;
+
+
+
+
+  return $self;
+}
+
+
+=head2 add_core_coord_system_info
+
+  Arg [1]    : mandatory hash:
+                              
+							 	-RANK                 => $rank,
+								-SEQUENCE_LEVEL              => $seq_lvl,
+								-DEFAULT          => $default,
+								-SCHEMA_BUILD         => $sbuild,
+								-CORE_COORD_SYSTEM_ID => $ccs_id,
+								-IS_STORED            => $stored_status,
+							  
+  Example    : $cs->add_core_coord_system_info(
+									-RANK                 => $rank,
+									-SEQUENCE_LEVEL              => $seq_lvl,
+									-DEFAULT          => $default,
+									-SCHEMA_BUILD         => $sbuild,
+									-CORE_COORD_SYSTEM_ID => $ccs_id,
+									-IS_STORED            => 1,
+									);
+
+  Description: Setter for core coord system information
+  Returntype : none
+  Exceptions : throws if:
+               rank not 0 when toplevel
+               name not 'TOPLEVEL" when toplevel
+               sequence level and top level
+               no schema_build defined
+               no rank
+               rank 0 when not toplevel
+               name 'TOPLEVEL' when not toplevel
+               
+  Caller     : Bio::EnsEMBL::Funcgen::DBSQL::CoordSystemAdaptor and ?
+  Status     : at risk - replace with add_core_CoordSystem? implement top level?
+
+#this does not check name and version!
+
+
+=cut
+
+sub add_core_coord_system_info {
+  my ($self) = shift;
+
+  my ($sbuild, $top_level, $sequence_level, $default, $rank, $stored, $ccs_id) =
+    rearrange(['SCHEMA_BUILD','TOP_LEVEL', 'SEQUENCE_LEVEL',
+               'DEFAULT', 'RANK', 'IS_STORED', 'CORE_COORD_SYSTEM_ID'], @_);
+
+
+  throw('Must provide a schema_build') if ! $sbuild;
+  throw('Must provide a core_coord_system_id') if ! $ccs_id;
+
+  
+ #$top_level       = ($top_level)      ? 1 : 0;
+  $sequence_level  = ($sequence_level) ? 1 : 0;
+  $default         = ($default)        ? 1 : 0;
+  $stored ||=0;
+
+  $rank ||= 0;
+
+  if($top_level) {
+    if($rank) {
+      throw('RANK argument must be 0 if TOP_LEVEL is 1');
+    }
+
+    if($self->name()) {
+      if($self->name() ne 'toplevel') {
+        throw('The NAME argument must be "toplevel" if TOP_LEVEL is 1')
+      }
+    } else {
+	  throw('toplevel not yet implemented');
+      #$name = 'toplevel';
+    }
+
+    if($sequence_level) {
+      throw("SEQUENCE_LEVEL argument must be 0 if TOP_LEVEL is 1");
+    }
+
+    $default = 0;
+
+  } else {
+    if(!$rank) {
+      throw("RANK argument must be non-zero if not toplevel CoordSystem");
+    }
+    if($self->name() eq 'toplevel') {
+      throw("Cannot name coord system 'toplevel' unless TOP_LEVEL is 1");
+    }
+  }
+
+  if($rank !~ /^\d+$/) {
+    throw('The RANK argument must be a positive integer');
+  }
+
+
+  #We can add unstored coord systems here
+  #But will these ever have valid entries in the seq_region cache
+  #Initialising this cache key turning off the warning in equals about
+  #Using the nearest coord_system
+
+  $self->{'core_cache'}{$sbuild} = {(
+									 RANK                 => $rank,
+									 SEQUENCE_LEVEL       => $sequence_level,
+									 DEFAULT              => $default,
+									 CORE_COORD_SYSTEM_ID => $ccs_id,
+									 IS_STORED            => $stored,
+									)};
+
+
+
+
+  return;
+}
+
+
+#remove all but schema_buil and equals?
+#depends on how we handle levels
+
+=head2 name
+
+  Arg [1]    : (optional) string $name
+  Example    : print $coord_system->name();
+  Description: Getter for the name of this coordinate system
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub name {
+  my $self = shift;
+  return $self->{'name'};
+}
+
+
+=head2 get_latest_schema_build
+
+  Example    : my $db_schema_build = $coord_system->get_latest_schema_build();
+  Description: Getter for the most recent schema_build of this coordinate system
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub get_latest_schema_build {
+  my $self = shift;
+
+  return (sort (keys %{$self->{'core_cache'}}))[0];
+}
+
+
+=head2 contains_schema_build
+
+  Example    : if ($coord_system->contains_schema_build('43_36e')){..do some coord system things ..};
+  Description: Returns true is the CoordSystem maps to the corresponding core CoordSystem
+  Returntype : Boolean
+  Exceptions : throws if schema_build not defined
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub contains_schema_build {
+  my ($self, $schema_build) = @_;
+  
+  throw('Must pass a schema_build') if ! $schema_build;
+
+  return (exists $self->{'core_cache'}{$schema_build}) ? 1 : 0;
+}
+
+=head2 version
+
+  Arg [1]    : none
+  Example    : print $coord->version();
+  Description: Getter/Setter for the version of this coordinate system.  This
+               will return an empty string if no version is defined for this
+               coordinate system.
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : Stable
+
+=cut
+
+sub version {
+  my $self = shift;
+
+  return $self->{'version'};
+}
+
+
+
+
+=head2 equals
+
+  Arg [1]    : Bio::EnsEMBL::Funcgen::CoordSystem $cs
+               The coord system to compare to for equality.
+  Example    : if($coord_sys->equals($other_coord_sys)) { ... }
+  Description: Compares 2 coordinate systems and returns true if they are
+               equivalent.  The definition of equivalent is sharing the same
+               name and version.
+  Returntype : string
+  Exceptions : none
+  Caller     : general
+  Status     : At risk
+
+=cut
+
+sub equals {
+  my $self = shift;
+  my $cs = shift;
+
+  if(!$cs || !ref($cs) || 
+	 (! $cs->isa('Bio::EnsEMBL::Funcgen::CoordSystem')  && 
+	  ! $cs->isa('Bio::EnsEMBL::CoordSystem'))){
+	throw('Argument must be a Bio::EnsEMBL::Funcgen::CoordSystem');
+  }
+  
+  
+  #need to add check on schema_build here
+  #all schema_builds should have been added by BaseFeatureAdaptor during import
+  #fails if we are using two different versions with the same cs's
+  
+  if(($self->version() eq $cs->version()) &&
+	 ($self->name() eq $cs->name())){
+
+	#we need to make sure these are default CS, otherwise we can get into trouble with
+	#re-used or mismatched seq_region_ids between DB wih different default assemblies
+
+	if (! $self->contains_schema_build($self->adaptor->db->_get_schema_build($cs->adaptor()))) {
+
+	  #Only warn first time this is seen
+	  my $warning_key = $self->adaptor->db->_get_schema_build($cs->adaptor()).':'.$self->name().':'.$self->version;
+
+	  if(! exists $warnings{$warning_key}){
+		warn 'You are using a schema_build('.$self->adaptor->db->_get_schema_build($cs->adaptor()).') which has no CoordSystem stored for '.$cs->version.". Defaulting to closest name version match.\n";
+		$warnings{$warning_key} = 1;
+	  }
+	}
+    return 1;
+  }
+
+  return 0;
+}
+
+
+
+
+=head2 is_top_level
+
+  Arg [1]    : none
+  Example    : if($coord_sys->is_top_level()) { ... }
+  Description: Returns true if this is the toplevel pseudo coordinate system.
+               The toplevel coordinate system is not a real coordinate system
+               which is stored in the database, but it is a placeholder that
+               can be used to request transformations or retrievals to/from
+               the highest defined coordinate system in a given region.
+  Returntype : 0 or 1
+  Exceptions : none
+  Caller     : general
+  Status     : at risk - not implemented yet
+
+=cut
+
+sub is_top_level {
+  my $self = shift;
+
+  throw('Not yet implmented, need to test against the core cache using dnadb/schema_build');
+
+  return $self->{'top_level'};
+}
+
+
+#These attribute methods are largely redundant
+#is_default is used by Feature Adaptors to restrict features to
+#current default assembly for non slice based methods
+#Especially redundant now we have implemented this in fetch_all
+
+=head2 is_sequence_level
+
+  Arg [1]    : Bio::EnsEMBL::DBSQL::DBAdaptor
+  Example    : if($coord_sys->is_sequence_level($dnadb)) { ... }
+  Description: Returns true if this is a sequence level coordinate system
+               for a given dnadb
+  Returntype : 0 or 1
+  Exceptions : none
+  Caller     : general
+  Status     : at risk
+
+=cut
+
+sub is_sequence_level {
+  my ($self, $dnadb) = @_;
+
+  return $self->get_coord_system_attribute('sequence_level', $dnadb);
+}
+
+
+=head2 is_default
+
+  Arg [1]    : Bio::EnsEMBL::DBSQL::DBAdaptor
+  Example    : if($coord_sys->is_default($dnadb)) { ... }
+  Description: Returns true if this coordinate system is the default
+               version of the coordinate system of this name for a given dnadb.
+  Returntype : 0 or 1
+  Exceptions : none
+  Caller     : general - Used 
+  Status     : at risk
+
+=cut
+
+sub is_default {
+  my ($self, $dnadb) = @_;
+
+  return $self->get_coord_system_attribute('default', $dnadb);
+}
+
+sub get_coord_system_attribute{
+  my($self, $attr_name, $dnadb) = @_;
+  
+  if(! ($dnadb && ref($dnadb) && $dnadb->isa('Bio::EnsEMBL::DBSQL::DBAdaptor'))){
+	throw("You must pass a dnadb to access the CoordSystem attribute:\t $attr_name");
+  }
+  
+  my $schema_build = $self->adaptor->db->_get_schema_build($dnadb);
+  
+  if(! $self->contains_schema_build($schema_build)){
+	throw("CoordSystem does not contain the schema_build:\t$schema_build");
+  }
+
+  return $self->{'core_cache'}{$schema_build}{uc($attr_name)};
+
+}
+
+
+=head2 rank
+
+  Arg [1]    : Bio::EnsEMBL::DBSQL::DBAdaptor
+  Example    : if($cs1->rank($dnadb) < $cs2->rank($dnadb)) {
+                 print $cs1->name(), " is a higher level coord system than",
+                       $cs2->name(), "\n";
+               }
+  Description: Returns the rank of this coordinate system for a given dnadb. 
+               A lower number is a higher coordinate system.  The highest level coordinate
+               system has a rank of 1 (e.g. 'chromosome').  The toplevel
+               pseudo coordinate system has a rank of 0.
+  Returntype : int
+  Exceptions : none
+  Caller     : general
+  Status     : at risk - not yet implemented
+
+=cut
+
+sub rank {
+  my ($self, $dnadb) = @_;
+  return $self->get_coord_system_attribute('rank', $dnadb);
+
+}
+
+1;