Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,2272 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SliceAdaptor - A database aware adaptor responsible for +the creation of Slice objects. + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::Slice qw(split_Slices); + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + $slice_adaptor = + Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "slice" ); + + # get a slice on the entire chromosome X + $chr_slice = $slice_adaptor->fetch_by_region( 'chromosome', 'X' ); + + # get a slice for each clone in the database + foreach $cln_slice ( @{ $slice_adaptor->fetch_all('clone') } ) { + # do something with clone + } + + # get a slice which is part of NT_004321 + $spctg_slice = + $slice_adaptor->fetch_by_region( 'supercontig', 'NT_004321', + 200_000, 600_000 ); + + # get all non-redundant slices from the highest possible coordinate + # systems + $slices = $slice_adaptor->fetch_all('toplevel'); + + # include non-reference regions + $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 1 ); + + # include non-duplicate regions + $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 0, 1 ); + + # split up a list of slices into smaller slices + $overlap = 1000; + $max_length = 1e6; + $slices = split_Slices( $slices, $max_length, $overlap ); + + # store a list of slice names in a file + open( FILE, ">$filename" ) or die("Could not open file $filename"); + foreach my $slice (@$slices) { + print FILE $slice->name(), "\n"; + } + close FILE; + + # retreive a list of slices from a file + open( FILE, $filename ) or die("Could not open file $filename"); + while ( $name = <FILE> ) { + chomp($name); + $slice = $slice_adaptor->fetch_by_name($name); + # do something with slice + } + +=head1 DESCRIPTION + +This module is responsible for fetching Slices representing genomic +regions from a database. A Details on how slices can be used are in the +Bio::EnsEMBL::Slice module. + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::DBSQL::SliceAdaptor; +use vars qw(@ISA); +use strict; + + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::CircularSlice; +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::LRGSlice; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); +use Scalar::Util qw/looks_like_number/; + +@ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + # use a cache which is shared and also used by the assembly + # mapper adaptor + + my $seq_region_cache = $self->db->get_SeqRegionCache(); + + $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'}; + $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'}; + + $self->{'lrg_region_test'} = undef; + my $meta_container = $self->db->get_MetaContainer(); + my @values = $meta_container->list_value_by_key("LRG"); + if(scalar(@values) and $values[0]->[0]){ + $self->{'lrg_region_test'} = $values[0]->[0]; + } + return $self; +} + + +=head2 fetch_by_region + + Arg [1] : string $coord_system_name (optional) + The name of the coordinate system of the slice to be created + This may be a name of an actual coordinate system or an alias + to a coordinate system. Valid aliases are 'seqlevel' or + 'toplevel'. + Arg [2] : string $seq_region_name + The name of the sequence region that the slice will be + created on. + Arg [3] : int $start (optional, default = 1) + The start of the slice on the sequence region + Arg [4] : int $end (optional, default = seq_region length) + The end of the slice on the sequence region + Arg [5] : int $strand (optional, default = 1) + The orientation of the slice on the sequence region + Arg [6] : string $version (optional, default = default version) + The version of the coordinate system to use (e.g. NCBI33) + Arg [7] : boolean $no_fuzz (optional, default = undef (false)) + If true (non-zero), do not use "fuzzy matching" (see below). + Example : $slice = $slice_adaptor->fetch_by_region('chromosome', 'X'); + $slice = $slice_adaptor->fetch_by_region('clone', 'AC008066.4'); + Description: Retrieves a slice on the requested region. At a minimum the + name the name of the seq_region to fetch must be provided. + + If no coordinate system name is provided than a slice on the + highest ranked coordinate system with a matching + seq_region_name will be returned. If a version but no + coordinate system name is provided, the same behaviour will + apply, but only coordinate systems of the appropriate version + are considered. The same applies if the 'toplevel' coordinate + system is specified, however in this case the version is + ignored. The coordinate system should always be specified if + it is known, since this is unambiguous and faster. + + Some fuzzy matching is performed if no exact match for + the provided name is found. This allows clones to be + fetched even when their version is not known. For + example fetch_by_region('clone', 'AC008066') will + retrieve the sequence_region with name 'AC008066.4'. + + The fuzzy matching can be turned off by setting the + $no_fuzz argument to a true value. + + If the requested seq_region is not found in the database undef + is returned. + + Returntype : Bio::EnsEMBL::Slice or undef + Exceptions : throw if no seq_region_name is provided + throw if invalid coord_system_name is provided + throw if start > end is provided + Caller : general + Status : Stable + +=cut + + +# +# ARNE: This subroutine needs simplification!! +# +sub fetch_by_region { + my ( $self, $coord_system_name, $seq_region_name, $start, $end, + $strand, $version, $no_fuzz ) + = @_; + + if ( !defined($start) ) { $start = 1 } + if ( !defined($strand) ) { $strand = 1 } + + if ( !defined($seq_region_name) ) { + throw('seq_region_name argument is required'); + } + + my $cs; + my $csa = $self->db->get_CoordSystemAdaptor(); + + if ( defined($coord_system_name) ) { + $cs = $csa->fetch_by_name( $coord_system_name, $version ); + + ## REMOVE THESE THREE LINES WHEN STICKLEBACK DB IS FIXED! + ## Anne/ap5 (2007-10-09): + # The problem was that the stickleback genebuild called the + # chromosomes 'groups', which meant they weren't being picked out by + # the karyotype drawing code. Apparently they are usually called + # 'groups' in the stickleback community, even though they really are + # chromosomes! + + if ( !defined($cs) && $coord_system_name eq 'chromosome' ) { + $cs = $csa->fetch_by_name( 'group', $version ); + } + + if ( !defined($cs) ) { + throw( sprintf( "Unknown coordinate system:\n" + . "name='%s' version='%s'\n", + $coord_system_name, $version ) ); + } + + # fetching by toplevel is same as fetching w/o name or version + if ( $cs->is_top_level() ) { + $cs = undef; + $version = undef; + } + + } ## end if ( defined($coord_system_name...)) + + my $constraint; + my $sql; + my @bind_params; + my $key; + + if ( defined($cs) ) { + $sql = sprintf( "SELECT sr.name, sr.seq_region_id, sr.length, %d " + . "FROM seq_region sr ", + $cs->dbID() ); + + $constraint = "AND sr.coord_system_id = ?"; + push( @bind_params, [ $cs->dbID(), SQL_INTEGER ] ); + + $key = "$seq_region_name:" . $cs->dbID(); + } else { + $sql = + "SELECT sr.name, sr.seq_region_id, sr.length, cs.coord_system_id " + . "FROM seq_region sr, coord_system cs "; + + $constraint = "AND sr.coord_system_id = cs.coord_system_id " + . "AND cs.species_id = ? "; + push( @bind_params, [ $self->species_id(), SQL_INTEGER ] ); + + if ( defined($version) ) { + $constraint .= "AND cs.version = ? "; + push( @bind_params, [ $version, SQL_VARCHAR ] ); + } + + $constraint .= "ORDER BY cs.rank ASC"; + } + + # check the cache so we only go to the db if necessary + my $length; + my $arr; + + if ( defined($key) ) { $arr = $self->{'sr_name_cache'}->{$key} } + + if ( defined($arr) ) { + $length = $arr->[3]; + } else { + my $sth = + $self->prepare( $sql . "WHERE sr.name = ? " . $constraint ); + + unshift( @bind_params, [ $seq_region_name, SQL_VARCHAR ] ); + + my $pos = 0; + foreach my $param (@bind_params) { + $sth->bind_param( ++$pos, $param->[0], $param->[1] ); + } + + $sth->execute(); + + if ( $sth->rows() == 0 ) { + $sth->finish(); + + + # try synonyms + my $syn_sql_sth = $self->prepare("select s.name from seq_region s, seq_region_synonym ss where s.seq_region_id = ss.seq_region_id and ss.synonym = ?"); + $syn_sql_sth->bind_param(1, $seq_region_name, SQL_VARCHAR); + $syn_sql_sth->execute(); + my $new_name; + $syn_sql_sth->bind_columns( \$new_name); + if($syn_sql_sth->fetch){ + $syn_sql_sth->finish; + return $self->fetch_by_region($coord_system_name, $new_name, $start, $end, $strand, $version, $no_fuzz); + } + $syn_sql_sth->finish; + + + if ($no_fuzz) { return undef } + + # Do fuzzy matching, assuming that we are just missing a version + # on the end of the seq_region name. + + $sth = + $self->prepare( $sql . " WHERE sr.name LIKE ? " . $constraint ); + + $bind_params[0] = + [ sprintf( '%s.%%', $seq_region_name ), SQL_VARCHAR ]; + + $pos = 0; + foreach my $param (@bind_params) { + $sth->bind_param( ++$pos, $param->[0], $param->[1] ); + } + + $sth->execute(); + + my $prefix_len = length($seq_region_name) + 1; + my $high_ver = undef; + my $high_cs = $cs; + + # Find the fuzzy-matched seq_region with the highest postfix + # (which ought to be a version). + + my ( $tmp_name, $id, $tmp_length, $cs_id ); + $sth->bind_columns( \( $tmp_name, $id, $tmp_length, $cs_id ) ); + + my $i = 0; + + while ( $sth->fetch ) { + my $tmp_cs = + ( defined($cs) ? $cs : $csa->fetch_by_dbID($cs_id) ); + + # cache values for future reference + my $arr = [ $id, $tmp_name, $cs_id, $tmp_length ]; + $self->{'sr_name_cache'}->{"$tmp_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$id"} = $arr; + + my $tmp_ver = substr( $tmp_name, $prefix_len ); + + # skip versions which are non-numeric and apparently not + # versions + if ( $tmp_ver !~ /^\d+$/ ) { next } + + # take version with highest num, if two versions match take one + # with highest ranked coord system (lowest num) + if ( !defined($high_ver) + || $tmp_ver > $high_ver + || ( $tmp_ver == $high_ver && $tmp_cs->rank < $high_cs->rank ) + ) + { + $seq_region_name = $tmp_name; + $length = $tmp_length; + $high_ver = $tmp_ver; + $high_cs = $tmp_cs; + } + + $i++; + } ## end while ( $sth->fetch ) + $sth->finish(); + + # warn if fuzzy matching found more than one result + if ( $i > 1 ) { + warning( + sprintf( + "Fuzzy matching of seq_region_name " + . "returned more than one result.\n" + . "You might want to check whether the returned seq_region\n" + . "(%s:%s) is the one you intended to fetch.\n", + $high_cs->name(), $seq_region_name ) ); + } + + $cs = $high_cs; + + # return if we did not find any appropriate match: + if ( !defined($high_ver) ) { return undef } + + } else { + + my ( $id, $cs_id ); + ( $seq_region_name, $id, $length, $cs_id ) = + $sth->fetchrow_array(); + $sth->finish(); + + # cache to speed up for future queries + my $arr = [ $id, $seq_region_name, $cs_id, $length ]; + $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$id"} = $arr; + $cs = $csa->fetch_by_dbID($cs_id); + } + } ## end else [ if ( defined($arr) ) ] + + if ( !defined($end) ) { $end = $length } + + #If this was given then check if we've got a circular seq region otherwise + #let it fall through to the normal Slice method + if ( $end + 1 < $start ) { + my $cs_id = $cs->dbID(); + my $seq_region_id = $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"}->[0]; + if($self->is_circular($seq_region_id)) { + my $new_sl = + Bio::EnsEMBL::CircularSlice->new( + -COORD_SYSTEM => $cs, + -SEQ_REGION_NAME => $seq_region_name, + -SEQ_REGION_LENGTH => $length, + -START => $start, + -END => $end, + -STRAND => 1, + -ADAPTOR => $self ); + + return $new_sl; + } + } + + if ( defined( $self->{'lrg_region_test'} ) + and substr( $cs->name, 0, 3 ) eq $self->{'lrg_region_test'} ) + { + return + Bio::EnsEMBL::LRGSlice->new( -COORD_SYSTEM => $cs, + -SEQ_REGION_NAME => $seq_region_name, + -SEQ_REGION_LENGTH => $length, + -START => $start, + -END => $end, + -STRAND => $strand, + -ADAPTOR => $self ); + } else { + return + Bio::EnsEMBL::Slice->new_fast( { + 'coord_system' => $cs, + 'seq_region_name' => $seq_region_name, + 'seq_region_length' => $length, + 'start' => $start, + 'end' => $end, + 'strand' => $strand, + 'adaptor' => $self } ); + } +} ## end sub fetch_by_region + +=head2 fetch_by_toplevel_location + + Arg [1] : string $location + Ensembl formatted location. Can be a format like + C<name:start-end>, C<name:start..end>, C<name:start:end>, + C<name:start>, C<name>. We can also support strand + specification as a +/- or 1/-1. + + Location names must be separated by a C<:>. All others can be + separated by C<..>, C<:> or C<->. + Arg[2] : boolean $no_warnings + Suppress warnings from this method + Example : my $slice = $sa->fetch_by_toplevel_location('X:1-10000') + my $slice = $sa->fetch_by_toplevel_location('X:1-10000:-1') + Description : Converts an Ensembl location/region into the sequence region + name, start and end and passes them onto C<fetch_by_region()>. + The code assumes that the required slice is on the top level + coordinate system. The code assumes that location formatting + is not perfect and will perform basic cleanup before parsing. + Returntype : Bio::EnsEMBL::Slice + Exceptions : If $location is false otherwise see C<fetch_by_region()> + Caller : General + Status : Beta + +=cut + +sub fetch_by_toplevel_location { + my ($self, $location, $no_warnings) = @_; + + my ($seq_region_name, $start, $end, $strand) = $self->parse_location_to_values($location, $no_warnings); + + if(! $seq_region_name) { + return; + } + + if(defined $start && defined $end && $start > $end) { + throw "Cannot request a slice whose start is greater than its end. Start: $start. End: $end"; + } + + my $coord_system_name = 'toplevel'; + my $slice = $self->fetch_by_region($coord_system_name, $seq_region_name, $start, $end, $strand, undef, 0); + return unless $slice; + + my $srl = $slice->seq_region_length(); + my $name = $slice->seq_region_name(); + if(defined $start && $start > $srl) { + throw "Cannot request a slice whose start ($start) is greater than $srl for $name."; + } + if(defined $end && $end > $srl) { + warning "Requested end ($end) is greater than $srl for $name. Resetting to $srl" if ! $no_warnings; + $slice->{end} = $srl; + } + + return $slice; +} + +=head2 parse_location_to_values + + Arg [1] : string $location + Ensembl formatted location. Can be a format like + C<name:start-end>, C<name:start..end>, C<name:start:end>, + C<name:start>, C<name>. We can also support strand + specification as a +/- or 1/-1. + + Location names must be separated by a C<:>. All others can be + separated by C<..>, C<:> or C<->. + Arg[2] : boolean $no_warnings + Suppress warnings from this method + Example : my ($name, $start, $end, $strand) = $sa->parse_location_to_values('X:1..100:1); + Description : Takes in an Ensembl location String and returns the parsed + values + Returntype : List. Contains name, start, end and strand + +=cut + + +sub parse_location_to_values { + my ($self, $location, $no_warnings, $no_errors) = @_; + + throw 'You must specify a location' if ! $location; + + #cleanup any nomenclature like 1_000 or 1 000 or 1,000 + my $number_seps_regex = qr/\s+|,|_/; + my $separator_regex = qr/(?:-|[.]{2}|\:)?/; + my $number_regex = qr/[0-9,_ E]+/xms; + my $strand_regex = qr/[+-1]|-1/xms; + + my $regex = qr/^(\w+) \s* :? \s* ($number_regex)? $separator_regex ($number_regex)? $separator_regex ($strand_regex)? $/xms; + my ($seq_region_name, $start, $end, $strand); + if(($seq_region_name, $start, $end, $strand) = $location =~ $regex) { + + if(defined $strand) { + if(!looks_like_number($strand)) { + $strand = ($strand eq '+') ? 1 : -1; + } + } + + if(defined $start) { + $start =~ s/$number_seps_regex//g; + if($start < 1) { + warning "Start was less than 1 (${start}) which is not allowed. Resetting to 1" if ! $no_warnings; + $start = 1; + } + } + if(defined $end) { + $end =~ s/$number_seps_regex//g; + if($end < 1) { + throw "Cannot request negative or 0 end indexes through this interface. Given $end but expected something greater than 0" unless $no_errors; + } + } + + if(defined $start && defined $end && $start > $end) { + throw "Cannot request a slice whose start is greater than its end. Start: $start. End: $end" unless $no_errors; + } + } + + return ($seq_region_name, $start, $end, $strand); +} + +=head2 fetch_by_region_unique + + Arg [1] : string $coord_system_name (optional) + The name of the coordinate system of the slice to be created + This may be a name of an actual coordinate system or an alias + to a coordinate system. Valid aliases are 'seqlevel' or + 'toplevel'. + Arg [2] : string $seq_region_name + The name of the sequence region that the slice will be + created on. + Arg [3] : int $start (optional, default = 1) + The start of the slice on the sequence region + Arg [4] : int $end (optional, default = seq_region length) + The end of the slice on the sequence region + Arg [5] : int $strand (optional, default = 1) + The orientation of the slice on the sequence region + Arg [6] : string $version (optional, default = default version) + The version of the coordinate system to use (e.g. NCBI33) + Arg [7] : boolean $no_fuzz (optional, default = undef (false)) + If true (non-zero), do not use "fuzzy matching" (see below). + Example : $slice = $slice_adaptor->fetch_by_region_unique('chromosome', 'HSCHR6_MHC_COX'); + Description: Retrieves a slice on the requested region but returns only the unique + parts of the slice. At a minimum the + name the name of the seq_region to fetch must be provided. + + If no coordinate system name is provided than a slice on the + highest ranked coordinate system with a matching + seq_region_name will be returned. If a version but no + coordinate system name is provided, the same behaviour will + apply, but only coordinate systems of the appropriate version + are considered. The same applies if the 'toplevel' coordinate + system is specified, however in this case the version is + ignored. The coordinate system should always be specified if + it is known, since this is unambiguous and faster. + + Some fuzzy matching is performed if no exact match for + the provided name is found. This allows clones to be + fetched even when their version is not known. For + example fetch_by_region('clone', 'AC008066') will + retrieve the sequence_region with name 'AC008066.4'. + + The fuzzy matching can be turned off by setting the + $no_fuzz argument to a true value. + + If the requested seq_region is not found in the database undef + is returned. + + Returntype : listref Bio::EnsEMBL::Slice + Exceptions : throw if no seq_region_name is provided + throw if invalid coord_system_name is provided + throw if start > end is provided + Caller : general + Status : Stable + +=cut + +sub fetch_by_region_unique { + my $self = shift; + + my @out = (); + my $slice = $self->fetch_by_region(@_); + + + if ( !exists( $self->{'asm_exc_cache'} ) ) { + $self->_build_exception_cache(); + } + + if ( exists( + $self->{'asm_exc_cache'}->{ $self->get_seq_region_id($slice) } + ) ) + { + # Dereference symlinked assembly regions. Take out any regions + # which are symlinked because these are duplicates. + my @projection = + @{ $self->fetch_normalized_slice_projection($slice) }; + + foreach my $segment (@projection) { + if ( $segment->[2]->seq_region_name() eq $slice->seq_region_name() + && $segment->[2]->coord_system->equals( $slice->coord_system ) ) + { + push( @out, $segment->[2] ); + } + } + } + + return \@out; +} ## end sub fetch_by_region_unique + +=head2 fetch_by_name + + Arg [1] : string $name + Example : $name = 'chromosome:NCBI34:X:1000000:2000000:1'; + $slice = $slice_adaptor->fetch_by_name($name); + $slice2 = $slice_adaptor->fetch_by_name($slice3->name()); + Description: Fetches a slice using a slice name (i.e. the value returned by + the Slice::name method). This is useful if you wish to + store a unique identifier for a slice in a file or database or + pass a slice over a network. + Slice::name allows you to serialise/marshall a slice and this + method allows you to deserialise/unmarshal it. + + Returns undef if no seq_region with the provided name exists in + the database. + + Returntype : Bio::EnsEMBL::Slice or undef + Exceptions : throw if incorrent arg provided + Caller : Pipeline + Status : Stable + +=cut + +sub fetch_by_name { + my $self = shift; + my $name = shift; + + if(!$name) { + throw("name argument is required"); + } + + my @array = split(/:/,$name); + + if(scalar(@array) < 3 || scalar(@array) > 6) { + throw("Malformed slice name [$name]. Format is " . + "coord_system:version:name:start:end:strand"); + } + + # Rearrange arguments to suit fetch_by_region + + my @targetarray; + + $targetarray[0]=$array[0]; + $targetarray[5]=(($array[1]&&$array[1] ne "")?$array[1]:undef); + $targetarray[1]=(($array[2]&&$array[2] ne "")?$array[2]:undef); + $targetarray[2]=(($array[3]&&$array[3] ne "")?$array[3]:undef); + $targetarray[3]=(($array[4]&&$array[4] ne "")?$array[4]:undef); + $targetarray[4]=(($array[5]&&$array[5] ne "")?$array[5]:undef); + return $self->fetch_by_region(@targetarray); +} + + + +=head2 fetch_by_seq_region_id + + Arg [1] : string $seq_region_id + The internal identifier of the seq_region to create this slice + on + Arg [2] : optional start + Arg [3] : optional end + Arg [4] : optional strand + Example : $slice = $slice_adaptor->fetch_by_seq_region_id(34413); + Description: Creates a slice object of an entire seq_region using the + seq_region internal identifier to resolve the seq_region. + Returns undef if no such slice exists. + Returntype : Bio::EnsEMBL::Slice or undef + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_by_seq_region_id { + my ( $self, $seq_region_id, $start, $end, $strand ) = @_; + + my $arr = $self->{'sr_id_cache'}->{$seq_region_id}; + my ( $name, $length, $cs, $cs_id ); + + + if ( $arr && defined( $arr->[2] ) ) { + ( $name, $cs_id, $length ) = ( $arr->[1], $arr->[2], $arr->[3] ); + $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id); + } else { + my $sth = + $self->prepare( "SELECT sr.name, sr.coord_system_id, sr.length " + . "FROM seq_region sr " + . "WHERE sr.seq_region_id = ? " ); + + $sth->bind_param( 1, $seq_region_id, SQL_INTEGER ); + $sth->execute(); + + if ( $sth->rows() == 0 ) { return undef } + + ( $name, $cs_id, $length ) = $sth->fetchrow_array(); + $sth->finish(); + + $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id); + + #cache results to speed up repeated queries + my $arr = [ $seq_region_id, $name, $cs_id, $length ]; + + $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr; + } + + return + Bio::EnsEMBL::Slice->new_fast({ + 'coord_system' => $cs, + 'seq_region_name' => $name, + 'seq_region_length'=> $length, + 'start' => $start || 1, + 'end' => $end || $length, + 'strand' => $strand || 1, + 'adaptor' => $self} ); +} ## end sub fetch_by_seq_region_id + + + +=head2 get_seq_region_id + + Arg [1] : Bio::EnsEMBL::Slice $slice + The slice to fetch a seq_region_id for + Example : $srid = $slice_adaptor->get_seq_region_id($slice); + Description: Retrieves the seq_region id (in this database) given a slice + Seq region ids are not stored on the slices themselves + because they are intended to be somewhat database independant + and seq_region_ids vary accross databases. + Returntype : int + Exceptions : throw if the seq_region of the slice is not in the db + throw if incorrect arg provided + Caller : BaseFeatureAdaptor + Status : Stable + +=cut + +sub get_seq_region_id { + my $self = shift; + my $slice = shift; + + if(!$slice || !ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw('Slice argument is required'); + } + + my $seq_region_name = $slice->seq_region_name(); + my $key = $seq_region_name.":".$slice->coord_system->dbID(); + my $arr = $self->{'sr_name_cache'}->{"$key"}; + + if( $arr ) { + return $arr->[0]; + } + + my $cs_id = $slice->coord_system->dbID(); + + my $sth = $self->prepare("SELECT seq_region_id, length " . + "FROM seq_region " . + "WHERE name = ? AND coord_system_id = ?"); + + #force seq_region_name cast to string so mysql cannot treat as int + $sth->bind_param(1,"$seq_region_name",SQL_VARCHAR); + $sth->bind_param(2,$cs_id,SQL_INTEGER); + $sth->execute(); + + if($sth->rows() != 1) { + throw("Non existant or ambigous seq_region:\n" . + " coord_system=[$cs_id],\n" . + " name=[$seq_region_name],\n"); + + } + + my($seq_region_id, $length) = $sth->fetchrow_array(); + $sth->finish(); + + #cache information for future requests + $arr = [ $seq_region_id, $seq_region_name, $cs_id, $length ]; + + $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr; + + return $seq_region_id; +} + + + +=head2 fetch_all + + Arg [1] : string $coord_system_name + The name of the coordinate system to retrieve slices of. + This may be a name of an acutal coordinate system or an alias + to a coordinate system. Valid aliases are 'seqlevel' or + 'toplevel'. + Arg [2] : string $coord_system_version (optional) + The version of the coordinate system to retrieve slices of + Arg [3] : bool $include_non_reference (optional) + If this argument is not provided then only reference slices + will be returned. If set, both reference and non refeference + slices will be rerurned. + Arg [4] : int $include_duplicates (optional) + If set duplicate regions will be returned. + + NOTE: if you do not use this option and you have a PAR + (pseudo-autosomal region) at the beginning of your seq_region + then your slice will not start at position 1, so coordinates + retrieved from this slice might not be what you expected. + + Arg[5] : bool $include_lrg (optional) (default 0) + If set lrg regions will be returned aswell. + + + Example : @chromos = @{$slice_adaptor->fetch_all('chromosome','NCBI33')}; + @contigs = @{$slice_adaptor->fetch_all('contig')}; + + # get even non-reference regions + @slices = @{$slice_adaptor->fetch_all('toplevel',undef,1)}; + + # include duplicate regions (such as pseudo autosomal regions) + @slices = @{$slice_adaptor->fetch_all('toplevel', undef,0,1)}; + + Description: Retrieves slices of all seq_regions for a given coordinate + system. This is analagous to the methods fetch_all which were + formerly on the ChromosomeAdaptor, RawContigAdaptor and + CloneAdaptor classes. Slices fetched span the entire + seq_regions and are on the forward strand. + If the coordinate system with the provided name and version + does not exist an empty list is returned. + If the coordinate system name provided is 'toplevel', all + non-redundant toplevel slices are returned (note that any + coord_system_version argument is ignored in that case). + + Retrieved slices can be broken into smaller slices using the + Bio::EnsEMBL::Utils::Slice module. + + Returntype : listref of Bio::EnsEMBL::Slices + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub fetch_all { + my $self = shift; + my $cs_name = shift; + my $cs_version = shift || ''; + + my ($include_non_reference, $include_duplicates, $include_lrg) = @_; + + # + # verify existance of requested coord system and get its id + # + my $csa = $self->db->get_CoordSystemAdaptor(); + my $orig_cs = $csa->fetch_by_name($cs_name, $cs_version); + + return [] if ( !$orig_cs ); + + my %bad_vals=(); + + + # + # Get a hash of non reference seq regions + # + if ( !$include_non_reference ) { + my $sth = + $self->prepare( 'SELECT sr.seq_region_id ' + . 'FROM seq_region sr, seq_region_attrib sra, ' + . 'attrib_type at, coord_system cs ' + . 'WHERE at.code = "non_ref" ' + . 'AND sra.seq_region_id = sr.seq_region_id ' + . 'AND at.attrib_type_id = sra.attrib_type_id ' + . 'AND sr.coord_system_id = cs.coord_system_id ' + . 'AND cs.species_id = ?' ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my ($seq_region_id); + $sth->bind_columns( \$seq_region_id ); + + while ( $sth->fetch() ) { + $bad_vals{$seq_region_id} = 1; + } + } + + # + # if we do not want lrg's then add them to the bad list; + # + if ( !$include_lrg ) { + my $sth = + $self->prepare( 'SELECT sr.seq_region_id ' + . 'FROM seq_region sr, seq_region_attrib sra, ' + . 'attrib_type at, coord_system cs ' + . 'WHERE at.code = "LRG" ' + . 'AND sra.seq_region_id = sr.seq_region_id ' + . 'AND at.attrib_type_id = sra.attrib_type_id ' + . 'AND sr.coord_system_id = cs.coord_system_id ' + . 'AND cs.species_id = ?' ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my ($seq_region_id); + $sth->bind_columns( \$seq_region_id ); + + while ( $sth->fetch() ) { + $bad_vals{$seq_region_id} = 1; + } + } + + # + # Retrieve the seq_regions from the database + # + + my $sth; + if ( $orig_cs->is_top_level() ) { + $sth = + $self->prepare( 'SELECT sr.seq_region_id, sr.name, ' + . 'sr.length, sr.coord_system_id ' + . 'FROM seq_region sr, seq_region_attrib sra, ' + . 'attrib_type at, coord_system cs ' + . 'WHERE at.code = "toplevel" ' + . 'AND at.attrib_type_id = sra.attrib_type_id ' + . 'AND sra.seq_region_id = sr.seq_region_id ' + . 'AND sr.coord_system_id = cs.coord_system_id ' + . 'AND cs.species_id = ?' ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + } else { + $sth = + $self->prepare( 'SELECT sr.seq_region_id, sr.name, ' + . 'sr.length, sr.coord_system_id ' + . 'FROM seq_region sr ' + . 'WHERE sr.coord_system_id = ?' ); + + $sth->bind_param( 1, $orig_cs->dbID, SQL_INTEGER ); + $sth->execute(); + } + + my ( $seq_region_id, $name, $length, $cs_id ); + $sth->bind_columns( \( $seq_region_id, $name, $length, $cs_id ) ); + + my $cache_count = 0; + + my @out; + while($sth->fetch()) { + if(!defined($bad_vals{$seq_region_id})){ + my $cs = $csa->fetch_by_dbID($cs_id); + + if(!$cs) { + throw("seq_region $name references non-existent coord_system $cs_id."); + } + + #cache values for future reference, but stop adding to the cache once we + #we know we have filled it up + if($cache_count < $Bio::EnsEMBL::Utils::SeqRegionCache::SEQ_REGION_CACHE_SIZE) { + my $arr = [ $seq_region_id, $name, $cs_id, $length ]; + + $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr; + + $cache_count++; + } + + my $slice = Bio::EnsEMBL::Slice->new_fast({ + 'start' => 1, + 'end' => $length, + 'strand' => 1, + 'seq_region_name' => $name, + 'seq_region_length'=> $length, + 'coord_system' => $cs, + 'adaptor' => $self}); + + if(!defined($include_duplicates) or !$include_duplicates){ + # test if this slice *could* have a duplicate (exception) region + $self->_build_exception_cache() if(!exists $self->{'asm_exc_cache'}); + if(exists $self->{asm_exc_cache}->{$seq_region_id}) { + + # Dereference symlinked assembly regions. Take out + # any regions which are symlinked because these are duplicates + my @projection = @{$self->fetch_normalized_slice_projection($slice)}; + foreach my $segment ( @projection) { + if($segment->[2]->seq_region_name() eq $slice->seq_region_name() && + $segment->[2]->coord_system->equals($slice->coord_system)) { + push @out, $segment->[2]; + } + } + } else { + # no duplicate regions + push @out, $slice; + } + } else { + # we want duplicates anyway so do not do any checks + push @out, $slice; + } + } + } + + return \@out; +} + +=head2 is_toplevel + Arg : int seq_region_id + Example : my $top = $slice_adptor->is_toplevel($seq_region_id) + Description: Returns 1 if slice is a toplevel slice else 0 + Returntype : int + Caller : Slice method is_toplevel + Status : At Risk + +=cut + +sub is_toplevel { + my $self = shift; + my $id = shift; + + my $sth = $self->prepare( + "SELECT at.code from seq_region_attrib sra, attrib_type at " + . "WHERE sra.seq_region_id = ? " + . "AND at.attrib_type_id = sra.attrib_type_id " + . "AND at.code = 'toplevel'" ); + + $sth->bind_param( 1, $id, SQL_INTEGER ); + $sth->execute(); + + my $code; + $sth->bind_columns( \$code ); + + while ( $sth->fetch ) { + $sth->finish; + return 1; + } + + $sth->finish; + return 0; +} + +=head2 is_reference + Arg : int seq_region_id + Example : my $reference = $slice_adptor->is_reference($seq_region_id) + Description: Returns 1 if slice is a reference slice else 0 + Returntype : int + Caller : Slice method is_reference + Status : At Risk + +=cut + +sub is_reference { + my $self = shift; + my $id = shift; + + my $sth = $self->prepare( + "SELECT at.code from seq_region_attrib sra, attrib_type at " + . "WHERE sra.seq_region_id = ? " + . "AND at.attrib_type_id = sra.attrib_type_id " + . "AND at.code = 'non_ref'" ); + + $sth->bind_param( 1, $id, SQL_INTEGER ); + $sth->execute(); + + my $code; + $sth->bind_columns( \$code ); + + while ( $sth->fetch ) { + $sth->finish; + return 0; + } + + $sth->finish; + return 1; +} + +=head2 is_circular + + Arg[1] : int seq_region_id + Example : my $circular = $slice_adptor->is_circular($seq_region_id); + Description : Indicates if the sequence region was circular or not + Returntype : Boolean + +=cut + +sub is_circular { + my ($self, $id) = @_; + + if (! defined $self->{is_circular}) { + $self->_build_circular_slice_cache(); + } + + return 0 if $self->{is_circular} == 0; + return (exists $self->{circular_sr_id_cache}->{$id}) ? 1 : 0; +} + +=head2 fetch_by_band + + Title : fetch_by_band + Usage : + Function: Does not work please use fetch_by_chr_band + Example : + Returns : Bio::EnsEMBL::Slice + Args : the band name + Status : AT RISK + +=cut + +sub fetch_by_band { + my ($self,$band) = @_; + + my $sth = $self->dbc->prepare + ("select s.name,max(k.seq_region_id)-min(k.seq_region_id, min(k.seq_region_start), max(k.seq_region_id) " . + "from karyotype as k " . + "where k.band like ? and k.seq_region_id = s.seq_region_id"); + + $sth->bind_param(1,"$band%",SQL_VARCHAR); + $sth->execute(); + my ( $seq_region_name, $discrepancy, $seq_region_start, $seq_region_end) = $sth->fetchrow_array; + + if($seq_region_name && $discrepancy>0) { + throw("Band maps to multiple seq_regions"); + } else { + return $self->fetch_by_region('toplevel',$seq_region_name,$seq_region_start,$seq_region_end); + } + throw("Band not recognised in database"); +} + +=head2 fetch_by_chr_band + + Title : fetch_by_chr_band + Usage : + Function: create a Slice representing a series of bands + Example : + Returns : Bio::EnsEMBL::Slice + Args : the band name + Status : Stable + +=cut + +sub fetch_by_chr_band { + my ( $self, $chr, $band ) = @_; + + my $chr_slice = $self->fetch_by_region( 'toplevel', $chr ); + my $seq_region_id = $self->get_seq_region_id($chr_slice); + + my $sth = + $self->prepare( 'SELECT MIN(k.seq_region_start), ' + . 'MAX(k.seq_region_end) ' + . 'FROM karyotype k ' + . 'WHERE k.seq_region_id = ? ' + . 'AND k.band LIKE ?' ); + + $sth->bind_param( 1, $seq_region_id, SQL_INTEGER ); + $sth->bind_param( 2, "$band%", SQL_VARCHAR ); + $sth->execute(); + + my ( $slice_start, $slice_end ) = $sth->fetchrow_array; + + if ( defined $slice_start ) { + return + $self->fetch_by_region( 'toplevel', $chr, + $slice_start, $slice_end ); + } + + throw("Band not recognised in database"); +} ## end sub fetch_by_chr_band + + + +=head2 fetch_by_exon_stable_id + + Arg [1] : string $exonid + The stable id of the exon around which the slice is + desired + Arg [2] : (optional) int $size + The length of the flanking regions the slice should encompass + on either side of the exon (0 by default) + Example : $slc = $sa->fetch_by_exon_stable_id('ENSE00000302930',10); + Description: Creates a slice around the region of the specified exon. + If a context size is given, the slice is extended by that + number of basepairs on either side of the exon. + + The slice will be created in the exon's native coordinate system + and in the forward orientation. + Returntype : Bio::EnsEMBL::Slice + Exceptions : Thrown if the exon is not in the database. + Caller : general + Status : Stable + +=cut + +sub fetch_by_exon_stable_id{ + my ($self,$exonid,$size) = @_; + + throw('Exon argument is required.') if(!$exonid); + + my $ea = $self->db->get_ExonAdaptor; + my $exon = $ea->fetch_by_stable_id($exonid); + + throw("Exon [$exonid] does not exist in DB.") if(!$exon); + + return $self->fetch_by_Feature($exon, $size); +} + +=head2 fetch_by_transcript_stable_id + + Arg [1] : string $transcriptid + The stable id of the transcript around which the slice is + desired + Arg [2] : (optional) int $size + The length of the flanking regions the slice should encompass + on either side of the transcript (0 by default) + Example : $slc = $sa->fetch_by_transcript_stable_id('ENST00000302930',10); + Description: Creates a slice around the region of the specified transcript. + If a context size is given, the slice is extended by that + number of basepairs on either side of the + transcript. + + The slice will be created in the transcript's native coordinate + system and in the forward orientation. + Returntype : Bio::EnsEMBL::Slice + Exceptions : Thrown if the transcript is not in the database. + Caller : general + Status : Stable + +=cut + +sub fetch_by_transcript_stable_id{ + my ($self,$transcriptid,$size) = @_; + + throw('Transcript argument is required.') if(!$transcriptid); + + my $ta = $self->db->get_TranscriptAdaptor; + my $transcript = $ta->fetch_by_stable_id($transcriptid); + + throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript); + + return $self->fetch_by_Feature($transcript, $size); +} + + + +=head2 fetch_by_transcript_id + + Arg [1] : int $transcriptid + The unique database identifier of the transcript around which + the slice is desired + Arg [2] : (optional) int $size + The length of the flanking regions the slice should encompass + on either side of the transcript (0 by default) + Example : $slc = $sa->fetch_by_transcript_id(24, 1000); + Description: Creates a slice around the region of the specified transcript. + If a context size is given, the slice is extended by that + number of basepairs on either side of the + transcript. + + The slice will be created in the transcript's native coordinate + system and in the forward orientation. + Returntype : Bio::EnsEMBL::Slice + Exceptions : throw on incorrect args + throw if transcript is not in database + Caller : general + Status : Stable + +=cut + +sub fetch_by_transcript_id { + my ($self,$transcriptid,$size) = @_; + + throw('Transcript id argument is required.') if(!$transcriptid); + + my $transcript_adaptor = $self->db()->get_TranscriptAdaptor(); + my $transcript = $transcript_adaptor->fetch_by_dbID($transcriptid); + + throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript); + + return $self->fetch_by_Feature($transcript, $size); +} + + + +=head2 fetch_by_gene_stable_id + + Arg [1] : string $geneid + The stable id of the gene around which the slice is + desired + Arg [2] : (optional) int $size + The length of the flanking regions the slice should encompass + on either side of the gene (0 by default) + Example : $slc = $sa->fetch_by_gene_stable_id('ENSG00000012123',10); + Description: Creates a slice around the region of the specified gene. + If a context size is given, the slice is extended by that + number of basepairs on either side of the gene. + + The slice will be created in the gene's native coordinate system + and in the forward orientation. + Returntype : Bio::EnsEMBL::Slice + Exceptions : throw on incorrect args + throw if transcript does not exist + Caller : general + Status : Stable + +=cut + +sub fetch_by_gene_stable_id { + my ($self,$geneid,$size) = @_; + + throw('Gene argument is required.') if(!$geneid); + + my $gene_adaptor = $self->db->get_GeneAdaptor(); + my $gene = $gene_adaptor->fetch_by_stable_id($geneid); + + throw("Gene [$geneid] does not exist in DB.") if(!$gene); + + return $self->fetch_by_Feature($gene, $size); +} + + + +=head2 fetch_by_Feature + + Arg [1] : Bio::EnsEMBL::Feature $feat + The feature to fetch the slice around + Arg [2] : int size (optional) + The desired number of flanking basepairs around the feature. + The size may also be provided as a percentage of the feature + size such as 200% or 80.5%. + Example : $slice = $slice_adaptor->fetch_by_Feature($feat, 100); + Description: Retrieves a slice around a specific feature. All this really + does is return a resized version of the slice that the feature + is already on. Note that slices returned from this method + are always on the forward strand of the seq_region regardless of + the strandedness of the feature passed in. + Returntype : Bio::EnsEMBL::Slice + Exceptions : throw if the feature does not have an attached slice + throw if feature argument is not provided + Caller : fetch_by_gene_stable_id, fetch_by_transcript_stable_id, + fetch_by_gene_id, fetch_by_transcript_id + Status : Stable + +=cut + +sub fetch_by_Feature{ + my ($self, $feature, $size) = @_; + + $size ||= 0; + + if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) { + throw('Feature argument expected.'); + } + + my $slice = $feature->slice(); + if(!$slice || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice') )) { + throw('Feature must be attached to a valid slice.'); + } + + + my $fstart = $feature->start(); + my $fend = $feature->end(); + if(!defined($fstart) || !defined($fend)) { + throw('Feature must have defined start and end.'); + } + + #convert the feature slice coordinates to seq_region coordinates + my $slice_start = $slice->start(); + my $slice_end = $slice->end(); + my $slice_strand = $slice->strand(); + if($slice_start != 1 || $slice_strand != 1) { + if($slice_strand == 1) { + $fstart = $fstart + $slice_start - 1; + $fend = $fend + $slice_start - 1; + } else { + my $tmp_start = $fstart; + $fstart = $slice_end - $fend + 1; + $fend = $slice_end - $tmp_start + 1; + } + } + + ## Size may be stored as a %age of the length of the feature + ## Size = 100% gives no context + ## Size = 200% gives context - 50% the size of the feature either side of + ## feature + + $size = int( ($1-100)/200 * ($fend-$fstart+1) ) if( $size =~/([\d+\.]+)%/ ); + + #return a new slice covering the region of the feature + my $S = Bio::EnsEMBL::Slice->new_fast({ + 'seq_region_name' => $slice->seq_region_name, + 'seq_region_length' => $slice->seq_region_length, + 'coord_system' => $slice->coord_system, + 'start' => $fstart - $size, + 'end' => $fend + $size, + 'strand' => 1, + 'adaptor' => $self}); + $S->{'_raw_feature_strand'} = $feature->strand * $slice_strand if $feature->can('strand'); + return $S; +} + + + +=head2 fetch_by_misc_feature_attribute + + Arg [1] : string $attribute_type + The code of the attribute type + Arg [2] : (optional) string $attribute_value + The value of the attribute to fetch by + Arg [3] : (optional) int $size + The amount of flanking region around the misc feature desired. + Example : $slice = $sa->fetch_by_misc_feature_attribute('superctg', + 'NT_030871'); + $slice = $sa->fetch_by_misc_feature_attribute('synonym', + 'AL00012311', + $flanking); + Description: Fetches a slice around a MiscFeature with a particular + attribute type and value. If no value is specified then + the feature with the particular attribute is used. + If no size is specified then 0 is used. + Returntype : Bio::EnsEMBL::Slice + Exceptions : Throw if no feature with the specified attribute type and value + exists in the database + Warning if multiple features with the specified attribute type + and value exist in the database. + Caller : webcode + Status : Stable + +=cut + +sub fetch_by_misc_feature_attribute { + my ($self, $attrib_type_code, $attrib_value, $size) = @_; + + my $mfa = $self->db()->get_MiscFeatureAdaptor(); + + my $feats = $mfa->fetch_all_by_attribute_type_value($attrib_type_code, + $attrib_value); + + if(@$feats == 0) { + throw("MiscFeature with $attrib_type_code=$attrib_value does " . + "not exist in DB."); + } + + if(@$feats > 1) { + warning("MiscFeature with $attrib_type_code=$attrib_value is " . + "ambiguous - using first one found."); + } + + my ($feat) = @$feats; + + return $self->fetch_by_Feature($feat, $size); +} + +=head2 fetch_normalized_slice_projection + + Arg [1] : Bio::EnsEMBL::Slice $slice + Example : ( optional ) + Description: gives back a project style result. The returned slices + represent the areas to which there are symlinks for the + given slice. start, end show which area on given slice is + symlinked + Returntype : [[start,end,$slice][]] + Exceptions : none + Caller : BaseFeatureAdaptor + Status : Stable + +=cut + + +sub fetch_normalized_slice_projection { + my $self = shift; + my $slice = shift; + + my $slice_seq_region_id = $self->get_seq_region_id( $slice ); + + $self->_build_exception_cache() if(!exists($self->{'asm_exc_cache'})); + + my $result = $self->{'asm_exc_cache'}->{$slice_seq_region_id}; + + $result ||= []; + + my (@haps, @pars); + + foreach my $row (@$result) { + my ( $seq_region_id, $seq_region_start, $seq_region_end, + $exc_type, $exc_seq_region_id, $exc_seq_region_start, + $exc_seq_region_end ) = @$row; + + # need overlapping PAR and all HAPs if any + if( $exc_type eq "PAR" ) { + if( $seq_region_start <= $slice->end() && + $seq_region_end >= $slice->start() ) { + push( @pars, [ $seq_region_start, $seq_region_end, $exc_seq_region_id, + $exc_seq_region_start, $exc_seq_region_end ] ); + } + } else { + push( @haps, [ $seq_region_start, $seq_region_end, $exc_seq_region_id, + $exc_seq_region_start, $exc_seq_region_end ] ); + } + } + + if(!@pars && !@haps) { + #just return this slice, there were no haps or pars + return [bless ( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment")]; + } + + my @syms; + if( @haps >= 1 ) { + my @sort_haps = sort { $a->[1] <=> $b->[1] } @haps; + + my $count =0; + my $chr_start = 1; + my $hap_start = 1; + my $last = 0; + + my $seq_reg_slice = $self->fetch_by_seq_region_id($slice_seq_region_id); + my $exc_slice = $self->fetch_by_seq_region_id( $sort_haps[0][2] ); + my $len1 = $seq_reg_slice->length(); + my $len2 = $exc_slice->length(); + my $max_len = ($len1 > $len2) ? $len1 : $len2; + + while($count <= scalar(@sort_haps) and !$last){ + my $chr_end; + my $hap_end; + if(defined($sort_haps[$count]) and defined($sort_haps[$count][0]) ){ + $hap_end = $sort_haps[$count][0]-1; + $chr_end = $sort_haps[$count][3]-1 + } + else{ + $last = 1; + $hap_end = $len1; + $chr_end = $len2; + my $diff = ($hap_end-$hap_start)-($chr_end-$chr_start); + if($diff > 0){ + push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end+$diff] ); + } + elsif($diff < 0){ + push( @syms, [ $hap_start, $hap_end - $diff, $sort_haps[0][2], $chr_start, $chr_end] ); + } + else{ + push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] ); + } + next; + } + if($hap_end and $hap_start < $len1){ # if hap at start or end of chromosome + push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] ); + } + $chr_start = $chr_end + ($sort_haps[$count][4]-$sort_haps[$count][3]) + 2; + $hap_start = $hap_end + ($sort_haps[$count][1]-$sort_haps[$count][0]) + 2; + $count++; + } + + + } + + + # for now haps and pars should not be both there, but in theory we + # could handle it here by cleverly merging the pars into the existing syms, + # for now just: + push( @syms, @pars ); + + my $mapper = Bio::EnsEMBL::Mapper->new( "sym", "org" ); + my $count = 0; + for my $sym ( @syms ) { + $mapper->add_map_coordinates( $slice_seq_region_id, $sym->[0], $sym->[1], + 1, $sym->[2], $sym->[3], $sym->[4] ); + } + + my @linked = $mapper->map_coordinates( $slice_seq_region_id, + $slice->start(), $slice->end(), + $slice->strand(), "sym" ); + + # gaps are regions where there is no mapping to another region + my $rel_start = 1; + + #if there was only one coord and it is a gap, we know it is just the + #same slice with no overlapping symlinks + if(@linked == 1 && $linked[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { + return [bless( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment" )]; + } + + my @out; + for my $coord ( @linked ) { + if( $coord->isa( "Bio::EnsEMBL::Mapper::Gap" )) { + my $exc_slice = Bio::EnsEMBL::Slice->new_fast({ + 'start' => $coord->start(), + 'end' => $coord->end(), + 'strand' => $slice->strand(), + 'coord_system' => $slice->coord_system(), + 'adaptor' => $self, + 'seq_region_name' => $slice->seq_region_name(), + 'seq_region_length' => $slice->seq_region_length()}); + push( @out, bless ( [ $rel_start, $coord->length()+$rel_start-1, + $exc_slice ], "Bio::EnsEMBL::ProjectionSegment") ); + } else { + my $exc_slice = $self->fetch_by_seq_region_id( $coord->id() ); + my $exc2_slice = Bio::EnsEMBL::Slice->new_fast({ + + 'start' => $coord->start(), + 'end' => $coord->end(), + 'strand' => $coord->strand(), + 'seq_region_name' => $exc_slice->seq_region_name(), + 'seq_region_length' => $exc_slice->seq_region_length(), + 'coord_system' => $exc_slice->coord_system(), + 'adaptor' => $self + }); + + push( @out, bless( [ $rel_start, $coord->length() + $rel_start - 1, + $exc2_slice ], "Bio::EnsEMBL::ProjectionSegment") ); + } + $rel_start += $coord->length(); + } + + return \@out; +} + + + + +=head2 store + + Arg [1] : Bio::EnsEMBL::Slice $slice + Arg [2] : (optional) $seqref reference to a string + The sequence associated with the slice to be stored. + Example : $slice = Bio::EnsEMBL::Slice->new(...); + $seq_region_id = $slice_adaptor->store($slice, \$sequence); + Description: This stores a slice as a sequence region in the database + and returns the seq region id. The passed in slice must + start at 1, and must have a valid seq_region name and coordinate + system. The attached coordinate system must already be stored in + the database. The sequence region is assumed to start at 1 and + to have a length equalling the length of the slice. The end of + the slice must equal the seq_region_length. + If the slice coordinate system is the sequence level coordinate + system then the seqref argument must also be passed. If the + slice coordinate system is NOT a sequence level coordinate + system then the sequence argument cannot be passed. + Returntype : int + Exceptions : throw if slice has no coord system. + throw if slice coord system is not already stored. + throw if slice coord system is seqlevel and no sequence is + provided. + throw if slice coord system is not seqlevel and sequence is + provided. + throw if slice does not start at 1 + throw if sequence is provided and the sequence length does not + match the slice length. + throw if the SQL insert fails (e.g. on duplicate seq region) + throw if slice argument is not passed + throw if the slice end is not equal to seq_region_length + Caller : database loading scripts + Status : Stable + +=cut + + + +sub store { + my $self = shift; + my $slice = shift; + my $seqref = shift; + + # + # Get all of the sanity checks out of the way before storing anything + # + + if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw('Slice argument is required'); + } + + my $cs = $slice->coord_system(); + throw("Slice must have attached CoordSystem.") if(!$cs); + + my $db = $self->db(); + if(!$cs->is_stored($db)) { + throw("Slice CoordSystem must already be stored in DB.") + } + + if($slice->start != 1 || $slice->strand != 1) { + throw("Slice must have start==1 and strand==1."); + } + + if($slice->end() != $slice->seq_region_length()) { + throw("Slice must have end==seq_region_length"); + } + + my $sr_len = $slice->length(); + my $sr_name = $slice->seq_region_name(); + + if(!$sr_name) { + throw("Slice must have valid seq region name."); + } + + if($cs->is_sequence_level()) { + if(!$seqref) { + throw("Must provide sequence for sequence level coord system."); + } + if(ref($seqref) ne 'SCALAR') { + throw("Sequence must be a scalar reference."); + } + my $seq_len = length($$seqref); + + if($seq_len != $sr_len) { + throw("Sequence length ($seq_len) must match slice length ($sr_len)."); + } + } else { + if($seqref) { + throw("Cannot provide sequence for non-sequence level seq regions."); + } + } + + #store the seq_region + + my $sth = $db->dbc->prepare("INSERT INTO seq_region " . + "SET name = ?, " . + " length = ?, " . + " coord_system_id = ?" ); + + $sth->bind_param(1,$sr_name,SQL_VARCHAR); + $sth->bind_param(2,$sr_len,SQL_INTEGER); + $sth->bind_param(3,$cs->dbID,SQL_INTEGER); + + $sth->execute(); + + my $seq_region_id = $sth->{'mysql_insertid'}; + + if(!$seq_region_id) { + throw("Database seq_region insertion failed."); + } + + if($cs->is_sequence_level()) { + #store sequence if it was provided + my $seq_adaptor = $db->get_SequenceAdaptor(); + $seq_adaptor->store($seq_region_id, $$seqref); + } + + #synonyms + if(defined($slice->{'synonym'})){ + foreach my $syn (@{$slice->{'synonym'}} ){ + $syn->seq_region_id($seq_region_id); # set the seq_region_id + $syn->adaptor->store($syn); + } + } + + + $slice->adaptor($self); + + return $seq_region_id; +} + + +=head2 store_assembly + + Arg [1] : Bio::EnsEMBL::Slice $asm_slice + Arg [2] : Bio::EnsEMBL::Slice $cmp_slice + Example : $asm = $slice_adaptor->store_assembly( $slice1, $slice2 ); + Description: Creates an entry in the analysis table based on the + coordinates of the two slices supplied. Returns a string + representation of the assembly that gets created. + Returntype : string + Exceptions : throw if either slice has no coord system (cs). + throw unless the cs rank of the asm_slice is lower than the + cmp_slice. + throw if there is no mapping path between coord systems + throw if the lengths of each slice are not equal + throw if there are existing mappings between either slice + and the oposite cs + Caller : database loading scripts + Status : Experimental + +=cut + +sub store_assembly{ + my $self = shift; + my $asm_slice = shift; + my $cmp_slice = shift; + + # + # Get all of the sanity checks out of the way before storing anything + # + + if(!ref($asm_slice) || !($asm_slice->isa('Bio::EnsEMBL::Slice') or $asm_slice->isa('Bio::EnsEMBL::LRGSlice'))) { + throw('Assembled Slice argument is required'); + } + if(!ref($cmp_slice) || !($cmp_slice->isa('Bio::EnsEMBL::Slice') or $cmp_slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw('Assembled Slice argument is required'); + } + + my $asm_cs = $asm_slice->coord_system(); + throw("Slice must have attached CoordSystem.") if(!$asm_cs); + my $cmp_cs = $cmp_slice->coord_system(); + throw("Slice must have attached CoordSystem.") if(!$cmp_cs); + + unless( $asm_cs->rank < $cmp_cs->rank ){ + throw("Assembled Slice CoordSystem->rank must be lower than ". + "the component Slice Coord_system" ); + } + + my @path = + @{ $asm_cs->adaptor()->get_mapping_path( $asm_cs, $cmp_cs ) }; + + if ( !@path ) { + throw("No mapping path defined between " + . $asm_cs->name() . " and " + . $cmp_cs->name() ); + } + + if( $asm_slice->length != $cmp_slice->length ){ + throw("The lengths of the assembled and component slices are not equal" ); + } + + # For now we disallow any existing mappings between the asm slice and cmp + # CoordSystem and vice-versa. + # Some cases of multiple mappings may be allowable by the API, but their + # logic needs to be coded below. + + my $asm_proj = $asm_slice->project( $cmp_cs->name, $cmp_cs->version ); + if( @$asm_proj ){ + throw("Regions of the assembled slice are already assembled ". + "into the component CoordSystem" ); + } + my $cmp_proj = $cmp_slice->project( $asm_cs->name, $asm_cs->version ); + if( @$cmp_proj ){ + throw("Regions of the component slice are already assembled ". + "into the assembled CoordSystem" ); + } + + # + # Checks complete. Store the data + # + my $sth = $self->db->dbc->prepare + ("INSERT INTO assembly " . + "SET asm_seq_region_id = ?, " . + " cmp_seq_region_id = ?, " . + " asm_start = ?, " . + " asm_end = ?, " . + " cmp_start = ?, " . + " cmp_end = ?, " . + " ori = ?" ); + + my $asm_seq_region_id = $self->get_seq_region_id( $asm_slice ); + my $cmp_seq_region_id = $self->get_seq_region_id( $cmp_slice ); + my $ori = $asm_slice->strand * $cmp_slice->strand; + + $sth->bind_param(1,$asm_seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$cmp_seq_region_id,SQL_INTEGER); + $sth->bind_param(3,$asm_slice->start,SQL_INTEGER); + $sth->bind_param(4,$asm_slice->end,SQL_INTEGER); + $sth->bind_param(5,$cmp_slice->start,SQL_INTEGER); + $sth->bind_param(6,$cmp_slice->end,SQL_INTEGER); + $sth->bind_param(7,$ori,SQL_INTEGER); + + $sth->execute(); + + #use Data::Dumper qw( Dumper ); + #warn Dumper( $self->db->{seq_region_cache} ); + #$self->db->{seq_region_cache} = undef; + #$self->_cache_seq_regions(); + + my $ama = $self->db->get_AssemblyMapperAdaptor(); + $ama->delete_cache(); + + + return $asm_slice->name . "<>" . $cmp_slice->name; + +} + + +=head2 prepare + + Arg [1] : String $sql + Example : ( optional ) + Description: overrides the default adaptor prepare method. + All slice sql will usually use the dna_db. + Returntype : DBD::sth + Exceptions : none + Caller : internal, convenience method + Status : Stable + +=cut + +sub prepare { + my ( $self, $sql ) = @_; + return $self->db()->dnadb()->dbc->prepare($sql); +} + +sub _build_exception_cache { + my $self = shift; + + # build up a cache of the entire assembly exception table + # it should be small anyway + my $sth = + $self->prepare( 'SELECT ae.seq_region_id, ae.seq_region_start, ' + . 'ae.seq_region_end, ae.exc_type, ae.exc_seq_region_id, ' + . 'ae.exc_seq_region_start, ae.exc_seq_region_end ' + . 'FROM assembly_exception ae, ' + . 'seq_region sr, coord_system cs ' + . 'WHERE sr.seq_region_id = ae.seq_region_id ' + . 'AND sr.coord_system_id = cs.coord_system_id ' + . 'AND cs.species_id = ?' ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my %hash; + $self->{'asm_exc_cache'} = \%hash; + + my $row; + while ( $row = $sth->fetchrow_arrayref() ) { + my @result = @$row; + $hash{ $result[0] } ||= []; + push( @{ $hash{ $result[0] } }, \@result ); + } + $sth->finish(); +} ## end sub _build_exception_cache + +=head2 cache_toplevel_seq_mappings + + Args : none + Example : $slice_adaptor->cache_toplevel_seq_mappings(); + Description: caches all the assembly mappings needed for genes + Returntype : None + Exceptions : None + Caller : general + Status : At Risk + : New experimental code + +=cut + +sub cache_toplevel_seq_mappings { + my ($self) = @_; + + # Get the sequence level to map too + + my $sql = (<<SSQL); + SELECT name + FROM coord_system + WHERE attrib like "%sequence_level%" + AND species_id = ? +SSQL + + my $sth = $self->prepare($sql); + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my $sequence_level = $sth->fetchrow_array(); + + $sth->finish(); + + my $csa = $self->db->get_CoordSystemAdaptor(); + my $ama = $self->db->get_AssemblyMapperAdaptor(); + + my $cs1 = $csa->fetch_by_name($sequence_level); + + #get level to map too. + + $sql = (<<LSQL); + SELECT DISTINCT(cs.name) + FROM seq_region sr, + seq_region_attrib sra, + attrib_type at, + coord_system cs + WHERE sra.seq_region_id = sr.seq_region_id + AND sra.attrib_type_id = at.attrib_type_id + AND at.code = "toplevel" + AND cs.coord_system_id = sr.coord_system_id + AND cs.species_id = ? +LSQL + + $sth = $self->prepare($sql); + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + while ( my $csn = $sth->fetchrow_array() ) { + if ( $csn eq $sequence_level ) { next } + my $cs2 = $csa->fetch_by_name($csn); + my $am = $ama->fetch_by_CoordSystems( $cs1, $cs2 ); + $am->register_all(); + } + +} ## end sub cache_toplevel_seq_mappings + + +sub _build_circular_slice_cache { + my $self = shift; + + # build up a cache of circular sequence region ids + my $sth = + $self->prepare( "SELECT sra.seq_region_id FROM seq_region_attrib sra " + . "INNER JOIN attrib_type at ON sra.attrib_type_id = at.attrib_type_id " + . "INNER JOIN seq_region sr ON sra.seq_region_id = sr.seq_region_id " + . "INNER JOIN coord_system cs ON sr.coord_system_id = cs.coord_system_id " + . "WHERE code = 'circular_seq' and cs.species_id = ?"); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + my $id; + my %hash; + if ( ($id) = $sth->fetchrow_array() ) { + $self->{'circular_sr_id_cache'} = \%hash; + $self->{'is_circular'} = 1; + $hash{ $id } = $id; + while ( ($id) = $sth->fetchrow_array() ) { + $hash{ $id } = $id; + } + } else { + $self->{'is_circular'} = 0; + } + $sth->finish(); +} ## end _build_circular_slice_cache + + +##################################### +# sub DEPRECATED METHODs +##################################### + +=head2 fetch_by_mapfrag + + Function: DEPRECATED use fetch_by_misc_feature_attribute('synonym',$mapfrag) + +=cut + +sub fetch_by_mapfrag{ + my ($self,$mymapfrag,$flag,$size) = @_; + deprecate('Use fetch_by_misc_feature_attribute instead'); + $flag ||= 'fixed-width'; # alt.. 'context' + $size ||= $flag eq 'fixed-width' ? 100000 : 0; + return $self->fetch_by_misc_feature_attribute('synonym',$mymapfrag,$size); +} + + + +=head2 fetch_by_chr_start_end + + Description: DEPRECATED use fetch_by_region instead + +=cut + +sub fetch_by_chr_start_end { + my ($self,$chr,$start,$end) = @_; + deprecate('Use fetch_by_region() instead'); + + #assume that by chromosome the user actually meant top-level coord + #system since this is the old behaviour of this deprecated method + my $csa = $self->db->get_CoordSystemAdaptor(); + my ($cs) = @{$csa->fetch_all()}; # get the highest coord system + + return $self->fetch_by_region($cs->name,$chr,$start,$end,1,$cs->version); +} + + + +=head2 fetch_by_contig_name + + Description: Deprecated. Use fetch_by_region(), Slice::project(), + Slice::expand() instead + +=cut + +sub fetch_by_contig_name { + my ($self, $name, $size) = @_; + + deprecate('Use fetch_by_region(), Slice::project() and Slice::expand().'); + + #previously wanted chromosomal slice on a given contig. Assume this means + #a top-level slice on a given seq_region in the seq_level coord system + my $csa = $self->db()->get_CoordSystemAdaptor(); + my $seq_level = $csa->fetch_sequence_level(); + + my $seq_lvl_slice = $self->fetch_by_region($seq_level->name(), $name); + + if(!$seq_lvl_slice) { + return undef; + } + + my @projection = @{$seq_lvl_slice->project('toplevel')}; + + if(@projection != 1) { + warning("$name is mapped to multiple toplevel locations."); + } + + return $projection[0]->[2]->expand($size, $size); +} + + +=head2 fetch_by_clone_accession + + Description: DEPRECATED. Use fetch_by_region, Slice::project, Slice::expand + instead. + +=cut + +sub fetch_by_clone_accession{ + my ($self,$name,$size) = @_; + + deprecate('Use fetch_by_region(), Slice::project() and Slice::expand().'); + + my $csa = $self->db()->get_CoordSystemAdaptor(); + my $clone_cs = $csa->fetch_by_name('clone'); + + if(!$clone_cs) { + warning('Clone coordinate system does not exist for this species'); + return undef; + } + + #this unfortunately needs a version on the end to work + if(! ($name =~ /\./)) { + my $sth = + $self->prepare( "SELECT sr.name " + . "FROM seq_region sr, coord_system cs " + . "WHERE cs.name = 'clone' " + . "AND cs.coord_system_id = sr.coord_system_id " + . "AND sr.name LIKE '$name.%'" + . "AND cs.species_id = ?" ); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + $sth->execute(); + + if(!$sth->rows()) { + $sth->finish(); + throw("Clone $name not found in database"); + } + + ($name) = $sth->fetchrow_array(); + + $sth->finish(); + } + + my $clone = $self->fetch_by_region($clone_cs->name(), $name); + return undef if(!$clone); + + my @projection = @{$clone->project('toplevel')}; + + if(@projection != 1) { + warning("$name is mapped to multiple locations."); + } + + return $projection[0]->[2]->expand($size, $size); +} + + +=head2 fetch_by_supercontig_name + + Description: DEPRECATED. Use fetch_by_region(), Slice::project() and + Slice::expand() instead + +=cut + +sub fetch_by_supercontig_name { + my ($self,$name, $size) = @_; + + deprecate('Use fetch_by_region(), Slice::project() and Slice::expand().'); + + my $csa = $self->db()->get_CoordSystemAdaptor(); + my $sc_level = $csa->fetch_by_name('supercontig'); + + if(!$sc_level) { + warning('No supercontig coordinate system exists for this species.'); + return undef; + } + + my $sc_slice = $self->fetch_by_region($sc_level->name(),$name); + + return undef if(!$sc_slice); + + my @projection = @{$sc_slice->project('toplevel')}; + + if(@projection > 1) { + warning("$name is mapped to multiple locations in toplevel"); + } + + return $projection[0]->[2]->expand($size, $size); +} + + + + +=head2 list_overlapping_supercontigs + + Description: DEPRECATED use Slice::project instead + +=cut + +sub list_overlapping_supercontigs { + my ($self,$slice) = @_; + + deprecate('Use Slice::project() instead.'); + + my $csa = $self->db()->get_CoordSystemAdaptor(); + my $sc_level = $csa->fetch_by_name('supercontig'); + + if(!$sc_level) { + warning('No supercontig coordinate system exists for this species.'); + return undef; + } + + my @out; + foreach my $seg (@{$slice->project($sc_level->name(), $sc_level->version)}){ + push @out, $seg->[2]->seq_region_name(); + } + + return \@out; +} + + + +=head2 fetch_by_chr_name + + Description: DEPRECATED. Use fetch by region instead + +=cut + +sub fetch_by_chr_name{ + my ($self,$chr_name) = @_; + deprecate('Use fetch_by_region() instead.'); + + my $csa = $self->db->get_CoordSystemAdaptor(); + + my $top_cs = @{$csa->fetch_all()}; + + return $self->fetch_by_region($top_cs->name(),$chr_name, + undef,undef,undef,$top_cs->version); +} + + + + + + +1;