Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,603 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations + +=head1 SYNOPSIS + + my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' ); + + my $dna = + ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) }; + +=head1 DESCRIPTION + +An adaptor for the retrieval of DNA sequence from the EnsEMBL database + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::SequenceAdaptor; + +use vars qw(@ISA @EXPORT); +use strict; +use warnings; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Utils::Cache; +use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + +our $SEQ_CHUNK_PWR = 18; # 2^18 = approx. 250KB +our $SEQ_CACHE_SZ = 5; +our $SEQ_CACHE_MAX = (2 ** $SEQ_CHUNK_PWR) * $SEQ_CACHE_SZ; + +@EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); + +=head2 new + + Arg [1] : none + Example : my $sa = $db_adaptor->get_SequenceAdaptor(); + Description: Constructor. Calls superclass constructor and initialises + internal cache structure. + Returntype : Bio::EnsEMBL::DBSQL::SequenceAdaptor + Exceptions : none + Caller : DBAdaptor::get_SequenceAdaptor + Status : Stable + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + # use an LRU cache to limit the size + my %seq_cache; + tie(%seq_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_CACHE_SZ); + + $self->{'seq_cache'} = \%seq_cache; + + +# +# See if this has any seq_region_attrib of type "_rna_edit_cache" if so store these +# in a hash. +# + + my $sth = $self->dbc->prepare('select sra.seq_region_id, sra.value from seq_region_attrib sra, attrib_type at where sra.attrib_type_id = at.attrib_type_id and code like "_rna_edit"'); + + $sth->execute(); + my ($seq_region_id, $value); + $sth->bind_columns(\$seq_region_id, \$value); + my %edits; + my $count = 0; + while($sth->fetch()){ + $count++; + push @{$edits{$seq_region_id}}, $value; + } + $sth->finish; + if($count){ + $self->{_rna_edits_cache} = \%edits; + } + + return $self; +} + +=head2 clear_cache + + Example : $sa->clear_cache(); + Description : Removes all entries from the associcated sequence cache + Returntype : None + Exceptions : None + +=cut + +sub clear_cache { + my ($self) = @_; + %{$self->{seq_cache}} = (); + return; +} + + +=head2 fetch_by_Slice_start_end_strand + + Arg [1] : Bio::EnsEMBL::Slice slice + The slice from which you want the sequence + Arg [2] : (optional) int startBasePair + The start base pair relative to the start of the slice. Negative + values or values greater than the length of the slice are fine. + default = 1 + Arg [3] : (optional) int endBasePair + The end base pair relative to the start of the slice. Negative + values or values greater than the length of the slice are fine, + but the end must be greater than or equal to the start + count from 1 + default = the length of the slice + Arg [4] : (optional) int strand + 1, -1 + default = 1 + Example : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1, + 1000, -1); + Description: retrieves from db the sequence for this slice + uses AssemblyMapper to find the assembly + Returntype : string + Exceptions : endBasePair should be less or equal to length of slice + Caller : Bio::EnsEMBL::Slice::seq(), Slice::subseq() + Status : Stable + +=cut + +sub fetch_by_Slice_start_end_strand { + my ( $self, $slice, $start, $end, $strand ) = @_; + + if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { + throw("Slice argument is required."); + } + + $start = 1 if(!defined($start)); + + + if ( ( !defined($end) || $start > $end || $start < 0 || $end < 0 || $slice->start> $slice->end ) && $slice->is_circular ) { + + if ( !defined($end) || ($start > $end ) ) { + return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand ); + } + + if ( defined($end) && ($end < 0) ) { + $end += $slice->seq_region_length; + } + + if ($start < 0) { + $start += $slice->seq_region_length; + } + + if($slice->start> $slice->end) { + return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand ); + } + } + + if ( ( !defined($end) ) && (not $slice->is_circular) ) { + $end = $slice->end() - $slice->start() + 1; + } + + if ( $start > $end ) { + throw("Start must be less than or equal to end."); + } + + $strand ||= 1; + + #get a new slice that spans the exact region to retrieve dna from + my $right_expand = $end - $slice->length(); #negative is fine + my $left_expand = 1 - $start; #negative is fine + + if($right_expand || $left_expand) { + $slice = $slice->expand($left_expand, $right_expand); + } + + #retrieve normalized 'non-symlinked' slices + #this allows us to support haplotypes and PARs + my $slice_adaptor = $slice->adaptor(); + my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)}; + + if(@symproj == 0) { + throw('Could not retrieve normalized Slices. Database contains ' . + 'incorrect assembly_exception information.'); + } + + #call this method again with any slices that were 'symlinked' to by this + #slice + if(@symproj != 1 || $symproj[0]->[2] != $slice) { + my $seq; + foreach my $segment (@symproj) { + my $symlink_slice = $segment->[2]; + #get sequence from each symlinked area + $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice, + 1,undef,1)}; + } + if($strand == -1) { + reverse_comp(\$seq); + } + return \$seq; + } + + # we need to project this slice onto the sequence coordinate system + # even if the slice is in the same coord system, we want to trim out + # flanking gaps (if the slice is past the edges of the seqregion) + my $csa = $self->db->get_CoordSystemAdaptor(); + my $seqlevel = $csa->fetch_sequence_level(); + + my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())}; + + my $seq = ''; + my $total = 0; + my $tmp_seq; + + #fetch sequence from each of the sequence regions projected onto + foreach my $segment (@projection) { + my ($start, $end, $seq_slice) = @$segment; + + #check for gaps between segments and pad them with Ns + my $gap = $start - $total - 1; + if($gap) { + $seq .= 'N' x $gap; + } + + my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice); + + $tmp_seq = ${$self->_fetch_seq($seq_region_id, + $seq_slice->start, $seq_slice->length())}; + + #reverse compliment on negatively oriented slices + if($seq_slice->strand == -1) { + reverse_comp(\$tmp_seq); + } + + $seq .= $tmp_seq; + + $total = $end; + } + + #check for any remaining gaps at the end + my $gap = $slice->length - $total; + if($gap) { + $seq .= 'N' x $gap; + } + + #if the sequence is too short it is because we came in with a seqlevel + #slice that was partially off of the seq_region. Pad the end with Ns + #to make long enough + if(length($seq) != $slice->length()) { + $seq .= 'N' x ($slice->length() - length($seq)); + } + + if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){ + $self->_rna_edit($slice,\$seq); + } + + #if they asked for the negative slice strand revcomp the whole thing + reverse_comp(\$seq) if($strand == -1); + + return \$seq; +} + + +sub _fetch_by_Slice_start_end_strand_circular { + my ( $self, $slice, $start, $end, $strand ) = @_; + + assert_ref( $slice, 'Bio::EnsEMBL::Slice' ); + + $strand ||= 1; + if ( !defined($start) ) { + $start ||= 1; + } + + if ( !defined($end) ) { + $end = $slice->end() - $slice->start() + 1; + } + + if ( $start > $end && $slice->is_circular() ) { + my ($seq, $seq1, $seq2); + + my $midpoint = $slice->seq_region_length - $slice->start + 1; + $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1, $midpoint, 1 )}; + $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )}; + + $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1"; + + reverse_comp( \$seq ) if ( $strand == -1 ); + + return \$seq; + } + + + + # Get a new slice that spans the exact region to retrieve dna from + my $right_expand = $end - $slice->length(); #negative is fine + my $left_expand = 1 - $start; #negative is fine + + if ( $right_expand || $left_expand ) { + $slice = + $slice->strand > 0 + ? $slice->expand( $left_expand, $right_expand ) + : $slice->expand( $right_expand, $left_expand ); + } + + # Retrieve normalized 'non-symlinked' slices. This allows us to + # support haplotypes and PARs. + my $slice_adaptor = $slice->adaptor(); + my @symproj = + @{ $slice_adaptor->fetch_normalized_slice_projection($slice) }; + + if ( @symproj == 0 ) { + throw( 'Could not retrieve normalized Slices. Database contains ' + . 'incorrect assembly_exception information.' ); + } + + # Call this method again with any slices that were 'symlinked' to by + # this slice. + if ( @symproj != 1 || $symproj[0]->[2] != $slice ) { + my $seq; + foreach my $segment (@symproj) { + my $symlink_slice = $segment->[2]; + + # Get sequence from each symlinked area. + $seq .= ${ + $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1, + undef, 1 ) }; + } + if ( $strand == -1 ) { + reverse_comp( \$seq ); + } + + return \$seq; + } + + # We need to project this slice onto the sequence coordinate system + # even if the slice is in the same coord system, we want to trim out + # flanking gaps (if the slice is past the edges of the seqregion). + my $csa = $self->db->get_CoordSystemAdaptor(); + my $seqlevel = $csa->fetch_sequence_level(); + + my @projection = + @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) }; + + my $seq = ''; + my $total = 0; + my $tmp_seq; + + # Fetch sequence from each of the sequence regions projected onto. + foreach my $segment (@projection) { + my ( $start, $end, $seq_slice ) = @{$segment}; + + # Check for gaps between segments and pad them with Ns + my $gap = $start - $total - 1; + if ($gap) { + $seq .= 'N' x $gap; + } + + my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice); + + $tmp_seq = ${ + $self->_fetch_seq( $seq_region_id, $seq_slice->start(), + $seq_slice->length() ) }; + + # Reverse compliment on negatively oriented slices. + if ( $seq_slice->strand == -1 ) { + reverse_comp( \$tmp_seq ); + } + + $seq .= $tmp_seq; + + $total = $end; + } + + # Check for any remaining gaps at the end. + my $gap = $slice->length() - $total; + + if ($gap) { + $seq .= 'N' x $gap; + } + + # If the sequence is too short it is because we came in with a + # seqlevel slice that was partially off of the seq_region. Pad the + # end with Ns to make long enough + if ( length($seq) != $slice->length() ) { + $seq .= 'N' x ( $slice->length() - length($seq) ); + } + + if ( defined( $self->{_rna_edits_cache} ) + && defined( + $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) ) + { + $self->_rna_edit( $slice, \$seq ); + } + + return \$seq; +} ## end sub _fetch_by_Slice_start_end_strand_circular + + + + + +sub _rna_edit { + my $self = shift; + my $slice = shift; + my $seq = shift; #reference to string + + my $s_start = $slice->start; #substr start at 0 , but seq starts at 1 (so no -1 here) + my $s_end = $s_start+length($$seq); + + foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){ + my ($start, $end, $txt) = split (/\s+/, $edit); +# check that RNA edit is not outside the requested region : happens quite often with LRG regions + next if ($end < $s_start); + next if ($s_end < $start); + substr($$seq,$start-$s_start, ($end-$start)+1, $txt); + } + return; +} + + +sub _fetch_seq { + my $self = shift; + my $seq_region_id = shift; + my $start = shift; + my $length = shift; + + my $cache = $self->{'seq_cache'}; + + if($length < $SEQ_CACHE_MAX) { + my $chunk_min = ($start-1) >> $SEQ_CHUNK_PWR; + my $chunk_max = ($start + $length - 1) >> $SEQ_CHUNK_PWR; + + # piece together sequence from cached component parts + + my $entire_seq = undef; + for(my $i = $chunk_min; $i <= $chunk_max; $i++) { + if($cache->{"$seq_region_id:$i"}) { + $entire_seq .= $cache->{"$seq_region_id:$i"}; + } else { + # retrieve uncached portions of the sequence + + my $sth = + $self->prepare( "SELECT SUBSTRING(d.sequence, ?, ?) " + . "FROM dna d " + . "WHERE d.seq_region_id = ?" ); + + my $tmp_seq; + + my $min = ($i << $SEQ_CHUNK_PWR) + 1; + + $sth->bind_param( 1, $min, SQL_INTEGER ); + $sth->bind_param( 2, 1 << $SEQ_CHUNK_PWR, SQL_INTEGER ); + $sth->bind_param( 3, $seq_region_id, SQL_INTEGER ); + + $sth->execute(); + $sth->bind_columns(\$tmp_seq); + $sth->fetch(); + $sth->finish(); + + # always give back uppercased sequence so it can be properly softmasked + $entire_seq .= uc($tmp_seq); + $cache->{"$seq_region_id:$i"} = uc($tmp_seq); + } + } + + # return only the requested portion of the entire sequence + my $min = ( $chunk_min << $SEQ_CHUNK_PWR ) + 1; + # my $max = ( $chunk_max + 1 ) << $SEQ_CHUNK_PWR; + my $seq = substr( $entire_seq, $start - $min, $length ); + + return \$seq; + } else { + + # do not do any caching for requests of very large sequences + my $sth = + $self->prepare( "SELECT SUBSTRING(d.sequence, ?, ?) " + . "FROM dna d " + . "WHERE d.seq_region_id = ?" ); + + my $tmp_seq; + + $sth->bind_param( 1, $start, SQL_INTEGER ); + $sth->bind_param( 2, $length, SQL_INTEGER ); + $sth->bind_param( 3, $seq_region_id, SQL_INTEGER ); + + $sth->execute(); + $sth->bind_columns(\$tmp_seq); + $sth->fetch(); + $sth->finish(); + + # always give back uppercased sequence so it can be properly softmasked + $tmp_seq = uc($tmp_seq); + + return \$tmp_seq; + } +} + + +=head2 store + + Arg [1] : int $seq_region_id the id of the sequence region this dna + will be associated with. + Arg [2] : string $sequence the dna sequence to be stored + in the database. Note that the sequence passed in will be + converted to uppercase. + Example : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA'); + Description: stores a dna sequence in the databases dna table and returns the + database identifier for the new record. + Returntype : none + Exceptions : throw if the database insert fails + Caller : sequence loading scripts + Status : Stable + +=cut + +sub store { + my ($self, $seq_region_id, $sequence) = @_; + + if(!$seq_region_id) { + throw('seq_region_id is required'); + } + + $sequence = uc($sequence); + + my $statement = + $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)"); + + $statement->bind_param(1,$seq_region_id,SQL_INTEGER); + $statement->bind_param(2,$sequence,SQL_LONGVARCHAR); + $statement->execute(); + + $statement->finish(); + + return; +} + + + + +=head2 fetch_by_assembly_location + + Description: DEPRECATED use fetch_by_Slice_start_end_strand() instead. + +=cut + +sub fetch_by_assembly_location { + my ( $self, $chrStart, $chrEnd, + $strand, $chrName, $assemblyType ) = @_; + + deprecate('Use fetch_by_Slice_start_end_strand() instead'); + + my $csa = $self->db->get_CoordSystem(); + my $top_cs = @{$csa->fetch_all}; + + my $slice_adaptor = $self->db->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_region($top_cs->name(), $chrName, + $chrStart, $chrEnd, + $strand, $top_cs->version); + + return $self->fetch_by_Slice_start_end_strand($slice,1, $slice->length,1); +} + + +=head2 fetch_by_RawContig_start_end_strand + + Description: DEPRECATED use fetch_by_Slice_start_end_strand instead + +=cut + +sub fetch_by_RawContig_start_end_strand { + deprecate('Use fetch_by_Slice_start_end_strand instead.'); + fetch_by_Slice_start_end_strand(@_); +} + + + + +1;