comparison variant_effect_predictor/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations
24
25 =head1 SYNOPSIS
26
27 my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' );
28
29 my $dna =
30 ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) };
31
32 =head1 DESCRIPTION
33
34 An adaptor for the retrieval of DNA sequence from the EnsEMBL database
35
36 =head1 METHODS
37
38 =cut
39
40 package Bio::EnsEMBL::DBSQL::SequenceAdaptor;
41
42 use vars qw(@ISA @EXPORT);
43 use strict;
44 use warnings;
45
46 use Bio::EnsEMBL::DBSQL::BaseAdaptor;
47 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate);
48 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
49 use Bio::EnsEMBL::Utils::Cache;
50 use Bio::EnsEMBL::Utils::Scalar qw( assert_ref );
51
52 @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
53
54 our $SEQ_CHUNK_PWR = 18; # 2^18 = approx. 250KB
55 our $SEQ_CACHE_SZ = 5;
56 our $SEQ_CACHE_MAX = (2 ** $SEQ_CHUNK_PWR) * $SEQ_CACHE_SZ;
57
58 @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}});
59
60 =head2 new
61
62 Arg [1] : none
63 Example : my $sa = $db_adaptor->get_SequenceAdaptor();
64 Description: Constructor. Calls superclass constructor and initialises
65 internal cache structure.
66 Returntype : Bio::EnsEMBL::DBSQL::SequenceAdaptor
67 Exceptions : none
68 Caller : DBAdaptor::get_SequenceAdaptor
69 Status : Stable
70
71 =cut
72
73 sub new {
74 my $caller = shift;
75
76 my $class = ref($caller) || $caller;
77
78 my $self = $class->SUPER::new(@_);
79
80 # use an LRU cache to limit the size
81 my %seq_cache;
82 tie(%seq_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_CACHE_SZ);
83
84 $self->{'seq_cache'} = \%seq_cache;
85
86
87 #
88 # See if this has any seq_region_attrib of type "_rna_edit_cache" if so store these
89 # in a hash.
90 #
91
92 my $sth = $self->dbc->prepare('select sra.seq_region_id, sra.value from seq_region_attrib sra, attrib_type at where sra.attrib_type_id = at.attrib_type_id and code like "_rna_edit"');
93
94 $sth->execute();
95 my ($seq_region_id, $value);
96 $sth->bind_columns(\$seq_region_id, \$value);
97 my %edits;
98 my $count = 0;
99 while($sth->fetch()){
100 $count++;
101 push @{$edits{$seq_region_id}}, $value;
102 }
103 $sth->finish;
104 if($count){
105 $self->{_rna_edits_cache} = \%edits;
106 }
107
108 return $self;
109 }
110
111 =head2 clear_cache
112
113 Example : $sa->clear_cache();
114 Description : Removes all entries from the associcated sequence cache
115 Returntype : None
116 Exceptions : None
117
118 =cut
119
120 sub clear_cache {
121 my ($self) = @_;
122 %{$self->{seq_cache}} = ();
123 return;
124 }
125
126
127 =head2 fetch_by_Slice_start_end_strand
128
129 Arg [1] : Bio::EnsEMBL::Slice slice
130 The slice from which you want the sequence
131 Arg [2] : (optional) int startBasePair
132 The start base pair relative to the start of the slice. Negative
133 values or values greater than the length of the slice are fine.
134 default = 1
135 Arg [3] : (optional) int endBasePair
136 The end base pair relative to the start of the slice. Negative
137 values or values greater than the length of the slice are fine,
138 but the end must be greater than or equal to the start
139 count from 1
140 default = the length of the slice
141 Arg [4] : (optional) int strand
142 1, -1
143 default = 1
144 Example : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1,
145 1000, -1);
146 Description: retrieves from db the sequence for this slice
147 uses AssemblyMapper to find the assembly
148 Returntype : string
149 Exceptions : endBasePair should be less or equal to length of slice
150 Caller : Bio::EnsEMBL::Slice::seq(), Slice::subseq()
151 Status : Stable
152
153 =cut
154
155 sub fetch_by_Slice_start_end_strand {
156 my ( $self, $slice, $start, $end, $strand ) = @_;
157
158 if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) {
159 throw("Slice argument is required.");
160 }
161
162 $start = 1 if(!defined($start));
163
164
165 if ( ( !defined($end) || $start > $end || $start < 0 || $end < 0 || $slice->start> $slice->end ) && $slice->is_circular ) {
166
167 if ( !defined($end) || ($start > $end ) ) {
168 return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand );
169 }
170
171 if ( defined($end) && ($end < 0) ) {
172 $end += $slice->seq_region_length;
173 }
174
175 if ($start < 0) {
176 $start += $slice->seq_region_length;
177 }
178
179 if($slice->start> $slice->end) {
180 return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand );
181 }
182 }
183
184 if ( ( !defined($end) ) && (not $slice->is_circular) ) {
185 $end = $slice->end() - $slice->start() + 1;
186 }
187
188 if ( $start > $end ) {
189 throw("Start must be less than or equal to end.");
190 }
191
192 $strand ||= 1;
193
194 #get a new slice that spans the exact region to retrieve dna from
195 my $right_expand = $end - $slice->length(); #negative is fine
196 my $left_expand = 1 - $start; #negative is fine
197
198 if($right_expand || $left_expand) {
199 $slice = $slice->expand($left_expand, $right_expand);
200 }
201
202 #retrieve normalized 'non-symlinked' slices
203 #this allows us to support haplotypes and PARs
204 my $slice_adaptor = $slice->adaptor();
205 my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)};
206
207 if(@symproj == 0) {
208 throw('Could not retrieve normalized Slices. Database contains ' .
209 'incorrect assembly_exception information.');
210 }
211
212 #call this method again with any slices that were 'symlinked' to by this
213 #slice
214 if(@symproj != 1 || $symproj[0]->[2] != $slice) {
215 my $seq;
216 foreach my $segment (@symproj) {
217 my $symlink_slice = $segment->[2];
218 #get sequence from each symlinked area
219 $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice,
220 1,undef,1)};
221 }
222 if($strand == -1) {
223 reverse_comp(\$seq);
224 }
225 return \$seq;
226 }
227
228 # we need to project this slice onto the sequence coordinate system
229 # even if the slice is in the same coord system, we want to trim out
230 # flanking gaps (if the slice is past the edges of the seqregion)
231 my $csa = $self->db->get_CoordSystemAdaptor();
232 my $seqlevel = $csa->fetch_sequence_level();
233
234 my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())};
235
236 my $seq = '';
237 my $total = 0;
238 my $tmp_seq;
239
240 #fetch sequence from each of the sequence regions projected onto
241 foreach my $segment (@projection) {
242 my ($start, $end, $seq_slice) = @$segment;
243
244 #check for gaps between segments and pad them with Ns
245 my $gap = $start - $total - 1;
246 if($gap) {
247 $seq .= 'N' x $gap;
248 }
249
250 my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
251
252 $tmp_seq = ${$self->_fetch_seq($seq_region_id,
253 $seq_slice->start, $seq_slice->length())};
254
255 #reverse compliment on negatively oriented slices
256 if($seq_slice->strand == -1) {
257 reverse_comp(\$tmp_seq);
258 }
259
260 $seq .= $tmp_seq;
261
262 $total = $end;
263 }
264
265 #check for any remaining gaps at the end
266 my $gap = $slice->length - $total;
267 if($gap) {
268 $seq .= 'N' x $gap;
269 }
270
271 #if the sequence is too short it is because we came in with a seqlevel
272 #slice that was partially off of the seq_region. Pad the end with Ns
273 #to make long enough
274 if(length($seq) != $slice->length()) {
275 $seq .= 'N' x ($slice->length() - length($seq));
276 }
277
278 if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){
279 $self->_rna_edit($slice,\$seq);
280 }
281
282 #if they asked for the negative slice strand revcomp the whole thing
283 reverse_comp(\$seq) if($strand == -1);
284
285 return \$seq;
286 }
287
288
289 sub _fetch_by_Slice_start_end_strand_circular {
290 my ( $self, $slice, $start, $end, $strand ) = @_;
291
292 assert_ref( $slice, 'Bio::EnsEMBL::Slice' );
293
294 $strand ||= 1;
295 if ( !defined($start) ) {
296 $start ||= 1;
297 }
298
299 if ( !defined($end) ) {
300 $end = $slice->end() - $slice->start() + 1;
301 }
302
303 if ( $start > $end && $slice->is_circular() ) {
304 my ($seq, $seq1, $seq2);
305
306 my $midpoint = $slice->seq_region_length - $slice->start + 1;
307 $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1, $midpoint, 1 )};
308 $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )};
309
310 $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1";
311
312 reverse_comp( \$seq ) if ( $strand == -1 );
313
314 return \$seq;
315 }
316
317
318
319 # Get a new slice that spans the exact region to retrieve dna from
320 my $right_expand = $end - $slice->length(); #negative is fine
321 my $left_expand = 1 - $start; #negative is fine
322
323 if ( $right_expand || $left_expand ) {
324 $slice =
325 $slice->strand > 0
326 ? $slice->expand( $left_expand, $right_expand )
327 : $slice->expand( $right_expand, $left_expand );
328 }
329
330 # Retrieve normalized 'non-symlinked' slices. This allows us to
331 # support haplotypes and PARs.
332 my $slice_adaptor = $slice->adaptor();
333 my @symproj =
334 @{ $slice_adaptor->fetch_normalized_slice_projection($slice) };
335
336 if ( @symproj == 0 ) {
337 throw( 'Could not retrieve normalized Slices. Database contains '
338 . 'incorrect assembly_exception information.' );
339 }
340
341 # Call this method again with any slices that were 'symlinked' to by
342 # this slice.
343 if ( @symproj != 1 || $symproj[0]->[2] != $slice ) {
344 my $seq;
345 foreach my $segment (@symproj) {
346 my $symlink_slice = $segment->[2];
347
348 # Get sequence from each symlinked area.
349 $seq .= ${
350 $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1,
351 undef, 1 ) };
352 }
353 if ( $strand == -1 ) {
354 reverse_comp( \$seq );
355 }
356
357 return \$seq;
358 }
359
360 # We need to project this slice onto the sequence coordinate system
361 # even if the slice is in the same coord system, we want to trim out
362 # flanking gaps (if the slice is past the edges of the seqregion).
363 my $csa = $self->db->get_CoordSystemAdaptor();
364 my $seqlevel = $csa->fetch_sequence_level();
365
366 my @projection =
367 @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) };
368
369 my $seq = '';
370 my $total = 0;
371 my $tmp_seq;
372
373 # Fetch sequence from each of the sequence regions projected onto.
374 foreach my $segment (@projection) {
375 my ( $start, $end, $seq_slice ) = @{$segment};
376
377 # Check for gaps between segments and pad them with Ns
378 my $gap = $start - $total - 1;
379 if ($gap) {
380 $seq .= 'N' x $gap;
381 }
382
383 my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice);
384
385 $tmp_seq = ${
386 $self->_fetch_seq( $seq_region_id, $seq_slice->start(),
387 $seq_slice->length() ) };
388
389 # Reverse compliment on negatively oriented slices.
390 if ( $seq_slice->strand == -1 ) {
391 reverse_comp( \$tmp_seq );
392 }
393
394 $seq .= $tmp_seq;
395
396 $total = $end;
397 }
398
399 # Check for any remaining gaps at the end.
400 my $gap = $slice->length() - $total;
401
402 if ($gap) {
403 $seq .= 'N' x $gap;
404 }
405
406 # If the sequence is too short it is because we came in with a
407 # seqlevel slice that was partially off of the seq_region. Pad the
408 # end with Ns to make long enough
409 if ( length($seq) != $slice->length() ) {
410 $seq .= 'N' x ( $slice->length() - length($seq) );
411 }
412
413 if ( defined( $self->{_rna_edits_cache} )
414 && defined(
415 $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) )
416 {
417 $self->_rna_edit( $slice, \$seq );
418 }
419
420 return \$seq;
421 } ## end sub _fetch_by_Slice_start_end_strand_circular
422
423
424
425
426
427 sub _rna_edit {
428 my $self = shift;
429 my $slice = shift;
430 my $seq = shift; #reference to string
431
432 my $s_start = $slice->start; #substr start at 0 , but seq starts at 1 (so no -1 here)
433 my $s_end = $s_start+length($$seq);
434
435 foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){
436 my ($start, $end, $txt) = split (/\s+/, $edit);
437 # check that RNA edit is not outside the requested region : happens quite often with LRG regions
438 next if ($end < $s_start);
439 next if ($s_end < $start);
440 substr($$seq,$start-$s_start, ($end-$start)+1, $txt);
441 }
442 return;
443 }
444
445
446 sub _fetch_seq {
447 my $self = shift;
448 my $seq_region_id = shift;
449 my $start = shift;
450 my $length = shift;
451
452 my $cache = $self->{'seq_cache'};
453
454 if($length < $SEQ_CACHE_MAX) {
455 my $chunk_min = ($start-1) >> $SEQ_CHUNK_PWR;
456 my $chunk_max = ($start + $length - 1) >> $SEQ_CHUNK_PWR;
457
458 # piece together sequence from cached component parts
459
460 my $entire_seq = undef;
461 for(my $i = $chunk_min; $i <= $chunk_max; $i++) {
462 if($cache->{"$seq_region_id:$i"}) {
463 $entire_seq .= $cache->{"$seq_region_id:$i"};
464 } else {
465 # retrieve uncached portions of the sequence
466
467 my $sth =
468 $self->prepare( "SELECT SUBSTRING(d.sequence, ?, ?) "
469 . "FROM dna d "
470 . "WHERE d.seq_region_id = ?" );
471
472 my $tmp_seq;
473
474 my $min = ($i << $SEQ_CHUNK_PWR) + 1;
475
476 $sth->bind_param( 1, $min, SQL_INTEGER );
477 $sth->bind_param( 2, 1 << $SEQ_CHUNK_PWR, SQL_INTEGER );
478 $sth->bind_param( 3, $seq_region_id, SQL_INTEGER );
479
480 $sth->execute();
481 $sth->bind_columns(\$tmp_seq);
482 $sth->fetch();
483 $sth->finish();
484
485 # always give back uppercased sequence so it can be properly softmasked
486 $entire_seq .= uc($tmp_seq);
487 $cache->{"$seq_region_id:$i"} = uc($tmp_seq);
488 }
489 }
490
491 # return only the requested portion of the entire sequence
492 my $min = ( $chunk_min << $SEQ_CHUNK_PWR ) + 1;
493 # my $max = ( $chunk_max + 1 ) << $SEQ_CHUNK_PWR;
494 my $seq = substr( $entire_seq, $start - $min, $length );
495
496 return \$seq;
497 } else {
498
499 # do not do any caching for requests of very large sequences
500 my $sth =
501 $self->prepare( "SELECT SUBSTRING(d.sequence, ?, ?) "
502 . "FROM dna d "
503 . "WHERE d.seq_region_id = ?" );
504
505 my $tmp_seq;
506
507 $sth->bind_param( 1, $start, SQL_INTEGER );
508 $sth->bind_param( 2, $length, SQL_INTEGER );
509 $sth->bind_param( 3, $seq_region_id, SQL_INTEGER );
510
511 $sth->execute();
512 $sth->bind_columns(\$tmp_seq);
513 $sth->fetch();
514 $sth->finish();
515
516 # always give back uppercased sequence so it can be properly softmasked
517 $tmp_seq = uc($tmp_seq);
518
519 return \$tmp_seq;
520 }
521 }
522
523
524 =head2 store
525
526 Arg [1] : int $seq_region_id the id of the sequence region this dna
527 will be associated with.
528 Arg [2] : string $sequence the dna sequence to be stored
529 in the database. Note that the sequence passed in will be
530 converted to uppercase.
531 Example : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA');
532 Description: stores a dna sequence in the databases dna table and returns the
533 database identifier for the new record.
534 Returntype : none
535 Exceptions : throw if the database insert fails
536 Caller : sequence loading scripts
537 Status : Stable
538
539 =cut
540
541 sub store {
542 my ($self, $seq_region_id, $sequence) = @_;
543
544 if(!$seq_region_id) {
545 throw('seq_region_id is required');
546 }
547
548 $sequence = uc($sequence);
549
550 my $statement =
551 $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)");
552
553 $statement->bind_param(1,$seq_region_id,SQL_INTEGER);
554 $statement->bind_param(2,$sequence,SQL_LONGVARCHAR);
555 $statement->execute();
556
557 $statement->finish();
558
559 return;
560 }
561
562
563
564
565 =head2 fetch_by_assembly_location
566
567 Description: DEPRECATED use fetch_by_Slice_start_end_strand() instead.
568
569 =cut
570
571 sub fetch_by_assembly_location {
572 my ( $self, $chrStart, $chrEnd,
573 $strand, $chrName, $assemblyType ) = @_;
574
575 deprecate('Use fetch_by_Slice_start_end_strand() instead');
576
577 my $csa = $self->db->get_CoordSystem();
578 my $top_cs = @{$csa->fetch_all};
579
580 my $slice_adaptor = $self->db->get_SliceAdaptor();
581 my $slice = $slice_adaptor->fetch_by_region($top_cs->name(), $chrName,
582 $chrStart, $chrEnd,
583 $strand, $top_cs->version);
584
585 return $self->fetch_by_Slice_start_end_strand($slice,1, $slice->length,1);
586 }
587
588
589 =head2 fetch_by_RawContig_start_end_strand
590
591 Description: DEPRECATED use fetch_by_Slice_start_end_strand instead
592
593 =cut
594
595 sub fetch_by_RawContig_start_end_strand {
596 deprecate('Use fetch_by_Slice_start_end_strand instead.');
597 fetch_by_Slice_start_end_strand(@_);
598 }
599
600
601
602
603 1;