Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/DBSQL/SequenceAdaptor.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::DBSQL::SequenceAdaptor - produce sequence strings from locations | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 my $sa = $registry->get_adaptor( 'Human', 'Core', 'Sequence' ); | |
28 | |
29 my $dna = | |
30 ${ $sa->fetch_by_Slice_start_end_strand( $slice, 1, 1000, -1 ) }; | |
31 | |
32 =head1 DESCRIPTION | |
33 | |
34 An adaptor for the retrieval of DNA sequence from the EnsEMBL database | |
35 | |
36 =head1 METHODS | |
37 | |
38 =cut | |
39 | |
40 package Bio::EnsEMBL::DBSQL::SequenceAdaptor; | |
41 | |
42 use vars qw(@ISA @EXPORT); | |
43 use strict; | |
44 use warnings; | |
45 | |
46 use Bio::EnsEMBL::DBSQL::BaseAdaptor; | |
47 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate); | |
48 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
49 use Bio::EnsEMBL::Utils::Cache; | |
50 use Bio::EnsEMBL::Utils::Scalar qw( assert_ref ); | |
51 | |
52 @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); | |
53 | |
54 our $SEQ_CHUNK_PWR = 18; # 2^18 = approx. 250KB | |
55 our $SEQ_CACHE_SZ = 5; | |
56 our $SEQ_CACHE_MAX = (2 ** $SEQ_CHUNK_PWR) * $SEQ_CACHE_SZ; | |
57 | |
58 @EXPORT = (@{$DBI::EXPORT_TAGS{'sql_types'}}); | |
59 | |
60 =head2 new | |
61 | |
62 Arg [1] : none | |
63 Example : my $sa = $db_adaptor->get_SequenceAdaptor(); | |
64 Description: Constructor. Calls superclass constructor and initialises | |
65 internal cache structure. | |
66 Returntype : Bio::EnsEMBL::DBSQL::SequenceAdaptor | |
67 Exceptions : none | |
68 Caller : DBAdaptor::get_SequenceAdaptor | |
69 Status : Stable | |
70 | |
71 =cut | |
72 | |
73 sub new { | |
74 my $caller = shift; | |
75 | |
76 my $class = ref($caller) || $caller; | |
77 | |
78 my $self = $class->SUPER::new(@_); | |
79 | |
80 # use an LRU cache to limit the size | |
81 my %seq_cache; | |
82 tie(%seq_cache, 'Bio::EnsEMBL::Utils::Cache', $SEQ_CACHE_SZ); | |
83 | |
84 $self->{'seq_cache'} = \%seq_cache; | |
85 | |
86 | |
87 # | |
88 # See if this has any seq_region_attrib of type "_rna_edit_cache" if so store these | |
89 # in a hash. | |
90 # | |
91 | |
92 my $sth = $self->dbc->prepare('select sra.seq_region_id, sra.value from seq_region_attrib sra, attrib_type at where sra.attrib_type_id = at.attrib_type_id and code like "_rna_edit"'); | |
93 | |
94 $sth->execute(); | |
95 my ($seq_region_id, $value); | |
96 $sth->bind_columns(\$seq_region_id, \$value); | |
97 my %edits; | |
98 my $count = 0; | |
99 while($sth->fetch()){ | |
100 $count++; | |
101 push @{$edits{$seq_region_id}}, $value; | |
102 } | |
103 $sth->finish; | |
104 if($count){ | |
105 $self->{_rna_edits_cache} = \%edits; | |
106 } | |
107 | |
108 return $self; | |
109 } | |
110 | |
111 =head2 clear_cache | |
112 | |
113 Example : $sa->clear_cache(); | |
114 Description : Removes all entries from the associcated sequence cache | |
115 Returntype : None | |
116 Exceptions : None | |
117 | |
118 =cut | |
119 | |
120 sub clear_cache { | |
121 my ($self) = @_; | |
122 %{$self->{seq_cache}} = (); | |
123 return; | |
124 } | |
125 | |
126 | |
127 =head2 fetch_by_Slice_start_end_strand | |
128 | |
129 Arg [1] : Bio::EnsEMBL::Slice slice | |
130 The slice from which you want the sequence | |
131 Arg [2] : (optional) int startBasePair | |
132 The start base pair relative to the start of the slice. Negative | |
133 values or values greater than the length of the slice are fine. | |
134 default = 1 | |
135 Arg [3] : (optional) int endBasePair | |
136 The end base pair relative to the start of the slice. Negative | |
137 values or values greater than the length of the slice are fine, | |
138 but the end must be greater than or equal to the start | |
139 count from 1 | |
140 default = the length of the slice | |
141 Arg [4] : (optional) int strand | |
142 1, -1 | |
143 default = 1 | |
144 Example : $dna = $seq_adptr->fetch_by_Slice_start_end_strand($slice, 1, | |
145 1000, -1); | |
146 Description: retrieves from db the sequence for this slice | |
147 uses AssemblyMapper to find the assembly | |
148 Returntype : string | |
149 Exceptions : endBasePair should be less or equal to length of slice | |
150 Caller : Bio::EnsEMBL::Slice::seq(), Slice::subseq() | |
151 Status : Stable | |
152 | |
153 =cut | |
154 | |
155 sub fetch_by_Slice_start_end_strand { | |
156 my ( $self, $slice, $start, $end, $strand ) = @_; | |
157 | |
158 if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice')) ) { | |
159 throw("Slice argument is required."); | |
160 } | |
161 | |
162 $start = 1 if(!defined($start)); | |
163 | |
164 | |
165 if ( ( !defined($end) || $start > $end || $start < 0 || $end < 0 || $slice->start> $slice->end ) && $slice->is_circular ) { | |
166 | |
167 if ( !defined($end) || ($start > $end ) ) { | |
168 return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $start, $end, $strand ); | |
169 } | |
170 | |
171 if ( defined($end) && ($end < 0) ) { | |
172 $end += $slice->seq_region_length; | |
173 } | |
174 | |
175 if ($start < 0) { | |
176 $start += $slice->seq_region_length; | |
177 } | |
178 | |
179 if($slice->start> $slice->end) { | |
180 return $self->_fetch_by_Slice_start_end_strand_circular( $slice, $slice->start, $slice->end, $strand ); | |
181 } | |
182 } | |
183 | |
184 if ( ( !defined($end) ) && (not $slice->is_circular) ) { | |
185 $end = $slice->end() - $slice->start() + 1; | |
186 } | |
187 | |
188 if ( $start > $end ) { | |
189 throw("Start must be less than or equal to end."); | |
190 } | |
191 | |
192 $strand ||= 1; | |
193 | |
194 #get a new slice that spans the exact region to retrieve dna from | |
195 my $right_expand = $end - $slice->length(); #negative is fine | |
196 my $left_expand = 1 - $start; #negative is fine | |
197 | |
198 if($right_expand || $left_expand) { | |
199 $slice = $slice->expand($left_expand, $right_expand); | |
200 } | |
201 | |
202 #retrieve normalized 'non-symlinked' slices | |
203 #this allows us to support haplotypes and PARs | |
204 my $slice_adaptor = $slice->adaptor(); | |
205 my @symproj=@{$slice_adaptor->fetch_normalized_slice_projection($slice)}; | |
206 | |
207 if(@symproj == 0) { | |
208 throw('Could not retrieve normalized Slices. Database contains ' . | |
209 'incorrect assembly_exception information.'); | |
210 } | |
211 | |
212 #call this method again with any slices that were 'symlinked' to by this | |
213 #slice | |
214 if(@symproj != 1 || $symproj[0]->[2] != $slice) { | |
215 my $seq; | |
216 foreach my $segment (@symproj) { | |
217 my $symlink_slice = $segment->[2]; | |
218 #get sequence from each symlinked area | |
219 $seq .= ${$self->fetch_by_Slice_start_end_strand($symlink_slice, | |
220 1,undef,1)}; | |
221 } | |
222 if($strand == -1) { | |
223 reverse_comp(\$seq); | |
224 } | |
225 return \$seq; | |
226 } | |
227 | |
228 # we need to project this slice onto the sequence coordinate system | |
229 # even if the slice is in the same coord system, we want to trim out | |
230 # flanking gaps (if the slice is past the edges of the seqregion) | |
231 my $csa = $self->db->get_CoordSystemAdaptor(); | |
232 my $seqlevel = $csa->fetch_sequence_level(); | |
233 | |
234 my @projection=@{$slice->project($seqlevel->name(), $seqlevel->version())}; | |
235 | |
236 my $seq = ''; | |
237 my $total = 0; | |
238 my $tmp_seq; | |
239 | |
240 #fetch sequence from each of the sequence regions projected onto | |
241 foreach my $segment (@projection) { | |
242 my ($start, $end, $seq_slice) = @$segment; | |
243 | |
244 #check for gaps between segments and pad them with Ns | |
245 my $gap = $start - $total - 1; | |
246 if($gap) { | |
247 $seq .= 'N' x $gap; | |
248 } | |
249 | |
250 my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice); | |
251 | |
252 $tmp_seq = ${$self->_fetch_seq($seq_region_id, | |
253 $seq_slice->start, $seq_slice->length())}; | |
254 | |
255 #reverse compliment on negatively oriented slices | |
256 if($seq_slice->strand == -1) { | |
257 reverse_comp(\$tmp_seq); | |
258 } | |
259 | |
260 $seq .= $tmp_seq; | |
261 | |
262 $total = $end; | |
263 } | |
264 | |
265 #check for any remaining gaps at the end | |
266 my $gap = $slice->length - $total; | |
267 if($gap) { | |
268 $seq .= 'N' x $gap; | |
269 } | |
270 | |
271 #if the sequence is too short it is because we came in with a seqlevel | |
272 #slice that was partially off of the seq_region. Pad the end with Ns | |
273 #to make long enough | |
274 if(length($seq) != $slice->length()) { | |
275 $seq .= 'N' x ($slice->length() - length($seq)); | |
276 } | |
277 | |
278 if(defined($self->{_rna_edits_cache}) and defined($self->{_rna_edits_cache}->{$slice->get_seq_region_id})){ | |
279 $self->_rna_edit($slice,\$seq); | |
280 } | |
281 | |
282 #if they asked for the negative slice strand revcomp the whole thing | |
283 reverse_comp(\$seq) if($strand == -1); | |
284 | |
285 return \$seq; | |
286 } | |
287 | |
288 | |
289 sub _fetch_by_Slice_start_end_strand_circular { | |
290 my ( $self, $slice, $start, $end, $strand ) = @_; | |
291 | |
292 assert_ref( $slice, 'Bio::EnsEMBL::Slice' ); | |
293 | |
294 $strand ||= 1; | |
295 if ( !defined($start) ) { | |
296 $start ||= 1; | |
297 } | |
298 | |
299 if ( !defined($end) ) { | |
300 $end = $slice->end() - $slice->start() + 1; | |
301 } | |
302 | |
303 if ( $start > $end && $slice->is_circular() ) { | |
304 my ($seq, $seq1, $seq2); | |
305 | |
306 my $midpoint = $slice->seq_region_length - $slice->start + 1; | |
307 $seq1 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, 1, $midpoint, 1 )}; | |
308 $seq2 = ${ $self->_fetch_by_Slice_start_end_strand_circular( $slice, $midpoint + 1, $slice->length(), 1 )}; | |
309 | |
310 $seq = $slice->strand > 0 ? "$seq1$seq2" : "$seq2$seq1"; | |
311 | |
312 reverse_comp( \$seq ) if ( $strand == -1 ); | |
313 | |
314 return \$seq; | |
315 } | |
316 | |
317 | |
318 | |
319 # Get a new slice that spans the exact region to retrieve dna from | |
320 my $right_expand = $end - $slice->length(); #negative is fine | |
321 my $left_expand = 1 - $start; #negative is fine | |
322 | |
323 if ( $right_expand || $left_expand ) { | |
324 $slice = | |
325 $slice->strand > 0 | |
326 ? $slice->expand( $left_expand, $right_expand ) | |
327 : $slice->expand( $right_expand, $left_expand ); | |
328 } | |
329 | |
330 # Retrieve normalized 'non-symlinked' slices. This allows us to | |
331 # support haplotypes and PARs. | |
332 my $slice_adaptor = $slice->adaptor(); | |
333 my @symproj = | |
334 @{ $slice_adaptor->fetch_normalized_slice_projection($slice) }; | |
335 | |
336 if ( @symproj == 0 ) { | |
337 throw( 'Could not retrieve normalized Slices. Database contains ' | |
338 . 'incorrect assembly_exception information.' ); | |
339 } | |
340 | |
341 # Call this method again with any slices that were 'symlinked' to by | |
342 # this slice. | |
343 if ( @symproj != 1 || $symproj[0]->[2] != $slice ) { | |
344 my $seq; | |
345 foreach my $segment (@symproj) { | |
346 my $symlink_slice = $segment->[2]; | |
347 | |
348 # Get sequence from each symlinked area. | |
349 $seq .= ${ | |
350 $self->fetch_by_Slice_start_end_strand( $symlink_slice, 1, | |
351 undef, 1 ) }; | |
352 } | |
353 if ( $strand == -1 ) { | |
354 reverse_comp( \$seq ); | |
355 } | |
356 | |
357 return \$seq; | |
358 } | |
359 | |
360 # We need to project this slice onto the sequence coordinate system | |
361 # even if the slice is in the same coord system, we want to trim out | |
362 # flanking gaps (if the slice is past the edges of the seqregion). | |
363 my $csa = $self->db->get_CoordSystemAdaptor(); | |
364 my $seqlevel = $csa->fetch_sequence_level(); | |
365 | |
366 my @projection = | |
367 @{ $slice->project( $seqlevel->name(), $seqlevel->version() ) }; | |
368 | |
369 my $seq = ''; | |
370 my $total = 0; | |
371 my $tmp_seq; | |
372 | |
373 # Fetch sequence from each of the sequence regions projected onto. | |
374 foreach my $segment (@projection) { | |
375 my ( $start, $end, $seq_slice ) = @{$segment}; | |
376 | |
377 # Check for gaps between segments and pad them with Ns | |
378 my $gap = $start - $total - 1; | |
379 if ($gap) { | |
380 $seq .= 'N' x $gap; | |
381 } | |
382 | |
383 my $seq_region_id = $slice_adaptor->get_seq_region_id($seq_slice); | |
384 | |
385 $tmp_seq = ${ | |
386 $self->_fetch_seq( $seq_region_id, $seq_slice->start(), | |
387 $seq_slice->length() ) }; | |
388 | |
389 # Reverse compliment on negatively oriented slices. | |
390 if ( $seq_slice->strand == -1 ) { | |
391 reverse_comp( \$tmp_seq ); | |
392 } | |
393 | |
394 $seq .= $tmp_seq; | |
395 | |
396 $total = $end; | |
397 } | |
398 | |
399 # Check for any remaining gaps at the end. | |
400 my $gap = $slice->length() - $total; | |
401 | |
402 if ($gap) { | |
403 $seq .= 'N' x $gap; | |
404 } | |
405 | |
406 # If the sequence is too short it is because we came in with a | |
407 # seqlevel slice that was partially off of the seq_region. Pad the | |
408 # end with Ns to make long enough | |
409 if ( length($seq) != $slice->length() ) { | |
410 $seq .= 'N' x ( $slice->length() - length($seq) ); | |
411 } | |
412 | |
413 if ( defined( $self->{_rna_edits_cache} ) | |
414 && defined( | |
415 $self->{_rna_edits_cache}->{ $slice->get_seq_region_id } ) ) | |
416 { | |
417 $self->_rna_edit( $slice, \$seq ); | |
418 } | |
419 | |
420 return \$seq; | |
421 } ## end sub _fetch_by_Slice_start_end_strand_circular | |
422 | |
423 | |
424 | |
425 | |
426 | |
427 sub _rna_edit { | |
428 my $self = shift; | |
429 my $slice = shift; | |
430 my $seq = shift; #reference to string | |
431 | |
432 my $s_start = $slice->start; #substr start at 0 , but seq starts at 1 (so no -1 here) | |
433 my $s_end = $s_start+length($$seq); | |
434 | |
435 foreach my $edit (@{$self->{_rna_edits_cache}->{$slice->get_seq_region_id}}){ | |
436 my ($start, $end, $txt) = split (/\s+/, $edit); | |
437 # check that RNA edit is not outside the requested region : happens quite often with LRG regions | |
438 next if ($end < $s_start); | |
439 next if ($s_end < $start); | |
440 substr($$seq,$start-$s_start, ($end-$start)+1, $txt); | |
441 } | |
442 return; | |
443 } | |
444 | |
445 | |
446 sub _fetch_seq { | |
447 my $self = shift; | |
448 my $seq_region_id = shift; | |
449 my $start = shift; | |
450 my $length = shift; | |
451 | |
452 my $cache = $self->{'seq_cache'}; | |
453 | |
454 if($length < $SEQ_CACHE_MAX) { | |
455 my $chunk_min = ($start-1) >> $SEQ_CHUNK_PWR; | |
456 my $chunk_max = ($start + $length - 1) >> $SEQ_CHUNK_PWR; | |
457 | |
458 # piece together sequence from cached component parts | |
459 | |
460 my $entire_seq = undef; | |
461 for(my $i = $chunk_min; $i <= $chunk_max; $i++) { | |
462 if($cache->{"$seq_region_id:$i"}) { | |
463 $entire_seq .= $cache->{"$seq_region_id:$i"}; | |
464 } else { | |
465 # retrieve uncached portions of the sequence | |
466 | |
467 my $sth = | |
468 $self->prepare( "SELECT SUBSTRING(d.sequence, ?, ?) " | |
469 . "FROM dna d " | |
470 . "WHERE d.seq_region_id = ?" ); | |
471 | |
472 my $tmp_seq; | |
473 | |
474 my $min = ($i << $SEQ_CHUNK_PWR) + 1; | |
475 | |
476 $sth->bind_param( 1, $min, SQL_INTEGER ); | |
477 $sth->bind_param( 2, 1 << $SEQ_CHUNK_PWR, SQL_INTEGER ); | |
478 $sth->bind_param( 3, $seq_region_id, SQL_INTEGER ); | |
479 | |
480 $sth->execute(); | |
481 $sth->bind_columns(\$tmp_seq); | |
482 $sth->fetch(); | |
483 $sth->finish(); | |
484 | |
485 # always give back uppercased sequence so it can be properly softmasked | |
486 $entire_seq .= uc($tmp_seq); | |
487 $cache->{"$seq_region_id:$i"} = uc($tmp_seq); | |
488 } | |
489 } | |
490 | |
491 # return only the requested portion of the entire sequence | |
492 my $min = ( $chunk_min << $SEQ_CHUNK_PWR ) + 1; | |
493 # my $max = ( $chunk_max + 1 ) << $SEQ_CHUNK_PWR; | |
494 my $seq = substr( $entire_seq, $start - $min, $length ); | |
495 | |
496 return \$seq; | |
497 } else { | |
498 | |
499 # do not do any caching for requests of very large sequences | |
500 my $sth = | |
501 $self->prepare( "SELECT SUBSTRING(d.sequence, ?, ?) " | |
502 . "FROM dna d " | |
503 . "WHERE d.seq_region_id = ?" ); | |
504 | |
505 my $tmp_seq; | |
506 | |
507 $sth->bind_param( 1, $start, SQL_INTEGER ); | |
508 $sth->bind_param( 2, $length, SQL_INTEGER ); | |
509 $sth->bind_param( 3, $seq_region_id, SQL_INTEGER ); | |
510 | |
511 $sth->execute(); | |
512 $sth->bind_columns(\$tmp_seq); | |
513 $sth->fetch(); | |
514 $sth->finish(); | |
515 | |
516 # always give back uppercased sequence so it can be properly softmasked | |
517 $tmp_seq = uc($tmp_seq); | |
518 | |
519 return \$tmp_seq; | |
520 } | |
521 } | |
522 | |
523 | |
524 =head2 store | |
525 | |
526 Arg [1] : int $seq_region_id the id of the sequence region this dna | |
527 will be associated with. | |
528 Arg [2] : string $sequence the dna sequence to be stored | |
529 in the database. Note that the sequence passed in will be | |
530 converted to uppercase. | |
531 Example : $seq_adaptor->store(11, 'ACTGGGTACCAAACAAACACAACA'); | |
532 Description: stores a dna sequence in the databases dna table and returns the | |
533 database identifier for the new record. | |
534 Returntype : none | |
535 Exceptions : throw if the database insert fails | |
536 Caller : sequence loading scripts | |
537 Status : Stable | |
538 | |
539 =cut | |
540 | |
541 sub store { | |
542 my ($self, $seq_region_id, $sequence) = @_; | |
543 | |
544 if(!$seq_region_id) { | |
545 throw('seq_region_id is required'); | |
546 } | |
547 | |
548 $sequence = uc($sequence); | |
549 | |
550 my $statement = | |
551 $self->prepare("INSERT INTO dna(seq_region_id, sequence) VALUES(?,?)"); | |
552 | |
553 $statement->bind_param(1,$seq_region_id,SQL_INTEGER); | |
554 $statement->bind_param(2,$sequence,SQL_LONGVARCHAR); | |
555 $statement->execute(); | |
556 | |
557 $statement->finish(); | |
558 | |
559 return; | |
560 } | |
561 | |
562 | |
563 | |
564 | |
565 =head2 fetch_by_assembly_location | |
566 | |
567 Description: DEPRECATED use fetch_by_Slice_start_end_strand() instead. | |
568 | |
569 =cut | |
570 | |
571 sub fetch_by_assembly_location { | |
572 my ( $self, $chrStart, $chrEnd, | |
573 $strand, $chrName, $assemblyType ) = @_; | |
574 | |
575 deprecate('Use fetch_by_Slice_start_end_strand() instead'); | |
576 | |
577 my $csa = $self->db->get_CoordSystem(); | |
578 my $top_cs = @{$csa->fetch_all}; | |
579 | |
580 my $slice_adaptor = $self->db->get_SliceAdaptor(); | |
581 my $slice = $slice_adaptor->fetch_by_region($top_cs->name(), $chrName, | |
582 $chrStart, $chrEnd, | |
583 $strand, $top_cs->version); | |
584 | |
585 return $self->fetch_by_Slice_start_end_strand($slice,1, $slice->length,1); | |
586 } | |
587 | |
588 | |
589 =head2 fetch_by_RawContig_start_end_strand | |
590 | |
591 Description: DEPRECATED use fetch_by_Slice_start_end_strand instead | |
592 | |
593 =cut | |
594 | |
595 sub fetch_by_RawContig_start_end_strand { | |
596 deprecate('Use fetch_by_Slice_start_end_strand instead.'); | |
597 fetch_by_Slice_start_end_strand(@_); | |
598 } | |
599 | |
600 | |
601 | |
602 | |
603 1; |