Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/DBSQL/SliceAdaptor.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::DBSQL::SliceAdaptor - A database aware adaptor responsible for | |
24 the creation of Slice objects. | |
25 | |
26 =head1 SYNOPSIS | |
27 | |
28 use Bio::EnsEMBL::Utils::Slice qw(split_Slices); | |
29 use Bio::EnsEMBL::Registry; | |
30 | |
31 Bio::EnsEMBL::Registry->load_registry_from_db( | |
32 -host => 'ensembldb.ensembl.org', | |
33 -user => 'anonymous' | |
34 ); | |
35 | |
36 $slice_adaptor = | |
37 Bio::EnsEMBL::Registry->get_adaptor( "human", "core", "slice" ); | |
38 | |
39 # get a slice on the entire chromosome X | |
40 $chr_slice = $slice_adaptor->fetch_by_region( 'chromosome', 'X' ); | |
41 | |
42 # get a slice for each clone in the database | |
43 foreach $cln_slice ( @{ $slice_adaptor->fetch_all('clone') } ) { | |
44 # do something with clone | |
45 } | |
46 | |
47 # get a slice which is part of NT_004321 | |
48 $spctg_slice = | |
49 $slice_adaptor->fetch_by_region( 'supercontig', 'NT_004321', | |
50 200_000, 600_000 ); | |
51 | |
52 # get all non-redundant slices from the highest possible coordinate | |
53 # systems | |
54 $slices = $slice_adaptor->fetch_all('toplevel'); | |
55 | |
56 # include non-reference regions | |
57 $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 1 ); | |
58 | |
59 # include non-duplicate regions | |
60 $slices = $slice_adaptor->fetch_all( 'toplevel', undef, 0, 1 ); | |
61 | |
62 # split up a list of slices into smaller slices | |
63 $overlap = 1000; | |
64 $max_length = 1e6; | |
65 $slices = split_Slices( $slices, $max_length, $overlap ); | |
66 | |
67 # store a list of slice names in a file | |
68 open( FILE, ">$filename" ) or die("Could not open file $filename"); | |
69 foreach my $slice (@$slices) { | |
70 print FILE $slice->name(), "\n"; | |
71 } | |
72 close FILE; | |
73 | |
74 # retreive a list of slices from a file | |
75 open( FILE, $filename ) or die("Could not open file $filename"); | |
76 while ( $name = <FILE> ) { | |
77 chomp($name); | |
78 $slice = $slice_adaptor->fetch_by_name($name); | |
79 # do something with slice | |
80 } | |
81 | |
82 =head1 DESCRIPTION | |
83 | |
84 This module is responsible for fetching Slices representing genomic | |
85 regions from a database. A Details on how slices can be used are in the | |
86 Bio::EnsEMBL::Slice module. | |
87 | |
88 =head1 METHODS | |
89 | |
90 =cut | |
91 | |
92 | |
93 package Bio::EnsEMBL::DBSQL::SliceAdaptor; | |
94 use vars qw(@ISA); | |
95 use strict; | |
96 | |
97 | |
98 use Bio::EnsEMBL::DBSQL::BaseAdaptor; | |
99 use Bio::EnsEMBL::Slice; | |
100 use Bio::EnsEMBL::CircularSlice; | |
101 use Bio::EnsEMBL::Mapper; | |
102 use Bio::EnsEMBL::LRGSlice; | |
103 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); | |
104 use Scalar::Util qw/looks_like_number/; | |
105 | |
106 @ISA = ('Bio::EnsEMBL::DBSQL::BaseAdaptor'); | |
107 | |
108 sub new { | |
109 my $caller = shift; | |
110 | |
111 my $class = ref($caller) || $caller; | |
112 | |
113 my $self = $class->SUPER::new(@_); | |
114 | |
115 # use a cache which is shared and also used by the assembly | |
116 # mapper adaptor | |
117 | |
118 my $seq_region_cache = $self->db->get_SeqRegionCache(); | |
119 | |
120 $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'}; | |
121 $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'}; | |
122 | |
123 $self->{'lrg_region_test'} = undef; | |
124 my $meta_container = $self->db->get_MetaContainer(); | |
125 my @values = $meta_container->list_value_by_key("LRG"); | |
126 if(scalar(@values) and $values[0]->[0]){ | |
127 $self->{'lrg_region_test'} = $values[0]->[0]; | |
128 } | |
129 return $self; | |
130 } | |
131 | |
132 | |
133 =head2 fetch_by_region | |
134 | |
135 Arg [1] : string $coord_system_name (optional) | |
136 The name of the coordinate system of the slice to be created | |
137 This may be a name of an actual coordinate system or an alias | |
138 to a coordinate system. Valid aliases are 'seqlevel' or | |
139 'toplevel'. | |
140 Arg [2] : string $seq_region_name | |
141 The name of the sequence region that the slice will be | |
142 created on. | |
143 Arg [3] : int $start (optional, default = 1) | |
144 The start of the slice on the sequence region | |
145 Arg [4] : int $end (optional, default = seq_region length) | |
146 The end of the slice on the sequence region | |
147 Arg [5] : int $strand (optional, default = 1) | |
148 The orientation of the slice on the sequence region | |
149 Arg [6] : string $version (optional, default = default version) | |
150 The version of the coordinate system to use (e.g. NCBI33) | |
151 Arg [7] : boolean $no_fuzz (optional, default = undef (false)) | |
152 If true (non-zero), do not use "fuzzy matching" (see below). | |
153 Example : $slice = $slice_adaptor->fetch_by_region('chromosome', 'X'); | |
154 $slice = $slice_adaptor->fetch_by_region('clone', 'AC008066.4'); | |
155 Description: Retrieves a slice on the requested region. At a minimum the | |
156 name the name of the seq_region to fetch must be provided. | |
157 | |
158 If no coordinate system name is provided than a slice on the | |
159 highest ranked coordinate system with a matching | |
160 seq_region_name will be returned. If a version but no | |
161 coordinate system name is provided, the same behaviour will | |
162 apply, but only coordinate systems of the appropriate version | |
163 are considered. The same applies if the 'toplevel' coordinate | |
164 system is specified, however in this case the version is | |
165 ignored. The coordinate system should always be specified if | |
166 it is known, since this is unambiguous and faster. | |
167 | |
168 Some fuzzy matching is performed if no exact match for | |
169 the provided name is found. This allows clones to be | |
170 fetched even when their version is not known. For | |
171 example fetch_by_region('clone', 'AC008066') will | |
172 retrieve the sequence_region with name 'AC008066.4'. | |
173 | |
174 The fuzzy matching can be turned off by setting the | |
175 $no_fuzz argument to a true value. | |
176 | |
177 If the requested seq_region is not found in the database undef | |
178 is returned. | |
179 | |
180 Returntype : Bio::EnsEMBL::Slice or undef | |
181 Exceptions : throw if no seq_region_name is provided | |
182 throw if invalid coord_system_name is provided | |
183 throw if start > end is provided | |
184 Caller : general | |
185 Status : Stable | |
186 | |
187 =cut | |
188 | |
189 | |
190 # | |
191 # ARNE: This subroutine needs simplification!! | |
192 # | |
193 sub fetch_by_region { | |
194 my ( $self, $coord_system_name, $seq_region_name, $start, $end, | |
195 $strand, $version, $no_fuzz ) | |
196 = @_; | |
197 | |
198 if ( !defined($start) ) { $start = 1 } | |
199 if ( !defined($strand) ) { $strand = 1 } | |
200 | |
201 if ( !defined($seq_region_name) ) { | |
202 throw('seq_region_name argument is required'); | |
203 } | |
204 | |
205 my $cs; | |
206 my $csa = $self->db->get_CoordSystemAdaptor(); | |
207 | |
208 if ( defined($coord_system_name) ) { | |
209 $cs = $csa->fetch_by_name( $coord_system_name, $version ); | |
210 | |
211 ## REMOVE THESE THREE LINES WHEN STICKLEBACK DB IS FIXED! | |
212 ## Anne/ap5 (2007-10-09): | |
213 # The problem was that the stickleback genebuild called the | |
214 # chromosomes 'groups', which meant they weren't being picked out by | |
215 # the karyotype drawing code. Apparently they are usually called | |
216 # 'groups' in the stickleback community, even though they really are | |
217 # chromosomes! | |
218 | |
219 if ( !defined($cs) && $coord_system_name eq 'chromosome' ) { | |
220 $cs = $csa->fetch_by_name( 'group', $version ); | |
221 } | |
222 | |
223 if ( !defined($cs) ) { | |
224 throw( sprintf( "Unknown coordinate system:\n" | |
225 . "name='%s' version='%s'\n", | |
226 $coord_system_name, $version ) ); | |
227 } | |
228 | |
229 # fetching by toplevel is same as fetching w/o name or version | |
230 if ( $cs->is_top_level() ) { | |
231 $cs = undef; | |
232 $version = undef; | |
233 } | |
234 | |
235 } ## end if ( defined($coord_system_name...)) | |
236 | |
237 my $constraint; | |
238 my $sql; | |
239 my @bind_params; | |
240 my $key; | |
241 | |
242 if ( defined($cs) ) { | |
243 $sql = sprintf( "SELECT sr.name, sr.seq_region_id, sr.length, %d " | |
244 . "FROM seq_region sr ", | |
245 $cs->dbID() ); | |
246 | |
247 $constraint = "AND sr.coord_system_id = ?"; | |
248 push( @bind_params, [ $cs->dbID(), SQL_INTEGER ] ); | |
249 | |
250 $key = "$seq_region_name:" . $cs->dbID(); | |
251 } else { | |
252 $sql = | |
253 "SELECT sr.name, sr.seq_region_id, sr.length, cs.coord_system_id " | |
254 . "FROM seq_region sr, coord_system cs "; | |
255 | |
256 $constraint = "AND sr.coord_system_id = cs.coord_system_id " | |
257 . "AND cs.species_id = ? "; | |
258 push( @bind_params, [ $self->species_id(), SQL_INTEGER ] ); | |
259 | |
260 if ( defined($version) ) { | |
261 $constraint .= "AND cs.version = ? "; | |
262 push( @bind_params, [ $version, SQL_VARCHAR ] ); | |
263 } | |
264 | |
265 $constraint .= "ORDER BY cs.rank ASC"; | |
266 } | |
267 | |
268 # check the cache so we only go to the db if necessary | |
269 my $length; | |
270 my $arr; | |
271 | |
272 if ( defined($key) ) { $arr = $self->{'sr_name_cache'}->{$key} } | |
273 | |
274 if ( defined($arr) ) { | |
275 $length = $arr->[3]; | |
276 } else { | |
277 my $sth = | |
278 $self->prepare( $sql . "WHERE sr.name = ? " . $constraint ); | |
279 | |
280 unshift( @bind_params, [ $seq_region_name, SQL_VARCHAR ] ); | |
281 | |
282 my $pos = 0; | |
283 foreach my $param (@bind_params) { | |
284 $sth->bind_param( ++$pos, $param->[0], $param->[1] ); | |
285 } | |
286 | |
287 $sth->execute(); | |
288 | |
289 if ( $sth->rows() == 0 ) { | |
290 $sth->finish(); | |
291 | |
292 | |
293 # try synonyms | |
294 my $syn_sql_sth = $self->prepare("select s.name from seq_region s, seq_region_synonym ss where s.seq_region_id = ss.seq_region_id and ss.synonym = ?"); | |
295 $syn_sql_sth->bind_param(1, $seq_region_name, SQL_VARCHAR); | |
296 $syn_sql_sth->execute(); | |
297 my $new_name; | |
298 $syn_sql_sth->bind_columns( \$new_name); | |
299 if($syn_sql_sth->fetch){ | |
300 $syn_sql_sth->finish; | |
301 return $self->fetch_by_region($coord_system_name, $new_name, $start, $end, $strand, $version, $no_fuzz); | |
302 } | |
303 $syn_sql_sth->finish; | |
304 | |
305 | |
306 if ($no_fuzz) { return undef } | |
307 | |
308 # Do fuzzy matching, assuming that we are just missing a version | |
309 # on the end of the seq_region name. | |
310 | |
311 $sth = | |
312 $self->prepare( $sql . " WHERE sr.name LIKE ? " . $constraint ); | |
313 | |
314 $bind_params[0] = | |
315 [ sprintf( '%s.%%', $seq_region_name ), SQL_VARCHAR ]; | |
316 | |
317 $pos = 0; | |
318 foreach my $param (@bind_params) { | |
319 $sth->bind_param( ++$pos, $param->[0], $param->[1] ); | |
320 } | |
321 | |
322 $sth->execute(); | |
323 | |
324 my $prefix_len = length($seq_region_name) + 1; | |
325 my $high_ver = undef; | |
326 my $high_cs = $cs; | |
327 | |
328 # Find the fuzzy-matched seq_region with the highest postfix | |
329 # (which ought to be a version). | |
330 | |
331 my ( $tmp_name, $id, $tmp_length, $cs_id ); | |
332 $sth->bind_columns( \( $tmp_name, $id, $tmp_length, $cs_id ) ); | |
333 | |
334 my $i = 0; | |
335 | |
336 while ( $sth->fetch ) { | |
337 my $tmp_cs = | |
338 ( defined($cs) ? $cs : $csa->fetch_by_dbID($cs_id) ); | |
339 | |
340 # cache values for future reference | |
341 my $arr = [ $id, $tmp_name, $cs_id, $tmp_length ]; | |
342 $self->{'sr_name_cache'}->{"$tmp_name:$cs_id"} = $arr; | |
343 $self->{'sr_id_cache'}->{"$id"} = $arr; | |
344 | |
345 my $tmp_ver = substr( $tmp_name, $prefix_len ); | |
346 | |
347 # skip versions which are non-numeric and apparently not | |
348 # versions | |
349 if ( $tmp_ver !~ /^\d+$/ ) { next } | |
350 | |
351 # take version with highest num, if two versions match take one | |
352 # with highest ranked coord system (lowest num) | |
353 if ( !defined($high_ver) | |
354 || $tmp_ver > $high_ver | |
355 || ( $tmp_ver == $high_ver && $tmp_cs->rank < $high_cs->rank ) | |
356 ) | |
357 { | |
358 $seq_region_name = $tmp_name; | |
359 $length = $tmp_length; | |
360 $high_ver = $tmp_ver; | |
361 $high_cs = $tmp_cs; | |
362 } | |
363 | |
364 $i++; | |
365 } ## end while ( $sth->fetch ) | |
366 $sth->finish(); | |
367 | |
368 # warn if fuzzy matching found more than one result | |
369 if ( $i > 1 ) { | |
370 warning( | |
371 sprintf( | |
372 "Fuzzy matching of seq_region_name " | |
373 . "returned more than one result.\n" | |
374 . "You might want to check whether the returned seq_region\n" | |
375 . "(%s:%s) is the one you intended to fetch.\n", | |
376 $high_cs->name(), $seq_region_name ) ); | |
377 } | |
378 | |
379 $cs = $high_cs; | |
380 | |
381 # return if we did not find any appropriate match: | |
382 if ( !defined($high_ver) ) { return undef } | |
383 | |
384 } else { | |
385 | |
386 my ( $id, $cs_id ); | |
387 ( $seq_region_name, $id, $length, $cs_id ) = | |
388 $sth->fetchrow_array(); | |
389 $sth->finish(); | |
390 | |
391 # cache to speed up for future queries | |
392 my $arr = [ $id, $seq_region_name, $cs_id, $length ]; | |
393 $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr; | |
394 $self->{'sr_id_cache'}->{"$id"} = $arr; | |
395 $cs = $csa->fetch_by_dbID($cs_id); | |
396 } | |
397 } ## end else [ if ( defined($arr) ) ] | |
398 | |
399 if ( !defined($end) ) { $end = $length } | |
400 | |
401 #If this was given then check if we've got a circular seq region otherwise | |
402 #let it fall through to the normal Slice method | |
403 if ( $end + 1 < $start ) { | |
404 my $cs_id = $cs->dbID(); | |
405 my $seq_region_id = $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"}->[0]; | |
406 if($self->is_circular($seq_region_id)) { | |
407 my $new_sl = | |
408 Bio::EnsEMBL::CircularSlice->new( | |
409 -COORD_SYSTEM => $cs, | |
410 -SEQ_REGION_NAME => $seq_region_name, | |
411 -SEQ_REGION_LENGTH => $length, | |
412 -START => $start, | |
413 -END => $end, | |
414 -STRAND => 1, | |
415 -ADAPTOR => $self ); | |
416 | |
417 return $new_sl; | |
418 } | |
419 } | |
420 | |
421 if ( defined( $self->{'lrg_region_test'} ) | |
422 and substr( $cs->name, 0, 3 ) eq $self->{'lrg_region_test'} ) | |
423 { | |
424 return | |
425 Bio::EnsEMBL::LRGSlice->new( -COORD_SYSTEM => $cs, | |
426 -SEQ_REGION_NAME => $seq_region_name, | |
427 -SEQ_REGION_LENGTH => $length, | |
428 -START => $start, | |
429 -END => $end, | |
430 -STRAND => $strand, | |
431 -ADAPTOR => $self ); | |
432 } else { | |
433 return | |
434 Bio::EnsEMBL::Slice->new_fast( { | |
435 'coord_system' => $cs, | |
436 'seq_region_name' => $seq_region_name, | |
437 'seq_region_length' => $length, | |
438 'start' => $start, | |
439 'end' => $end, | |
440 'strand' => $strand, | |
441 'adaptor' => $self } ); | |
442 } | |
443 } ## end sub fetch_by_region | |
444 | |
445 =head2 fetch_by_toplevel_location | |
446 | |
447 Arg [1] : string $location | |
448 Ensembl formatted location. Can be a format like | |
449 C<name:start-end>, C<name:start..end>, C<name:start:end>, | |
450 C<name:start>, C<name>. We can also support strand | |
451 specification as a +/- or 1/-1. | |
452 | |
453 Location names must be separated by a C<:>. All others can be | |
454 separated by C<..>, C<:> or C<->. | |
455 Arg[2] : boolean $no_warnings | |
456 Suppress warnings from this method | |
457 Example : my $slice = $sa->fetch_by_toplevel_location('X:1-10000') | |
458 my $slice = $sa->fetch_by_toplevel_location('X:1-10000:-1') | |
459 Description : Converts an Ensembl location/region into the sequence region | |
460 name, start and end and passes them onto C<fetch_by_region()>. | |
461 The code assumes that the required slice is on the top level | |
462 coordinate system. The code assumes that location formatting | |
463 is not perfect and will perform basic cleanup before parsing. | |
464 Returntype : Bio::EnsEMBL::Slice | |
465 Exceptions : If $location is false otherwise see C<fetch_by_region()> | |
466 Caller : General | |
467 Status : Beta | |
468 | |
469 =cut | |
470 | |
471 sub fetch_by_toplevel_location { | |
472 my ($self, $location, $no_warnings) = @_; | |
473 | |
474 my ($seq_region_name, $start, $end, $strand) = $self->parse_location_to_values($location, $no_warnings); | |
475 | |
476 if(! $seq_region_name) { | |
477 return; | |
478 } | |
479 | |
480 if(defined $start && defined $end && $start > $end) { | |
481 throw "Cannot request a slice whose start is greater than its end. Start: $start. End: $end"; | |
482 } | |
483 | |
484 my $coord_system_name = 'toplevel'; | |
485 my $slice = $self->fetch_by_region($coord_system_name, $seq_region_name, $start, $end, $strand, undef, 0); | |
486 return unless $slice; | |
487 | |
488 my $srl = $slice->seq_region_length(); | |
489 my $name = $slice->seq_region_name(); | |
490 if(defined $start && $start > $srl) { | |
491 throw "Cannot request a slice whose start ($start) is greater than $srl for $name."; | |
492 } | |
493 if(defined $end && $end > $srl) { | |
494 warning "Requested end ($end) is greater than $srl for $name. Resetting to $srl" if ! $no_warnings; | |
495 $slice->{end} = $srl; | |
496 } | |
497 | |
498 return $slice; | |
499 } | |
500 | |
501 =head2 parse_location_to_values | |
502 | |
503 Arg [1] : string $location | |
504 Ensembl formatted location. Can be a format like | |
505 C<name:start-end>, C<name:start..end>, C<name:start:end>, | |
506 C<name:start>, C<name>. We can also support strand | |
507 specification as a +/- or 1/-1. | |
508 | |
509 Location names must be separated by a C<:>. All others can be | |
510 separated by C<..>, C<:> or C<->. | |
511 Arg[2] : boolean $no_warnings | |
512 Suppress warnings from this method | |
513 Example : my ($name, $start, $end, $strand) = $sa->parse_location_to_values('X:1..100:1); | |
514 Description : Takes in an Ensembl location String and returns the parsed | |
515 values | |
516 Returntype : List. Contains name, start, end and strand | |
517 | |
518 =cut | |
519 | |
520 | |
521 sub parse_location_to_values { | |
522 my ($self, $location, $no_warnings, $no_errors) = @_; | |
523 | |
524 throw 'You must specify a location' if ! $location; | |
525 | |
526 #cleanup any nomenclature like 1_000 or 1 000 or 1,000 | |
527 my $number_seps_regex = qr/\s+|,|_/; | |
528 my $separator_regex = qr/(?:-|[.]{2}|\:)?/; | |
529 my $number_regex = qr/[0-9,_ E]+/xms; | |
530 my $strand_regex = qr/[+-1]|-1/xms; | |
531 | |
532 my $regex = qr/^(\w+) \s* :? \s* ($number_regex)? $separator_regex ($number_regex)? $separator_regex ($strand_regex)? $/xms; | |
533 my ($seq_region_name, $start, $end, $strand); | |
534 if(($seq_region_name, $start, $end, $strand) = $location =~ $regex) { | |
535 | |
536 if(defined $strand) { | |
537 if(!looks_like_number($strand)) { | |
538 $strand = ($strand eq '+') ? 1 : -1; | |
539 } | |
540 } | |
541 | |
542 if(defined $start) { | |
543 $start =~ s/$number_seps_regex//g; | |
544 if($start < 1) { | |
545 warning "Start was less than 1 (${start}) which is not allowed. Resetting to 1" if ! $no_warnings; | |
546 $start = 1; | |
547 } | |
548 } | |
549 if(defined $end) { | |
550 $end =~ s/$number_seps_regex//g; | |
551 if($end < 1) { | |
552 throw "Cannot request negative or 0 end indexes through this interface. Given $end but expected something greater than 0" unless $no_errors; | |
553 } | |
554 } | |
555 | |
556 if(defined $start && defined $end && $start > $end) { | |
557 throw "Cannot request a slice whose start is greater than its end. Start: $start. End: $end" unless $no_errors; | |
558 } | |
559 } | |
560 | |
561 return ($seq_region_name, $start, $end, $strand); | |
562 } | |
563 | |
564 =head2 fetch_by_region_unique | |
565 | |
566 Arg [1] : string $coord_system_name (optional) | |
567 The name of the coordinate system of the slice to be created | |
568 This may be a name of an actual coordinate system or an alias | |
569 to a coordinate system. Valid aliases are 'seqlevel' or | |
570 'toplevel'. | |
571 Arg [2] : string $seq_region_name | |
572 The name of the sequence region that the slice will be | |
573 created on. | |
574 Arg [3] : int $start (optional, default = 1) | |
575 The start of the slice on the sequence region | |
576 Arg [4] : int $end (optional, default = seq_region length) | |
577 The end of the slice on the sequence region | |
578 Arg [5] : int $strand (optional, default = 1) | |
579 The orientation of the slice on the sequence region | |
580 Arg [6] : string $version (optional, default = default version) | |
581 The version of the coordinate system to use (e.g. NCBI33) | |
582 Arg [7] : boolean $no_fuzz (optional, default = undef (false)) | |
583 If true (non-zero), do not use "fuzzy matching" (see below). | |
584 Example : $slice = $slice_adaptor->fetch_by_region_unique('chromosome', 'HSCHR6_MHC_COX'); | |
585 Description: Retrieves a slice on the requested region but returns only the unique | |
586 parts of the slice. At a minimum the | |
587 name the name of the seq_region to fetch must be provided. | |
588 | |
589 If no coordinate system name is provided than a slice on the | |
590 highest ranked coordinate system with a matching | |
591 seq_region_name will be returned. If a version but no | |
592 coordinate system name is provided, the same behaviour will | |
593 apply, but only coordinate systems of the appropriate version | |
594 are considered. The same applies if the 'toplevel' coordinate | |
595 system is specified, however in this case the version is | |
596 ignored. The coordinate system should always be specified if | |
597 it is known, since this is unambiguous and faster. | |
598 | |
599 Some fuzzy matching is performed if no exact match for | |
600 the provided name is found. This allows clones to be | |
601 fetched even when their version is not known. For | |
602 example fetch_by_region('clone', 'AC008066') will | |
603 retrieve the sequence_region with name 'AC008066.4'. | |
604 | |
605 The fuzzy matching can be turned off by setting the | |
606 $no_fuzz argument to a true value. | |
607 | |
608 If the requested seq_region is not found in the database undef | |
609 is returned. | |
610 | |
611 Returntype : listref Bio::EnsEMBL::Slice | |
612 Exceptions : throw if no seq_region_name is provided | |
613 throw if invalid coord_system_name is provided | |
614 throw if start > end is provided | |
615 Caller : general | |
616 Status : Stable | |
617 | |
618 =cut | |
619 | |
620 sub fetch_by_region_unique { | |
621 my $self = shift; | |
622 | |
623 my @out = (); | |
624 my $slice = $self->fetch_by_region(@_); | |
625 | |
626 | |
627 if ( !exists( $self->{'asm_exc_cache'} ) ) { | |
628 $self->_build_exception_cache(); | |
629 } | |
630 | |
631 if ( exists( | |
632 $self->{'asm_exc_cache'}->{ $self->get_seq_region_id($slice) } | |
633 ) ) | |
634 { | |
635 # Dereference symlinked assembly regions. Take out any regions | |
636 # which are symlinked because these are duplicates. | |
637 my @projection = | |
638 @{ $self->fetch_normalized_slice_projection($slice) }; | |
639 | |
640 foreach my $segment (@projection) { | |
641 if ( $segment->[2]->seq_region_name() eq $slice->seq_region_name() | |
642 && $segment->[2]->coord_system->equals( $slice->coord_system ) ) | |
643 { | |
644 push( @out, $segment->[2] ); | |
645 } | |
646 } | |
647 } | |
648 | |
649 return \@out; | |
650 } ## end sub fetch_by_region_unique | |
651 | |
652 =head2 fetch_by_name | |
653 | |
654 Arg [1] : string $name | |
655 Example : $name = 'chromosome:NCBI34:X:1000000:2000000:1'; | |
656 $slice = $slice_adaptor->fetch_by_name($name); | |
657 $slice2 = $slice_adaptor->fetch_by_name($slice3->name()); | |
658 Description: Fetches a slice using a slice name (i.e. the value returned by | |
659 the Slice::name method). This is useful if you wish to | |
660 store a unique identifier for a slice in a file or database or | |
661 pass a slice over a network. | |
662 Slice::name allows you to serialise/marshall a slice and this | |
663 method allows you to deserialise/unmarshal it. | |
664 | |
665 Returns undef if no seq_region with the provided name exists in | |
666 the database. | |
667 | |
668 Returntype : Bio::EnsEMBL::Slice or undef | |
669 Exceptions : throw if incorrent arg provided | |
670 Caller : Pipeline | |
671 Status : Stable | |
672 | |
673 =cut | |
674 | |
675 sub fetch_by_name { | |
676 my $self = shift; | |
677 my $name = shift; | |
678 | |
679 if(!$name) { | |
680 throw("name argument is required"); | |
681 } | |
682 | |
683 my @array = split(/:/,$name); | |
684 | |
685 if(scalar(@array) < 3 || scalar(@array) > 6) { | |
686 throw("Malformed slice name [$name]. Format is " . | |
687 "coord_system:version:name:start:end:strand"); | |
688 } | |
689 | |
690 # Rearrange arguments to suit fetch_by_region | |
691 | |
692 my @targetarray; | |
693 | |
694 $targetarray[0]=$array[0]; | |
695 $targetarray[5]=(($array[1]&&$array[1] ne "")?$array[1]:undef); | |
696 $targetarray[1]=(($array[2]&&$array[2] ne "")?$array[2]:undef); | |
697 $targetarray[2]=(($array[3]&&$array[3] ne "")?$array[3]:undef); | |
698 $targetarray[3]=(($array[4]&&$array[4] ne "")?$array[4]:undef); | |
699 $targetarray[4]=(($array[5]&&$array[5] ne "")?$array[5]:undef); | |
700 return $self->fetch_by_region(@targetarray); | |
701 } | |
702 | |
703 | |
704 | |
705 =head2 fetch_by_seq_region_id | |
706 | |
707 Arg [1] : string $seq_region_id | |
708 The internal identifier of the seq_region to create this slice | |
709 on | |
710 Arg [2] : optional start | |
711 Arg [3] : optional end | |
712 Arg [4] : optional strand | |
713 Example : $slice = $slice_adaptor->fetch_by_seq_region_id(34413); | |
714 Description: Creates a slice object of an entire seq_region using the | |
715 seq_region internal identifier to resolve the seq_region. | |
716 Returns undef if no such slice exists. | |
717 Returntype : Bio::EnsEMBL::Slice or undef | |
718 Exceptions : none | |
719 Caller : general | |
720 Status : Stable | |
721 | |
722 =cut | |
723 | |
724 sub fetch_by_seq_region_id { | |
725 my ( $self, $seq_region_id, $start, $end, $strand ) = @_; | |
726 | |
727 my $arr = $self->{'sr_id_cache'}->{$seq_region_id}; | |
728 my ( $name, $length, $cs, $cs_id ); | |
729 | |
730 | |
731 if ( $arr && defined( $arr->[2] ) ) { | |
732 ( $name, $cs_id, $length ) = ( $arr->[1], $arr->[2], $arr->[3] ); | |
733 $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id); | |
734 } else { | |
735 my $sth = | |
736 $self->prepare( "SELECT sr.name, sr.coord_system_id, sr.length " | |
737 . "FROM seq_region sr " | |
738 . "WHERE sr.seq_region_id = ? " ); | |
739 | |
740 $sth->bind_param( 1, $seq_region_id, SQL_INTEGER ); | |
741 $sth->execute(); | |
742 | |
743 if ( $sth->rows() == 0 ) { return undef } | |
744 | |
745 ( $name, $cs_id, $length ) = $sth->fetchrow_array(); | |
746 $sth->finish(); | |
747 | |
748 $cs = $self->db->get_CoordSystemAdaptor->fetch_by_dbID($cs_id); | |
749 | |
750 #cache results to speed up repeated queries | |
751 my $arr = [ $seq_region_id, $name, $cs_id, $length ]; | |
752 | |
753 $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr; | |
754 $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr; | |
755 } | |
756 | |
757 return | |
758 Bio::EnsEMBL::Slice->new_fast({ | |
759 'coord_system' => $cs, | |
760 'seq_region_name' => $name, | |
761 'seq_region_length'=> $length, | |
762 'start' => $start || 1, | |
763 'end' => $end || $length, | |
764 'strand' => $strand || 1, | |
765 'adaptor' => $self} ); | |
766 } ## end sub fetch_by_seq_region_id | |
767 | |
768 | |
769 | |
770 =head2 get_seq_region_id | |
771 | |
772 Arg [1] : Bio::EnsEMBL::Slice $slice | |
773 The slice to fetch a seq_region_id for | |
774 Example : $srid = $slice_adaptor->get_seq_region_id($slice); | |
775 Description: Retrieves the seq_region id (in this database) given a slice | |
776 Seq region ids are not stored on the slices themselves | |
777 because they are intended to be somewhat database independant | |
778 and seq_region_ids vary accross databases. | |
779 Returntype : int | |
780 Exceptions : throw if the seq_region of the slice is not in the db | |
781 throw if incorrect arg provided | |
782 Caller : BaseFeatureAdaptor | |
783 Status : Stable | |
784 | |
785 =cut | |
786 | |
787 sub get_seq_region_id { | |
788 my $self = shift; | |
789 my $slice = shift; | |
790 | |
791 if(!$slice || !ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) { | |
792 throw('Slice argument is required'); | |
793 } | |
794 | |
795 my $seq_region_name = $slice->seq_region_name(); | |
796 my $key = $seq_region_name.":".$slice->coord_system->dbID(); | |
797 my $arr = $self->{'sr_name_cache'}->{"$key"}; | |
798 | |
799 if( $arr ) { | |
800 return $arr->[0]; | |
801 } | |
802 | |
803 my $cs_id = $slice->coord_system->dbID(); | |
804 | |
805 my $sth = $self->prepare("SELECT seq_region_id, length " . | |
806 "FROM seq_region " . | |
807 "WHERE name = ? AND coord_system_id = ?"); | |
808 | |
809 #force seq_region_name cast to string so mysql cannot treat as int | |
810 $sth->bind_param(1,"$seq_region_name",SQL_VARCHAR); | |
811 $sth->bind_param(2,$cs_id,SQL_INTEGER); | |
812 $sth->execute(); | |
813 | |
814 if($sth->rows() != 1) { | |
815 throw("Non existant or ambigous seq_region:\n" . | |
816 " coord_system=[$cs_id],\n" . | |
817 " name=[$seq_region_name],\n"); | |
818 | |
819 } | |
820 | |
821 my($seq_region_id, $length) = $sth->fetchrow_array(); | |
822 $sth->finish(); | |
823 | |
824 #cache information for future requests | |
825 $arr = [ $seq_region_id, $seq_region_name, $cs_id, $length ]; | |
826 | |
827 $self->{'sr_name_cache'}->{"$seq_region_name:$cs_id"} = $arr; | |
828 $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr; | |
829 | |
830 return $seq_region_id; | |
831 } | |
832 | |
833 | |
834 | |
835 =head2 fetch_all | |
836 | |
837 Arg [1] : string $coord_system_name | |
838 The name of the coordinate system to retrieve slices of. | |
839 This may be a name of an acutal coordinate system or an alias | |
840 to a coordinate system. Valid aliases are 'seqlevel' or | |
841 'toplevel'. | |
842 Arg [2] : string $coord_system_version (optional) | |
843 The version of the coordinate system to retrieve slices of | |
844 Arg [3] : bool $include_non_reference (optional) | |
845 If this argument is not provided then only reference slices | |
846 will be returned. If set, both reference and non refeference | |
847 slices will be rerurned. | |
848 Arg [4] : int $include_duplicates (optional) | |
849 If set duplicate regions will be returned. | |
850 | |
851 NOTE: if you do not use this option and you have a PAR | |
852 (pseudo-autosomal region) at the beginning of your seq_region | |
853 then your slice will not start at position 1, so coordinates | |
854 retrieved from this slice might not be what you expected. | |
855 | |
856 Arg[5] : bool $include_lrg (optional) (default 0) | |
857 If set lrg regions will be returned aswell. | |
858 | |
859 | |
860 Example : @chromos = @{$slice_adaptor->fetch_all('chromosome','NCBI33')}; | |
861 @contigs = @{$slice_adaptor->fetch_all('contig')}; | |
862 | |
863 # get even non-reference regions | |
864 @slices = @{$slice_adaptor->fetch_all('toplevel',undef,1)}; | |
865 | |
866 # include duplicate regions (such as pseudo autosomal regions) | |
867 @slices = @{$slice_adaptor->fetch_all('toplevel', undef,0,1)}; | |
868 | |
869 Description: Retrieves slices of all seq_regions for a given coordinate | |
870 system. This is analagous to the methods fetch_all which were | |
871 formerly on the ChromosomeAdaptor, RawContigAdaptor and | |
872 CloneAdaptor classes. Slices fetched span the entire | |
873 seq_regions and are on the forward strand. | |
874 If the coordinate system with the provided name and version | |
875 does not exist an empty list is returned. | |
876 If the coordinate system name provided is 'toplevel', all | |
877 non-redundant toplevel slices are returned (note that any | |
878 coord_system_version argument is ignored in that case). | |
879 | |
880 Retrieved slices can be broken into smaller slices using the | |
881 Bio::EnsEMBL::Utils::Slice module. | |
882 | |
883 Returntype : listref of Bio::EnsEMBL::Slices | |
884 Exceptions : none | |
885 Caller : general | |
886 Status : Stable | |
887 | |
888 =cut | |
889 | |
890 sub fetch_all { | |
891 my $self = shift; | |
892 my $cs_name = shift; | |
893 my $cs_version = shift || ''; | |
894 | |
895 my ($include_non_reference, $include_duplicates, $include_lrg) = @_; | |
896 | |
897 # | |
898 # verify existance of requested coord system and get its id | |
899 # | |
900 my $csa = $self->db->get_CoordSystemAdaptor(); | |
901 my $orig_cs = $csa->fetch_by_name($cs_name, $cs_version); | |
902 | |
903 return [] if ( !$orig_cs ); | |
904 | |
905 my %bad_vals=(); | |
906 | |
907 | |
908 # | |
909 # Get a hash of non reference seq regions | |
910 # | |
911 if ( !$include_non_reference ) { | |
912 my $sth = | |
913 $self->prepare( 'SELECT sr.seq_region_id ' | |
914 . 'FROM seq_region sr, seq_region_attrib sra, ' | |
915 . 'attrib_type at, coord_system cs ' | |
916 . 'WHERE at.code = "non_ref" ' | |
917 . 'AND sra.seq_region_id = sr.seq_region_id ' | |
918 . 'AND at.attrib_type_id = sra.attrib_type_id ' | |
919 . 'AND sr.coord_system_id = cs.coord_system_id ' | |
920 . 'AND cs.species_id = ?' ); | |
921 | |
922 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); | |
923 $sth->execute(); | |
924 | |
925 my ($seq_region_id); | |
926 $sth->bind_columns( \$seq_region_id ); | |
927 | |
928 while ( $sth->fetch() ) { | |
929 $bad_vals{$seq_region_id} = 1; | |
930 } | |
931 } | |
932 | |
933 # | |
934 # if we do not want lrg's then add them to the bad list; | |
935 # | |
936 if ( !$include_lrg ) { | |
937 my $sth = | |
938 $self->prepare( 'SELECT sr.seq_region_id ' | |
939 . 'FROM seq_region sr, seq_region_attrib sra, ' | |
940 . 'attrib_type at, coord_system cs ' | |
941 . 'WHERE at.code = "LRG" ' | |
942 . 'AND sra.seq_region_id = sr.seq_region_id ' | |
943 . 'AND at.attrib_type_id = sra.attrib_type_id ' | |
944 . 'AND sr.coord_system_id = cs.coord_system_id ' | |
945 . 'AND cs.species_id = ?' ); | |
946 | |
947 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); | |
948 $sth->execute(); | |
949 | |
950 my ($seq_region_id); | |
951 $sth->bind_columns( \$seq_region_id ); | |
952 | |
953 while ( $sth->fetch() ) { | |
954 $bad_vals{$seq_region_id} = 1; | |
955 } | |
956 } | |
957 | |
958 # | |
959 # Retrieve the seq_regions from the database | |
960 # | |
961 | |
962 my $sth; | |
963 if ( $orig_cs->is_top_level() ) { | |
964 $sth = | |
965 $self->prepare( 'SELECT sr.seq_region_id, sr.name, ' | |
966 . 'sr.length, sr.coord_system_id ' | |
967 . 'FROM seq_region sr, seq_region_attrib sra, ' | |
968 . 'attrib_type at, coord_system cs ' | |
969 . 'WHERE at.code = "toplevel" ' | |
970 . 'AND at.attrib_type_id = sra.attrib_type_id ' | |
971 . 'AND sra.seq_region_id = sr.seq_region_id ' | |
972 . 'AND sr.coord_system_id = cs.coord_system_id ' | |
973 . 'AND cs.species_id = ?' ); | |
974 | |
975 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); | |
976 $sth->execute(); | |
977 } else { | |
978 $sth = | |
979 $self->prepare( 'SELECT sr.seq_region_id, sr.name, ' | |
980 . 'sr.length, sr.coord_system_id ' | |
981 . 'FROM seq_region sr ' | |
982 . 'WHERE sr.coord_system_id = ?' ); | |
983 | |
984 $sth->bind_param( 1, $orig_cs->dbID, SQL_INTEGER ); | |
985 $sth->execute(); | |
986 } | |
987 | |
988 my ( $seq_region_id, $name, $length, $cs_id ); | |
989 $sth->bind_columns( \( $seq_region_id, $name, $length, $cs_id ) ); | |
990 | |
991 my $cache_count = 0; | |
992 | |
993 my @out; | |
994 while($sth->fetch()) { | |
995 if(!defined($bad_vals{$seq_region_id})){ | |
996 my $cs = $csa->fetch_by_dbID($cs_id); | |
997 | |
998 if(!$cs) { | |
999 throw("seq_region $name references non-existent coord_system $cs_id."); | |
1000 } | |
1001 | |
1002 #cache values for future reference, but stop adding to the cache once we | |
1003 #we know we have filled it up | |
1004 if($cache_count < $Bio::EnsEMBL::Utils::SeqRegionCache::SEQ_REGION_CACHE_SIZE) { | |
1005 my $arr = [ $seq_region_id, $name, $cs_id, $length ]; | |
1006 | |
1007 $self->{'sr_name_cache'}->{"$name:$cs_id"} = $arr; | |
1008 $self->{'sr_id_cache'}->{"$seq_region_id"} = $arr; | |
1009 | |
1010 $cache_count++; | |
1011 } | |
1012 | |
1013 my $slice = Bio::EnsEMBL::Slice->new_fast({ | |
1014 'start' => 1, | |
1015 'end' => $length, | |
1016 'strand' => 1, | |
1017 'seq_region_name' => $name, | |
1018 'seq_region_length'=> $length, | |
1019 'coord_system' => $cs, | |
1020 'adaptor' => $self}); | |
1021 | |
1022 if(!defined($include_duplicates) or !$include_duplicates){ | |
1023 # test if this slice *could* have a duplicate (exception) region | |
1024 $self->_build_exception_cache() if(!exists $self->{'asm_exc_cache'}); | |
1025 if(exists $self->{asm_exc_cache}->{$seq_region_id}) { | |
1026 | |
1027 # Dereference symlinked assembly regions. Take out | |
1028 # any regions which are symlinked because these are duplicates | |
1029 my @projection = @{$self->fetch_normalized_slice_projection($slice)}; | |
1030 foreach my $segment ( @projection) { | |
1031 if($segment->[2]->seq_region_name() eq $slice->seq_region_name() && | |
1032 $segment->[2]->coord_system->equals($slice->coord_system)) { | |
1033 push @out, $segment->[2]; | |
1034 } | |
1035 } | |
1036 } else { | |
1037 # no duplicate regions | |
1038 push @out, $slice; | |
1039 } | |
1040 } else { | |
1041 # we want duplicates anyway so do not do any checks | |
1042 push @out, $slice; | |
1043 } | |
1044 } | |
1045 } | |
1046 | |
1047 return \@out; | |
1048 } | |
1049 | |
1050 =head2 is_toplevel | |
1051 Arg : int seq_region_id | |
1052 Example : my $top = $slice_adptor->is_toplevel($seq_region_id) | |
1053 Description: Returns 1 if slice is a toplevel slice else 0 | |
1054 Returntype : int | |
1055 Caller : Slice method is_toplevel | |
1056 Status : At Risk | |
1057 | |
1058 =cut | |
1059 | |
1060 sub is_toplevel { | |
1061 my $self = shift; | |
1062 my $id = shift; | |
1063 | |
1064 my $sth = $self->prepare( | |
1065 "SELECT at.code from seq_region_attrib sra, attrib_type at " | |
1066 . "WHERE sra.seq_region_id = ? " | |
1067 . "AND at.attrib_type_id = sra.attrib_type_id " | |
1068 . "AND at.code = 'toplevel'" ); | |
1069 | |
1070 $sth->bind_param( 1, $id, SQL_INTEGER ); | |
1071 $sth->execute(); | |
1072 | |
1073 my $code; | |
1074 $sth->bind_columns( \$code ); | |
1075 | |
1076 while ( $sth->fetch ) { | |
1077 $sth->finish; | |
1078 return 1; | |
1079 } | |
1080 | |
1081 $sth->finish; | |
1082 return 0; | |
1083 } | |
1084 | |
1085 =head2 is_reference | |
1086 Arg : int seq_region_id | |
1087 Example : my $reference = $slice_adptor->is_reference($seq_region_id) | |
1088 Description: Returns 1 if slice is a reference slice else 0 | |
1089 Returntype : int | |
1090 Caller : Slice method is_reference | |
1091 Status : At Risk | |
1092 | |
1093 =cut | |
1094 | |
1095 sub is_reference { | |
1096 my $self = shift; | |
1097 my $id = shift; | |
1098 | |
1099 my $sth = $self->prepare( | |
1100 "SELECT at.code from seq_region_attrib sra, attrib_type at " | |
1101 . "WHERE sra.seq_region_id = ? " | |
1102 . "AND at.attrib_type_id = sra.attrib_type_id " | |
1103 . "AND at.code = 'non_ref'" ); | |
1104 | |
1105 $sth->bind_param( 1, $id, SQL_INTEGER ); | |
1106 $sth->execute(); | |
1107 | |
1108 my $code; | |
1109 $sth->bind_columns( \$code ); | |
1110 | |
1111 while ( $sth->fetch ) { | |
1112 $sth->finish; | |
1113 return 0; | |
1114 } | |
1115 | |
1116 $sth->finish; | |
1117 return 1; | |
1118 } | |
1119 | |
1120 =head2 is_circular | |
1121 | |
1122 Arg[1] : int seq_region_id | |
1123 Example : my $circular = $slice_adptor->is_circular($seq_region_id); | |
1124 Description : Indicates if the sequence region was circular or not | |
1125 Returntype : Boolean | |
1126 | |
1127 =cut | |
1128 | |
1129 sub is_circular { | |
1130 my ($self, $id) = @_; | |
1131 | |
1132 if (! defined $self->{is_circular}) { | |
1133 $self->_build_circular_slice_cache(); | |
1134 } | |
1135 | |
1136 return 0 if $self->{is_circular} == 0; | |
1137 return (exists $self->{circular_sr_id_cache}->{$id}) ? 1 : 0; | |
1138 } | |
1139 | |
1140 =head2 fetch_by_band | |
1141 | |
1142 Title : fetch_by_band | |
1143 Usage : | |
1144 Function: Does not work please use fetch_by_chr_band | |
1145 Example : | |
1146 Returns : Bio::EnsEMBL::Slice | |
1147 Args : the band name | |
1148 Status : AT RISK | |
1149 | |
1150 =cut | |
1151 | |
1152 sub fetch_by_band { | |
1153 my ($self,$band) = @_; | |
1154 | |
1155 my $sth = $self->dbc->prepare | |
1156 ("select s.name,max(k.seq_region_id)-min(k.seq_region_id, min(k.seq_region_start), max(k.seq_region_id) " . | |
1157 "from karyotype as k " . | |
1158 "where k.band like ? and k.seq_region_id = s.seq_region_id"); | |
1159 | |
1160 $sth->bind_param(1,"$band%",SQL_VARCHAR); | |
1161 $sth->execute(); | |
1162 my ( $seq_region_name, $discrepancy, $seq_region_start, $seq_region_end) = $sth->fetchrow_array; | |
1163 | |
1164 if($seq_region_name && $discrepancy>0) { | |
1165 throw("Band maps to multiple seq_regions"); | |
1166 } else { | |
1167 return $self->fetch_by_region('toplevel',$seq_region_name,$seq_region_start,$seq_region_end); | |
1168 } | |
1169 throw("Band not recognised in database"); | |
1170 } | |
1171 | |
1172 =head2 fetch_by_chr_band | |
1173 | |
1174 Title : fetch_by_chr_band | |
1175 Usage : | |
1176 Function: create a Slice representing a series of bands | |
1177 Example : | |
1178 Returns : Bio::EnsEMBL::Slice | |
1179 Args : the band name | |
1180 Status : Stable | |
1181 | |
1182 =cut | |
1183 | |
1184 sub fetch_by_chr_band { | |
1185 my ( $self, $chr, $band ) = @_; | |
1186 | |
1187 my $chr_slice = $self->fetch_by_region( 'toplevel', $chr ); | |
1188 my $seq_region_id = $self->get_seq_region_id($chr_slice); | |
1189 | |
1190 my $sth = | |
1191 $self->prepare( 'SELECT MIN(k.seq_region_start), ' | |
1192 . 'MAX(k.seq_region_end) ' | |
1193 . 'FROM karyotype k ' | |
1194 . 'WHERE k.seq_region_id = ? ' | |
1195 . 'AND k.band LIKE ?' ); | |
1196 | |
1197 $sth->bind_param( 1, $seq_region_id, SQL_INTEGER ); | |
1198 $sth->bind_param( 2, "$band%", SQL_VARCHAR ); | |
1199 $sth->execute(); | |
1200 | |
1201 my ( $slice_start, $slice_end ) = $sth->fetchrow_array; | |
1202 | |
1203 if ( defined $slice_start ) { | |
1204 return | |
1205 $self->fetch_by_region( 'toplevel', $chr, | |
1206 $slice_start, $slice_end ); | |
1207 } | |
1208 | |
1209 throw("Band not recognised in database"); | |
1210 } ## end sub fetch_by_chr_band | |
1211 | |
1212 | |
1213 | |
1214 =head2 fetch_by_exon_stable_id | |
1215 | |
1216 Arg [1] : string $exonid | |
1217 The stable id of the exon around which the slice is | |
1218 desired | |
1219 Arg [2] : (optional) int $size | |
1220 The length of the flanking regions the slice should encompass | |
1221 on either side of the exon (0 by default) | |
1222 Example : $slc = $sa->fetch_by_exon_stable_id('ENSE00000302930',10); | |
1223 Description: Creates a slice around the region of the specified exon. | |
1224 If a context size is given, the slice is extended by that | |
1225 number of basepairs on either side of the exon. | |
1226 | |
1227 The slice will be created in the exon's native coordinate system | |
1228 and in the forward orientation. | |
1229 Returntype : Bio::EnsEMBL::Slice | |
1230 Exceptions : Thrown if the exon is not in the database. | |
1231 Caller : general | |
1232 Status : Stable | |
1233 | |
1234 =cut | |
1235 | |
1236 sub fetch_by_exon_stable_id{ | |
1237 my ($self,$exonid,$size) = @_; | |
1238 | |
1239 throw('Exon argument is required.') if(!$exonid); | |
1240 | |
1241 my $ea = $self->db->get_ExonAdaptor; | |
1242 my $exon = $ea->fetch_by_stable_id($exonid); | |
1243 | |
1244 throw("Exon [$exonid] does not exist in DB.") if(!$exon); | |
1245 | |
1246 return $self->fetch_by_Feature($exon, $size); | |
1247 } | |
1248 | |
1249 =head2 fetch_by_transcript_stable_id | |
1250 | |
1251 Arg [1] : string $transcriptid | |
1252 The stable id of the transcript around which the slice is | |
1253 desired | |
1254 Arg [2] : (optional) int $size | |
1255 The length of the flanking regions the slice should encompass | |
1256 on either side of the transcript (0 by default) | |
1257 Example : $slc = $sa->fetch_by_transcript_stable_id('ENST00000302930',10); | |
1258 Description: Creates a slice around the region of the specified transcript. | |
1259 If a context size is given, the slice is extended by that | |
1260 number of basepairs on either side of the | |
1261 transcript. | |
1262 | |
1263 The slice will be created in the transcript's native coordinate | |
1264 system and in the forward orientation. | |
1265 Returntype : Bio::EnsEMBL::Slice | |
1266 Exceptions : Thrown if the transcript is not in the database. | |
1267 Caller : general | |
1268 Status : Stable | |
1269 | |
1270 =cut | |
1271 | |
1272 sub fetch_by_transcript_stable_id{ | |
1273 my ($self,$transcriptid,$size) = @_; | |
1274 | |
1275 throw('Transcript argument is required.') if(!$transcriptid); | |
1276 | |
1277 my $ta = $self->db->get_TranscriptAdaptor; | |
1278 my $transcript = $ta->fetch_by_stable_id($transcriptid); | |
1279 | |
1280 throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript); | |
1281 | |
1282 return $self->fetch_by_Feature($transcript, $size); | |
1283 } | |
1284 | |
1285 | |
1286 | |
1287 =head2 fetch_by_transcript_id | |
1288 | |
1289 Arg [1] : int $transcriptid | |
1290 The unique database identifier of the transcript around which | |
1291 the slice is desired | |
1292 Arg [2] : (optional) int $size | |
1293 The length of the flanking regions the slice should encompass | |
1294 on either side of the transcript (0 by default) | |
1295 Example : $slc = $sa->fetch_by_transcript_id(24, 1000); | |
1296 Description: Creates a slice around the region of the specified transcript. | |
1297 If a context size is given, the slice is extended by that | |
1298 number of basepairs on either side of the | |
1299 transcript. | |
1300 | |
1301 The slice will be created in the transcript's native coordinate | |
1302 system and in the forward orientation. | |
1303 Returntype : Bio::EnsEMBL::Slice | |
1304 Exceptions : throw on incorrect args | |
1305 throw if transcript is not in database | |
1306 Caller : general | |
1307 Status : Stable | |
1308 | |
1309 =cut | |
1310 | |
1311 sub fetch_by_transcript_id { | |
1312 my ($self,$transcriptid,$size) = @_; | |
1313 | |
1314 throw('Transcript id argument is required.') if(!$transcriptid); | |
1315 | |
1316 my $transcript_adaptor = $self->db()->get_TranscriptAdaptor(); | |
1317 my $transcript = $transcript_adaptor->fetch_by_dbID($transcriptid); | |
1318 | |
1319 throw("Transcript [$transcriptid] does not exist in DB.") if(!$transcript); | |
1320 | |
1321 return $self->fetch_by_Feature($transcript, $size); | |
1322 } | |
1323 | |
1324 | |
1325 | |
1326 =head2 fetch_by_gene_stable_id | |
1327 | |
1328 Arg [1] : string $geneid | |
1329 The stable id of the gene around which the slice is | |
1330 desired | |
1331 Arg [2] : (optional) int $size | |
1332 The length of the flanking regions the slice should encompass | |
1333 on either side of the gene (0 by default) | |
1334 Example : $slc = $sa->fetch_by_gene_stable_id('ENSG00000012123',10); | |
1335 Description: Creates a slice around the region of the specified gene. | |
1336 If a context size is given, the slice is extended by that | |
1337 number of basepairs on either side of the gene. | |
1338 | |
1339 The slice will be created in the gene's native coordinate system | |
1340 and in the forward orientation. | |
1341 Returntype : Bio::EnsEMBL::Slice | |
1342 Exceptions : throw on incorrect args | |
1343 throw if transcript does not exist | |
1344 Caller : general | |
1345 Status : Stable | |
1346 | |
1347 =cut | |
1348 | |
1349 sub fetch_by_gene_stable_id { | |
1350 my ($self,$geneid,$size) = @_; | |
1351 | |
1352 throw('Gene argument is required.') if(!$geneid); | |
1353 | |
1354 my $gene_adaptor = $self->db->get_GeneAdaptor(); | |
1355 my $gene = $gene_adaptor->fetch_by_stable_id($geneid); | |
1356 | |
1357 throw("Gene [$geneid] does not exist in DB.") if(!$gene); | |
1358 | |
1359 return $self->fetch_by_Feature($gene, $size); | |
1360 } | |
1361 | |
1362 | |
1363 | |
1364 =head2 fetch_by_Feature | |
1365 | |
1366 Arg [1] : Bio::EnsEMBL::Feature $feat | |
1367 The feature to fetch the slice around | |
1368 Arg [2] : int size (optional) | |
1369 The desired number of flanking basepairs around the feature. | |
1370 The size may also be provided as a percentage of the feature | |
1371 size such as 200% or 80.5%. | |
1372 Example : $slice = $slice_adaptor->fetch_by_Feature($feat, 100); | |
1373 Description: Retrieves a slice around a specific feature. All this really | |
1374 does is return a resized version of the slice that the feature | |
1375 is already on. Note that slices returned from this method | |
1376 are always on the forward strand of the seq_region regardless of | |
1377 the strandedness of the feature passed in. | |
1378 Returntype : Bio::EnsEMBL::Slice | |
1379 Exceptions : throw if the feature does not have an attached slice | |
1380 throw if feature argument is not provided | |
1381 Caller : fetch_by_gene_stable_id, fetch_by_transcript_stable_id, | |
1382 fetch_by_gene_id, fetch_by_transcript_id | |
1383 Status : Stable | |
1384 | |
1385 =cut | |
1386 | |
1387 sub fetch_by_Feature{ | |
1388 my ($self, $feature, $size) = @_; | |
1389 | |
1390 $size ||= 0; | |
1391 | |
1392 if(!ref($feature) || !$feature->isa('Bio::EnsEMBL::Feature')) { | |
1393 throw('Feature argument expected.'); | |
1394 } | |
1395 | |
1396 my $slice = $feature->slice(); | |
1397 if(!$slice || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice') )) { | |
1398 throw('Feature must be attached to a valid slice.'); | |
1399 } | |
1400 | |
1401 | |
1402 my $fstart = $feature->start(); | |
1403 my $fend = $feature->end(); | |
1404 if(!defined($fstart) || !defined($fend)) { | |
1405 throw('Feature must have defined start and end.'); | |
1406 } | |
1407 | |
1408 #convert the feature slice coordinates to seq_region coordinates | |
1409 my $slice_start = $slice->start(); | |
1410 my $slice_end = $slice->end(); | |
1411 my $slice_strand = $slice->strand(); | |
1412 if($slice_start != 1 || $slice_strand != 1) { | |
1413 if($slice_strand == 1) { | |
1414 $fstart = $fstart + $slice_start - 1; | |
1415 $fend = $fend + $slice_start - 1; | |
1416 } else { | |
1417 my $tmp_start = $fstart; | |
1418 $fstart = $slice_end - $fend + 1; | |
1419 $fend = $slice_end - $tmp_start + 1; | |
1420 } | |
1421 } | |
1422 | |
1423 ## Size may be stored as a %age of the length of the feature | |
1424 ## Size = 100% gives no context | |
1425 ## Size = 200% gives context - 50% the size of the feature either side of | |
1426 ## feature | |
1427 | |
1428 $size = int( ($1-100)/200 * ($fend-$fstart+1) ) if( $size =~/([\d+\.]+)%/ ); | |
1429 | |
1430 #return a new slice covering the region of the feature | |
1431 my $S = Bio::EnsEMBL::Slice->new_fast({ | |
1432 'seq_region_name' => $slice->seq_region_name, | |
1433 'seq_region_length' => $slice->seq_region_length, | |
1434 'coord_system' => $slice->coord_system, | |
1435 'start' => $fstart - $size, | |
1436 'end' => $fend + $size, | |
1437 'strand' => 1, | |
1438 'adaptor' => $self}); | |
1439 $S->{'_raw_feature_strand'} = $feature->strand * $slice_strand if $feature->can('strand'); | |
1440 return $S; | |
1441 } | |
1442 | |
1443 | |
1444 | |
1445 =head2 fetch_by_misc_feature_attribute | |
1446 | |
1447 Arg [1] : string $attribute_type | |
1448 The code of the attribute type | |
1449 Arg [2] : (optional) string $attribute_value | |
1450 The value of the attribute to fetch by | |
1451 Arg [3] : (optional) int $size | |
1452 The amount of flanking region around the misc feature desired. | |
1453 Example : $slice = $sa->fetch_by_misc_feature_attribute('superctg', | |
1454 'NT_030871'); | |
1455 $slice = $sa->fetch_by_misc_feature_attribute('synonym', | |
1456 'AL00012311', | |
1457 $flanking); | |
1458 Description: Fetches a slice around a MiscFeature with a particular | |
1459 attribute type and value. If no value is specified then | |
1460 the feature with the particular attribute is used. | |
1461 If no size is specified then 0 is used. | |
1462 Returntype : Bio::EnsEMBL::Slice | |
1463 Exceptions : Throw if no feature with the specified attribute type and value | |
1464 exists in the database | |
1465 Warning if multiple features with the specified attribute type | |
1466 and value exist in the database. | |
1467 Caller : webcode | |
1468 Status : Stable | |
1469 | |
1470 =cut | |
1471 | |
1472 sub fetch_by_misc_feature_attribute { | |
1473 my ($self, $attrib_type_code, $attrib_value, $size) = @_; | |
1474 | |
1475 my $mfa = $self->db()->get_MiscFeatureAdaptor(); | |
1476 | |
1477 my $feats = $mfa->fetch_all_by_attribute_type_value($attrib_type_code, | |
1478 $attrib_value); | |
1479 | |
1480 if(@$feats == 0) { | |
1481 throw("MiscFeature with $attrib_type_code=$attrib_value does " . | |
1482 "not exist in DB."); | |
1483 } | |
1484 | |
1485 if(@$feats > 1) { | |
1486 warning("MiscFeature with $attrib_type_code=$attrib_value is " . | |
1487 "ambiguous - using first one found."); | |
1488 } | |
1489 | |
1490 my ($feat) = @$feats; | |
1491 | |
1492 return $self->fetch_by_Feature($feat, $size); | |
1493 } | |
1494 | |
1495 =head2 fetch_normalized_slice_projection | |
1496 | |
1497 Arg [1] : Bio::EnsEMBL::Slice $slice | |
1498 Example : ( optional ) | |
1499 Description: gives back a project style result. The returned slices | |
1500 represent the areas to which there are symlinks for the | |
1501 given slice. start, end show which area on given slice is | |
1502 symlinked | |
1503 Returntype : [[start,end,$slice][]] | |
1504 Exceptions : none | |
1505 Caller : BaseFeatureAdaptor | |
1506 Status : Stable | |
1507 | |
1508 =cut | |
1509 | |
1510 | |
1511 sub fetch_normalized_slice_projection { | |
1512 my $self = shift; | |
1513 my $slice = shift; | |
1514 | |
1515 my $slice_seq_region_id = $self->get_seq_region_id( $slice ); | |
1516 | |
1517 $self->_build_exception_cache() if(!exists($self->{'asm_exc_cache'})); | |
1518 | |
1519 my $result = $self->{'asm_exc_cache'}->{$slice_seq_region_id}; | |
1520 | |
1521 $result ||= []; | |
1522 | |
1523 my (@haps, @pars); | |
1524 | |
1525 foreach my $row (@$result) { | |
1526 my ( $seq_region_id, $seq_region_start, $seq_region_end, | |
1527 $exc_type, $exc_seq_region_id, $exc_seq_region_start, | |
1528 $exc_seq_region_end ) = @$row; | |
1529 | |
1530 # need overlapping PAR and all HAPs if any | |
1531 if( $exc_type eq "PAR" ) { | |
1532 if( $seq_region_start <= $slice->end() && | |
1533 $seq_region_end >= $slice->start() ) { | |
1534 push( @pars, [ $seq_region_start, $seq_region_end, $exc_seq_region_id, | |
1535 $exc_seq_region_start, $exc_seq_region_end ] ); | |
1536 } | |
1537 } else { | |
1538 push( @haps, [ $seq_region_start, $seq_region_end, $exc_seq_region_id, | |
1539 $exc_seq_region_start, $exc_seq_region_end ] ); | |
1540 } | |
1541 } | |
1542 | |
1543 if(!@pars && !@haps) { | |
1544 #just return this slice, there were no haps or pars | |
1545 return [bless ( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment")]; | |
1546 } | |
1547 | |
1548 my @syms; | |
1549 if( @haps >= 1 ) { | |
1550 my @sort_haps = sort { $a->[1] <=> $b->[1] } @haps; | |
1551 | |
1552 my $count =0; | |
1553 my $chr_start = 1; | |
1554 my $hap_start = 1; | |
1555 my $last = 0; | |
1556 | |
1557 my $seq_reg_slice = $self->fetch_by_seq_region_id($slice_seq_region_id); | |
1558 my $exc_slice = $self->fetch_by_seq_region_id( $sort_haps[0][2] ); | |
1559 my $len1 = $seq_reg_slice->length(); | |
1560 my $len2 = $exc_slice->length(); | |
1561 my $max_len = ($len1 > $len2) ? $len1 : $len2; | |
1562 | |
1563 while($count <= scalar(@sort_haps) and !$last){ | |
1564 my $chr_end; | |
1565 my $hap_end; | |
1566 if(defined($sort_haps[$count]) and defined($sort_haps[$count][0]) ){ | |
1567 $hap_end = $sort_haps[$count][0]-1; | |
1568 $chr_end = $sort_haps[$count][3]-1 | |
1569 } | |
1570 else{ | |
1571 $last = 1; | |
1572 $hap_end = $len1; | |
1573 $chr_end = $len2; | |
1574 my $diff = ($hap_end-$hap_start)-($chr_end-$chr_start); | |
1575 if($diff > 0){ | |
1576 push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end+$diff] ); | |
1577 } | |
1578 elsif($diff < 0){ | |
1579 push( @syms, [ $hap_start, $hap_end - $diff, $sort_haps[0][2], $chr_start, $chr_end] ); | |
1580 } | |
1581 else{ | |
1582 push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] ); | |
1583 } | |
1584 next; | |
1585 } | |
1586 if($hap_end and $hap_start < $len1){ # if hap at start or end of chromosome | |
1587 push( @syms, [ $hap_start, $hap_end, $sort_haps[0][2], $chr_start, $chr_end] ); | |
1588 } | |
1589 $chr_start = $chr_end + ($sort_haps[$count][4]-$sort_haps[$count][3]) + 2; | |
1590 $hap_start = $hap_end + ($sort_haps[$count][1]-$sort_haps[$count][0]) + 2; | |
1591 $count++; | |
1592 } | |
1593 | |
1594 | |
1595 } | |
1596 | |
1597 | |
1598 # for now haps and pars should not be both there, but in theory we | |
1599 # could handle it here by cleverly merging the pars into the existing syms, | |
1600 # for now just: | |
1601 push( @syms, @pars ); | |
1602 | |
1603 my $mapper = Bio::EnsEMBL::Mapper->new( "sym", "org" ); | |
1604 my $count = 0; | |
1605 for my $sym ( @syms ) { | |
1606 $mapper->add_map_coordinates( $slice_seq_region_id, $sym->[0], $sym->[1], | |
1607 1, $sym->[2], $sym->[3], $sym->[4] ); | |
1608 } | |
1609 | |
1610 my @linked = $mapper->map_coordinates( $slice_seq_region_id, | |
1611 $slice->start(), $slice->end(), | |
1612 $slice->strand(), "sym" ); | |
1613 | |
1614 # gaps are regions where there is no mapping to another region | |
1615 my $rel_start = 1; | |
1616 | |
1617 #if there was only one coord and it is a gap, we know it is just the | |
1618 #same slice with no overlapping symlinks | |
1619 if(@linked == 1 && $linked[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
1620 return [bless( [1,$slice->length, $slice], "Bio::EnsEMBL::ProjectionSegment" )]; | |
1621 } | |
1622 | |
1623 my @out; | |
1624 for my $coord ( @linked ) { | |
1625 if( $coord->isa( "Bio::EnsEMBL::Mapper::Gap" )) { | |
1626 my $exc_slice = Bio::EnsEMBL::Slice->new_fast({ | |
1627 'start' => $coord->start(), | |
1628 'end' => $coord->end(), | |
1629 'strand' => $slice->strand(), | |
1630 'coord_system' => $slice->coord_system(), | |
1631 'adaptor' => $self, | |
1632 'seq_region_name' => $slice->seq_region_name(), | |
1633 'seq_region_length' => $slice->seq_region_length()}); | |
1634 push( @out, bless ( [ $rel_start, $coord->length()+$rel_start-1, | |
1635 $exc_slice ], "Bio::EnsEMBL::ProjectionSegment") ); | |
1636 } else { | |
1637 my $exc_slice = $self->fetch_by_seq_region_id( $coord->id() ); | |
1638 my $exc2_slice = Bio::EnsEMBL::Slice->new_fast({ | |
1639 | |
1640 'start' => $coord->start(), | |
1641 'end' => $coord->end(), | |
1642 'strand' => $coord->strand(), | |
1643 'seq_region_name' => $exc_slice->seq_region_name(), | |
1644 'seq_region_length' => $exc_slice->seq_region_length(), | |
1645 'coord_system' => $exc_slice->coord_system(), | |
1646 'adaptor' => $self | |
1647 }); | |
1648 | |
1649 push( @out, bless( [ $rel_start, $coord->length() + $rel_start - 1, | |
1650 $exc2_slice ], "Bio::EnsEMBL::ProjectionSegment") ); | |
1651 } | |
1652 $rel_start += $coord->length(); | |
1653 } | |
1654 | |
1655 return \@out; | |
1656 } | |
1657 | |
1658 | |
1659 | |
1660 | |
1661 =head2 store | |
1662 | |
1663 Arg [1] : Bio::EnsEMBL::Slice $slice | |
1664 Arg [2] : (optional) $seqref reference to a string | |
1665 The sequence associated with the slice to be stored. | |
1666 Example : $slice = Bio::EnsEMBL::Slice->new(...); | |
1667 $seq_region_id = $slice_adaptor->store($slice, \$sequence); | |
1668 Description: This stores a slice as a sequence region in the database | |
1669 and returns the seq region id. The passed in slice must | |
1670 start at 1, and must have a valid seq_region name and coordinate | |
1671 system. The attached coordinate system must already be stored in | |
1672 the database. The sequence region is assumed to start at 1 and | |
1673 to have a length equalling the length of the slice. The end of | |
1674 the slice must equal the seq_region_length. | |
1675 If the slice coordinate system is the sequence level coordinate | |
1676 system then the seqref argument must also be passed. If the | |
1677 slice coordinate system is NOT a sequence level coordinate | |
1678 system then the sequence argument cannot be passed. | |
1679 Returntype : int | |
1680 Exceptions : throw if slice has no coord system. | |
1681 throw if slice coord system is not already stored. | |
1682 throw if slice coord system is seqlevel and no sequence is | |
1683 provided. | |
1684 throw if slice coord system is not seqlevel and sequence is | |
1685 provided. | |
1686 throw if slice does not start at 1 | |
1687 throw if sequence is provided and the sequence length does not | |
1688 match the slice length. | |
1689 throw if the SQL insert fails (e.g. on duplicate seq region) | |
1690 throw if slice argument is not passed | |
1691 throw if the slice end is not equal to seq_region_length | |
1692 Caller : database loading scripts | |
1693 Status : Stable | |
1694 | |
1695 =cut | |
1696 | |
1697 | |
1698 | |
1699 sub store { | |
1700 my $self = shift; | |
1701 my $slice = shift; | |
1702 my $seqref = shift; | |
1703 | |
1704 # | |
1705 # Get all of the sanity checks out of the way before storing anything | |
1706 # | |
1707 | |
1708 if(!ref($slice) || !($slice->isa('Bio::EnsEMBL::Slice') or $slice->isa('Bio::EnsEMBL::LRGSlice'))) { | |
1709 throw('Slice argument is required'); | |
1710 } | |
1711 | |
1712 my $cs = $slice->coord_system(); | |
1713 throw("Slice must have attached CoordSystem.") if(!$cs); | |
1714 | |
1715 my $db = $self->db(); | |
1716 if(!$cs->is_stored($db)) { | |
1717 throw("Slice CoordSystem must already be stored in DB.") | |
1718 } | |
1719 | |
1720 if($slice->start != 1 || $slice->strand != 1) { | |
1721 throw("Slice must have start==1 and strand==1."); | |
1722 } | |
1723 | |
1724 if($slice->end() != $slice->seq_region_length()) { | |
1725 throw("Slice must have end==seq_region_length"); | |
1726 } | |
1727 | |
1728 my $sr_len = $slice->length(); | |
1729 my $sr_name = $slice->seq_region_name(); | |
1730 | |
1731 if(!$sr_name) { | |
1732 throw("Slice must have valid seq region name."); | |
1733 } | |
1734 | |
1735 if($cs->is_sequence_level()) { | |
1736 if(!$seqref) { | |
1737 throw("Must provide sequence for sequence level coord system."); | |
1738 } | |
1739 if(ref($seqref) ne 'SCALAR') { | |
1740 throw("Sequence must be a scalar reference."); | |
1741 } | |
1742 my $seq_len = length($$seqref); | |
1743 | |
1744 if($seq_len != $sr_len) { | |
1745 throw("Sequence length ($seq_len) must match slice length ($sr_len)."); | |
1746 } | |
1747 } else { | |
1748 if($seqref) { | |
1749 throw("Cannot provide sequence for non-sequence level seq regions."); | |
1750 } | |
1751 } | |
1752 | |
1753 #store the seq_region | |
1754 | |
1755 my $sth = $db->dbc->prepare("INSERT INTO seq_region " . | |
1756 "SET name = ?, " . | |
1757 " length = ?, " . | |
1758 " coord_system_id = ?" ); | |
1759 | |
1760 $sth->bind_param(1,$sr_name,SQL_VARCHAR); | |
1761 $sth->bind_param(2,$sr_len,SQL_INTEGER); | |
1762 $sth->bind_param(3,$cs->dbID,SQL_INTEGER); | |
1763 | |
1764 $sth->execute(); | |
1765 | |
1766 my $seq_region_id = $sth->{'mysql_insertid'}; | |
1767 | |
1768 if(!$seq_region_id) { | |
1769 throw("Database seq_region insertion failed."); | |
1770 } | |
1771 | |
1772 if($cs->is_sequence_level()) { | |
1773 #store sequence if it was provided | |
1774 my $seq_adaptor = $db->get_SequenceAdaptor(); | |
1775 $seq_adaptor->store($seq_region_id, $$seqref); | |
1776 } | |
1777 | |
1778 #synonyms | |
1779 if(defined($slice->{'synonym'})){ | |
1780 foreach my $syn (@{$slice->{'synonym'}} ){ | |
1781 $syn->seq_region_id($seq_region_id); # set the seq_region_id | |
1782 $syn->adaptor->store($syn); | |
1783 } | |
1784 } | |
1785 | |
1786 | |
1787 $slice->adaptor($self); | |
1788 | |
1789 return $seq_region_id; | |
1790 } | |
1791 | |
1792 | |
1793 =head2 store_assembly | |
1794 | |
1795 Arg [1] : Bio::EnsEMBL::Slice $asm_slice | |
1796 Arg [2] : Bio::EnsEMBL::Slice $cmp_slice | |
1797 Example : $asm = $slice_adaptor->store_assembly( $slice1, $slice2 ); | |
1798 Description: Creates an entry in the analysis table based on the | |
1799 coordinates of the two slices supplied. Returns a string | |
1800 representation of the assembly that gets created. | |
1801 Returntype : string | |
1802 Exceptions : throw if either slice has no coord system (cs). | |
1803 throw unless the cs rank of the asm_slice is lower than the | |
1804 cmp_slice. | |
1805 throw if there is no mapping path between coord systems | |
1806 throw if the lengths of each slice are not equal | |
1807 throw if there are existing mappings between either slice | |
1808 and the oposite cs | |
1809 Caller : database loading scripts | |
1810 Status : Experimental | |
1811 | |
1812 =cut | |
1813 | |
1814 sub store_assembly{ | |
1815 my $self = shift; | |
1816 my $asm_slice = shift; | |
1817 my $cmp_slice = shift; | |
1818 | |
1819 # | |
1820 # Get all of the sanity checks out of the way before storing anything | |
1821 # | |
1822 | |
1823 if(!ref($asm_slice) || !($asm_slice->isa('Bio::EnsEMBL::Slice') or $asm_slice->isa('Bio::EnsEMBL::LRGSlice'))) { | |
1824 throw('Assembled Slice argument is required'); | |
1825 } | |
1826 if(!ref($cmp_slice) || !($cmp_slice->isa('Bio::EnsEMBL::Slice') or $cmp_slice->isa('Bio::EnsEMBL::LRGSlice')) ) { | |
1827 throw('Assembled Slice argument is required'); | |
1828 } | |
1829 | |
1830 my $asm_cs = $asm_slice->coord_system(); | |
1831 throw("Slice must have attached CoordSystem.") if(!$asm_cs); | |
1832 my $cmp_cs = $cmp_slice->coord_system(); | |
1833 throw("Slice must have attached CoordSystem.") if(!$cmp_cs); | |
1834 | |
1835 unless( $asm_cs->rank < $cmp_cs->rank ){ | |
1836 throw("Assembled Slice CoordSystem->rank must be lower than ". | |
1837 "the component Slice Coord_system" ); | |
1838 } | |
1839 | |
1840 my @path = | |
1841 @{ $asm_cs->adaptor()->get_mapping_path( $asm_cs, $cmp_cs ) }; | |
1842 | |
1843 if ( !@path ) { | |
1844 throw("No mapping path defined between " | |
1845 . $asm_cs->name() . " and " | |
1846 . $cmp_cs->name() ); | |
1847 } | |
1848 | |
1849 if( $asm_slice->length != $cmp_slice->length ){ | |
1850 throw("The lengths of the assembled and component slices are not equal" ); | |
1851 } | |
1852 | |
1853 # For now we disallow any existing mappings between the asm slice and cmp | |
1854 # CoordSystem and vice-versa. | |
1855 # Some cases of multiple mappings may be allowable by the API, but their | |
1856 # logic needs to be coded below. | |
1857 | |
1858 my $asm_proj = $asm_slice->project( $cmp_cs->name, $cmp_cs->version ); | |
1859 if( @$asm_proj ){ | |
1860 throw("Regions of the assembled slice are already assembled ". | |
1861 "into the component CoordSystem" ); | |
1862 } | |
1863 my $cmp_proj = $cmp_slice->project( $asm_cs->name, $asm_cs->version ); | |
1864 if( @$cmp_proj ){ | |
1865 throw("Regions of the component slice are already assembled ". | |
1866 "into the assembled CoordSystem" ); | |
1867 } | |
1868 | |
1869 # | |
1870 # Checks complete. Store the data | |
1871 # | |
1872 my $sth = $self->db->dbc->prepare | |
1873 ("INSERT INTO assembly " . | |
1874 "SET asm_seq_region_id = ?, " . | |
1875 " cmp_seq_region_id = ?, " . | |
1876 " asm_start = ?, " . | |
1877 " asm_end = ?, " . | |
1878 " cmp_start = ?, " . | |
1879 " cmp_end = ?, " . | |
1880 " ori = ?" ); | |
1881 | |
1882 my $asm_seq_region_id = $self->get_seq_region_id( $asm_slice ); | |
1883 my $cmp_seq_region_id = $self->get_seq_region_id( $cmp_slice ); | |
1884 my $ori = $asm_slice->strand * $cmp_slice->strand; | |
1885 | |
1886 $sth->bind_param(1,$asm_seq_region_id,SQL_INTEGER); | |
1887 $sth->bind_param(2,$cmp_seq_region_id,SQL_INTEGER); | |
1888 $sth->bind_param(3,$asm_slice->start,SQL_INTEGER); | |
1889 $sth->bind_param(4,$asm_slice->end,SQL_INTEGER); | |
1890 $sth->bind_param(5,$cmp_slice->start,SQL_INTEGER); | |
1891 $sth->bind_param(6,$cmp_slice->end,SQL_INTEGER); | |
1892 $sth->bind_param(7,$ori,SQL_INTEGER); | |
1893 | |
1894 $sth->execute(); | |
1895 | |
1896 #use Data::Dumper qw( Dumper ); | |
1897 #warn Dumper( $self->db->{seq_region_cache} ); | |
1898 #$self->db->{seq_region_cache} = undef; | |
1899 #$self->_cache_seq_regions(); | |
1900 | |
1901 my $ama = $self->db->get_AssemblyMapperAdaptor(); | |
1902 $ama->delete_cache(); | |
1903 | |
1904 | |
1905 return $asm_slice->name . "<>" . $cmp_slice->name; | |
1906 | |
1907 } | |
1908 | |
1909 | |
1910 =head2 prepare | |
1911 | |
1912 Arg [1] : String $sql | |
1913 Example : ( optional ) | |
1914 Description: overrides the default adaptor prepare method. | |
1915 All slice sql will usually use the dna_db. | |
1916 Returntype : DBD::sth | |
1917 Exceptions : none | |
1918 Caller : internal, convenience method | |
1919 Status : Stable | |
1920 | |
1921 =cut | |
1922 | |
1923 sub prepare { | |
1924 my ( $self, $sql ) = @_; | |
1925 return $self->db()->dnadb()->dbc->prepare($sql); | |
1926 } | |
1927 | |
1928 sub _build_exception_cache { | |
1929 my $self = shift; | |
1930 | |
1931 # build up a cache of the entire assembly exception table | |
1932 # it should be small anyway | |
1933 my $sth = | |
1934 $self->prepare( 'SELECT ae.seq_region_id, ae.seq_region_start, ' | |
1935 . 'ae.seq_region_end, ae.exc_type, ae.exc_seq_region_id, ' | |
1936 . 'ae.exc_seq_region_start, ae.exc_seq_region_end ' | |
1937 . 'FROM assembly_exception ae, ' | |
1938 . 'seq_region sr, coord_system cs ' | |
1939 . 'WHERE sr.seq_region_id = ae.seq_region_id ' | |
1940 . 'AND sr.coord_system_id = cs.coord_system_id ' | |
1941 . 'AND cs.species_id = ?' ); | |
1942 | |
1943 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); | |
1944 $sth->execute(); | |
1945 | |
1946 my %hash; | |
1947 $self->{'asm_exc_cache'} = \%hash; | |
1948 | |
1949 my $row; | |
1950 while ( $row = $sth->fetchrow_arrayref() ) { | |
1951 my @result = @$row; | |
1952 $hash{ $result[0] } ||= []; | |
1953 push( @{ $hash{ $result[0] } }, \@result ); | |
1954 } | |
1955 $sth->finish(); | |
1956 } ## end sub _build_exception_cache | |
1957 | |
1958 =head2 cache_toplevel_seq_mappings | |
1959 | |
1960 Args : none | |
1961 Example : $slice_adaptor->cache_toplevel_seq_mappings(); | |
1962 Description: caches all the assembly mappings needed for genes | |
1963 Returntype : None | |
1964 Exceptions : None | |
1965 Caller : general | |
1966 Status : At Risk | |
1967 : New experimental code | |
1968 | |
1969 =cut | |
1970 | |
1971 sub cache_toplevel_seq_mappings { | |
1972 my ($self) = @_; | |
1973 | |
1974 # Get the sequence level to map too | |
1975 | |
1976 my $sql = (<<SSQL); | |
1977 SELECT name | |
1978 FROM coord_system | |
1979 WHERE attrib like "%sequence_level%" | |
1980 AND species_id = ? | |
1981 SSQL | |
1982 | |
1983 my $sth = $self->prepare($sql); | |
1984 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); | |
1985 $sth->execute(); | |
1986 | |
1987 my $sequence_level = $sth->fetchrow_array(); | |
1988 | |
1989 $sth->finish(); | |
1990 | |
1991 my $csa = $self->db->get_CoordSystemAdaptor(); | |
1992 my $ama = $self->db->get_AssemblyMapperAdaptor(); | |
1993 | |
1994 my $cs1 = $csa->fetch_by_name($sequence_level); | |
1995 | |
1996 #get level to map too. | |
1997 | |
1998 $sql = (<<LSQL); | |
1999 SELECT DISTINCT(cs.name) | |
2000 FROM seq_region sr, | |
2001 seq_region_attrib sra, | |
2002 attrib_type at, | |
2003 coord_system cs | |
2004 WHERE sra.seq_region_id = sr.seq_region_id | |
2005 AND sra.attrib_type_id = at.attrib_type_id | |
2006 AND at.code = "toplevel" | |
2007 AND cs.coord_system_id = sr.coord_system_id | |
2008 AND cs.species_id = ? | |
2009 LSQL | |
2010 | |
2011 $sth = $self->prepare($sql); | |
2012 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); | |
2013 $sth->execute(); | |
2014 | |
2015 while ( my $csn = $sth->fetchrow_array() ) { | |
2016 if ( $csn eq $sequence_level ) { next } | |
2017 my $cs2 = $csa->fetch_by_name($csn); | |
2018 my $am = $ama->fetch_by_CoordSystems( $cs1, $cs2 ); | |
2019 $am->register_all(); | |
2020 } | |
2021 | |
2022 } ## end sub cache_toplevel_seq_mappings | |
2023 | |
2024 | |
2025 sub _build_circular_slice_cache { | |
2026 my $self = shift; | |
2027 | |
2028 # build up a cache of circular sequence region ids | |
2029 my $sth = | |
2030 $self->prepare( "SELECT sra.seq_region_id FROM seq_region_attrib sra " | |
2031 . "INNER JOIN attrib_type at ON sra.attrib_type_id = at.attrib_type_id " | |
2032 . "INNER JOIN seq_region sr ON sra.seq_region_id = sr.seq_region_id " | |
2033 . "INNER JOIN coord_system cs ON sr.coord_system_id = cs.coord_system_id " | |
2034 . "WHERE code = 'circular_seq' and cs.species_id = ?"); | |
2035 | |
2036 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); | |
2037 $sth->execute(); | |
2038 | |
2039 my $id; | |
2040 my %hash; | |
2041 if ( ($id) = $sth->fetchrow_array() ) { | |
2042 $self->{'circular_sr_id_cache'} = \%hash; | |
2043 $self->{'is_circular'} = 1; | |
2044 $hash{ $id } = $id; | |
2045 while ( ($id) = $sth->fetchrow_array() ) { | |
2046 $hash{ $id } = $id; | |
2047 } | |
2048 } else { | |
2049 $self->{'is_circular'} = 0; | |
2050 } | |
2051 $sth->finish(); | |
2052 } ## end _build_circular_slice_cache | |
2053 | |
2054 | |
2055 ##################################### | |
2056 # sub DEPRECATED METHODs | |
2057 ##################################### | |
2058 | |
2059 =head2 fetch_by_mapfrag | |
2060 | |
2061 Function: DEPRECATED use fetch_by_misc_feature_attribute('synonym',$mapfrag) | |
2062 | |
2063 =cut | |
2064 | |
2065 sub fetch_by_mapfrag{ | |
2066 my ($self,$mymapfrag,$flag,$size) = @_; | |
2067 deprecate('Use fetch_by_misc_feature_attribute instead'); | |
2068 $flag ||= 'fixed-width'; # alt.. 'context' | |
2069 $size ||= $flag eq 'fixed-width' ? 100000 : 0; | |
2070 return $self->fetch_by_misc_feature_attribute('synonym',$mymapfrag,$size); | |
2071 } | |
2072 | |
2073 | |
2074 | |
2075 =head2 fetch_by_chr_start_end | |
2076 | |
2077 Description: DEPRECATED use fetch_by_region instead | |
2078 | |
2079 =cut | |
2080 | |
2081 sub fetch_by_chr_start_end { | |
2082 my ($self,$chr,$start,$end) = @_; | |
2083 deprecate('Use fetch_by_region() instead'); | |
2084 | |
2085 #assume that by chromosome the user actually meant top-level coord | |
2086 #system since this is the old behaviour of this deprecated method | |
2087 my $csa = $self->db->get_CoordSystemAdaptor(); | |
2088 my ($cs) = @{$csa->fetch_all()}; # get the highest coord system | |
2089 | |
2090 return $self->fetch_by_region($cs->name,$chr,$start,$end,1,$cs->version); | |
2091 } | |
2092 | |
2093 | |
2094 | |
2095 =head2 fetch_by_contig_name | |
2096 | |
2097 Description: Deprecated. Use fetch_by_region(), Slice::project(), | |
2098 Slice::expand() instead | |
2099 | |
2100 =cut | |
2101 | |
2102 sub fetch_by_contig_name { | |
2103 my ($self, $name, $size) = @_; | |
2104 | |
2105 deprecate('Use fetch_by_region(), Slice::project() and Slice::expand().'); | |
2106 | |
2107 #previously wanted chromosomal slice on a given contig. Assume this means | |
2108 #a top-level slice on a given seq_region in the seq_level coord system | |
2109 my $csa = $self->db()->get_CoordSystemAdaptor(); | |
2110 my $seq_level = $csa->fetch_sequence_level(); | |
2111 | |
2112 my $seq_lvl_slice = $self->fetch_by_region($seq_level->name(), $name); | |
2113 | |
2114 if(!$seq_lvl_slice) { | |
2115 return undef; | |
2116 } | |
2117 | |
2118 my @projection = @{$seq_lvl_slice->project('toplevel')}; | |
2119 | |
2120 if(@projection != 1) { | |
2121 warning("$name is mapped to multiple toplevel locations."); | |
2122 } | |
2123 | |
2124 return $projection[0]->[2]->expand($size, $size); | |
2125 } | |
2126 | |
2127 | |
2128 =head2 fetch_by_clone_accession | |
2129 | |
2130 Description: DEPRECATED. Use fetch_by_region, Slice::project, Slice::expand | |
2131 instead. | |
2132 | |
2133 =cut | |
2134 | |
2135 sub fetch_by_clone_accession{ | |
2136 my ($self,$name,$size) = @_; | |
2137 | |
2138 deprecate('Use fetch_by_region(), Slice::project() and Slice::expand().'); | |
2139 | |
2140 my $csa = $self->db()->get_CoordSystemAdaptor(); | |
2141 my $clone_cs = $csa->fetch_by_name('clone'); | |
2142 | |
2143 if(!$clone_cs) { | |
2144 warning('Clone coordinate system does not exist for this species'); | |
2145 return undef; | |
2146 } | |
2147 | |
2148 #this unfortunately needs a version on the end to work | |
2149 if(! ($name =~ /\./)) { | |
2150 my $sth = | |
2151 $self->prepare( "SELECT sr.name " | |
2152 . "FROM seq_region sr, coord_system cs " | |
2153 . "WHERE cs.name = 'clone' " | |
2154 . "AND cs.coord_system_id = sr.coord_system_id " | |
2155 . "AND sr.name LIKE '$name.%'" | |
2156 . "AND cs.species_id = ?" ); | |
2157 | |
2158 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); | |
2159 $sth->execute(); | |
2160 | |
2161 if(!$sth->rows()) { | |
2162 $sth->finish(); | |
2163 throw("Clone $name not found in database"); | |
2164 } | |
2165 | |
2166 ($name) = $sth->fetchrow_array(); | |
2167 | |
2168 $sth->finish(); | |
2169 } | |
2170 | |
2171 my $clone = $self->fetch_by_region($clone_cs->name(), $name); | |
2172 return undef if(!$clone); | |
2173 | |
2174 my @projection = @{$clone->project('toplevel')}; | |
2175 | |
2176 if(@projection != 1) { | |
2177 warning("$name is mapped to multiple locations."); | |
2178 } | |
2179 | |
2180 return $projection[0]->[2]->expand($size, $size); | |
2181 } | |
2182 | |
2183 | |
2184 =head2 fetch_by_supercontig_name | |
2185 | |
2186 Description: DEPRECATED. Use fetch_by_region(), Slice::project() and | |
2187 Slice::expand() instead | |
2188 | |
2189 =cut | |
2190 | |
2191 sub fetch_by_supercontig_name { | |
2192 my ($self,$name, $size) = @_; | |
2193 | |
2194 deprecate('Use fetch_by_region(), Slice::project() and Slice::expand().'); | |
2195 | |
2196 my $csa = $self->db()->get_CoordSystemAdaptor(); | |
2197 my $sc_level = $csa->fetch_by_name('supercontig'); | |
2198 | |
2199 if(!$sc_level) { | |
2200 warning('No supercontig coordinate system exists for this species.'); | |
2201 return undef; | |
2202 } | |
2203 | |
2204 my $sc_slice = $self->fetch_by_region($sc_level->name(),$name); | |
2205 | |
2206 return undef if(!$sc_slice); | |
2207 | |
2208 my @projection = @{$sc_slice->project('toplevel')}; | |
2209 | |
2210 if(@projection > 1) { | |
2211 warning("$name is mapped to multiple locations in toplevel"); | |
2212 } | |
2213 | |
2214 return $projection[0]->[2]->expand($size, $size); | |
2215 } | |
2216 | |
2217 | |
2218 | |
2219 | |
2220 =head2 list_overlapping_supercontigs | |
2221 | |
2222 Description: DEPRECATED use Slice::project instead | |
2223 | |
2224 =cut | |
2225 | |
2226 sub list_overlapping_supercontigs { | |
2227 my ($self,$slice) = @_; | |
2228 | |
2229 deprecate('Use Slice::project() instead.'); | |
2230 | |
2231 my $csa = $self->db()->get_CoordSystemAdaptor(); | |
2232 my $sc_level = $csa->fetch_by_name('supercontig'); | |
2233 | |
2234 if(!$sc_level) { | |
2235 warning('No supercontig coordinate system exists for this species.'); | |
2236 return undef; | |
2237 } | |
2238 | |
2239 my @out; | |
2240 foreach my $seg (@{$slice->project($sc_level->name(), $sc_level->version)}){ | |
2241 push @out, $seg->[2]->seq_region_name(); | |
2242 } | |
2243 | |
2244 return \@out; | |
2245 } | |
2246 | |
2247 | |
2248 | |
2249 =head2 fetch_by_chr_name | |
2250 | |
2251 Description: DEPRECATED. Use fetch by region instead | |
2252 | |
2253 =cut | |
2254 | |
2255 sub fetch_by_chr_name{ | |
2256 my ($self,$chr_name) = @_; | |
2257 deprecate('Use fetch_by_region() instead.'); | |
2258 | |
2259 my $csa = $self->db->get_CoordSystemAdaptor(); | |
2260 | |
2261 my $top_cs = @{$csa->fetch_all()}; | |
2262 | |
2263 return $self->fetch_by_region($top_cs->name(),$chr_name, | |
2264 undef,undef,undef,$top_cs->version); | |
2265 } | |
2266 | |
2267 | |
2268 | |
2269 | |
2270 | |
2271 | |
2272 1; |