0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::DBSQL::RepeatFeatureAdaptor
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 $rfa = $database_adaptor->get_RepeatFeatureAdaptor();
|
|
28
|
|
29 my $repeat = $rfa->fetch_by_dbID(1234);
|
|
30 my @repeats = @{ $rfa->fetch_all_by_Slice($slice) };
|
|
31
|
|
32 =head1 DESCRIPTION
|
|
33
|
|
34 This is an adaptor for the retrieval and storage of RepeatFeature
|
|
35 objects from the database. Most of the implementation is in the
|
|
36 superclass BaseFeatureAdaptor.
|
|
37
|
|
38 =head1 METHODS
|
|
39
|
|
40 =cut
|
|
41
|
|
42 package Bio::EnsEMBL::DBSQL::RepeatFeatureAdaptor;
|
|
43
|
|
44 use strict;
|
|
45 use Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor;
|
|
46 use Bio::EnsEMBL::RepeatFeature;
|
|
47 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
48 use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
|
|
49
|
|
50 use vars qw(@ISA);
|
|
51
|
|
52 @ISA = qw(Bio::EnsEMBL::DBSQL::BaseFeatureAdaptor);
|
|
53
|
|
54
|
|
55 =head2 fetch_all_by_Slice
|
|
56
|
|
57 Arg [1] : Bio::EnsEMBL::Slice $slice
|
|
58 Arg [2] : (optional) string $logic_name
|
|
59 Limits RepeatFeatures obtained to those having an Analysis with
|
|
60 of the specified logic_name. If no logic name is specified
|
|
61 Repeats of all analysis types are retrieved.
|
|
62 Arg [3] : (optional) string/array $repeat_type
|
|
63 Limits RepeatFeatures obtained to those of specified
|
|
64 repeat_type
|
|
65 Example : @rfeats = @{$rfa->fetch_all_by_Slice($slice, undef, 'Type II Transposons')};
|
|
66 @rfeats = @{$rfa->fetch_all_by_Slice($slice, undef, ['Type II Transposons', 'RNA repeats'])};
|
|
67 Description: Retrieves repeat features overlapping the area designated by
|
|
68 the provided slice argument. Returned features will be in
|
|
69 in the same coordinate system as the provided slice and will
|
|
70 have coordinates relative to the slice start.
|
|
71 Returntype : reference to a list of Bio::EnsEMBL::RepeatFeatures.
|
|
72 Exceptions : throw on bad argument
|
|
73 Caller : Slice::get_all_RepeatFeatures
|
|
74 Status : Stable
|
|
75
|
|
76 =cut
|
|
77
|
|
78 sub fetch_all_by_Slice {
|
|
79 my $self = shift;
|
|
80 my $slice = shift;
|
|
81 my $logic_name = shift;
|
|
82 my $repeat_type = shift;
|
|
83
|
|
84 my $constraint = '';
|
|
85
|
|
86 # MySQL was optimising the query the incorrect way when joining to
|
|
87 # the repeat_consensus table on type
|
|
88 $self->_straight_join(1);
|
|
89
|
|
90 if($repeat_type) {
|
|
91 my $rta = wrap_array($repeat_type);
|
|
92 if(scalar(@{$rta}) > 1) {
|
|
93 $constraint .= sprintf('rc.repeat_type IN (%s)', join(q{,}, map {"'${_}'"} @{$rta}));
|
|
94 }
|
|
95 else {
|
|
96 $constraint .= "rc.repeat_type = '${repeat_type}'";
|
|
97 }
|
|
98 }
|
|
99
|
|
100 my $result =
|
|
101 $self->fetch_all_by_Slice_constraint($slice,$constraint,$logic_name);
|
|
102
|
|
103
|
|
104 $self->_straight_join(0);
|
|
105
|
|
106 return $result;
|
|
107 }
|
|
108
|
|
109
|
|
110 # _tablename
|
|
111 #
|
|
112 # Arg [1] : none
|
|
113 # Example : none
|
|
114 # Description: PROTECTED Implementation of abstract superclass method to
|
|
115 # provide the name of the tables to query
|
|
116 # Returntype : string
|
|
117 # Exceptions : none
|
|
118 # Caller : internal
|
|
119
|
|
120
|
|
121 sub _tables {
|
|
122 my $self = shift;
|
|
123
|
|
124 return (['repeat_feature', 'r'], ['repeat_consensus', 'rc']);
|
|
125 }
|
|
126
|
|
127
|
|
128 # _columns
|
|
129 #
|
|
130 # Arg [1] : none
|
|
131 # Example : none
|
|
132 # Description: PROTECTED Implementation of abstract superclass method to
|
|
133 # provide the name of the columns to query
|
|
134 # Returntype : list of strings
|
|
135 # Exceptions : none
|
|
136 # Caller : internal
|
|
137
|
|
138 sub _columns {
|
|
139 my $self = shift;
|
|
140
|
|
141 return qw (r.repeat_feature_id
|
|
142 r.seq_region_id
|
|
143 r.seq_region_start
|
|
144 r.seq_region_end
|
|
145 r.seq_region_strand
|
|
146 r.repeat_consensus_id
|
|
147 r.repeat_start
|
|
148 r.repeat_end
|
|
149 r.analysis_id
|
|
150 r.score
|
|
151 rc.repeat_name
|
|
152 rc.repeat_class
|
|
153 rc.repeat_type
|
|
154 rc.repeat_consensus);
|
|
155 }
|
|
156
|
|
157
|
|
158 # _default_where_clause
|
|
159 # Arg [1] : none
|
|
160 # Example : none
|
|
161 # Description: Overrides superclass method to provide an additional
|
|
162 # table joining constraint before the SQL query is performed.
|
|
163 # Returntype : string
|
|
164 # Exceptions : none
|
|
165 # Caller : generic_fetch
|
|
166 #
|
|
167
|
|
168 sub _default_where_clause {
|
|
169 my $self = shift;
|
|
170
|
|
171 return 'r.repeat_consensus_id = rc.repeat_consensus_id';
|
|
172 }
|
|
173
|
|
174
|
|
175
|
|
176 # Description: PROTECTED implementation of abstract superclass method.
|
|
177 # responsible for the creation of RepeatFeatures from a
|
|
178 # hashref generated from an SQL query
|
|
179
|
|
180 sub _objs_from_sth {
|
|
181 my ($self, $sth, $mapper, $dest_slice) = @_;
|
|
182
|
|
183 #
|
|
184 # This code is ugly because an attempt has been made to remove as many
|
|
185 # function calls as possible for speed purposes. Thus many caches and
|
|
186 # a fair bit of gymnastics is used.
|
|
187 #
|
|
188
|
|
189 my $rca = $self->db()->get_RepeatConsensusAdaptor();
|
|
190 my $sa = $self->db()->get_SliceAdaptor();
|
|
191 my $aa = $self->db->get_AnalysisAdaptor();
|
|
192
|
|
193 my @features;
|
|
194 my %rc_hash;
|
|
195 my %analysis_hash;
|
|
196 my %slice_hash;
|
|
197 my %sr_name_hash;
|
|
198 my %sr_cs_hash;
|
|
199
|
|
200 my($repeat_feature_id, $seq_region_id, $seq_region_start, $seq_region_end,
|
|
201 $seq_region_strand, $repeat_consensus_id, $repeat_start, $repeat_end,
|
|
202 $analysis_id, $score, $repeat_name, $repeat_class, $repeat_type,
|
|
203 $repeat_consensus);
|
|
204
|
|
205 $sth->bind_columns( \$repeat_feature_id, \$seq_region_id, \$seq_region_start,
|
|
206 \$seq_region_end, \$seq_region_strand,
|
|
207 \$repeat_consensus_id, \$repeat_start,\$repeat_end,
|
|
208 \$analysis_id, \$score, \$repeat_name, \$repeat_class,
|
|
209 \$repeat_type, \$repeat_consensus );
|
|
210
|
|
211 my $asm_cs;
|
|
212 my $cmp_cs;
|
|
213 my $asm_cs_vers;
|
|
214 my $asm_cs_name;
|
|
215 my $cmp_cs_vers;
|
|
216 my $cmp_cs_name;
|
|
217 if($mapper) {
|
|
218 $asm_cs = $mapper->assembled_CoordSystem();
|
|
219 $cmp_cs = $mapper->component_CoordSystem();
|
|
220 $asm_cs_name = $asm_cs->name();
|
|
221 $asm_cs_vers = $asm_cs->version();
|
|
222 $cmp_cs_name = $cmp_cs->name();
|
|
223 $cmp_cs_vers = $cmp_cs->version();
|
|
224 }
|
|
225
|
|
226 my $dest_slice_start;
|
|
227 my $dest_slice_end;
|
|
228 my $dest_slice_strand;
|
|
229 my $dest_slice_length;
|
|
230 my $dest_slice_sr_name;
|
|
231 my $dest_slice_sr_id;
|
|
232 if($dest_slice) {
|
|
233 $dest_slice_start = $dest_slice->start();
|
|
234 $dest_slice_end = $dest_slice->end();
|
|
235 $dest_slice_strand = $dest_slice->strand();
|
|
236 $dest_slice_length = $dest_slice->length();
|
|
237 $dest_slice_sr_name = $dest_slice->seq_region_name();
|
|
238 $dest_slice_sr_id = $dest_slice->get_seq_region_id();
|
|
239 }
|
|
240
|
|
241 FEATURE: while($sth->fetch()) {
|
|
242 #create a repeat consensus object
|
|
243
|
|
244 my $rc = $rc_hash{$repeat_consensus_id} ||=
|
|
245 Bio::EnsEMBL::RepeatConsensus->new_fast
|
|
246 ({'dbID' => $repeat_consensus_id,
|
|
247 'adaptor' => $rca,
|
|
248 'name' => $repeat_name,
|
|
249 'repeat_class' => $repeat_class,
|
|
250 'repeat_type' => $repeat_type,
|
|
251 'repeat_consensus' => $repeat_consensus,
|
|
252 'length' => length($repeat_consensus)});
|
|
253
|
|
254 #get the analysis object
|
|
255 my $analysis = $analysis_hash{$analysis_id} ||=
|
|
256 $aa->fetch_by_dbID($analysis_id);
|
|
257 #need to get the internal_seq_region, if present
|
|
258 $seq_region_id = $self->get_seq_region_id_internal($seq_region_id);
|
|
259 my $slice = $slice_hash{"ID:".$seq_region_id};
|
|
260
|
|
261 if(!$slice) {
|
|
262 $slice = $sa->fetch_by_seq_region_id($seq_region_id);
|
|
263 $slice_hash{"ID:".$seq_region_id} = $slice;
|
|
264 $sr_name_hash{$seq_region_id} = $slice->seq_region_name();
|
|
265 $sr_cs_hash{$seq_region_id} = $slice->coord_system();
|
|
266 }
|
|
267
|
|
268 my $sr_name = $sr_name_hash{$seq_region_id};
|
|
269 my $sr_cs = $sr_cs_hash{$seq_region_id};
|
|
270 #
|
|
271 # remap the feature coordinates to another coord system
|
|
272 # if a mapper was provided
|
|
273 #
|
|
274 if($mapper) {
|
|
275
|
|
276 if (defined $dest_slice && $mapper->isa('Bio::EnsEMBL::ChainedAssemblyMapper') ) {
|
|
277 ( $seq_region_id, $seq_region_start,
|
|
278 $seq_region_end, $seq_region_strand )
|
|
279 =
|
|
280 $mapper->map( $sr_name, $seq_region_start, $seq_region_end,
|
|
281 $seq_region_strand, $sr_cs, 1, $dest_slice);
|
|
282
|
|
283 } else {
|
|
284
|
|
285 ( $seq_region_id, $seq_region_start,
|
|
286 $seq_region_end, $seq_region_strand )
|
|
287 =
|
|
288 $mapper->fastmap( $sr_name, $seq_region_start, $seq_region_end,
|
|
289 $seq_region_strand, $sr_cs );
|
|
290 }
|
|
291
|
|
292 #skip features that map to gaps or coord system boundaries
|
|
293 next FEATURE if(!defined($seq_region_id));
|
|
294
|
|
295 #get a slice in the coord system we just mapped to
|
|
296 # if($asm_cs == $sr_cs || ($cmp_cs != $sr_cs && $asm_cs->equals($sr_cs))) {
|
|
297 $slice = $slice_hash{"ID:".$seq_region_id} ||=
|
|
298 $sa->fetch_by_seq_region_id($seq_region_id);
|
|
299 # } else {
|
|
300 # $slice = $slice_hash{"NAME:$sr_name:$asm_cs_name:$asm_cs_vers"} ||=
|
|
301 # $sa->fetch_by_region($asm_cs_name, $sr_name, undef, undef, undef,
|
|
302 # $asm_cs_vers);
|
|
303 # }
|
|
304 }
|
|
305
|
|
306 #
|
|
307 # If a destination slice was provided convert the coords
|
|
308 # If the dest_slice starts at 1 and is foward strand, nothing needs doing
|
|
309 #
|
|
310 if($dest_slice) {
|
|
311 if($dest_slice_start != 1 || $dest_slice_strand != 1) {
|
|
312 if($dest_slice_strand == 1) {
|
|
313 $seq_region_start = $seq_region_start - $dest_slice_start + 1;
|
|
314 $seq_region_end = $seq_region_end - $dest_slice_start + 1;
|
|
315 } else {
|
|
316 my $tmp_seq_region_start = $seq_region_start;
|
|
317 $seq_region_start = $dest_slice_end - $seq_region_end + 1;
|
|
318 $seq_region_end = $dest_slice_end - $tmp_seq_region_start + 1;
|
|
319 $seq_region_strand *= -1;
|
|
320 }
|
|
321 }
|
|
322
|
|
323 #throw away features off the end of the requested slice
|
|
324 if($seq_region_end < 1 || $seq_region_start > $dest_slice_length ||
|
|
325 ( $dest_slice_sr_id ne $seq_region_id )) {
|
|
326 next FEATURE;
|
|
327 }
|
|
328 $slice = $dest_slice;
|
|
329 }
|
|
330
|
|
331 # Finally, create the new RepeatFeature.
|
|
332 push( @features,
|
|
333 $self->_create_feature_fast( 'Bio::EnsEMBL::RepeatFeature', {
|
|
334 'dbID' => $repeat_feature_id,
|
|
335 'analysis' => $analysis,
|
|
336 'start' => $seq_region_start,
|
|
337 'end' => $seq_region_end,
|
|
338 'strand' => $seq_region_strand,
|
|
339 'score' => $score,
|
|
340 'hstart' => $repeat_start,
|
|
341 'hend' => $repeat_end,
|
|
342 'repeat_consensus' => $rc,
|
|
343 'adaptor' => $self,
|
|
344 'slice' => $slice
|
|
345 } ) );
|
|
346
|
|
347 }
|
|
348
|
|
349 return \@features;
|
|
350 }
|
|
351
|
|
352
|
|
353 =head2 store
|
|
354
|
|
355 Arg [1] : list of Bio::EnsEMBL::RepeatFeatures $repeat_feature_id
|
|
356 the list of repeat features to store in the database
|
|
357 Example : $repeat_feature_adaptor->store(@repeat_features);
|
|
358 Description: stores a repeat feature in the database
|
|
359 Returntype : none
|
|
360 Exceptions : if the repeat features do not have attached sequences
|
|
361 or if repeat_consensus are not present
|
|
362 Caller : general
|
|
363 Status : Stable
|
|
364
|
|
365 =cut
|
|
366
|
|
367 sub store {
|
|
368 my( $self, @repeats ) = @_;
|
|
369
|
|
370 my $db = $self->db();
|
|
371 my $rca = $db->get_RepeatConsensusAdaptor();
|
|
372 my $sa = $db->get_SliceAdaptor();
|
|
373 my ($cons, $db_id);
|
|
374
|
|
375 my $sth = $self->prepare(qq{
|
|
376 INSERT into repeat_feature( repeat_feature_id
|
|
377 , seq_region_id
|
|
378 , seq_region_start
|
|
379 , seq_region_end
|
|
380 , seq_region_strand
|
|
381 , repeat_consensus_id
|
|
382 , repeat_start
|
|
383 , repeat_end
|
|
384 , score
|
|
385 , analysis_id )
|
|
386 VALUES(NULL, ?,?,?,?,?,?,?,?,?)
|
|
387 });
|
|
388
|
|
389 FEATURE: foreach my $rf (@repeats) {
|
|
390 if(!ref($rf) || !$rf->isa('Bio::EnsEMBL::RepeatFeature')) {
|
|
391 throw('Expected RepeatFeature argument not [' . ref($rf) .'].');
|
|
392 }
|
|
393
|
|
394 if($rf->is_stored($db)) {
|
|
395 warning("RepeatFeature [".$rf->dbID."] is already stored in this DB.");
|
|
396 next FEATURE;
|
|
397 }
|
|
398
|
|
399 my $cons = $rf->repeat_consensus();
|
|
400 throw("Must have a RepeatConsensus attached") if(!defined($cons));
|
|
401
|
|
402 # for tandem repeats - simply store consensus and repeat
|
|
403 # one pair per hit. don't need to check consensi stored
|
|
404 # already. consensus has name and class set to 'trf'
|
|
405
|
|
406 if ($cons->repeat_class eq 'trf') {
|
|
407
|
|
408 # Look for matches already stored
|
|
409 my @match = @{$rca->fetch_all_by_class_seq('trf', $cons->repeat_consensus)};
|
|
410 if (@match) {
|
|
411 $cons->dbID($match[0]->dbID());
|
|
412 }
|
|
413 else {
|
|
414 $rca->store($cons);
|
|
415 }
|
|
416
|
|
417 } elsif ($cons->repeat_class eq 'Simple_repeat') {
|
|
418
|
|
419 my $rcon = $cons->name;
|
|
420 $rcon =~ s/\((\S+)\)n/$1/; # get repeat element
|
|
421 $cons->repeat_consensus($rcon);
|
|
422
|
|
423 # Look for matches already stored
|
|
424 my $match = $rca->fetch_by_name_class($cons->name, 'Simple_repeat');
|
|
425 if ($match) {
|
|
426 $cons->dbID($match->dbID());
|
|
427 }
|
|
428 else {
|
|
429 $rca->store($cons);
|
|
430 }
|
|
431
|
|
432 } else {
|
|
433
|
|
434 # for other repeats - need to see if a consensus is stored already
|
|
435 if(!$cons->dbID) {
|
|
436 my $match = ($rca->fetch_by_name($cons->name));
|
|
437
|
|
438 if($match) {
|
|
439 #set the consensus dbID to be the same as the database one
|
|
440 $cons->dbID($match->dbID());
|
|
441 } else {
|
|
442 # if we don't match a consensus already stored create a fake one
|
|
443 # and set consensus to 'N' as null seq not allowed
|
|
444 # FIXME: not happy with this, but ho hum ...
|
|
445 warning("Can't find " . $cons->name . "\n");
|
|
446 $cons->repeat_consensus("N");
|
|
447 $rca->store($cons);
|
|
448 }
|
|
449 }
|
|
450
|
|
451 #if (@match > 1) {
|
|
452 #multiple consensi were matched
|
|
453 # $self->warn(@match . " consensi for " . $cons->name . "\n");
|
|
454 #}
|
|
455 }
|
|
456
|
|
457 my $slice = $rf->slice();
|
|
458 if(!ref($slice) || !($slice->isa("Bio::EnsEMBL::Slice") or $slice->isa('Bio::EnsEMBL::LRGSlice'))) {
|
|
459 throw("RepeatFeature cannot be stored without an associated slice.");
|
|
460 }
|
|
461
|
|
462 my $original = $rf;
|
|
463 my $seq_region_id;
|
|
464 ($rf, $seq_region_id) = $self->_pre_store($rf);
|
|
465
|
|
466 $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
|
|
467 $sth->bind_param(2,$rf->start,SQL_INTEGER);
|
|
468 $sth->bind_param(3,$rf->end,SQL_INTEGER);
|
|
469 $sth->bind_param(4,$rf->strand,SQL_TINYINT);
|
|
470 $sth->bind_param(5,$rf->repeat_consensus->dbID,SQL_INTEGER);
|
|
471 $sth->bind_param(6,$rf->hstart,SQL_INTEGER);
|
|
472 $sth->bind_param(7,$rf->hend,SQL_INTEGER);
|
|
473 $sth->bind_param(8,$rf->score,SQL_DOUBLE);
|
|
474 $sth->bind_param(9,$rf->analysis->dbID,SQL_INTEGER);
|
|
475
|
|
476 $sth->execute();
|
|
477
|
|
478 my $db_id = $sth->{'mysql_insertid'}
|
|
479 or throw("Didn't get an insertid from the INSERT statement");
|
|
480
|
|
481 $original->dbID($db_id);
|
|
482 $original->adaptor($self);
|
|
483 }
|
|
484 }
|
|
485
|
|
486
|
|
487 =head2 list_dbIDs
|
|
488
|
|
489 Arg [1] : none
|
|
490 Example : @feature_ids = @{$repeat_feature_adaptor->list_dbIDs()};
|
|
491 Description: Gets an array of internal ids for all repeat features in the current db
|
|
492 Arg[1] : <optional> int. not 0 for the ids to be sorted by the seq_region.
|
|
493 Returntype : list of ints
|
|
494 Exceptions : none
|
|
495 Caller : ?
|
|
496 Status : Stable
|
|
497
|
|
498 =cut
|
|
499
|
|
500 sub list_dbIDs {
|
|
501 my ($self, $ordered) = @_;
|
|
502
|
|
503 return $self->_list_dbIDs("repeat_feature", undef, $ordered);
|
|
504 }
|
|
505
|
|
506 1;
|
|
507
|
|
508
|
|
509
|
|
510
|
|
511
|