Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/GFF/Aggregator.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 NAME | |
2 | |
3 Bio::DB::GFF::Aggregator -- Aggregate GFF groups into composite features | |
4 | |
5 =head1 SYNOPSIS | |
6 | |
7 use Bio::DB::GFF; | |
8 | |
9 my $agg1 = Bio::DB::GFF::Aggregator->new(-method => 'cistron', | |
10 -main_method => 'locus', | |
11 -sub_parts => ['allele','variant'] | |
12 ); | |
13 | |
14 my $agg2 = Bio::DB::GFF::Aggregator->new(-method => 'splice_group', | |
15 -sub_parts => 'transcript'); | |
16 | |
17 my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', | |
18 -aggregator => [$agg1,$agg2], | |
19 -dsn => 'dbi:mysql:elegans42', | |
20 ); | |
21 | |
22 | |
23 =head1 DESCRIPTION | |
24 | |
25 Bio::DB::GFF::Aggregator is used to aggregate GFF groups into | |
26 composite features. Each composite feature has a "main part", the | |
27 top-level feature, and a series of zero or more subparts, retrieved | |
28 with the sub_SeqFeature() method. The aggregator class is designed to | |
29 be subclassable, allowing a variety of GFF feature types to be | |
30 supported. | |
31 | |
32 The base Bio::DB::GFF::Aggregator class is generic, and can be used to | |
33 create specific instances to be passed to the -aggregator argument of | |
34 Bio::DB::GFF-E<gt>new() call. The various subclasses of | |
35 Bio::DB::GFF::Aggregator are tuned for specific common feature types | |
36 such as clones, gapped alignments and transcripts. | |
37 | |
38 Instances of Bio::DB::GFF::Aggregator have three attributes: | |
39 | |
40 =over 3 | |
41 | |
42 =item method | |
43 | |
44 This is the GFF method field of the composite feature as a whole. For | |
45 example, "transcript" may be used for a composite feature created by | |
46 aggregating individual intron, exon and UTR features. | |
47 | |
48 =item main method | |
49 | |
50 Sometimes GFF groups are organized hierarchically, with one feature | |
51 logically containing another. For example, in the C. elegans schema, | |
52 methods of type "Sequence:curated" correspond to regions covered by | |
53 curated genes. There can be zero or one main methods. | |
54 | |
55 =item subparts | |
56 | |
57 This is a list of one or more methods that correspond to the component | |
58 features of the aggregates. For example, in the C. elegans database, | |
59 the subparts of transcript are "intron", "exon" and "CDS". | |
60 | |
61 =back | |
62 | |
63 Aggregators have two main methods that can be overridden in | |
64 subclasses: | |
65 | |
66 =over 4 | |
67 | |
68 =item disaggregate() | |
69 | |
70 This method is called by the Adaptor object prior to fetching a list | |
71 of features. The method is passed an associative array containing the | |
72 [method,source] pairs that the user has requested, and it returns a | |
73 list of raw features that it would like the adaptor to fetch. | |
74 | |
75 =item aggregate() | |
76 | |
77 This method is called by the Adaptor object after it has fetched | |
78 features. The method is passed a list of raw features and is expected | |
79 to add its composite features to the list. | |
80 | |
81 =back | |
82 | |
83 The disaggregate() and aggregate() methods provided by the base | |
84 Aggregator class should be sufficient for many applications. In this | |
85 case, it suffices for subclasses to override the following methods: | |
86 | |
87 =over 4 | |
88 | |
89 =item method() | |
90 | |
91 Return the default method for the composite feature as a whole. | |
92 | |
93 =item main_name() | |
94 | |
95 Return the default main method name. | |
96 | |
97 =item part_names() | |
98 | |
99 Return a list of subpart method names. | |
100 | |
101 =back | |
102 | |
103 Provided that method() and part_names() are overridden (and optionally | |
104 main_name() as well), then the bare name of the aggregator subclass | |
105 can be passed to the -aggregator of Bio::DB::GFF-E<gt>new(). For example, | |
106 this is a small subclass that will aggregate features of type "allele" | |
107 and "polymorphism" into an aggregate named "mutant": | |
108 | |
109 package Bio::DB::GFF::Aggregator::mutant; | |
110 | |
111 use strict; | |
112 use Bio::DB::GFF::Aggregator; | |
113 | |
114 use vars '@ISA'; | |
115 @ISA = 'Bio::DB::GFF::Aggregator'; | |
116 | |
117 sub method { 'mutant' } | |
118 | |
119 sub part_names { | |
120 return qw(allele polymorphism); | |
121 } | |
122 | |
123 1; | |
124 | |
125 Once installed, this aggregator can be passed to Bio::DB::GFF-E<gt>new() | |
126 by name like so: | |
127 | |
128 my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql', | |
129 -aggregator => 'mutant', | |
130 -dsn => 'dbi:mysql:elegans42', | |
131 ); | |
132 | |
133 =head1 API | |
134 | |
135 The remainder of this document describes the public and private | |
136 methods implemented by this module. | |
137 | |
138 =cut | |
139 | |
140 package Bio::DB::GFF::Aggregator; | |
141 | |
142 use strict; | |
143 use Bio::DB::GFF::Util::Rearrange; # for rearrange() | |
144 use Bio::DB::GFF::Feature; | |
145 use vars qw(@ISA); | |
146 | |
147 @ISA = qw(Bio::Root::Root); | |
148 | |
149 my $ALWAYS_TRUE = sub { 1 }; | |
150 | |
151 =head2 new | |
152 | |
153 Title : new | |
154 Usage : $a = Bio::DB::GFF::Aggregator->new(@args) | |
155 Function: create a new aggregator | |
156 Returns : a Bio::DB::GFF::Aggregator object | |
157 Args : see below | |
158 Status : Public | |
159 | |
160 This is the constructor for Bio::DB::GFF::Aggregator. Named arguments | |
161 are as follows: | |
162 | |
163 -method the method for the composite feature | |
164 | |
165 -main_method the top-level raw feature, if any | |
166 | |
167 -sub_parts the list of raw features that will form the subparts | |
168 of the composite feature (array reference or scalar) | |
169 | |
170 =cut | |
171 | |
172 sub new { | |
173 my $class = shift; | |
174 my ($method,$main,$sub_parts) = rearrange(['METHOD', | |
175 ['MAIN_PART','MAIN_METHOD'], | |
176 ['SUB_METHODS','SUB_PARTS'] | |
177 ],@_); | |
178 return bless { | |
179 method => $method, | |
180 main_method => $main, | |
181 sub_parts => $sub_parts, | |
182 },$class; | |
183 } | |
184 | |
185 =head2 disaggregate | |
186 | |
187 Title : disaggregate | |
188 Usage : $a->disaggregate($types,$factory) | |
189 Function: disaggregate type list into components | |
190 Returns : a true value if this aggregator should be called to reaggregate | |
191 Args : see below | |
192 Status : Public | |
193 | |
194 This method is called to disaggregate a list of types into the set of | |
195 low-level features to be retrieved from the GFF database. The list of | |
196 types is passed as an array reference containing a series of | |
197 [method,source] pairs. This method synthesizes a new set of | |
198 [method,source] pairs, and appends them to the list of requested | |
199 types, changing the list in situ. | |
200 | |
201 Arguments: | |
202 | |
203 $types reference to an array of [method,source] pairs | |
204 | |
205 $factory reference to the Adaptor object that is calling | |
206 this method | |
207 | |
208 Note that the API allows disaggregate() to remove types from the type | |
209 list. This feature is probably not desirable and may be deprecated in | |
210 the future. | |
211 | |
212 =cut | |
213 | |
214 # this is called at the beginning to turn the pseudo-type | |
215 # into its component feature types | |
216 sub disaggregate { | |
217 my $self = shift; | |
218 my $types = shift; | |
219 my $factory = shift; | |
220 | |
221 my $sub_features = $factory->parse_types($self->get_part_names); | |
222 my $main_feature = $factory->parse_types($self->get_main_name); | |
223 | |
224 if (@$types) { | |
225 my (@synthetic_types,@unchanged); | |
226 foreach (@$types) { | |
227 my ($method,$source) = @$_; | |
228 if (lc $method eq lc $self->get_method) { # e.g. "transcript" | |
229 push @synthetic_types,map { [$_->[0],$_->[1] || $source] } @$sub_features,@$main_feature; | |
230 } | |
231 else { | |
232 push @unchanged,$_; | |
233 } | |
234 } | |
235 # remember what we're searching for | |
236 $self->components(\@synthetic_types); | |
237 $self->passthru(\@unchanged); | |
238 @$types = (@unchanged,@synthetic_types); | |
239 } | |
240 | |
241 # we get here when no search types are listed | |
242 else { | |
243 my @stypes = map { [$_->[0],$_->[1]] } @$sub_features,@$main_feature; | |
244 $self->components(\@stypes); | |
245 $self->passthru(undef); | |
246 } | |
247 | |
248 return $self->component_count > 0; | |
249 } | |
250 | |
251 | |
252 =head2 aggregate | |
253 | |
254 Title : aggregate | |
255 Usage : $features = $a->aggregate($features,$factory) | |
256 Function: aggregate a feature list into composite features | |
257 Returns : an array reference containing modified features | |
258 Args : see below | |
259 Status : Public | |
260 | |
261 This method is called to aggregate a list of raw GFF features into the | |
262 set of composite features. The method is called an array reference to | |
263 a set of Bio::DB::GFF::Feature objects. It runs through the list, | |
264 creating new composite features when appropriate. The method result | |
265 is an array reference containing the composite features. | |
266 | |
267 Arguments: | |
268 | |
269 $features reference to an array of Bio::DB::GFF::Feature objects | |
270 | |
271 $factory reference to the Adaptor object that is calling | |
272 this method | |
273 | |
274 NOTE: The reason that the function result contains the raw features as | |
275 well as the aggregated ones is to allow queries like this one: | |
276 | |
277 @features = $segment->features('exon','transcript:curated'); | |
278 | |
279 Assuming that "transcript" is the name of an aggregated feature and | |
280 that "exon" is one of its components, we do not want the transcript | |
281 aggregator to remove features of type "exon" because the user asked | |
282 for them explicitly. | |
283 | |
284 =cut | |
285 | |
286 sub aggregate { | |
287 my $self = shift; | |
288 my $features = shift; | |
289 my $factory = shift; | |
290 | |
291 my $main_method = $self->get_main_name; | |
292 my $matchsub = $self->match_sub($factory) or return; | |
293 my $passthru = $self->passthru_sub($factory); | |
294 | |
295 my (%aggregates,@result); | |
296 for my $feature (@$features) { | |
297 if ($feature->group && $matchsub->($feature)) { | |
298 if ($main_method && lc $feature->method eq lc $main_method) { | |
299 $aggregates{$feature->group,$feature->refseq}{base} ||= $feature->clone; | |
300 } else { | |
301 push @{$aggregates{$feature->group,$feature->refseq}{subparts}},$feature; | |
302 } | |
303 push @result,$feature if $passthru && $passthru->($feature); | |
304 | |
305 } else { | |
306 push @result,$feature; | |
307 } | |
308 } | |
309 | |
310 # aggregate components | |
311 my $pseudo_method = $self->get_method; | |
312 my $require_whole_object = $self->require_whole_object; | |
313 foreach (keys %aggregates) { | |
314 if ($require_whole_object && $self->components) { | |
315 next unless $aggregates{$_}{base} && $aggregates{$_}{subparts}; | |
316 } | |
317 my $base = $aggregates{$_}{base}; | |
318 unless ($base) { # no base, so create one | |
319 my $first = $aggregates{$_}{subparts}[0]; | |
320 $base = $first->clone; # to inherit parent coordinate system, etc | |
321 $base->score(undef); | |
322 $base->phase(undef); | |
323 } | |
324 $base->method($pseudo_method); | |
325 $base->add_subfeature($_) foreach @{$aggregates{$_}{subparts}}; | |
326 $base->adjust_bounds; | |
327 $base->compound(1); # set the compound flag | |
328 push @result,$base; | |
329 } | |
330 @$features = @result; | |
331 } | |
332 | |
333 | |
334 =head2 method | |
335 | |
336 Title : method | |
337 Usage : $string = $a->method | |
338 Function: get the method type for the composite feature | |
339 Returns : a string | |
340 Args : none | |
341 Status : Protected | |
342 | |
343 This method is called to get the method to be assigned to the | |
344 composite feature once it is aggregated. It is called if the user did | |
345 not explicitly supply a -method argument when the aggregator was | |
346 created. | |
347 | |
348 This is the method that should be overridden in aggregator subclasses. | |
349 | |
350 =cut | |
351 | |
352 # no default method | |
353 sub method { | |
354 my $self = shift; | |
355 return; | |
356 } | |
357 | |
358 =head2 main_name | |
359 | |
360 Title : main_name | |
361 Usage : $string = $a->main_name | |
362 Function: get the method type for the "main" component of the feature | |
363 Returns : a string | |
364 Args : none | |
365 Status : Protected | |
366 | |
367 This method is called to get the method of the "main component" of the | |
368 composite feature. It is called if the user did not explicitly supply | |
369 a -main-method argument when the aggregator was created. | |
370 | |
371 This is the method that should be overridden in aggregator subclasses. | |
372 | |
373 =cut | |
374 | |
375 # no default main method | |
376 sub main_name { | |
377 my $self = shift; | |
378 return; | |
379 } | |
380 | |
381 =head2 part_names | |
382 | |
383 Title : part_names | |
384 Usage : @methods = $a->part_names | |
385 Function: get the methods for the non-main various components of the feature | |
386 Returns : a list of strings | |
387 Args : none | |
388 Status : Protected | |
389 | |
390 This method is called to get the list of methods of the "main component" of the | |
391 composite feature. It is called if the user did not explicitly supply | |
392 a -main-method argument when the aggregator was created. | |
393 | |
394 This is the method that should be overridden in aggregator subclasses. | |
395 | |
396 =cut | |
397 | |
398 # no default part names | |
399 sub part_names { | |
400 my $self = shift; | |
401 return; | |
402 } | |
403 | |
404 =head2 require_whole_object | |
405 | |
406 Title : require_whole_object | |
407 Usage : $bool = $a->require_whole_object | |
408 Function: see below | |
409 Returns : a boolean flag | |
410 Args : none | |
411 Status : Internal | |
412 | |
413 This method returns true if the aggregator should refuse to aggregate | |
414 an object unless both its main part and its subparts are present. | |
415 | |
416 =cut | |
417 | |
418 sub require_whole_object { 0; } | |
419 | |
420 =head2 match_sub | |
421 | |
422 Title : match_sub | |
423 Usage : $coderef = $a->match_sub($factory) | |
424 Function: generate a code reference that will match desired features | |
425 Returns : a code reference | |
426 Args : see below | |
427 Status : Internal | |
428 | |
429 This method is used internally to generate a code sub that will | |
430 quickly filter out the raw features that we're interested in | |
431 aggregating. The returned sub accepts a Feature and returns true if | |
432 we should aggregate it, false otherwise. | |
433 | |
434 =cut | |
435 | |
436 sub match_sub { | |
437 my $self = shift; | |
438 my $factory = shift; | |
439 my $types_to_aggregate = $self->components() or return; # saved from disaggregate call | |
440 return unless @$types_to_aggregate; | |
441 return $factory->make_match_sub($types_to_aggregate); | |
442 } | |
443 | |
444 sub passthru_sub { | |
445 my $self = shift; | |
446 my $factory = shift; | |
447 my $passthru = $self->passthru() or return; | |
448 return unless @$passthru; | |
449 return $factory->make_match_sub($passthru); | |
450 } | |
451 | |
452 =head2 components | |
453 | |
454 Title : components | |
455 Usage : @array= $a->components([$components]) | |
456 Function: get/set stored list of parsed raw feature types | |
457 Returns : an array in list context, an array ref in scalar context | |
458 Args : new arrayref of feature types | |
459 Status : Internal | |
460 | |
461 This method is used internally to remember the parsed list of raw | |
462 features that we will aggregate. The need for this subroutine is | |
463 seen when a user requests a composite feature of type | |
464 "clone:cosmid". This generates a list of components in which the | |
465 source is appended to the method, like "clone_left_end:cosmid" and | |
466 "clone_right_end:cosmid". components() stores this information for | |
467 later use. | |
468 | |
469 =cut | |
470 | |
471 sub components { | |
472 my $self = shift; | |
473 my $d = $self->{components}; | |
474 $self->{components} = shift if @_; | |
475 return unless ref $d; | |
476 return wantarray ? @$d : $d; | |
477 } | |
478 | |
479 sub component_count { | |
480 my @c = shift->components; | |
481 scalar @c; | |
482 } | |
483 | |
484 sub passthru { | |
485 my $self = shift; | |
486 my $d = $self->{passthru}; | |
487 $self->{passthru} = shift if @_; | |
488 return unless ref $d; | |
489 return wantarray ? @$d : $d; | |
490 } | |
491 | |
492 sub clone { | |
493 my $self = shift; | |
494 my %new = %{$self}; | |
495 return bless \%new,ref($self); | |
496 } | |
497 | |
498 =head2 get_part_names | |
499 | |
500 Title : get_part_names | |
501 Usage : @array = $a->get_part_names | |
502 Function: get list of sub-parts for this type of feature | |
503 Returns : an array | |
504 Args : none | |
505 Status : Internal | |
506 | |
507 This method is used internally to fetch the list of feature types that | |
508 form the components of the composite feature. Type names in the | |
509 format "method:source" are recognized, as are "method" and | |
510 Bio::DB::GFF::Typename objects as well. It checks instance variables | |
511 first, and if not defined calls the part_names() method. | |
512 | |
513 =cut | |
514 | |
515 sub get_part_names { | |
516 my $self = shift; | |
517 if ($self->{sub_parts}) { | |
518 return ref $self->{sub_parts} ? @{$self->{sub_parts}} : $self->{sub_parts}; | |
519 } else { | |
520 return $self->part_names; | |
521 } | |
522 } | |
523 | |
524 =head2 get_main_name | |
525 | |
526 Title : get_main_name | |
527 Usage : $string = $a->get_main_name | |
528 Function: get the "main" method type for this feature | |
529 Returns : a string | |
530 Args : none | |
531 Status : Internal | |
532 | |
533 This method is used internally to fetch the type of the "main part" of | |
534 the feature. It checks instance variables first, and if not defined | |
535 calls the main_name() method. | |
536 | |
537 =cut | |
538 | |
539 sub get_main_name { | |
540 my $self = shift; | |
541 return $self->{main_method} if defined $self->{main_method}; | |
542 return $self->main_name; | |
543 } | |
544 | |
545 =head2 get_method | |
546 | |
547 Title : get_method | |
548 Usage : $string = $a->get_method | |
549 Function: get the method type for the composite feature | |
550 Returns : a string | |
551 Args : none | |
552 Status : Internal | |
553 | |
554 This method is used internally to fetch the type of the method that | |
555 will be assigned to the composite feature once it is synthesized. | |
556 | |
557 =cut | |
558 | |
559 sub get_method { | |
560 my $self = shift; | |
561 return $self->{method} if defined $self->{method}; | |
562 return $self->method; | |
563 } | |
564 | |
565 1; | |
566 | |
567 =head1 BUGS | |
568 | |
569 None known yet. | |
570 | |
571 =head1 SEE ALSO | |
572 | |
573 L<Bio::DB::GFF>, | |
574 L<Bio::DB::GFF::Aggregator::alignment>, | |
575 L<Bio::DB::GFF::Aggregator::clone>, | |
576 L<Bio::DB::GFF::Aggregator::coding>, | |
577 L<Bio::DB::GFF::Aggregator::match>, | |
578 L<Bio::DB::GFF::Aggregator::processed_transcript>, | |
579 L<Bio::DB::GFF::Aggregator::transcript>, | |
580 L<Bio::DB::GFF::Aggregator::none> | |
581 | |
582 =head1 AUTHOR | |
583 | |
584 Lincoln Stein E<lt>lstein@cshl.orgE<gt>. | |
585 | |
586 Copyright (c) 2001 Cold Spring Harbor Laboratory. | |
587 | |
588 This library is free software; you can redistribute it and/or modify | |
589 it under the same terms as Perl itself. | |
590 | |
591 =cut | |
592 |