0
|
1 =head1 NAME
|
|
2
|
|
3 Bio::DB::GFF::Aggregator -- Aggregate GFF groups into composite features
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 use Bio::DB::GFF;
|
|
8
|
|
9 my $agg1 = Bio::DB::GFF::Aggregator->new(-method => 'cistron',
|
|
10 -main_method => 'locus',
|
|
11 -sub_parts => ['allele','variant']
|
|
12 );
|
|
13
|
|
14 my $agg2 = Bio::DB::GFF::Aggregator->new(-method => 'splice_group',
|
|
15 -sub_parts => 'transcript');
|
|
16
|
|
17 my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql',
|
|
18 -aggregator => [$agg1,$agg2],
|
|
19 -dsn => 'dbi:mysql:elegans42',
|
|
20 );
|
|
21
|
|
22
|
|
23 =head1 DESCRIPTION
|
|
24
|
|
25 Bio::DB::GFF::Aggregator is used to aggregate GFF groups into
|
|
26 composite features. Each composite feature has a "main part", the
|
|
27 top-level feature, and a series of zero or more subparts, retrieved
|
|
28 with the sub_SeqFeature() method. The aggregator class is designed to
|
|
29 be subclassable, allowing a variety of GFF feature types to be
|
|
30 supported.
|
|
31
|
|
32 The base Bio::DB::GFF::Aggregator class is generic, and can be used to
|
|
33 create specific instances to be passed to the -aggregator argument of
|
|
34 Bio::DB::GFF-E<gt>new() call. The various subclasses of
|
|
35 Bio::DB::GFF::Aggregator are tuned for specific common feature types
|
|
36 such as clones, gapped alignments and transcripts.
|
|
37
|
|
38 Instances of Bio::DB::GFF::Aggregator have three attributes:
|
|
39
|
|
40 =over 3
|
|
41
|
|
42 =item method
|
|
43
|
|
44 This is the GFF method field of the composite feature as a whole. For
|
|
45 example, "transcript" may be used for a composite feature created by
|
|
46 aggregating individual intron, exon and UTR features.
|
|
47
|
|
48 =item main method
|
|
49
|
|
50 Sometimes GFF groups are organized hierarchically, with one feature
|
|
51 logically containing another. For example, in the C. elegans schema,
|
|
52 methods of type "Sequence:curated" correspond to regions covered by
|
|
53 curated genes. There can be zero or one main methods.
|
|
54
|
|
55 =item subparts
|
|
56
|
|
57 This is a list of one or more methods that correspond to the component
|
|
58 features of the aggregates. For example, in the C. elegans database,
|
|
59 the subparts of transcript are "intron", "exon" and "CDS".
|
|
60
|
|
61 =back
|
|
62
|
|
63 Aggregators have two main methods that can be overridden in
|
|
64 subclasses:
|
|
65
|
|
66 =over 4
|
|
67
|
|
68 =item disaggregate()
|
|
69
|
|
70 This method is called by the Adaptor object prior to fetching a list
|
|
71 of features. The method is passed an associative array containing the
|
|
72 [method,source] pairs that the user has requested, and it returns a
|
|
73 list of raw features that it would like the adaptor to fetch.
|
|
74
|
|
75 =item aggregate()
|
|
76
|
|
77 This method is called by the Adaptor object after it has fetched
|
|
78 features. The method is passed a list of raw features and is expected
|
|
79 to add its composite features to the list.
|
|
80
|
|
81 =back
|
|
82
|
|
83 The disaggregate() and aggregate() methods provided by the base
|
|
84 Aggregator class should be sufficient for many applications. In this
|
|
85 case, it suffices for subclasses to override the following methods:
|
|
86
|
|
87 =over 4
|
|
88
|
|
89 =item method()
|
|
90
|
|
91 Return the default method for the composite feature as a whole.
|
|
92
|
|
93 =item main_name()
|
|
94
|
|
95 Return the default main method name.
|
|
96
|
|
97 =item part_names()
|
|
98
|
|
99 Return a list of subpart method names.
|
|
100
|
|
101 =back
|
|
102
|
|
103 Provided that method() and part_names() are overridden (and optionally
|
|
104 main_name() as well), then the bare name of the aggregator subclass
|
|
105 can be passed to the -aggregator of Bio::DB::GFF-E<gt>new(). For example,
|
|
106 this is a small subclass that will aggregate features of type "allele"
|
|
107 and "polymorphism" into an aggregate named "mutant":
|
|
108
|
|
109 package Bio::DB::GFF::Aggregator::mutant;
|
|
110
|
|
111 use strict;
|
|
112 use Bio::DB::GFF::Aggregator;
|
|
113
|
|
114 use vars '@ISA';
|
|
115 @ISA = 'Bio::DB::GFF::Aggregator';
|
|
116
|
|
117 sub method { 'mutant' }
|
|
118
|
|
119 sub part_names {
|
|
120 return qw(allele polymorphism);
|
|
121 }
|
|
122
|
|
123 1;
|
|
124
|
|
125 Once installed, this aggregator can be passed to Bio::DB::GFF-E<gt>new()
|
|
126 by name like so:
|
|
127
|
|
128 my $db = Bio::DB::GFF->new( -adaptor => 'dbi:mysql',
|
|
129 -aggregator => 'mutant',
|
|
130 -dsn => 'dbi:mysql:elegans42',
|
|
131 );
|
|
132
|
|
133 =head1 API
|
|
134
|
|
135 The remainder of this document describes the public and private
|
|
136 methods implemented by this module.
|
|
137
|
|
138 =cut
|
|
139
|
|
140 package Bio::DB::GFF::Aggregator;
|
|
141
|
|
142 use strict;
|
|
143 use Bio::DB::GFF::Util::Rearrange; # for rearrange()
|
|
144 use Bio::DB::GFF::Feature;
|
|
145 use vars qw(@ISA);
|
|
146
|
|
147 @ISA = qw(Bio::Root::Root);
|
|
148
|
|
149 my $ALWAYS_TRUE = sub { 1 };
|
|
150
|
|
151 =head2 new
|
|
152
|
|
153 Title : new
|
|
154 Usage : $a = Bio::DB::GFF::Aggregator->new(@args)
|
|
155 Function: create a new aggregator
|
|
156 Returns : a Bio::DB::GFF::Aggregator object
|
|
157 Args : see below
|
|
158 Status : Public
|
|
159
|
|
160 This is the constructor for Bio::DB::GFF::Aggregator. Named arguments
|
|
161 are as follows:
|
|
162
|
|
163 -method the method for the composite feature
|
|
164
|
|
165 -main_method the top-level raw feature, if any
|
|
166
|
|
167 -sub_parts the list of raw features that will form the subparts
|
|
168 of the composite feature (array reference or scalar)
|
|
169
|
|
170 =cut
|
|
171
|
|
172 sub new {
|
|
173 my $class = shift;
|
|
174 my ($method,$main,$sub_parts) = rearrange(['METHOD',
|
|
175 ['MAIN_PART','MAIN_METHOD'],
|
|
176 ['SUB_METHODS','SUB_PARTS']
|
|
177 ],@_);
|
|
178 return bless {
|
|
179 method => $method,
|
|
180 main_method => $main,
|
|
181 sub_parts => $sub_parts,
|
|
182 },$class;
|
|
183 }
|
|
184
|
|
185 =head2 disaggregate
|
|
186
|
|
187 Title : disaggregate
|
|
188 Usage : $a->disaggregate($types,$factory)
|
|
189 Function: disaggregate type list into components
|
|
190 Returns : a true value if this aggregator should be called to reaggregate
|
|
191 Args : see below
|
|
192 Status : Public
|
|
193
|
|
194 This method is called to disaggregate a list of types into the set of
|
|
195 low-level features to be retrieved from the GFF database. The list of
|
|
196 types is passed as an array reference containing a series of
|
|
197 [method,source] pairs. This method synthesizes a new set of
|
|
198 [method,source] pairs, and appends them to the list of requested
|
|
199 types, changing the list in situ.
|
|
200
|
|
201 Arguments:
|
|
202
|
|
203 $types reference to an array of [method,source] pairs
|
|
204
|
|
205 $factory reference to the Adaptor object that is calling
|
|
206 this method
|
|
207
|
|
208 Note that the API allows disaggregate() to remove types from the type
|
|
209 list. This feature is probably not desirable and may be deprecated in
|
|
210 the future.
|
|
211
|
|
212 =cut
|
|
213
|
|
214 # this is called at the beginning to turn the pseudo-type
|
|
215 # into its component feature types
|
|
216 sub disaggregate {
|
|
217 my $self = shift;
|
|
218 my $types = shift;
|
|
219 my $factory = shift;
|
|
220
|
|
221 my $sub_features = $factory->parse_types($self->get_part_names);
|
|
222 my $main_feature = $factory->parse_types($self->get_main_name);
|
|
223
|
|
224 if (@$types) {
|
|
225 my (@synthetic_types,@unchanged);
|
|
226 foreach (@$types) {
|
|
227 my ($method,$source) = @$_;
|
|
228 if (lc $method eq lc $self->get_method) { # e.g. "transcript"
|
|
229 push @synthetic_types,map { [$_->[0],$_->[1] || $source] } @$sub_features,@$main_feature;
|
|
230 }
|
|
231 else {
|
|
232 push @unchanged,$_;
|
|
233 }
|
|
234 }
|
|
235 # remember what we're searching for
|
|
236 $self->components(\@synthetic_types);
|
|
237 $self->passthru(\@unchanged);
|
|
238 @$types = (@unchanged,@synthetic_types);
|
|
239 }
|
|
240
|
|
241 # we get here when no search types are listed
|
|
242 else {
|
|
243 my @stypes = map { [$_->[0],$_->[1]] } @$sub_features,@$main_feature;
|
|
244 $self->components(\@stypes);
|
|
245 $self->passthru(undef);
|
|
246 }
|
|
247
|
|
248 return $self->component_count > 0;
|
|
249 }
|
|
250
|
|
251
|
|
252 =head2 aggregate
|
|
253
|
|
254 Title : aggregate
|
|
255 Usage : $features = $a->aggregate($features,$factory)
|
|
256 Function: aggregate a feature list into composite features
|
|
257 Returns : an array reference containing modified features
|
|
258 Args : see below
|
|
259 Status : Public
|
|
260
|
|
261 This method is called to aggregate a list of raw GFF features into the
|
|
262 set of composite features. The method is called an array reference to
|
|
263 a set of Bio::DB::GFF::Feature objects. It runs through the list,
|
|
264 creating new composite features when appropriate. The method result
|
|
265 is an array reference containing the composite features.
|
|
266
|
|
267 Arguments:
|
|
268
|
|
269 $features reference to an array of Bio::DB::GFF::Feature objects
|
|
270
|
|
271 $factory reference to the Adaptor object that is calling
|
|
272 this method
|
|
273
|
|
274 NOTE: The reason that the function result contains the raw features as
|
|
275 well as the aggregated ones is to allow queries like this one:
|
|
276
|
|
277 @features = $segment->features('exon','transcript:curated');
|
|
278
|
|
279 Assuming that "transcript" is the name of an aggregated feature and
|
|
280 that "exon" is one of its components, we do not want the transcript
|
|
281 aggregator to remove features of type "exon" because the user asked
|
|
282 for them explicitly.
|
|
283
|
|
284 =cut
|
|
285
|
|
286 sub aggregate {
|
|
287 my $self = shift;
|
|
288 my $features = shift;
|
|
289 my $factory = shift;
|
|
290
|
|
291 my $main_method = $self->get_main_name;
|
|
292 my $matchsub = $self->match_sub($factory) or return;
|
|
293 my $passthru = $self->passthru_sub($factory);
|
|
294
|
|
295 my (%aggregates,@result);
|
|
296 for my $feature (@$features) {
|
|
297 if ($feature->group && $matchsub->($feature)) {
|
|
298 if ($main_method && lc $feature->method eq lc $main_method) {
|
|
299 $aggregates{$feature->group,$feature->refseq}{base} ||= $feature->clone;
|
|
300 } else {
|
|
301 push @{$aggregates{$feature->group,$feature->refseq}{subparts}},$feature;
|
|
302 }
|
|
303 push @result,$feature if $passthru && $passthru->($feature);
|
|
304
|
|
305 } else {
|
|
306 push @result,$feature;
|
|
307 }
|
|
308 }
|
|
309
|
|
310 # aggregate components
|
|
311 my $pseudo_method = $self->get_method;
|
|
312 my $require_whole_object = $self->require_whole_object;
|
|
313 foreach (keys %aggregates) {
|
|
314 if ($require_whole_object && $self->components) {
|
|
315 next unless $aggregates{$_}{base} && $aggregates{$_}{subparts};
|
|
316 }
|
|
317 my $base = $aggregates{$_}{base};
|
|
318 unless ($base) { # no base, so create one
|
|
319 my $first = $aggregates{$_}{subparts}[0];
|
|
320 $base = $first->clone; # to inherit parent coordinate system, etc
|
|
321 $base->score(undef);
|
|
322 $base->phase(undef);
|
|
323 }
|
|
324 $base->method($pseudo_method);
|
|
325 $base->add_subfeature($_) foreach @{$aggregates{$_}{subparts}};
|
|
326 $base->adjust_bounds;
|
|
327 $base->compound(1); # set the compound flag
|
|
328 push @result,$base;
|
|
329 }
|
|
330 @$features = @result;
|
|
331 }
|
|
332
|
|
333
|
|
334 =head2 method
|
|
335
|
|
336 Title : method
|
|
337 Usage : $string = $a->method
|
|
338 Function: get the method type for the composite feature
|
|
339 Returns : a string
|
|
340 Args : none
|
|
341 Status : Protected
|
|
342
|
|
343 This method is called to get the method to be assigned to the
|
|
344 composite feature once it is aggregated. It is called if the user did
|
|
345 not explicitly supply a -method argument when the aggregator was
|
|
346 created.
|
|
347
|
|
348 This is the method that should be overridden in aggregator subclasses.
|
|
349
|
|
350 =cut
|
|
351
|
|
352 # no default method
|
|
353 sub method {
|
|
354 my $self = shift;
|
|
355 return;
|
|
356 }
|
|
357
|
|
358 =head2 main_name
|
|
359
|
|
360 Title : main_name
|
|
361 Usage : $string = $a->main_name
|
|
362 Function: get the method type for the "main" component of the feature
|
|
363 Returns : a string
|
|
364 Args : none
|
|
365 Status : Protected
|
|
366
|
|
367 This method is called to get the method of the "main component" of the
|
|
368 composite feature. It is called if the user did not explicitly supply
|
|
369 a -main-method argument when the aggregator was created.
|
|
370
|
|
371 This is the method that should be overridden in aggregator subclasses.
|
|
372
|
|
373 =cut
|
|
374
|
|
375 # no default main method
|
|
376 sub main_name {
|
|
377 my $self = shift;
|
|
378 return;
|
|
379 }
|
|
380
|
|
381 =head2 part_names
|
|
382
|
|
383 Title : part_names
|
|
384 Usage : @methods = $a->part_names
|
|
385 Function: get the methods for the non-main various components of the feature
|
|
386 Returns : a list of strings
|
|
387 Args : none
|
|
388 Status : Protected
|
|
389
|
|
390 This method is called to get the list of methods of the "main component" of the
|
|
391 composite feature. It is called if the user did not explicitly supply
|
|
392 a -main-method argument when the aggregator was created.
|
|
393
|
|
394 This is the method that should be overridden in aggregator subclasses.
|
|
395
|
|
396 =cut
|
|
397
|
|
398 # no default part names
|
|
399 sub part_names {
|
|
400 my $self = shift;
|
|
401 return;
|
|
402 }
|
|
403
|
|
404 =head2 require_whole_object
|
|
405
|
|
406 Title : require_whole_object
|
|
407 Usage : $bool = $a->require_whole_object
|
|
408 Function: see below
|
|
409 Returns : a boolean flag
|
|
410 Args : none
|
|
411 Status : Internal
|
|
412
|
|
413 This method returns true if the aggregator should refuse to aggregate
|
|
414 an object unless both its main part and its subparts are present.
|
|
415
|
|
416 =cut
|
|
417
|
|
418 sub require_whole_object { 0; }
|
|
419
|
|
420 =head2 match_sub
|
|
421
|
|
422 Title : match_sub
|
|
423 Usage : $coderef = $a->match_sub($factory)
|
|
424 Function: generate a code reference that will match desired features
|
|
425 Returns : a code reference
|
|
426 Args : see below
|
|
427 Status : Internal
|
|
428
|
|
429 This method is used internally to generate a code sub that will
|
|
430 quickly filter out the raw features that we're interested in
|
|
431 aggregating. The returned sub accepts a Feature and returns true if
|
|
432 we should aggregate it, false otherwise.
|
|
433
|
|
434 =cut
|
|
435
|
|
436 sub match_sub {
|
|
437 my $self = shift;
|
|
438 my $factory = shift;
|
|
439 my $types_to_aggregate = $self->components() or return; # saved from disaggregate call
|
|
440 return unless @$types_to_aggregate;
|
|
441 return $factory->make_match_sub($types_to_aggregate);
|
|
442 }
|
|
443
|
|
444 sub passthru_sub {
|
|
445 my $self = shift;
|
|
446 my $factory = shift;
|
|
447 my $passthru = $self->passthru() or return;
|
|
448 return unless @$passthru;
|
|
449 return $factory->make_match_sub($passthru);
|
|
450 }
|
|
451
|
|
452 =head2 components
|
|
453
|
|
454 Title : components
|
|
455 Usage : @array= $a->components([$components])
|
|
456 Function: get/set stored list of parsed raw feature types
|
|
457 Returns : an array in list context, an array ref in scalar context
|
|
458 Args : new arrayref of feature types
|
|
459 Status : Internal
|
|
460
|
|
461 This method is used internally to remember the parsed list of raw
|
|
462 features that we will aggregate. The need for this subroutine is
|
|
463 seen when a user requests a composite feature of type
|
|
464 "clone:cosmid". This generates a list of components in which the
|
|
465 source is appended to the method, like "clone_left_end:cosmid" and
|
|
466 "clone_right_end:cosmid". components() stores this information for
|
|
467 later use.
|
|
468
|
|
469 =cut
|
|
470
|
|
471 sub components {
|
|
472 my $self = shift;
|
|
473 my $d = $self->{components};
|
|
474 $self->{components} = shift if @_;
|
|
475 return unless ref $d;
|
|
476 return wantarray ? @$d : $d;
|
|
477 }
|
|
478
|
|
479 sub component_count {
|
|
480 my @c = shift->components;
|
|
481 scalar @c;
|
|
482 }
|
|
483
|
|
484 sub passthru {
|
|
485 my $self = shift;
|
|
486 my $d = $self->{passthru};
|
|
487 $self->{passthru} = shift if @_;
|
|
488 return unless ref $d;
|
|
489 return wantarray ? @$d : $d;
|
|
490 }
|
|
491
|
|
492 sub clone {
|
|
493 my $self = shift;
|
|
494 my %new = %{$self};
|
|
495 return bless \%new,ref($self);
|
|
496 }
|
|
497
|
|
498 =head2 get_part_names
|
|
499
|
|
500 Title : get_part_names
|
|
501 Usage : @array = $a->get_part_names
|
|
502 Function: get list of sub-parts for this type of feature
|
|
503 Returns : an array
|
|
504 Args : none
|
|
505 Status : Internal
|
|
506
|
|
507 This method is used internally to fetch the list of feature types that
|
|
508 form the components of the composite feature. Type names in the
|
|
509 format "method:source" are recognized, as are "method" and
|
|
510 Bio::DB::GFF::Typename objects as well. It checks instance variables
|
|
511 first, and if not defined calls the part_names() method.
|
|
512
|
|
513 =cut
|
|
514
|
|
515 sub get_part_names {
|
|
516 my $self = shift;
|
|
517 if ($self->{sub_parts}) {
|
|
518 return ref $self->{sub_parts} ? @{$self->{sub_parts}} : $self->{sub_parts};
|
|
519 } else {
|
|
520 return $self->part_names;
|
|
521 }
|
|
522 }
|
|
523
|
|
524 =head2 get_main_name
|
|
525
|
|
526 Title : get_main_name
|
|
527 Usage : $string = $a->get_main_name
|
|
528 Function: get the "main" method type for this feature
|
|
529 Returns : a string
|
|
530 Args : none
|
|
531 Status : Internal
|
|
532
|
|
533 This method is used internally to fetch the type of the "main part" of
|
|
534 the feature. It checks instance variables first, and if not defined
|
|
535 calls the main_name() method.
|
|
536
|
|
537 =cut
|
|
538
|
|
539 sub get_main_name {
|
|
540 my $self = shift;
|
|
541 return $self->{main_method} if defined $self->{main_method};
|
|
542 return $self->main_name;
|
|
543 }
|
|
544
|
|
545 =head2 get_method
|
|
546
|
|
547 Title : get_method
|
|
548 Usage : $string = $a->get_method
|
|
549 Function: get the method type for the composite feature
|
|
550 Returns : a string
|
|
551 Args : none
|
|
552 Status : Internal
|
|
553
|
|
554 This method is used internally to fetch the type of the method that
|
|
555 will be assigned to the composite feature once it is synthesized.
|
|
556
|
|
557 =cut
|
|
558
|
|
559 sub get_method {
|
|
560 my $self = shift;
|
|
561 return $self->{method} if defined $self->{method};
|
|
562 return $self->method;
|
|
563 }
|
|
564
|
|
565 1;
|
|
566
|
|
567 =head1 BUGS
|
|
568
|
|
569 None known yet.
|
|
570
|
|
571 =head1 SEE ALSO
|
|
572
|
|
573 L<Bio::DB::GFF>,
|
|
574 L<Bio::DB::GFF::Aggregator::alignment>,
|
|
575 L<Bio::DB::GFF::Aggregator::clone>,
|
|
576 L<Bio::DB::GFF::Aggregator::coding>,
|
|
577 L<Bio::DB::GFF::Aggregator::match>,
|
|
578 L<Bio::DB::GFF::Aggregator::processed_transcript>,
|
|
579 L<Bio::DB::GFF::Aggregator::transcript>,
|
|
580 L<Bio::DB::GFF::Aggregator::none>
|
|
581
|
|
582 =head1 AUTHOR
|
|
583
|
|
584 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
|
|
585
|
|
586 Copyright (c) 2001 Cold Spring Harbor Laboratory.
|
|
587
|
|
588 This library is free software; you can redistribute it and/or modify
|
|
589 it under the same terms as Perl itself.
|
|
590
|
|
591 =cut
|
|
592
|