comparison variant_effect_predictor/Bio/DB/GFF.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: GFF.pm,v 1.71.2.2 2003/09/12 13:29:32 lstein Exp $
2
3 =head1 NAME
4
5 Bio::DB::GFF -- Storage and retrieval of sequence annotation data
6
7 =head1 SYNOPSIS
8
9 use Bio::DB::GFF;
10
11 # Open the sequence database
12 my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt',
13 -dsn => 'dbi:mysql:elegans',
14 -fasta => '/usr/local/fasta_files'
15 );
16
17 # fetch a 1 megabase segment of sequence starting at landmark "ZK909"
18 my $segment = $db->segment('ZK909', 1 => 1000000);
19
20 # pull out all transcript features
21 my @transcripts = $segment->features('transcript');
22
23 # for each transcript, total the length of the introns
24 my %totals;
25 for my $t (@transcripts) {
26 my @introns = $t->Intron;
27 $totals{$t->name} += $_->length foreach @introns;
28 }
29
30 # Sort the exons of the first transcript by position
31 my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon;
32
33 # Get a region 1000 bp upstream of first exon
34 my $upstream = $exons[0]->segment(-1000,0);
35
36 # get its DNA
37 my $dna = $upstream->dna;
38
39 # and get all curated polymorphisms inside it
40 @polymorphisms = $upstream->contained_features('polymorphism:curated');
41
42 # get all feature types in the database
43 my @types = $db->types;
44
45 # count all feature types in the segment
46 my %type_counts = $segment->types(-enumerate=>1);
47
48 # get an iterator on all curated features of type 'exon' or 'intron'
49 my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']);
50
51 while (my $s = $iterator->next_seq) {
52 print $s,"\n";
53 }
54
55 # find all transcripts annotated as having function 'kinase'
56 my $iterator = $db->get_seq_stream(-type=>'transcript',
57 -attributes=>{Function=>'kinase'});
58 while (my $s = $iterator->next_seq) {
59 print $s,"\n";
60 }
61
62 =head1 DESCRIPTION
63
64 Bio::DB::GFF provides fast indexed access to a sequence annotation
65 database. It supports multiple database types (ACeDB, relational),
66 and multiple schemas through a system of adaptors and aggregators.
67
68 The following operations are supported by this module:
69
70 - retrieving a segment of sequence based on the ID of a landmark
71 - retrieving the DNA from that segment
72 - finding all annotations that overlap with the segment
73 - finding all annotations that are completely contained within the
74 segment
75 - retrieving all annotations of a particular type, either within a
76 segment, or globally
77 - conversion from absolute to relative coordinates and back again,
78 using any arbitrary landmark for the relative coordinates
79 - using a sequence segment to create new segments based on relative
80 offsets
81
82 The data model used by Bio::DB::GFF is compatible with the GFF flat
83 file format (http://www.sanger.ac.uk/software/GFF). The module can
84 load a set of GFF files into the database, and serves objects that
85 have methods corresponding to GFF fields.
86
87 The objects returned by Bio::DB::GFF are compatible with the
88 SeqFeatureI interface, allowing their use by the Bio::Graphics and
89 Bio::DAS modules.
90
91 =head2 Auxiliary Scripts
92
93 The bioperl distribution includes several scripts that make it easier
94 to work with Bio::DB::GFF databases. They are located in the scripts
95 directory under a subdirectory named Bio::DB::GFF:
96
97 =over 4
98
99 =item bp_load_gff.pl
100
101 This script will load a Bio::DB::GFF database from a flat GFF file of
102 sequence annotations. Only the relational database version of
103 Bio::DB::GFF is supported. It can be used to create the database from
104 scratch, as well as to incrementally load new data.
105
106 This script takes a --fasta argument to load raw DNA into the database
107 as well. However, GFF databases do not require access to the raw DNA
108 for most of their functionality.
109
110 load_gff.pl also has a --upgrade option, which will perform a
111 non-destructive upgrade of older schemas to newer ones.
112
113 =item bp_bulk_load_gff.pl
114
115 This script will populate a Bio::DB::GFF database from a flat GFF file
116 of sequence annotations. Only the MySQL database version of
117 Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in
118 order to accelerate loading considerably; however, it can only be used
119 for the initial load, and not for updates.
120
121 This script takes a --fasta argument to load raw DNA into the database
122 as well. However, GFF databases do not require access to the raw DNA
123 for most of their functionality.
124
125 =item bp_fast_load_gff.pl
126
127 This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe
128 tricks to allow for incremental updates. It only supports the MySQL
129 database version of Bio::DB::GFF and is guaranteed not to work on
130 non-Unix platforms.
131
132 Arguments are the same as bp_load_gff.pl
133
134 =item gadfly_to_gff.pl
135
136 This script will convert the GFF-like format used by the Berkeley
137 Drosophila Sequencing project into a format suitable for use with this
138 module.
139
140 =item sgd_to_gff.pl
141
142 This script will convert the tab-delimited feature files used by the
143 Saccharomyces Genome Database into a format suitable for use with this
144 module.
145
146 =back
147
148 =head2 GFF Fundamentals
149
150 The GFF format is a flat tab-delimited file, each line of which
151 corresponds to an annotation, or feature. Each line has nine columns
152 and looks like this:
153
154 Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7"
155
156 The 9 columns are as follows:
157
158 =over 4
159
160 =item 1. reference sequence
161
162 This is the ID of the sequence that is used to establish the
163 coordinate system of the annotation. In the example above, the
164 reference sequence is "Chr1".
165
166 =item 2. source
167
168 The source of the annotation. This field describes how the annotation
169 was derived. In the example above, the source is "curated" to
170 indicate that the feature is the result of human curation. The names
171 and versions of software programs are often used for the source field,
172 as in "tRNAScan-SE/1.2".
173
174 =item 3. method
175
176 The annotation method. This field describes the type of the
177 annotation, such as "CDS". Together the method and source describe
178 the annotation type.
179
180 =item 4. start position
181
182 The start of the annotation relative to the reference sequence.
183
184 =item 5. stop position
185
186 The stop of the annotation relative to the reference sequence. Start
187 is always less than or equal to stop.
188
189 =item 6. score
190
191 For annotations that are associated with a numeric score (for example,
192 a sequence similarity), this field describes the score. The score
193 units are completely unspecified, but for sequence similarities, it is
194 typically percent identity. Annotations that don't have a score can
195 use "."
196
197 =item 7. strand
198
199 For those annotations which are strand-specific, this field is the
200 strand on which the annotation resides. It is "+" for the forward
201 strand, "-" for the reverse strand, or "." for annotations that are
202 not stranded.
203
204 =item 8. phase
205
206 For annotations that are linked to proteins, this field describes the
207 phase of the annotation on the codons. It is a number from 0 to 2, or
208 "." for features that have no phase\.
209
210 =item 9. group
211
212 GFF provides a simple way of generating annotation hierarchies ("is
213 composed of" relationships) by providing a group field. The group
214 field contains the class and ID of an annotation which is the logical
215 parent of the current one. In the example given above, the group is
216 the Transcript named "R119.7".
217
218 The group field is also used to store information about the target of
219 sequence similarity hits, and miscellaneous notes. See the next
220 section for a description of how to describe similarity targets.
221
222 The format of the group fields is "Class ID" with a single space (not
223 a tab) separating the class from the ID. It is VERY IMPORTANT to
224 follow this format, or grouping will not work properly.
225
226 =back
227
228 The sequences used to establish the coordinate system for annotations
229 can correspond to sequenced clones, clone fragments, contigs or
230 super-contigs. Thus, this module can be used throughout the lifecycle
231 of a sequencing project.
232
233 In addition to a group ID, the GFF format allows annotations to have a
234 group class. For example, in the ACeDB representation, RNA
235 interference experiments have a class of "RNAi" and an ID that is
236 unique among the RNAi experiments. Since not all databases support
237 this notion, the class is optional in all calls to this module, and
238 defaults to "Sequence" when not provided.
239
240 Double-quotes are sometimes used in GFF files around components of the
241 group field. Strictly, this is only necessary if the group name or
242 class contains whitespace.
243
244 =head2 Making GFF files work with this module
245
246 Some annotations do not need to be individually named. For example,
247 it is probably not useful to assign a unique name to each ALU repeat
248 in a vertebrate genome. Others, such as predicted genes, correspond
249 to named biological objects; you probably want to be able to fetch the
250 positions of these objects by referring to them by name.
251
252 To accomodate named annotations, the GFF format places the object
253 class and name in the group field. The name identifies the object,
254 and the class prevents similarly-named objects, for example clones and
255 sequences, from collding.
256
257 A named object is shown in the following excerpt from a GFF file:
258
259 Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2
260
261 This object is a predicted transcript named Y95BA.2. In this case,
262 the group field is used to identify the class and name of the object,
263 even though no other annotation belongs to that group.
264
265 It now becomes possible to retrieve the region of the genome covered
266 by transcript Y95B8A.2 using the segment() method:
267
268 $segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2');
269
270 It is not necessary for the annotation's method to correspond to the
271 object class, although this is commonly the case.
272
273 As explained above, each annotation in a GFF file refers to a
274 reference sequence. It is important that each reference sequence also
275 be identified by a line in the GFF file. This allows the Bio::DB::GFF
276 module to determine the length and class of the reference sequence,
277 and makes it possible to do relative arithmetic.
278
279 For example, if "Chr1" is used as a reference sequence, then it should
280 have an entry in the GFF file similar to this one:
281
282 Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1
283
284 This indicates that the reference sequence named "Chr1" has length
285 14972282 bp, method "chromosome" and source "assembly". In addition,
286 as indicated by the group field, Chr1 has class "Sequence" and name
287 "Chr1".
288
289 The object class "Sequence" is used by default when the class is not
290 specified in the segment() call. This allows you to use a shortcut
291 form of the segment() method:
292
293 $segment = $db->segment('Chr1'); # whole chromosome
294 $segment = $db->segment('Chr1',1=>1000); # first 1000 bp
295
296 For your convenience, if, during loading a GFF file, Bio::DB::GFF
297 encounters a line like the following:
298
299 ##sequence-region Chr1 1 14972282
300
301 It will automatically generate the following entry:
302
303 Chr1 reference Component 1 14972282 . + . Sequence Chr1
304
305 This is sufficient to use Chr1 as a reference point.
306 The ##sequence-region line is frequently found in the GFF files
307 distributed by annotation groups.
308
309 =head2 Sequence alignments
310
311 There are two cases in which an annotation indicates the relationship
312 between two sequences. The first case is a similarity hit, where the
313 annotation indicates an alignment. The second case is a map assembly,
314 in which the annotation indicates that a portion of a larger sequence
315 is built up from one or more smaller ones.
316
317 Both cases are indicated by using the B<Target> tag in the group
318 field. For example, a typical similarity hit will look like this:
319
320 Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544
321
322 The group field contains the Target tag, followed by an identifier for
323 the biological object referred to. The GFF format uses the notation
324 I<Class>:I<Name> for the biological object, and even though this is
325 stylistically inconsistent, that's the way it's done. The object
326 identifier is followed by two integers indicating the start and stop
327 of the alignment on the target sequence.
328
329 Unlike the main start and stop columns, it is possible for the target
330 start to be greater than the target end. The previous example
331 indicates that the the section of Chr1 from 76,953 to 77,108 aligns to
332 the protein SW:ABL_DROME starting at position 493 and extending to
333 position 544.
334
335 A similar notation is used for sequence assembly information as shown
336 in this example:
337
338 Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826
339 LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742
340
341 This indicates that the region between bases 10922906 and 11177731 of
342 Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region
343 of LINK_H0601 between 32386 and 64122 is, in turn, composed of the
344 bases 5 to 31742 of cosmid F49B2.
345
346 =head2 Attributes
347
348 While not intended to serve as a general-purpose sequence database
349 (see bioperl-db for that), GFF allows you to tag features with
350 arbitrary attributes. Attributes appear in the Group field following
351 the initial class/name pair. For example:
352
353 Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3
354
355 This line tags the feature named Transcript Y95B8A.2 as being "Gene"
356 named sma-3 and having the Alias "sma3". Features having these
357 attributes can be looked up using the fetch_feature_by_attribute() method.
358
359 Two attributes have special meaning: "Note" is for backward
360 compatibility and is used for unstructured text remarks. "Alias" is
361 considered as a synonym for the feature name and will be consulted
362 when looking up a feature by its name.
363
364 =head2 Adaptors and Aggregators
365
366 This module uses a system of adaptors and aggregators in order to make
367 it adaptable to use with a variety of databases.
368
369 =over 4
370
371 =item Adaptors
372
373 The core of the module handles the user API, annotation coordinate
374 arithmetic, and other common issues. The details of fetching
375 information from databases is handled by an adaptor, which is
376 specified during Bio::DB::GFF construction. The adaptor encapsulates
377 database-specific information such as the schema, user authentication
378 and access methods.
379
380 Currently there are two adaptors: 'dbi::mysql' and 'dbi::mysqlopt'.
381 The former is an interface to a simple Mysql schema. The latter is an
382 optimized version of dbi::mysql which uses a binning scheme to
383 accelerate range queries and the Bio::DB::Fasta module for rapid
384 retrieval of sequences. Note the double-colon between the words.
385
386 =item Aggregators
387
388 The GFF format uses a "group" field to indicate aggregation properties
389 of individual features. For example, a set of exons and introns may
390 share a common transcript group, and multiple transcripts may share
391 the same gene group.
392
393 Aggregators are small modules that use the group information to
394 rebuild the hierarchy. When a Bio::DB::GFF object is created, you
395 indicate that it use a set of one or more aggregators. Each
396 aggregator provides a new composite annotation type. Before the
397 database query is generated each aggregator is called to
398 "disaggregate" its annotation type into list of component types
399 contained in the database. After the query is generated, each
400 aggregator is called again in order to build composite annotations
401 from the returned components.
402
403 For example, during disaggregation, the standard
404 "processed_transcript" aggregator generates a list of component
405 feature types including "UTR", "CDS", and "polyA_site". Later, it
406 aggregates these features into a set of annotations of type
407 "processed_transcript".
408
409 During aggregation, the list of aggregators is called in reverse
410 order. This allows aggregators to collaborate to create multi-level
411 structures: the transcript aggregator assembles transcripts from
412 introns and exons; the gene aggregator then assembles genes from sets
413 of transcripts.
414
415 Three default aggregators are provided:
416
417 transcript assembles transcripts from features of type
418 exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA
419 clone assembles clones from Clone_left_end, Clone_right_end
420 and Sequence features.
421 alignment assembles gapped alignments from features of type
422 "similarity".
423
424 In addition, this module provides the optional "wormbase_gene"
425 aggregator, which accomodates the WormBase representation of genes.
426 This aggregator aggregates features of method "exon", "CDS", "5'UTR",
427 "3'UTR", "polyA" and "TSS" into a single object. It also expects to
428 find a single feature of type "Sequence" that spans the entire gene.
429
430 The existing aggregators are easily customized.
431
432 Note that aggregation will not occur unless you specifically request
433 the aggregation type. For example, this call:
434
435 @features = $segment->features('alignment');
436
437 will generate an array of aggregated alignment features. However,
438 this call:
439
440 @features = $segment->features();
441
442 will return a list of unaggregated similarity segments.
443
444 For more informnation, see the manual pages for
445 Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone,
446 etc.
447
448 =back
449
450 =head1 API
451
452 The following is the API for Bio::DB::GFF.
453
454 =cut
455
456 package Bio::DB::GFF;
457
458 use strict;
459
460 use Bio::DB::GFF::Util::Rearrange;
461 use Bio::DB::GFF::RelSegment;
462 use Bio::DB::GFF::Feature;
463 use Bio::DB::GFF::Aggregator;
464 use Bio::DasI;
465 use Bio::Root::Root;
466
467 use vars qw(@ISA $VERSION);
468 @ISA = qw(Bio::Root::Root Bio::DasI);
469
470 $VERSION = '1.2003';
471 my %valid_range_types = (overlaps => 1,
472 contains => 1,
473 contained_in => 1);
474
475 =head1 Querying GFF Databases
476
477 =head2 new
478
479 Title : new
480 Usage : my $db = new Bio::DB::GFF(@args);
481 Function: create a new Bio::DB::GFF object
482 Returns : new Bio::DB::GFF object
483 Args : lists of adaptors and aggregators
484 Status : Public
485
486 These are the arguments:
487
488 -adaptor Name of the adaptor module to use. If none
489 provided, defaults to "dbi::mysqlopt".
490
491 -aggregator Array reference to a list of aggregators
492 to apply to the database. If none provided,
493 defaults to ['processed_transcript','alignment'].
494
495 <other> Any other named argument pairs are passed to
496 the adaptor for processing.
497
498 The adaptor argument must correspond to a module contained within the
499 Bio::DB::GFF::Adaptor namespace. For example, the
500 Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying
501 'dbi::mysql'. By Perl convention, the adaptors names are lower case
502 because they are loaded at run time.
503
504 The aggregator array may contain a list of aggregator names, a list of
505 initialized aggregator objects, or a string in the form
506 "aggregator_name{subpart1,subpart2,subpart3/main_method}" (the
507 /main_method part is optional). For example, if you wish to change
508 the components aggregated by the transcript aggregator, you could pass
509 it to the GFF constructor this way:
510
511 my $transcript =
512 Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr
513 polyA spliced_leader)]);
514
515 my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment],
516 -adaptor => 'dbi::mysql',
517 -dsn => 'dbi:mysql:elegans42');
518
519 Alternatively, you could create an entirely new transcript aggregator
520 this way:
521
522 my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}';
523 my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment],
524 -adaptor => 'dbi::mysql',
525 -dsn => 'dbi:mysql:elegans42');
526
527 See L<Bio::DB::GFF::Aggregator> for more details.
528
529 The commonly used 'dbi::mysql' adaptor recognizes the following
530 adaptor-specific arguments:
531
532 Argument Description
533 -------- -----------
534
535 -dsn the DBI data source, e.g. 'dbi:mysql:ens0040'
536 If a partial name is given, such as "ens0040", the
537 "dbi:mysql:" prefix will be added automatically.
538
539 -user username for authentication
540
541 -pass the password for authentication
542
543 -refclass landmark Class; defaults to "Sequence"
544
545 The commonly used 'dbi::mysqlopt' adaptor also recogizes the following
546 arguments.
547
548 Argument Description
549 -------- -----------
550
551 -fasta path to a directory containing FASTA files for the DNA
552 contained in this database (e.g. "/usr/local/share/fasta")
553
554 -acedb an acedb URL to use when converting features into ACEDB
555 objects (e.g. sace://localhost:2005)
556
557 =cut
558
559 #'
560
561 sub new {
562 my $package = shift;
563 my ($adaptor,$aggregators,$args,$refclass);
564
565 if (@_ == 1) { # special case, default to dbi::mysqlopt
566 $adaptor = 'dbi::mysqlopt';
567 $args = {DSN => shift};
568 } else {
569 ($adaptor,$aggregators,$refclass,$args) = rearrange([
570 [qw(ADAPTOR FACTORY)],
571 [qw(AGGREGATOR AGGREGATORS)],
572 'REFCLASS',
573 ],@_);
574 }
575
576 $adaptor ||= 'dbi::mysqlopt';
577 my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E";
578 eval "require $class" unless $class->can('new');
579 $package->throw("Unable to load $adaptor adaptor: $@") if $@;
580
581 my $self = $class->new($args);
582 $self->default_class($refclass) if defined $refclass;
583
584 # handle the aggregators.
585 # aggregators are responsible for creating complex multi-part features
586 # from the GFF "group" field. If none are provided, then we provide a
587 # list of the two used in WormBase.
588 # Each aggregator can be a scalar or a ref. In the former case
589 # it is treated as a class name to call new() on. In the latter
590 # the aggreator is treated as a ready made object.
591 $aggregators = $self->default_aggregators unless defined $aggregators;
592 my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators;
593 for my $a (@a) {
594 $self->add_aggregator($a);
595 }
596
597 # default settings go here.....
598 $self->automerge(1); # set automerge to true
599
600 $self;
601 }
602
603
604 =head2 types
605
606 Title : types
607 Usage : $db->types(@args)
608 Function: return list of feature types in range or database
609 Returns : a list of Bio::DB::GFF::Typename objects
610 Args : see below
611 Status : public
612
613 This routine returns a list of feature types known to the database.
614 The list can be database-wide or restricted to a region. It is also
615 possible to find out how many times each feature occurs.
616
617 For range queries, it is usually more convenient to create a
618 Bio::DB::GFF::Segment object, and then invoke it's types() method.
619
620 Arguments are as follows:
621
622 -ref ID of reference sequence
623 -class class of reference sequence
624 -start start of segment
625 -stop stop of segment
626 -enumerate if true, count the features
627
628 The returned value will be a list of Bio::DB::GFF::Typename objects,
629 which if evaluated in a string context will return the feature type in
630 "method:source" format. This object class also has method() and
631 source() methods for retrieving the like-named fields.
632
633 If -enumerate is true, then the function returns a hash (not a hash
634 reference) in which the keys are type names in "method:source" format
635 and the values are the number of times each feature appears in the
636 database or segment.
637
638 The argument -end is a synonum for -stop, and -count is a synonym for
639 -enumerate.
640
641 =cut
642
643 sub types {
644 my $self = shift;
645 my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([
646 [qw(REF REFSEQ)],
647 qw(START),
648 [qw(STOP END)],
649 [qw(ENUMERATE COUNT)],
650 [qw(CLASS SEQCLASS)],
651 [qw(TYPE TYPES)],
652 ],@_);
653 $types = $self->parse_types($types) if defined $types;
654 $self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types);
655 }
656
657 =head2 classes
658
659 Title : classes
660 Usage : $db->classes
661 Function: return list of landmark classes in database
662 Returns : a list of classes
663 Args : none
664 Status : public
665
666 This routine returns the list of reference classes known to the
667 database, or empty if classes are not used by the database. Classes
668 are distinct from types, being essentially qualifiers on the reference
669 namespaces.
670
671 =cut
672
673 sub classes {
674 my $self = shift;
675 return ();
676 }
677
678 =head2 segment
679
680 Title : segment
681 Usage : $db->segment(@args);
682 Function: create a segment object
683 Returns : segment object(s)
684 Args : numerous, see below
685 Status : public
686
687 This method generates a segment object, which is a Perl object
688 subclassed from Bio::DB::GFF::Segment. The segment can be used to
689 find overlapping features and the raw DNA.
690
691 When making the segment() call, you specify the ID of a sequence
692 landmark (e.g. an accession number, a clone or contig), and a
693 positional range relative to the landmark. If no range is specified,
694 then the entire extent of the landmark is used to generate the
695 segment.
696
697 You may also provide the ID of a "reference" sequence, which will set
698 the coordinate system and orientation used for all features contained
699 within the segment. The reference sequence can be changed later. If
700 no reference sequence is provided, then the coordinate system is based
701 on the landmark.
702
703 Arguments:
704
705 -name ID of the landmark sequence.
706
707 -class Database object class for the landmark sequence.
708 "Sequence" assumed if not specified. This is
709 irrelevant for databases which do not recognize
710 object classes.
711
712 -start Start of the segment relative to landmark. Positions
713 follow standard 1-based sequence rules. If not specified,
714 defaults to the beginning of the landmark.
715
716 -end Stop of the segment relative to the landmark. If not specified,
717 defaults to the end of the landmark.
718
719 -stop Same as -end.
720
721 -offset For those who prefer 0-based indexing, the offset specifies the
722 position of the new segment relative to the start of the landmark.
723
724 -length For those who prefer 0-based indexing, the length specifies the
725 length of the new segment.
726
727 -refseq Specifies the ID of the reference landmark used to establish the
728 coordinate system for the newly-created segment.
729
730 -refclass Specifies the class of the reference landmark, for those databases
731 that distinguish different object classes. Defaults to "Sequence".
732
733 -absolute
734 Return features in absolute coordinates rather than relative to the
735 parent segment.
736
737 -nocheck Don't check the database for the coordinates and length of this
738 feature. Construct a segment using the indicated name as the
739 reference, a start coordinate of 1, an undefined end coordinate,
740 and a strand of +1.
741
742 -force Same as -nocheck.
743
744 -seq,-sequence,-sourceseq Aliases for -name.
745
746 -begin,-end Aliases for -start and -stop
747
748 -off,-len Aliases for -offset and -length
749
750 -seqclass Alias for -class
751
752 Here's an example to explain how this works:
753
754 my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql');
755
756 If successful, $db will now hold the database accessor object. We now
757 try to fetch the fragment of sequence whose ID is A0000182 and class
758 is "Accession."
759
760 my $segment = $db->segment(-name=>'A0000182',-class=>'Accession');
761
762 If successful, $segment now holds the entire segment corresponding to
763 this accession number. By default, the sequence is used as its own
764 reference sequence, so its first base will be 1 and its last base will
765 be the length of the accession.
766
767 Assuming that this sequence belongs to a longer stretch of DNA, say a
768 contig, we can fetch this information like so:
769
770 my $sourceseq = $segment->sourceseq;
771
772 and find the start and stop on the source like this:
773
774 my $start = $segment->abs_start;
775 my $stop = $segment->abs_stop;
776
777 If we had another segment, say $s2, which is on the same contiguous
778 piece of DNA, we can pass that to the refseq() method in order to
779 establish it as the coordinate reference point:
780
781 $segment->refseq($s2);
782
783 Now calling start() will return the start of the segment relative to
784 the beginning of $s2, accounting for differences in strandedness:
785
786 my $rel_start = $segment->start;
787
788 IMPORTANT NOTE: This method can be used to return the segment spanned
789 by an arbitrary named annotation. However, if the annotation appears
790 at multiple locations on the genome, for example an EST that maps to
791 multiple locations, then, provided that all locations reside on the
792 same physical segment, the method will return a segment that spans the
793 minimum and maximum positions. If the reference sequence occupies
794 ranges on different physical segments, then it returns them all in an
795 array context, and raises a "multiple segment exception" exception in
796 a scalar context.
797
798 =cut
799
800 #'
801
802 sub segment {
803 my $self = shift;
804 my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self,
805 $self->setup_segment_args(@_));
806 foreach (@segments) {
807 $_->absolute(1) if $self->absolute;
808 }
809
810 $self->_multiple_return_args(@segments);
811 }
812
813 sub _multiple_return_args {
814 my $self = shift;
815 my @args = @_;
816 if (@args == 0) {
817 return;
818 } elsif (@args == 1) {
819 return $args[0];
820 } elsif (wantarray) { # more than one reference sequence
821 return @args;
822 } else {
823 $self->error($args[0]->name,
824 " has more than one reference sequence in database. Please call in a list context to retrieve them all.");
825 $self->throw('multiple segment exception');
826 return;
827 }
828
829 }
830
831 # backward compatibility -- don't use!
832 # (deliberately undocumented too)
833 sub abs_segment {
834 my $self = shift;
835 return $self->segment($self->setup_segment_args(@_),-absolute=>1);
836 }
837
838 sub setup_segment_args {
839 my $self = shift;
840 return @_ if defined $_[0] && $_[0] =~ /^-/;
841 return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3;
842 return (-class=>$_[0],-name=>$_[1]) if @_ == 2;
843 return (-name=>$_[0]) if @_ == 1;
844 }
845
846 =head2 features
847
848 Title : features
849 Usage : $db->features(@args)
850 Function: get all features, possibly filtered by type
851 Returns : a list of Bio::DB::GFF::Feature objects
852 Args : see below
853 Status : public
854
855 This routine will retrieve features in the database regardless of
856 position. It can be used to return all features, or a subset based on
857 their method and source.
858
859 Arguments are as follows:
860
861 -types List of feature types to return. Argument is an array
862 reference containing strings of the format "method:source"
863
864 -merge Whether to apply aggregators to the generated features.
865
866 -rare Turn on optimizations suitable for a relatively rare feature type,
867 where it makes more sense to filter by feature type first,
868 and then by position.
869
870 -attributes A hash reference containing attributes to match.
871
872 -iterator Whether to return an iterator across the features.
873
874 -binsize A true value will create a set of artificial features whose
875 start and stop positions indicate bins of the given size, and
876 whose scores are the number of features in the bin. The
877 class and method of the feature will be set to "bin",
878 its source to "method:source", and its group to "bin:method:source".
879 This is a handy way of generating histograms of feature density.
880
881 If -iterator is true, then the method returns a single scalar value
882 consisting of a Bio::SeqIO object. You can call next_seq() repeatedly
883 on this object to fetch each of the features in turn. If iterator is
884 false or absent, then all the features are returned as a list.
885
886 Currently aggregation is disabled when iterating over a series of
887 features.
888
889 Types are indicated using the nomenclature "method:source". Either of
890 these fields can be omitted, in which case a wildcard is used for the
891 missing field. Type names without the colon (e.g. "exon") are
892 interpreted as the method name and a source wild card. Regular
893 expressions are allowed in either field, as in: "similarity:BLAST.*".
894
895 The -attributes argument is a hashref containing one or more attributes
896 to match against:
897
898 -attributes => { Gene => 'abc-1',
899 Note => 'confirmed' }
900
901 Attribute matching is simple string matching, and multiple attributes
902 are ANDed together.
903
904 =cut
905
906 sub features {
907 my $self = shift;
908 my ($types,$automerge,$sparse,$iterator,$other);
909 if (defined $_[0] &&
910 $_[0] =~ /^-/) {
911 ($types,$automerge,$sparse,$iterator,$other) = rearrange([
912 [qw(TYPE TYPES)],
913 [qw(MERGE AUTOMERGE)],
914 [qw(RARE SPARSE)],
915 'ITERATOR'
916 ],@_);
917 } else {
918 $types = \@_;
919 }
920
921 # for whole database retrievals, we probably don't want to automerge!
922 $automerge = $self->automerge unless defined $automerge;
923 $other ||= {};
924 $self->_features({
925 rangetype => 'contains',
926 types => $types,
927 },
928 { sparse => $sparse,
929 automerge => $automerge,
930 iterator =>$iterator,
931 %$other,
932 }
933 );
934 }
935
936 =head2 get_seq_stream
937
938 Title : get_seq_stream
939 Usage : my $seqio = $self->get_seq_sream(@args)
940 Function: Performs a query and returns an iterator over it
941 Returns : a Bio::SeqIO stream capable of producing sequence
942 Args : As in features()
943 Status : public
944
945 This routine takes the same arguments as features(), but returns a
946 Bio::SeqIO::Stream-compliant object. Use it like this:
947
948 $stream = $db->get_seq_stream('exon');
949 while (my $exon = $stream->next_seq) {
950 print $exon,"\n";
951 }
952
953 NOTE: This is also called get_feature_stream(), since that's what it
954 really does.
955
956 =cut
957
958 sub get_seq_stream {
959 my $self = shift;
960 my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1)
961 : (-types=>\@_,-iterator=>1);
962 $self->features(@args);
963 }
964
965 *get_feature_stream = \&get_seq_stream;
966
967 =head2 get_feature_by_name
968
969 Title : get_feature_by_name
970 Usage : $db->get_feature_by_name($class => $name)
971 Function: fetch features by their name
972 Returns : a list of Bio::DB::GFF::Feature objects
973 Args : the class and name of the desired feature
974 Status : public
975
976 This method can be used to fetch a named feature from the database.
977 GFF annotations are named using the group class and name fields, so
978 for features that belong to a group of size one, this method can be
979 used to retrieve that group (and is equivalent to the segment()
980 method). Any Alias attributes are also searched for matching names.
981
982 An alternative syntax allows you to search for features by name within
983 a circumscribed region:
984
985 @f = $db->get_feature_by_name(-class => $class,-name=>$name,
986 -ref => $sequence_name,
987 -start => $start,
988 -end => $end);
989
990 This method may return zero, one, or several Bio::DB::GFF::Feature
991 objects.
992
993 Aggregation is performed on features as usual.
994
995 NOTE: At various times, this function was called fetch_group(),
996 fetch_feature(), fetch_feature_by_name() and segments(). These names
997 are preserved for backward compatibility.
998
999 =cut
1000
1001 sub get_feature_by_name {
1002 my $self = shift;
1003 my ($gclass,$gname,$automerge,$ref,$start,$end);
1004 if (@_ == 1) {
1005 $gclass = $self->default_class;
1006 $gname = shift;
1007 } else {
1008 ($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE',
1009 ['REF','REFSEQ'],
1010 'START',['STOP','END']
1011 ],@_);
1012 $gclass ||= $self->default_class;
1013 }
1014 $automerge = $self->automerge unless defined $automerge;
1015
1016 # we need to refactor this... It's repeated code (see below)...
1017 my @aggregators;
1018 if ($automerge) {
1019 for my $a ($self->aggregators) {
1020 push @aggregators,$a if $a->disaggregate([],$self);
1021 }
1022 }
1023
1024 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1025 my $features = [];
1026 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
1027 my $location = [$ref,$start,$end] if defined $ref;
1028 $self->_feature_by_name($gclass,$gname,$location,$callback);
1029
1030 warn "aggregating...\n" if $self->debug;
1031 foreach my $a (@aggregators) { # last aggregator gets first shot
1032 $a->aggregate($features,$self) or next;
1033 }
1034
1035 @$features;
1036 }
1037
1038 # horrible indecision regarding proper names!
1039 *fetch_group = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name;
1040 *segments = \&segment;
1041
1042 =head2 get_feature_by_target
1043
1044 Title : get_feature_by_target
1045 Usage : $db->get_feature_by_target($class => $name)
1046 Function: fetch features by their similarity target
1047 Returns : a list of Bio::DB::GFF::Feature objects
1048 Args : the class and name of the desired feature
1049 Status : public
1050
1051 This method can be used to fetch a named feature from the database
1052 based on its similarity hit.
1053
1054 =cut
1055
1056 sub get_feature_by_target {
1057 shift->get_feature_by_name(@_);
1058 }
1059
1060 =head2 get_feature_by_attribute
1061
1062 Title : get_feature_by_attribute
1063 Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2)
1064 Function: fetch segments by combinations of attribute values
1065 Returns : a list of Bio::DB::GFF::Feature objects
1066 Args : the class and name of the desired feature
1067 Status : public
1068
1069 This method can be used to fetch a set of features from the database.
1070 Attributes are a list of name=E<gt>value pairs. They will be logically
1071 ANDED together.
1072
1073 =cut
1074
1075 sub get_feature_by_attribute {
1076 my $self = shift;
1077 my %attributes = ref($_[0]) ? %{$_[0]} : @_;
1078
1079 # we need to refactor this... It's repeated code (see above)...
1080 my @aggregators;
1081 if ($self->automerge) {
1082 for my $a ($self->aggregators) {
1083 unshift @aggregators,$a if $a->disaggregate([],$self);
1084 }
1085 }
1086
1087 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1088 my $features = [];
1089 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
1090 $self->_feature_by_attribute(\%attributes,$callback);
1091
1092 warn "aggregating...\n" if $self->debug;
1093 foreach my $a (@aggregators) { # last aggregator gets first shot
1094 $a->aggregate($features,$self) or next;
1095 }
1096
1097 @$features;
1098 }
1099
1100 # more indecision...
1101 *fetch_feature_by_attribute = \&get_feature_by_attribute;
1102
1103 =head2 get_feature_by_id
1104
1105 Title : get_feature_by_id
1106 Usage : $db->get_feature_by_id($id)
1107 Function: fetch segments by feature ID
1108 Returns : a Bio::DB::GFF::Feature object
1109 Args : the feature ID
1110 Status : public
1111
1112 This method can be used to fetch a feature from the database using its
1113 ID. Not all GFF databases support IDs, so be careful with this.
1114
1115 =cut
1116
1117 sub get_feature_by_id {
1118 my $self = shift;
1119 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
1120 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1121 my $features = [];
1122 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
1123 $self->_feature_by_id($id,'feature',$callback);
1124 return wantarray ? @$features : $features->[0];
1125 }
1126 *fetch_feature_by_id = \&get_feature_by_id;
1127
1128 =head2 get_feature_by_gid
1129
1130 Title : get_feature_by_gid
1131 Usage : $db->get_feature_by_gid($id)
1132 Function: fetch segments by feature ID
1133 Returns : a Bio::DB::GFF::Feature object
1134 Args : the feature ID
1135 Status : public
1136
1137 This method can be used to fetch a feature from the database using its
1138 group ID. Not all GFF databases support IDs, so be careful with this.
1139
1140 The group ID is often more interesting than the feature ID, since
1141 groups can be complex objects containing subobjects.
1142
1143 =cut
1144
1145 sub get_feature_by_gid {
1146 my $self = shift;
1147 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
1148 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
1149 my $features = [];
1150 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
1151 $self->_feature_by_id($id,'group',$callback);
1152 return wantarray ? @$features : $features->[0];
1153 }
1154 *fetch_feature_by_gid = \&get_feature_by_gid;
1155
1156 =head2 delete_features
1157
1158 Title : delete_features
1159 Usage : $db->delete_features(@ids_or_features)
1160 Function: delete one or more features
1161 Returns : count of features deleted
1162 Args : list of features or feature ids
1163 Status : public
1164
1165 Pass this method a list of numeric feature ids or a set of features.
1166 It will attempt to remove the features from the database and return a
1167 count of the features removed.
1168
1169 NOTE: This method is also called delete_feature(). Also see
1170 delete_groups().
1171
1172 =cut
1173
1174 *delete_feature = \&delete_features;
1175
1176 sub delete_features {
1177 my $self = shift;
1178 my @features_or_ids = @_;
1179 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids;
1180 return unless @ids;
1181 $self->_delete_features(@ids);
1182 }
1183
1184 =head2 delete_groups
1185
1186 Title : delete_groups
1187 Usage : $db->delete_groups(@ids_or_features)
1188 Function: delete one or more feature groups
1189 Returns : count of features deleted
1190 Args : list of features or feature group ids
1191 Status : public
1192
1193 Pass this method a list of numeric group ids or a set of features. It
1194 will attempt to recursively remove the features and ALL members of
1195 their group from the database. It returns a count of the number of
1196 features (not groups) returned.
1197
1198 NOTE: This method is also called delete_group(). Also see
1199 delete_features().
1200
1201 =cut
1202
1203 *delete_group = \&delete_groupss;
1204
1205 sub delete_groups {
1206 my $self = shift;
1207 my @features_or_ids = @_;
1208 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids;
1209 return unless @ids;
1210 $self->_delete_groups(@ids);
1211 }
1212
1213 =head2 delete
1214
1215 Title : delete
1216 Usage : $db->delete(@args)
1217 Function: delete features
1218 Returns : count of features deleted -- if available
1219 Args : numerous, see below
1220 Status : public
1221
1222 This method deletes all features that overlap the specified region or
1223 are of a particular type. If no arguments are provided and the -force
1224 argument is true, then deletes ALL features.
1225
1226 Arguments:
1227
1228 -name ID of the landmark sequence.
1229
1230 -ref ID of the landmark sequence (synonym for -name).
1231
1232 -class Database object class for the landmark sequence.
1233 "Sequence" assumed if not specified. This is
1234 irrelevant for databases which do not recognize
1235 object classes.
1236
1237 -start Start of the segment relative to landmark. Positions
1238 follow standard 1-based sequence rules. If not specified,
1239 defaults to the beginning of the landmark.
1240
1241 -end Stop of the segment relative to the landmark. If not specified,
1242 defaults to the end of the landmark.
1243
1244 -offset Zero-based addressing
1245
1246 -length Length of region
1247
1248 -type,-types Either a single scalar type to be deleted, or an
1249 reference to an array of types.
1250
1251 -force Force operation to be performed even if it would delete
1252 entire feature table.
1253
1254 -range_type Control the range type of the deletion. One of "overlaps" (default)
1255 "contains" or "contained_in"
1256
1257 Examples:
1258
1259 $db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats
1260 $db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000
1261 $db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3
1262
1263 The short form of this call, as described in segment() is also allowed:
1264
1265 $db->delete("chr3",1=>1000);
1266 $db->delete("chr3");
1267
1268 IMPORTANT NOTE: This method only deletes features. It does *NOT*
1269 delete the names of groups that contain the deleted features. Group
1270 IDs will be reused if you later load a feature with the same group
1271 name as one that was previously deleted.
1272
1273 NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the
1274 result code from the SQL DELETE operation. Some dbd drivers return the
1275 count of rows deleted, while others return 0E0. Caveat emptor.
1276
1277 =cut
1278
1279 sub delete {
1280 my $self = shift;
1281 my @args = $self->setup_segment_args(@_);
1282 my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) =
1283 rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET',
1284 'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args);
1285 $offset = 0 unless defined $offset;
1286 $start = $offset+1 unless defined $start;
1287 $end = $start+$length-1 if !defined $end and $length;
1288 $class ||= $self->default_class;
1289
1290 my $types = $self->parse_types($type); # parse out list of types
1291
1292 $range_type ||= 'overlaps';
1293 $self->throw("range type must be one of {".
1294 join(',',keys %valid_range_types).
1295 "}\n")
1296 unless $valid_range_types{lc $range_type};
1297
1298
1299 my @segments;
1300 if (defined $name && $name ne '') {
1301 my @args = (-name=>$name,-class=>$class);
1302 push @args,(-start=>$start) if defined $start;
1303 push @args,(-end =>$end) if defined $end;
1304 @segments = $self->segment(@args);
1305 return unless @segments;
1306 }
1307 $self->_delete({segments => \@segments,
1308 types => $types,
1309 range_type => $range_type,
1310 force => $force}
1311 );
1312 }
1313
1314 =head2 absolute
1315
1316 Title : absolute
1317 Usage : $abs = $db->absolute([$abs]);
1318 Function: gets/sets absolute mode
1319 Returns : current setting of absolute mode boolean
1320 Args : new setting for absolute mode boolean
1321 Status : public
1322
1323 $db-E<gt>absolute(1) will turn on absolute mode for the entire database.
1324 All segments retrieved will use absolute coordinates by default,
1325 rather than relative coordinates. You can still set them to use
1326 relative coordinates by calling $segment-E<gt>absolute(0).
1327
1328 Note that this is not the same as calling abs_segment(); it continues
1329 to allow you to look up groups that are not used directly as reference
1330 sequences.
1331
1332 =cut
1333
1334 sub absolute {
1335 my $self = shift;
1336 my $d = $self->{absolute};
1337 $self->{absolute} = shift if @_;
1338 $d;
1339 }
1340
1341 =head2 strict_bounds_checking
1342
1343 Title : strict_bounds_checking
1344 Usage : $flag = $db->strict_bounds_checking([$flag])
1345 Function: gets/sets strict bounds checking
1346 Returns : current setting of bounds checking flag
1347 Args : new setting for bounds checking flag
1348 Status : public
1349
1350 This flag enables extra checks for segment requests that go beyond the
1351 ends of their reference sequences. If bounds checking is enabled,
1352 then retrieved segments will be truncated to their physical length,
1353 and their truncated() methods will return true.
1354
1355 If the flag is off (the default), then the module will return segments
1356 that appear to extend beyond their physical boundaries. Requests for
1357 features beyond the end of the segment will, however, return empty.
1358
1359 =cut
1360
1361 sub strict_bounds_checking {
1362 my $self = shift;
1363 my $d = $self->{strict};
1364 $self->{strict} = shift if @_;
1365 $d;
1366 }
1367
1368 =head2 get_Seq_by_id
1369
1370 Title : get_Seq_by_id
1371 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
1372 Function: Gets a Bio::Seq object by its name
1373 Returns : a Bio::Seq object
1374 Args : the id (as a string) of a sequence
1375 Throws : "id does not exist" exception
1376
1377 NOTE: Bio::DB::RandomAccessI compliant method
1378
1379 =cut
1380
1381 sub get_Seq_by_id {
1382 my $self = shift;
1383 my $id = shift;
1384 my $stream = $self->get_Stream_by_id($id);
1385 return $stream->next_seq;
1386 }
1387
1388
1389 =head2 get_Seq_by_accession
1390
1391 Title : get_Seq_by_accession
1392 Usage : $seq = $db->get_Seq_by_accession('AL12234')
1393 Function: Gets a Bio::Seq object by its accession
1394 Returns : a Bio::Seq object
1395 Args : the id (as a string) of a sequence
1396 Throws : "id does not exist" exception
1397
1398 NOTE: Bio::DB::RandomAccessI compliant method
1399
1400 =cut
1401
1402 sub get_Seq_by_accession {
1403 my $self = shift;
1404 my $id = shift;
1405 my $stream = $self->get_Stream_by_accession($id);
1406 return $stream->next_seq;
1407 }
1408
1409 =head2 get_Stream_by_acc ()
1410
1411 =cut
1412
1413 =head2 get_Seq_by_acc
1414
1415 Title : get_Seq_by_acc
1416 Usage : $seq = $db->get_Seq_by_acc('X77802');
1417 Function: Gets a Bio::Seq object by accession number
1418 Returns : A Bio::Seq object
1419 Args : accession number (as a string)
1420 Throws : "acc does not exist" exception
1421
1422 NOTE: Bio::DB::RandomAccessI compliant method
1423
1424 =cut
1425
1426 sub get_Stream_by_name {
1427 my $self = shift;
1428 my @ids = @_;
1429 my $id = ref($ids[0]) ? $ids[0] : \@ids;
1430 Bio::DB::GFF::ID_Iterator->new($self,$id,'name');
1431 }
1432
1433 =head2 get_Stream_by_id
1434
1435 Title : get_Stream_by_id
1436 Usage : $seq = $db->get_Stream_by_id(@ids);
1437 Function: Retrieves a stream of Seq objects given their ids
1438 Returns : a Bio::SeqIO stream object
1439 Args : an array of unique ids/accession numbers, or
1440 an array reference
1441
1442 NOTE: This is also called get_Stream_by_batch()
1443
1444 =cut
1445
1446 sub get_Stream_by_id {
1447 my $self = shift;
1448 my @ids = @_;
1449 my $id = ref($ids[0]) ? $ids[0] : \@ids;
1450 Bio::DB::GFF::ID_Iterator->new($self,$id,'feature');
1451 }
1452
1453 =head2 get_Stream_by_batch ()
1454
1455 Title : get_Stream_by_batch
1456 Usage : $seq = $db->get_Stream_by_batch(@ids);
1457 Function: Retrieves a stream of Seq objects given their ids
1458 Returns : a Bio::SeqIO stream object
1459 Args : an array of unique ids/accession numbers, or
1460 an array reference
1461
1462 NOTE: This is the same as get_Stream_by_id().
1463
1464 =cut
1465
1466 *get_Stream_by_batch = \&get_Stream_by_id;
1467
1468
1469 =head2 get_Stream_by_group ()
1470
1471 Bioperl compatibility.
1472
1473 =cut
1474
1475 sub get_Stream_by_group {
1476 my $self = shift;
1477 my @ids = @_;
1478 my $id = ref($ids[0]) ? $ids[0] : \@ids;
1479 Bio::DB::GFF::ID_Iterator->new($self,$id,'group');
1480 }
1481
1482 =head2 all_seqfeatures
1483
1484 Title : all_seqfeatures
1485 Usage : @features = $db->all_seqfeatures(@args)
1486 Function: fetch all the features in the database
1487 Returns : an array of features, or an iterator
1488 Args : See below
1489 Status : public
1490
1491 This is equivalent to calling $db-E<gt>features() without any types, and
1492 will return all the features in the database. The -merge and
1493 -iterator arguments are recognized, and behave the same as described
1494 for features().
1495
1496 =cut
1497
1498 sub all_seqfeatures {
1499 my $self = shift;
1500 my ($automerge,$iterator)= rearrange([
1501 [qw(MERGE AUTOMERGE)],
1502 'ITERATOR'
1503 ],@_);
1504 my @args;
1505 push @args,(-merge=>$automerge) if defined $automerge;
1506 push @args,(-iterator=>$iterator) if defined $iterator;
1507 $self->features(@args);
1508 }
1509
1510 =head1 Creating and Loading GFF Databases
1511
1512 =head2 initialize
1513
1514 Title : initialize
1515 Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2);
1516 Function: initialize a GFF database
1517 Returns : true if initialization successful
1518 Args : a set of named parameters
1519 Status : Public
1520
1521 This method can be used to initialize an empty database. It takes the following
1522 named arguments:
1523
1524 -erase A boolean value. If true the database will be wiped clean if it
1525 already contains data.
1526
1527 Other named arguments may be recognized by subclasses. They become database
1528 meta values that control various settable options.
1529
1530 As a shortcut (and for backward compatibility) a single true argument
1531 is the same as initialize(-erase=E<gt>1).
1532
1533 =cut
1534
1535 sub initialize {
1536 my $self = shift;
1537 #$self->do_initialize(1) if @_ == 1 && $_[0];
1538 #why was this line (^) here? I can't see that it actually does anything
1539 #one option would be to execute the line and return, but I don't know
1540 #why you would want to do that either.
1541
1542 my ($erase,$meta) = rearrange(['ERASE'],@_);
1543 $meta ||= {};
1544
1545 # initialize (possibly erasing)
1546 return unless $self->do_initialize($erase);
1547 my @default = $self->default_meta_values;
1548
1549 # this is an awkward way of uppercasing the
1550 # even-numbered values (necessary for case-insensitive SQL databases)
1551 for (my $i=0; $i<@default; $i++) {
1552 $default[$i] = uc $default[$i] if !($i % 2);
1553 }
1554
1555 my %values = (@default,%$meta);
1556 foreach (keys %values) {
1557 $self->meta($_ => $values{$_});
1558 }
1559 1;
1560 }
1561
1562
1563 =head2 load_gff
1564
1565 Title : load_gff
1566 Usage : $db->load_gff($file|$directory|$filehandle);
1567 Function: load GFF data into database
1568 Returns : count of records loaded
1569 Args : a directory, a file, a list of files,
1570 or a filehandle
1571 Status : Public
1572
1573 This method takes a single overloaded argument, which can be any of:
1574
1575 =over 4
1576
1577 =item 1. a scalar corresponding to a GFF file on the system
1578
1579 A pathname to a local GFF file. Any files ending with the .gz, .Z, or
1580 .bz2 suffixes will be transparently decompressed with the appropriate
1581 command-line utility.
1582
1583 =item 2. an array reference containing a list of GFF files on the system
1584
1585 For example ['/home/gff/gff1.gz','/home/gff/gff2.gz']
1586
1587 =item 3. directory path
1588
1589 The indicated directory will be searched for all files ending in the
1590 suffixes .gff, .gff.gz, .gff.Z or .gff.bz2.
1591
1592 =item 4. filehandle
1593
1594 An open filehandle from which to read the GFF data. Tied filehandles
1595 now work as well.
1596
1597 =item 5. a pipe expression
1598
1599 A pipe expression will also work. For example, a GFF file on a remote
1600 web server can be loaded with an expression like this:
1601
1602 $db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |");
1603
1604 =back
1605
1606 If successful, the method will return the number of GFF lines
1607 successfully loaded.
1608
1609 NOTE:this method used to be called load(), but has been changed. The
1610 old method name is also recognized.
1611
1612 =cut
1613
1614 sub load_gff {
1615 my $self = shift;
1616 my $file_or_directory = shift || '.';
1617 return $self->do_load_gff($file_or_directory) if ref($file_or_directory) &&
1618 tied *$file_or_directory;
1619
1620 my $tied_stdin = tied(*STDIN);
1621 open SAVEIN,"<&STDIN" unless $tied_stdin;
1622 local @ARGV = $self->setup_argv($file_or_directory,'gff') or return; # to play tricks with reader
1623 my $result = $self->do_load_gff('ARGV');
1624 open STDIN,"<&SAVEIN" unless $tied_stdin; # restore STDIN
1625 return $result;
1626 }
1627
1628 *load = \&load_gff;
1629
1630 =head2 load_fasta
1631
1632 Title : load_fasta
1633 Usage : $db->load_fasta($file|$directory|$filehandle);
1634 Function: load FASTA data into database
1635 Returns : count of records loaded
1636 Args : a directory, a file, a list of files,
1637 or a filehandle
1638 Status : Public
1639
1640 This method takes a single overloaded argument, which can be any of:
1641
1642 =over 4
1643
1644 =item 1. scalar corresponding to a FASTA file on the system
1645
1646 A pathname to a local FASTA file. Any files ending with the .gz, .Z, or
1647 .bz2 suffixes will be transparently decompressed with the appropriate
1648 command-line utility.
1649
1650 =item 2. array reference containing a list of FASTA files on the
1651 system
1652
1653 For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz']
1654
1655 =item 3. path to a directory
1656
1657 The indicated directory will be searched for all files ending in the
1658 suffixes .fa, .fa.gz, .fa.Z or .fa.bz2.
1659
1660 a=item 4. filehandle
1661
1662 An open filehandle from which to read the FASTA data.
1663
1664 =item 5. pipe expression
1665
1666 A pipe expression will also work. For example, a FASTA file on a remote
1667 web server can be loaded with an expression like this:
1668
1669 $db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |");
1670
1671 =back
1672
1673 =cut
1674
1675 sub load_fasta {
1676 my $self = shift;
1677 my $file_or_directory = shift || '.';
1678 return $self->load_sequence($file_or_directory) if ref($file_or_directory) &&
1679 tied *$file_or_directory;
1680
1681 my $tied = tied(*STDIN);
1682 open SAVEIN,"<&STDIN" unless $tied;
1683 local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader
1684 my $result = $self->load_sequence('ARGV');
1685 open STDIN,"<&SAVEIN" unless $tied; # restore STDIN
1686 return $result;
1687 }
1688
1689 =head2 load_sequence_string
1690
1691 Title : load_sequence_string
1692 Usage : $db->load_sequence_string($id,$dna)
1693 Function: load a single DNA entry
1694 Returns : true if successfully loaded
1695 Args : a raw sequence string (DNA, RNA, protein)
1696 Status : Public
1697
1698 =cut
1699
1700 sub load_sequence_string {
1701 my $self = shift;
1702 my ($acc,$seq) = @_;
1703 my $offset = 0;
1704 $self->insert_sequence_chunk($acc,\$offset,\$seq) or return;
1705 $self->insert_sequence($acc,$offset,$seq) or return;
1706 1;
1707 }
1708
1709 sub setup_argv {
1710 my $self = shift;
1711 my $file_or_directory = shift;
1712 my @suffixes = @_;
1713 no strict 'refs'; # so that we can call fileno() on the argument
1714
1715 my @argv;
1716
1717 if (-d $file_or_directory) {
1718 @argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes;
1719 }elsif (my $fd = fileno($file_or_directory)) {
1720 open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN");
1721 @argv = '-';
1722 } elsif (ref $file_or_directory) {
1723 @argv = @$file_or_directory;
1724 } else {
1725 @argv = $file_or_directory;
1726 }
1727
1728 foreach (@argv) {
1729 if (/\.gz$/) {
1730 $_ = "gunzip -c $_ |";
1731 } elsif (/\.Z$/) {
1732 $_ = "uncompress -c $_ |";
1733 } elsif (/\.bz2$/) {
1734 $_ = "bunzip2 -c $_ |";
1735 }
1736 }
1737 @argv;
1738 }
1739
1740 =head2 lock_on_load
1741
1742 Title : lock_on_load
1743 Usage : $lock = $db->lock_on_load([$lock])
1744 Function: set write locking during load
1745 Returns : current value of lock-on-load flag
1746 Args : new value of lock-on-load-flag
1747 Status : Public
1748
1749 This method is honored by some of the adaptors. If the value is true,
1750 the tables used by the GFF modules will be locked for writing during
1751 loads and inaccessible to other processes.
1752
1753 =cut
1754
1755 sub lock_on_load {
1756 my $self = shift;
1757 my $d = $self->{lock};
1758 $self->{lock} = shift if @_;
1759 $d;
1760 }
1761
1762 =head2 meta
1763
1764 Title : meta
1765 Usage : $value = $db->meta($name [,$newval])
1766 Function: get or set a meta variable
1767 Returns : a string
1768 Args : meta variable name and optionally value
1769 Status : abstract
1770
1771 Get or set a named metavalues for the database. Metavalues can be
1772 used for database-specific settings.
1773
1774 By default, this method does nothing!
1775
1776 =cut
1777
1778 sub meta {
1779 my $self = shift;
1780 my ($name,$value) = @_;
1781 return;
1782 }
1783
1784 =head2 default_meta_values
1785
1786 Title : default_meta_values
1787 Usage : %values = $db->default_meta_values
1788 Function: empty the database
1789 Returns : a list of tag=>value pairs
1790 Args : none
1791 Status : protected
1792
1793 This method returns a list of tag=E<gt>value pairs that contain default
1794 meta information about the database. It is invoked by initialize() to
1795 write out the default meta values. The base class version returns an
1796 empty list.
1797
1798 For things to work properly, meta value names must be UPPERCASE.
1799
1800 =cut
1801
1802 sub default_meta_values {
1803 my $self = shift;
1804 return ();
1805 }
1806
1807
1808 =head2 error
1809
1810 Title : error
1811 Usage : $db->error( [$new error] );
1812 Function: read or set error message
1813 Returns : error message
1814 Args : an optional argument to set the error message
1815 Status : Public
1816
1817 This method can be used to retrieve the last error message. Errors
1818 are not reset to empty by successful calls, so contents are only valid
1819 immediately after an error condition has been detected.
1820
1821 =cut
1822
1823 sub error {
1824 my $self = shift;
1825 my $g = $self->{error};
1826 $self->{error} = join '',@_ if @_;
1827 $g;
1828 }
1829
1830 =head2 debug
1831
1832 Title : debug
1833 Usage : $db->debug( [$flag] );
1834 Function: read or set debug flag
1835 Returns : current value of debug flag
1836 Args : new debug flag (optional)
1837 Status : Public
1838
1839 This method can be used to turn on debug messages. The exact nature
1840 of those messages depends on the adaptor in use.
1841
1842 =cut
1843
1844 sub debug {
1845 my $self = shift;
1846 my $g = $self->{debug};
1847 $self->{debug} = shift if @_;
1848 $g;
1849 }
1850
1851
1852 =head2 automerge
1853
1854 Title : automerge
1855 Usage : $db->automerge( [$new automerge] );
1856 Function: get or set automerge value
1857 Returns : current value (boolean)
1858 Args : an optional argument to set the automerge value
1859 Status : Public
1860
1861 By default, this module will use the aggregators to merge groups into
1862 single composite objects. This default can be changed to false by
1863 calling automerge(0).
1864
1865 =cut
1866
1867 sub automerge {
1868 my $self = shift;
1869 my $g = $self->{automerge};
1870 $self->{automerge} = shift if @_;
1871 $g;
1872 }
1873
1874 =head2 attributes
1875
1876 Title : attributes
1877 Usage : @attributes = $db->attributes($id,$name)
1878 Function: get the "attributres" on a particular feature
1879 Returns : an array of string
1880 Args : feature ID
1881 Status : public
1882
1883 Some GFF version 2 files use the groups column to store a series of
1884 attribute/value pairs. In this interpretation of GFF, the first such
1885 pair is treated as the primary group for the feature; subsequent pairs
1886 are treated as attributes. Two attributes have special meaning:
1887 "Note" is for backward compatibility and is used for unstructured text
1888 remarks. "Alias" is considered as a synonym for the feature name.
1889
1890 If no name is provided, then attributes() returns a flattened hash, of
1891 attribute=E<gt>value pairs. This lets you do:
1892
1893 %attributes = $db->attributes($id);
1894
1895 Normally, attributes() will be called by the feature:
1896
1897 @notes = $feature->attributes('Note');
1898
1899 In a scalar context, attributes() returns the first value of the
1900 attribute if a tag is present, otherwise a hash reference in which the
1901 keys are attribute names and the values are anonymous arrays
1902 containing the values.
1903
1904 =cut
1905
1906 sub attributes {
1907 my $self = shift;
1908 my ($id,$tag) = @_;
1909 my @result = $self->do_attributes($id,$tag) or return;
1910 return @result if wantarray;
1911
1912 # what to do in an array context
1913 return $result[0] if $tag;
1914 my %result;
1915 while (my($key,$value) = splice(@result,0,2)) {
1916 push @{$result{$key}},$value;
1917 }
1918 return \%result;
1919 }
1920
1921 =head2 fast_queries
1922
1923 Title : fast_queries
1924 Usage : $flag = $db->fast_queries([$flag])
1925 Function: turn on and off the "fast queries" option
1926 Returns : a boolean
1927 Args : a boolean flag (optional)
1928 Status : public
1929
1930 The mysql database driver (and possibly others) support a "fast" query
1931 mode that caches results on the server side. This makes queries come
1932 back faster, particularly when creating iterators. The downside is
1933 that while iterating, new queries will die with a "command synch"
1934 error. This method turns the feature on and off.
1935
1936 For databases that do not support a fast query, this method has no
1937 effect.
1938
1939 =cut
1940
1941 # override this method in order to set the mysql_use_result attribute, which is an obscure
1942 # but extremely powerful optimization for both performance and memory.
1943 sub fast_queries {
1944 my $self = shift;
1945 my $d = $self->{fast_queries};
1946 $self->{fast_queries} = shift if @_;
1947 $d;
1948 }
1949
1950 =head2 add_aggregator
1951
1952 Title : add_aggregator
1953 Usage : $db->add_aggregator($aggregator)
1954 Function: add an aggregator to the list
1955 Returns : nothing
1956 Args : an aggregator
1957 Status : public
1958
1959 This method will append an aggregator to the end of the list of
1960 registered aggregators. Three different argument types are accepted:
1961
1962 1) a Bio::DB::GFF::Aggregator object -- will be added
1963 2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}"
1964 -- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method
1965 part is optional).
1966 3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator
1967 subclass, where the token corresponds to the subclass name.
1968
1969 =cut
1970
1971 sub add_aggregator {
1972 my $self = shift;
1973 my $aggregator = shift;
1974 my $list = $self->{aggregators} ||= [];
1975 if (ref $aggregator) { # an object
1976 @$list = grep {$_->get_method ne $aggregator->get_method} @$list;
1977 push @$list,$aggregator;
1978 }
1979
1980 elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) {
1981 my($agg_name,$subparts,$mainpart) = ($1,$2,$3);
1982 my @subparts = split /,\s*/,$subparts;
1983 my @args = (-method => $agg_name,
1984 -sub_parts => \@subparts);
1985 push @args,(-main_method => $mainpart) if $mainpart;
1986 warn "making an aggregator with (@args), subparts = @subparts" if $self->debug;
1987 push @$list,Bio::DB::GFF::Aggregator->new(@args);
1988 }
1989
1990 else {
1991 my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E";
1992 eval "require $class";
1993 $self->throw("Unable to load $aggregator aggregator: $@") if $@;
1994 push @$list,$class->new();
1995 }
1996 }
1997
1998 =head2 aggregators
1999
2000 Title : aggregators
2001 Usage : $db->aggregators([@new_aggregators]);
2002 Function: retrieve list of aggregators
2003 Returns : list of aggregators
2004 Args : a list of aggregators to set (optional)
2005 Status : public
2006
2007 This method will get or set the list of aggregators assigned to
2008 the database. If 1 or more arguments are passed, the existing
2009 set will be cleared.
2010
2011 =cut
2012
2013 sub aggregators {
2014 my $self = shift;
2015 my $d = $self->{aggregators};
2016 if (@_) {
2017 $self->clear_aggregators;
2018 $self->add_aggregator($_) foreach @_;
2019 }
2020 return unless $d;
2021 return @$d;
2022 }
2023
2024 =head2 clear_aggregators
2025
2026 Title : clear_aggregators
2027 Usage : $db->clear_aggregators
2028 Function: clears list of aggregators
2029 Returns : nothing
2030 Args : none
2031 Status : public
2032
2033 This method will clear the aggregators stored in the database object.
2034 Use aggregators() or add_aggregator() to add some back.
2035
2036 =cut
2037
2038 sub clear_aggregators { shift->{aggregators} = [] }
2039
2040 =head1 Methods for use by Subclasses
2041
2042 The following methods are chiefly of interest to subclasses and are
2043 not intended for use by end programmers.
2044
2045 =head2 abscoords
2046
2047 Title : abscoords
2048 Usage : $db->abscoords($name,$class,$refseq)
2049 Function: finds position of a landmark in reference coordinates
2050 Returns : ($ref,$class,$start,$stop,$strand)
2051 Args : name and class of landmark
2052 Status : public
2053
2054 This method is called by Bio::DB::GFF::RelSegment to obtain the
2055 absolute coordinates of a sequence landmark. The arguments are the
2056 name and class of the landmark. If successful, abscoords() returns
2057 the ID of the reference sequence, its class, its start and stop
2058 positions, and the orientation of the reference sequence's coordinate
2059 system ("+" for forward strand, "-" for reverse strand).
2060
2061 If $refseq is present in the argument list, it forces the query to
2062 search for the landmark in a particular reference sequence.
2063
2064 =cut
2065
2066 sub abscoords {
2067 my $self = shift;
2068 my ($name,$class,$refseq) = @_;
2069 $class ||= $self->{default_class};
2070 $self->get_abscoords($name,$class,$refseq);
2071 }
2072
2073 =head1 Protected API
2074
2075 The following methods are not intended for public consumption, but are
2076 intended to be overridden/implemented by adaptors.
2077
2078 =head2 default_aggregators
2079
2080 Title : default_aggregators
2081 Usage : $db->default_aggregators;
2082 Function: retrieve list of aggregators
2083 Returns : array reference containing list of aggregator names
2084 Args : none
2085 Status : protected
2086
2087 This method (which is intended to be overridden by adaptors) returns a
2088 list of standard aggregators to be applied when no aggregators are
2089 specified in the constructor.
2090
2091 =cut
2092
2093 sub default_aggregators {
2094 my $self = shift;
2095 return ['processed_transcript','alignment'];
2096 }
2097
2098 =head2 do_load_gff
2099
2100 Title : do_load_gff
2101 Usage : $db->do_load_gff($handle)
2102 Function: load a GFF input stream
2103 Returns : number of features loaded
2104 Args : A filehandle.
2105 Status : protected
2106
2107 This method is called to load a GFF data stream. The method will read
2108 GFF features from E<lt>E<gt> and load them into the database. On exit the
2109 method must return the number of features loaded.
2110
2111 Note that the method is responsible for parsing the GFF lines. This
2112 is to allow for differences in the interpretation of the "group"
2113 field, which are legion.
2114
2115 You probably want to use load_gff() instead. It is more flexible
2116 about the arguments it accepts.
2117
2118 =cut
2119
2120 # load from <>
2121 sub do_load_gff {
2122 my $self = shift;
2123 my $io_handle = shift;
2124
2125 local $self->{gff3_flag} = 0;
2126 $self->setup_load();
2127
2128 my $fasta_sequence_id;
2129
2130 while (<$io_handle>) {
2131 chomp;
2132 $self->{gff3_flag}++ if /^\#\#gff-version\s+3/;
2133 if (/^>(\S+)/) { # uh oh, sequence coming
2134 $fasta_sequence_id = $1;
2135 last;
2136 }
2137 if (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
2138 $self->load_gff_line(
2139 {
2140 ref => $1,
2141 class => 'Sequence',
2142 source => 'reference',
2143 method => 'Component',
2144 start => $2,
2145 stop => $3,
2146 score => undef,
2147 strand => undef,
2148 phase => undef,
2149 gclass => 'Sequence',
2150 gname => $1,
2151 tstart => undef,
2152 tstop => undef,
2153 attributes => [],
2154 }
2155 );
2156 next;
2157 }
2158
2159 next if /^\#/;
2160 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
2161 next unless defined($ref) && defined($method) && defined($start) && defined($stop);
2162 foreach (\$score,\$strand,\$phase) {
2163 undef $$_ if $$_ eq '.';
2164 }
2165
2166 my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{gff3_flag});
2167
2168 # no standard way in the GFF file to denote the class of the reference sequence -- drat!
2169 # so we invoke the factory to do it
2170 my $class = $self->refclass($ref);
2171
2172 # call subclass to do the dirty work
2173 if ($start > $stop) {
2174 ($start,$stop) = ($stop,$start);
2175 if ($strand eq '+') {
2176 $strand = '-';
2177 } elsif ($strand eq '-') {
2178 $strand = '+';
2179 }
2180 }
2181 $self->load_gff_line({ref => $ref,
2182 class => $class,
2183 source => $source,
2184 method => $method,
2185 start => $start,
2186 stop => $stop,
2187 score => $score,
2188 strand => $strand,
2189 phase => $phase,
2190 gclass => $gclass,
2191 gname => $gname,
2192 tstart => $tstart,
2193 tstop => $tstop,
2194 attributes => $attributes}
2195 );
2196 }
2197
2198 my $result = $self->finish_load();
2199 $result += $self->load_sequence($io_handle,$fasta_sequence_id)
2200 if defined $fasta_sequence_id;
2201 $result;
2202
2203 }
2204
2205 =head2 load_sequence
2206
2207 Title : load_sequence
2208 Usage : $db->load_sequence($handle [,$id])
2209 Function: load a FASTA data stream
2210 Returns : number of sequences
2211 Args : a filehandle and optionally the ID of
2212 the first sequence in the stream.
2213 Status : protected
2214
2215 You probably want to use load_fasta() instead. The $id argument is a
2216 hack used to switch from GFF loading to FASTA loading when load_gff()
2217 discovers FASTA data hiding at the bottom of the GFF file (as Artemis
2218 does).
2219
2220 =cut
2221
2222 sub load_sequence {
2223 my $self = shift;
2224 my $io_handle = shift;
2225 my $id = shift; # hack for GFF files that contain fasta data
2226
2227 # read fasta file(s) from ARGV
2228 my ($seq,$offset,$loaded) = (undef,0,0);
2229 while (<$io_handle>) {
2230 chomp;
2231 if (/^>(\S+)/) {
2232 $self->insert_sequence($id,$offset,$seq) if $id;
2233 $id = $1;
2234 $offset = 0;
2235 $seq = '';
2236 $loaded++;
2237 } else {
2238 $seq .= $_;
2239 $self->insert_sequence_chunk($id,\$offset,\$seq);
2240 }
2241 }
2242 $self->insert_sequence($id,$offset,$seq) if $id;
2243 $loaded+0;
2244 }
2245
2246 sub insert_sequence_chunk {
2247 my $self = shift;
2248 my ($id,$offsetp,$seqp) = @_;
2249 if (my $cs = $self->dna_chunk_size) {
2250 while (length($$seqp) >= $cs) {
2251 my $chunk = substr($$seqp,0,$cs);
2252 $self->insert_sequence($id,$$offsetp,$chunk);
2253 $$offsetp += length($chunk);
2254 substr($$seqp,0,$cs) = '';
2255 }
2256 }
2257 return 1; # the calling routine may expect success or failure
2258 }
2259
2260 # used to store big pieces of DNA in itty bitty pieces
2261 sub dna_chunk_size {
2262 return 0;
2263 }
2264
2265 sub insert_sequence {
2266 my $self = shift;
2267 my($id,$offset,$seq) = @_;
2268 $self->throw('insert_sequence(): must be defined in subclass');
2269 }
2270
2271 # This is the default class for reference points. Defaults to Sequence.
2272 sub default_class {
2273 my $self = shift;
2274 my $d = exists($self->{default_class}) ? $self->{default_class} : 'Sequence';
2275 $self->{default_class} = shift if @_;
2276 $d;
2277 }
2278
2279 # gets name of the reference sequence, and returns its class
2280 # currently just calls default_class
2281 sub refclass {
2282 my $self = shift;
2283 my $name = shift;
2284 return $self->default_class;
2285 }
2286
2287 =head2 setup_load
2288
2289 Title : setup_load
2290 Usage : $db->setup_load
2291 Function: called before load_gff_line()
2292 Returns : void
2293 Args : none
2294 Status : abstract
2295
2296 This abstract method gives subclasses a chance to do any
2297 schema-specific initialization prior to loading a set of GFF records.
2298 It must be implemented by a subclass.
2299
2300 =cut
2301
2302 sub setup_load {
2303 # default, do nothing
2304 }
2305
2306 =head2 finish_load
2307
2308 Title : finish_load
2309 Usage : $db->finish_load
2310 Function: called after load_gff_line()
2311 Returns : number of records loaded
2312 Args : none
2313 Status :abstract
2314
2315 This method gives subclasses a chance to do any schema-specific
2316 cleanup after loading a set of GFF records.
2317
2318 =cut
2319
2320 sub finish_load {
2321 # default, do nothing
2322 }
2323
2324 =head2 load_gff_line
2325
2326 Title : load_gff_line
2327 Usage : $db->load_gff_line(@args)
2328 Function: called to load one parsed line of GFF
2329 Returns : true if successfully inserted
2330 Args : see below
2331 Status : abstract
2332
2333 This abstract method is called once per line of the GFF and passed a
2334 hashref containing parsed GFF fields. The fields are:
2335
2336 {ref => $ref,
2337 class => $class,
2338 source => $source,
2339 method => $method,
2340 start => $start,
2341 stop => $stop,
2342 score => $score,
2343 strand => $strand,
2344 phase => $phase,
2345 gclass => $gclass,
2346 gname => $gname,
2347 tstart => $tstart,
2348 tstop => $tstop,
2349 attributes => $attributes}
2350
2351 =cut
2352
2353 sub load_gff_line {
2354 shift->throw("load_gff_line(): must be implemented by an adaptor");
2355 }
2356
2357
2358 =head2 do_initialize
2359
2360 Title : do_initialize
2361 Usage : $db->do_initialize([$erase])
2362 Function: initialize and possibly erase database
2363 Returns : true if successful
2364 Args : optional erase flag
2365 Status : protected
2366
2367 This method implements the initialize() method described above, and
2368 takes the same arguments.
2369
2370 =cut
2371
2372 sub do_initialize {
2373 shift->throw('do_initialize(): must be implemented by an adaptor');
2374 }
2375
2376 =head2 dna
2377
2378 Title : dna
2379 Usage : $db->dna($id,$start,$stop,$class)
2380 Function: return the raw DNA string for a segment
2381 Returns : a raw DNA string
2382 Args : id of the sequence, its class, start and stop positions
2383 Status : public
2384
2385 This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA
2386 sequence.
2387
2388 Arguments: -name sequence name
2389 -start start position
2390 -stop stop position
2391 -class sequence class
2392
2393 If start and stop are both undef, then the entire DNA is retrieved.
2394 So to fetch the whole dna, call like this:
2395
2396 $db->dna($name_of_sequence);
2397
2398 or like this:
2399
2400 $db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence);
2401
2402 NOTE: you will probably prefer to create a Segment and then invoke its
2403 dna() method.
2404
2405 =cut
2406
2407 # call to return the DNA string for the indicated region
2408 # real work is done by get_dna()
2409 sub dna {
2410 my $self = shift;
2411 my ($id,$start,$stop,$class) = rearrange([
2412 [qw(NAME ID REF REFSEQ)],
2413 qw(START),
2414 [qw(STOP END)],
2415 'CLASS',
2416 ],@_);
2417 # return unless defined $start && defined $stop;
2418 $self->get_dna($id,$start,$stop,$class);
2419 }
2420
2421 sub features_in_range {
2422 my $self = shift;
2423 my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) =
2424 rearrange([
2425 [qw(RANGE_TYPE)],
2426 [qw(REF REFSEQ)],
2427 qw(CLASS),
2428 qw(START),
2429 [qw(STOP END)],
2430 [qw(TYPE TYPES)],
2431 qw(PARENT),
2432 [qw(RARE SPARSE)],
2433 [qw(MERGE AUTOMERGE)],
2434 'ITERATOR'
2435 ],@_);
2436 $other ||= {};
2437 $automerge = $types && $self->automerge unless defined $automerge;
2438 $self->throw("range type must be one of {".
2439 join(',',keys %valid_range_types).
2440 "}\n")
2441 unless $valid_range_types{lc $range_type};
2442 $self->_features({
2443 rangetype => lc $range_type,
2444 refseq => $refseq,
2445 refclass => $class,
2446 start => $start,
2447 stop => $stop,
2448 types => $types },
2449 {
2450 sparse => $sparse,
2451 automerge => $automerge,
2452 iterator => $iterator,
2453 %$other,
2454 },
2455 $parent);
2456 }
2457
2458 =head2 get_dna
2459
2460 Title : get_dna
2461 Usage : $db->get_dna($id,$start,$stop,$class)
2462 Function: get DNA for indicated segment
2463 Returns : the dna string
2464 Args : sequence ID, start, stop and class
2465 Status : protected
2466
2467 If start E<gt> stop and the sequence is nucleotide, then this method
2468 should return the reverse complement. The sequence class may be
2469 ignored by those databases that do not recognize different object
2470 types.
2471
2472 =cut
2473
2474 sub get_dna {
2475 my $self = shift;
2476 my ($id,$start,$stop,$class,) = @_;
2477 $self->throw("get_dna() must be implemented by an adaptor");
2478 }
2479
2480 =head2 get_features
2481
2482 Title : get_features
2483 Usage : $db->get_features($search,$options,$callback)
2484 Function: get list of features for a region
2485 Returns : count of number of features retrieved
2486 Args : see below
2487 Status : protected
2488
2489 The first argument is a hash reference containing search criteria for
2490 retrieving features. It contains the following keys:
2491
2492 rangetype One of "overlaps", "contains" or "contained_in". Indicates
2493 the type of range query requested.
2494
2495 refseq ID of the landmark that establishes the absolute
2496 coordinate system.
2497
2498 refclass Class of this landmark. Can be ignored by implementations
2499 that don't recognize such distinctions.
2500
2501 start Start of the range, inclusive.
2502
2503 stop Stop of the range, inclusive.
2504
2505 types Array reference containing the list of annotation types
2506 to fetch from the database. Each annotation type is an
2507 array reference consisting of [source,method].
2508
2509 The second argument is a hash reference containing certain options
2510 that affect the way information is retrieved:
2511
2512 sort_by_group
2513 A flag. If true, means that the returned features should be
2514 sorted by the group that they're in.
2515
2516 sparse A flag. If true, means that the expected density of the
2517 features is such that it will be more efficient to search
2518 by type rather than by range. If it is taking a long
2519 time to fetch features, give this a try.
2520
2521 binsize A true value will create a set of artificial features whose
2522 start and stop positions indicate bins of the given size, and
2523 whose scores are the number of features in the bin. The
2524 class of the feature will be set to "bin", and its name to
2525 "method:source". This is a handy way of generating histograms
2526 of feature density.
2527
2528 The third argument, the $callback, is a code reference to which
2529 retrieved features are passed. It is described in more detail below.
2530
2531 This routine is responsible for getting arrays of GFF data out of the
2532 database and passing them to the callback subroutine. The callback
2533 does the work of constructing a Bio::DB::GFF::Feature object out of
2534 that data. The callback expects a list of 13 fields:
2535
2536 $refseq The reference sequence
2537 $start feature start
2538 $stop feature stop
2539 $source feature source
2540 $method feature method
2541 $score feature score
2542 $strand feature strand
2543 $phase feature phase
2544 $groupclass group class (may be undef)
2545 $groupname group ID (may be undef)
2546 $tstart target start for similarity hits (may be undef)
2547 $tstop target stop for similarity hits (may be undef)
2548 $feature_id A unique feature ID (may be undef)
2549
2550 These fields are in the same order as the raw GFF file, with the
2551 exception that the group column has been parsed into group class and
2552 group name fields.
2553
2554 The feature ID, if provided, is a unique identifier of the feature
2555 line. The module does not depend on this ID in any way, but it is
2556 available via Bio::DB::GFF-E<gt>id() if wanted. In the dbi::mysql and
2557 dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb
2558 adaptor it is not used.
2559
2560 =cut
2561
2562 sub get_features{
2563 my $self = shift;
2564 my ($search,$options,$callback) = @_;
2565 $self->throw("get_features() must be implemented by an adaptor");
2566 }
2567
2568
2569 =head2 _feature_by_name
2570
2571 Title : _feature_by_name
2572 Usage : $db->_feature_by_name($class,$name,$location,$callback)
2573 Function: get a list of features by name and class
2574 Returns : count of number of features retrieved
2575 Args : name of feature, class of feature, and a callback
2576 Status : abstract
2577
2578 This method is used internally. The callback arguments are the same
2579 as those used by make_feature(). This method must be overidden by
2580 subclasses.
2581
2582 =cut
2583
2584 sub _feature_by_name {
2585 my $self = shift;
2586 my ($class,$name,$location,$callback) = @_;
2587 $self->throw("_feature_by_name() must be implemented by an adaptor");
2588 }
2589
2590 sub _feature_by_attribute {
2591 my $self = shift;
2592 my ($attributes,$callback) = @_;
2593 $self->throw("_feature_by_name() must be implemented by an adaptor");
2594 }
2595
2596 =head2 _feature_by_id
2597
2598 Title : _feature_by_id
2599 Usage : $db->_feature_by_id($ids,$type,$callback)
2600 Function: get a feature based
2601 Returns : count of number of features retrieved
2602 Args : arrayref to feature IDs to fetch
2603 Status : abstract
2604
2605 This method is used internally to fetch features either by their ID or
2606 their group ID. $ids is a arrayref containing a list of IDs, $type is
2607 one of "feature" or "group", and $callback is a callback. The
2608 callback arguments are the same as those used by make_feature(). This
2609 method must be overidden by subclasses.
2610
2611 =cut
2612
2613 sub _feature_by_id {
2614 my $self = shift;
2615 my ($ids,$type,$callback) = @_;
2616 $self->throw("_feature_by_id() must be implemented by an adaptor");
2617 }
2618
2619 =head2 overlapping_features
2620
2621 Title : overlapping_features
2622 Usage : $db->overlapping_features(@args)
2623 Function: get features that overlap the indicated range
2624 Returns : a list of Bio::DB::GFF::Feature objects
2625 Args : see below
2626 Status : public
2627
2628 This method is invoked by Bio::DB::GFF::Segment-E<gt>features() to find
2629 the list of features that overlap a given range. It is generally
2630 preferable to create the Segment first, and then fetch the features.
2631
2632 This method takes set of named arguments:
2633
2634 -refseq ID of the reference sequence
2635 -class Class of the reference sequence
2636 -start Start of the desired range in refseq coordinates
2637 -stop Stop of the desired range in refseq coordinates
2638 -types List of feature types to return. Argument is an array
2639 reference containing strings of the format "method:source"
2640 -parent A parent Bio::DB::GFF::Segment object, used to create
2641 relative coordinates in the generated features.
2642 -rare Turn on an optimization suitable for a relatively rare feature type,
2643 where it will be faster to filter by feature type first
2644 and then by position, rather than vice versa.
2645 -merge Whether to apply aggregators to the generated features.
2646 -iterator Whether to return an iterator across the features.
2647
2648 If -iterator is true, then the method returns a single scalar value
2649 consisting of a Bio::SeqIO object. You can call next_seq() repeatedly
2650 on this object to fetch each of the features in turn. If iterator is
2651 false or absent, then all the features are returned as a list.
2652
2653 Currently aggregation is disabled when iterating over a series of
2654 features.
2655
2656 Types are indicated using the nomenclature "method:source". Either of
2657 these fields can be omitted, in which case a wildcard is used for the
2658 missing field. Type names without the colon (e.g. "exon") are
2659 interpreted as the method name and a source wild card. Regular
2660 expressions are allowed in either field, as in: "similarity:BLAST.*".
2661
2662 =cut
2663
2664 # call to return the features that overlap the named region
2665 # real work is done by get_features
2666 sub overlapping_features {
2667 my $self = shift;
2668 $self->features_in_range(-range_type=>'overlaps',@_);
2669 }
2670
2671 =head2 contained_features
2672
2673 Title : contained_features
2674 Usage : $db->contained_features(@args)
2675 Function: get features that are contained within the indicated range
2676 Returns : a list of Bio::DB::GFF::Feature objects
2677 Args : see overlapping_features()
2678 Status : public
2679
2680 This call is similar to overlapping_features(), except that it only
2681 retrieves features whose end points are completely contained within
2682 the specified range.
2683
2684 Generally you will want to fetch a Bio::DB::GFF::Segment object and
2685 call its contained_features() method rather than call this directly.
2686
2687 =cut
2688
2689 # The same, except that it only returns features that are completely contained within the
2690 # range (much faster usually)
2691 sub contained_features {
2692 my $self = shift;
2693 $self->features_in_range(-range_type=>'contains',@_);
2694 }
2695
2696 =head2 contained_in
2697
2698 Title : contained_in
2699 Usage : @features = $s->contained_in(@args)
2700 Function: get features that contain this segment
2701 Returns : a list of Bio::DB::GFF::Feature objects
2702 Args : see features()
2703 Status : Public
2704
2705 This is identical in behavior to features() except that it returns
2706 only those features that completely contain the segment.
2707
2708 =cut
2709
2710 sub contained_in {
2711 my $self = shift;
2712 $self->features_in_range(-range_type=>'contained_in',@_);
2713 }
2714
2715 =head2 get_abscoords
2716
2717 Title : get_abscoords
2718 Usage : $db->get_abscoords($name,$class,$refseq)
2719 Function: get the absolute coordinates of sequence with name & class
2720 Returns : ($absref,$absstart,$absstop,$absstrand)
2721 Args : name and class of the landmark
2722 Status : protected
2723
2724 Given the name and class of a genomic landmark, this function returns
2725 a four-element array consisting of:
2726
2727 $absref the ID of the reference sequence that contains this landmark
2728 $absstart the position at which the landmark starts
2729 $absstop the position at which the landmark stops
2730 $absstrand the strand of the landmark, relative to the reference sequence
2731
2732 If $refseq is provided, the function searches only within the
2733 specified reference sequence.
2734
2735 =cut
2736
2737 sub get_abscoords {
2738 my $self = shift;
2739 my ($name,$class,$refseq) = @_;
2740 $self->throw("get_abscoords() must be implemented by an adaptor");
2741 }
2742
2743 =head2 get_types
2744
2745 Title : get_types
2746 Usage : $db->get_types($absref,$class,$start,$stop,$count)
2747 Function: get list of all feature types on the indicated segment
2748 Returns : list or hash of Bio::DB::GFF::Typename objects
2749 Args : see below
2750 Status : protected
2751
2752 Arguments are:
2753
2754 $absref the ID of the reference sequence
2755 $class the class of the reference sequence
2756 $start the position to start counting
2757 $stop the position to end counting
2758 $count a boolean indicating whether to count the number
2759 of occurrences of each feature type
2760
2761 If $count is true, then a hash is returned. The keys of the hash are
2762 feature type names in the format "method:source" and the values are
2763 the number of times a feature of this type overlaps the indicated
2764 segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename
2765 objects. If $start or $stop are undef, then all features on the
2766 indicated segment are enumerated. If $absref is undef, then the call
2767 returns all feature types in the database.
2768
2769 =cut
2770
2771 sub get_types {
2772 my $self = shift;
2773 my ($refseq,$class,$start,$stop,$count,$types) = @_;
2774 $self->throw("get_types() must be implemented by an adaptor");
2775 }
2776
2777 =head2 make_feature
2778
2779 Title : make_feature
2780 Usage : $db->make_feature(@args)
2781 Function: Create a Bio::DB::GFF::Feature object from string data
2782 Returns : a Bio::DB::GFF::Feature object
2783 Args : see below
2784 Status : internal
2785
2786 This takes 14 arguments (really!):
2787
2788 $parent A Bio::DB::GFF::RelSegment object
2789 $group_hash A hashref containing unique list of GFF groups
2790 $refname The name of the reference sequence for this feature
2791 $refclass The class of the reference sequence for this feature
2792 $start Start of feature
2793 $stop Stop of feature
2794 $source Feature source field
2795 $method Feature method field
2796 $score Feature score field
2797 $strand Feature strand
2798 $phase Feature phase
2799 $group_class Class of feature group
2800 $group_name Name of feature group
2801 $tstart For homologies, start of hit on target
2802 $tstop Stop of hit on target
2803
2804 The $parent argument, if present, is used to establish relative
2805 coordinates in the resulting Bio::DB::Feature object. This allows one
2806 feature to generate a list of other features that are relative to its
2807 coordinate system (for example, finding the coordinates of the second
2808 exon relative to the coordinates of the first).
2809
2810 The $group_hash allows the group_class/group_name strings to be turned
2811 into rich database objects via the make_obect() method (see above).
2812 Because these objects may be expensive to create, $group_hash is used
2813 to uniquefy them. The index of this hash is the composite key
2814 {$group_class,$group_name,$tstart,$tstop}. Values are whatever object
2815 is returned by the make_object() method.
2816
2817 The remainder of the fields are taken from the GFF line, with the
2818 exception that "Target" features, which contain information about the
2819 target of a homology search, are parsed into their components.
2820
2821 =cut
2822
2823 # This call is responsible for turning a line of GFF into a
2824 # feature object.
2825 # The $parent argument is a Bio::DB::GFF::Segment object and is used
2826 # to establish the coordinate system for the new feature.
2827 # The $group_hash argument is an hash ref that holds previously-
2828 # generated group objects.
2829 # Other arguments are taken right out of the GFF table.
2830 sub make_feature {
2831 my $self = shift;
2832 my ($parent,$group_hash, # these arguments provided by generic mechanisms
2833 $srcseq, # the rest is provided by adaptor
2834 $start,$stop,
2835 $source,$method,
2836 $score,$strand,$phase,
2837 $group_class,$group_name,
2838 $tstart,$tstop,
2839 $db_id,$group_id) = @_;
2840
2841 return unless $srcseq; # return undef if called with no arguments. This behavior is used for
2842 # on-the-fly aggregation.
2843
2844 my $group; # undefined
2845 if (defined $group_class && defined $group_name) {
2846 $tstart ||= '';
2847 $tstop ||= '';
2848 if ($group_hash) {
2849 $group = $group_hash->{$group_class,$group_name,$tstart,$tstop}
2850 ||= $self->make_object($group_class,$group_name,$tstart,$tstop);
2851 } else {
2852 $group = $self->make_object($group_class,$group_name,$tstart,$tstop);
2853 }
2854 }
2855
2856 # fix for some broken GFF files
2857 # unfortunately - has undesired side effects
2858 # if (defined $tstart && defined $tstop && !defined $strand) {
2859 # $strand = $tstart <= $tstop ? '+' : '-';
2860 # }
2861
2862 if (ref $parent) { # note that the src sequence is ignored
2863 return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop,
2864 $method,$source,
2865 $score,$strand,$phase,
2866 $group,$db_id,$group_id,
2867 $tstart,$tstop);
2868 } else {
2869 return Bio::DB::GFF::Feature->new($self,$srcseq,
2870 $start,$stop,
2871 $method,$source,
2872 $score,$strand,$phase,
2873 $group,$db_id,$group_id,
2874 $tstart,$tstop);
2875 }
2876 }
2877
2878 sub make_aggregated_feature {
2879 my $self = shift;
2880 my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3);
2881 my $feature = $self->make_feature($parent,undef,@_);
2882 return [$feature] if $feature && !$feature->group;
2883
2884 # if we have accumulated features and either:
2885 # (1) make_feature() returned undef, indicated very end or
2886 # (2) the current group is different from the previous one
2887
2888 local $^W = 0; # irritating uninitialized value warning in next statement
2889 if (@$accumulated_features &&
2890 (!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) {
2891 foreach my $a (@$aggregators) { # last aggregator gets first shot
2892 $a->aggregate($accumulated_features,$self) or next;
2893 }
2894 my @result = @$accumulated_features;
2895 @$accumulated_features = $feature ? ($feature) : ();
2896 return unless @result;
2897 return \@result ;
2898 }
2899 push @$accumulated_features,$feature;
2900 return;
2901 }
2902
2903 =head2 parse_types
2904
2905 Title : parse_types
2906 Usage : $db->parse_types(@args)
2907 Function: parses list of types
2908 Returns : an array ref containing ['method','source'] pairs
2909 Args : a list of types in 'method:source' form
2910 Status : internal
2911
2912 This method takes an array of type names in the format "method:source"
2913 and returns an array reference of ['method','source'] pairs. It will
2914 also accept a single argument consisting of an array reference with
2915 the list of type names.
2916
2917 =cut
2918
2919 # turn feature types in the format "method:source" into a list of [method,source] refs
2920 sub parse_types {
2921 my $self = shift;
2922 return [] if !@_ or !defined($_[0]);
2923 return $_[0] if ref $_[0] eq 'ARRAY' && ref $_[0][0];
2924 my @types = ref($_[0]) ? @{$_[0]} : @_;
2925 my @type_list = map { [split(':',$_,2)] } @types;
2926 return \@type_list;
2927 }
2928
2929 =head2 make_match_sub
2930
2931 Title : make_match_sub
2932 Usage : $db->make_match_sub($types)
2933 Function: creates a subroutine used for filtering features
2934 Returns : a code reference
2935 Args : a list of parsed type names
2936 Status : protected
2937
2938 This method is used internally to generate a code subroutine that will
2939 accept or reject a feature based on its method and source. It takes
2940 an array of parsed type names in the format returned by parse_types(),
2941 and generates an anonymous subroutine. The subroutine takes a single
2942 Bio::DB::GFF::Feature object and returns true if the feature matches
2943 one of the desired feature types, and false otherwise.
2944
2945 =cut
2946
2947 # a subroutine that matches features indicated by list of types
2948 sub make_match_sub {
2949 my $self = shift;
2950 my $types = shift;
2951
2952 return sub { 1 } unless ref $types && @$types;
2953
2954 my @expr;
2955 for my $type (@$types) {
2956 my ($method,$source) = @$type;
2957 $method ||= '.*';
2958 $source = $source ? ":$source" : "(?::.+)?";
2959 push @expr,"${method}${source}";
2960 }
2961 my $expr = join '|',@expr;
2962 return $self->{match_subs}{$expr} if $self->{match_subs}{$expr};
2963
2964 my $sub =<<END;
2965 sub {
2966 my \$feature = shift or return;
2967 return \$feature->type =~ /^($expr)\$/i;
2968 }
2969 END
2970 warn "match sub: $sub\n" if $self->debug;
2971 my $compiled_sub = eval $sub;
2972 $self->throw($@) if $@;
2973 return $self->{match_subs}{$expr} = $compiled_sub;
2974 }
2975
2976 =head2 make_object
2977
2978 Title : make_object
2979 Usage : $db->make_object($class,$name,$start,$stop)
2980 Function: creates a feature object
2981 Returns : a feature object
2982 Args : see below
2983 Status : protected
2984
2985 This method is called to make an object from the GFF "group" field.
2986 By default, all Target groups are turned into Bio::DB::GFF::Homol
2987 objects, and everything else becomes a Bio::DB::GFF::Featname.
2988 However, adaptors are free to override this method to generate more
2989 interesting objects, such as true BioPerl objects, or Acedb objects.
2990
2991 Arguments are:
2992
2993 $name database ID for object
2994 $class class of object
2995 $start for similarities, start of match inside object
2996 $stop for similarities, stop of match inside object
2997
2998 =cut
2999
3000 # abstract call to turn a feature into an object, given its class and name
3001 sub make_object {
3002 my $self = shift;
3003 my ($class,$name,$start,$stop) = @_;
3004 return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop)
3005 if defined $start and length $start;
3006 return Bio::DB::GFF::Featname->new($class,$name);
3007 }
3008
3009
3010 =head2 do_attributes
3011
3012 Title : do_attributes
3013 Usage : $db->do_attributes($id [,$tag]);
3014 Function: internal method to retrieve attributes given an id and tag
3015 Returns : a list of Bio::DB::GFF::Feature objects
3016 Args : a feature id and a attribute tag (optional)
3017 Status : protected
3018
3019 This method is overridden by subclasses in order to return a list of
3020 attributes. If called with a tag, returns the value of attributes of
3021 that tag type. If called without a tag, returns a flattened array of
3022 (tag=E<gt>value) pairs. A particular tag can be present multiple times.
3023
3024 =cut
3025
3026 sub do_attributes {
3027 my $self = shift;
3028 my ($id,$tag) = @_;
3029 return ();
3030 }
3031
3032
3033
3034 =head1 Internal Methods
3035
3036 The following methods are internal to Bio::DB::GFF and are not
3037 guaranteed to remain the same.
3038
3039 =head2 _features
3040
3041 Title : _features
3042 Usage : $db->_features($search,$options,$parent)
3043 Function: internal method
3044 Returns : a list of Bio::DB::GFF::Feature objects
3045 Args : see below
3046 Status : internal
3047
3048 This is an internal method that is called by overlapping_features(),
3049 contained_features() and features() to create features based on a
3050 parent segment's coordinate system. It takes three arguments, a
3051 search options hashref, an options hashref, and a parent segment.
3052
3053 The search hashref contains the following keys:
3054
3055 rangetype One of "overlaps", "contains" or "contained_in". Indicates
3056 the type of range query requested.
3057 refseq reference sequence ID
3058 refclass reference sequence class
3059 start start of range
3060 stop stop of range
3061 types arrayref containing list of types in "method:source" form
3062
3063 The options hashref contains zero or more of the following keys:
3064
3065 sparse turn on optimizations for a rare feature
3066 automerge if true, invoke aggregators to merge features
3067 iterator if true, return an iterator
3068
3069 The $parent argument is a scalar object containing a
3070 Bio::DB::GFF::RelSegment object or descendent.
3071
3072 =cut
3073
3074 #'
3075
3076 sub _features {
3077 my $self = shift;
3078 my ($search,$options,$parent) = @_;
3079 (@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)})
3080 if defined($search->{start}) && $search->{start} > $search->{stop};
3081
3082 my $types = $self->parse_types($search->{types}); # parse out list of types
3083 my @aggregated_types = @$types; # keep a copy
3084
3085 # allow the aggregators to operate on the original
3086 my @aggregators;
3087 if ($options->{automerge}) {
3088 for my $a ($self->aggregators) {
3089 $a = $a->clone if $options->{iterator};
3090 unshift @aggregators,$a
3091 if $a->disaggregate(\@aggregated_types,$self);
3092 }
3093 }
3094
3095 if ($options->{iterator}) {
3096 my @accumulated_features;
3097 my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) }
3098 : sub { [$self->make_feature($parent,undef,@_)] };
3099 return $self->get_features_iterator({ %$search,
3100 types => \@aggregated_types },
3101 { %$options,
3102 sort_by_group => $options->{automerge} },
3103 $callback
3104 );
3105 }
3106
3107 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
3108 my $features = [];
3109
3110 my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) };
3111 $self->get_features({ %$search,
3112 types => \@aggregated_types },
3113 $options,
3114 $callback);
3115
3116 if ($options->{automerge}) {
3117 warn "aggregating...\n" if $self->debug;
3118 foreach my $a (@aggregators) { # last aggregator gets first shot
3119 warn "Aggregator $a:\n" if $self->debug;
3120 $a->aggregate($features,$self);
3121 }
3122 }
3123
3124 @$features;
3125 }
3126
3127 =head2 get_features_iterator
3128
3129 Title : get_features_iterator
3130 Usage : $db->get_features_iterator($search,$options,$callback)
3131 Function: get an iterator on a features query
3132 Returns : a Bio::SeqIO object
3133 Args : as per get_features()
3134 Status : Public
3135
3136 This method takes the same arguments as get_features(), but returns an
3137 iterator that can be used to fetch features sequentially, as per
3138 Bio::SeqIO.
3139
3140 Internally, this method is simply a front end to range_query().
3141 The latter method constructs and executes the query, returning a
3142 statement handle. This routine passes the statement handle to the
3143 constructor for the iterator, along with the callback.
3144
3145 =cut
3146
3147 sub get_features_iterator {
3148 my $self = shift;
3149 my ($search,$options,$callback) = @_;
3150 $self->throw('feature iteration is not implemented in this adaptor');
3151 }
3152
3153 =head2 split_group
3154
3155 Title : split_group
3156 Usage : $db->split_group($group_field,$gff3_flag)
3157 Function: parse GFF group field
3158 Returns : ($gclass,$gname,$tstart,$tstop,$attributes)
3159 Args : the gff group column and a flag indicating gff3 compatibility
3160 Status : internal
3161
3162 This is a method that is called by load_gff_line to parse out the
3163 contents of one or more group fields. It returns the class of the
3164 group, its name, the start and stop of the target, if any, and an
3165 array reference containing any attributes that were stuck into the
3166 group field, in [attribute_name,attribute_value] format.
3167
3168 =cut
3169
3170 sub split_group {
3171 my $self = shift;
3172 my ($group,$gff3) = @_;
3173 if ($gff3) {
3174 my @groups = split /[;&]/,$group; # so easy!
3175 return $self->_split_gff3_group(@groups);
3176 } else {
3177 # handle group parsing
3178 # protect embedded semicolons in the group; there must be faster/more elegant way
3179 # to do this.
3180 $group =~ s/\\;/$;/g;
3181 while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 }
3182 my @groups = split(/\s*;\s*/,$group);
3183 foreach (@groups) { s/$;/;/g }
3184 return $self->_split_gff2_group(@groups);
3185 }
3186 }
3187
3188 =head2 _split_gff2_group
3189
3190 This is an internal method called by split_group().
3191
3192 =cut
3193
3194 sub _split_gff2_group {
3195 my $self = shift;
3196 my @groups = @_;
3197
3198 my ($gclass,$gname,$tstart,$tstop,@attributes);
3199
3200 for (@groups) {
3201
3202 my ($tag,$value) = /^(\S+)(?:\s+(.+))?/;
3203 $value ||= '';
3204 if ($value =~ /^\"(.+)\"$/) { #remove quotes
3205 $value = $1;
3206 }
3207 $value =~ s/\\t/\t/g;
3208 $value =~ s/\\r/\r/g;
3209
3210 # Any additional groups become part of the attributes hash
3211 # For historical reasons, the tag "Note" is treated as an
3212 # attribute, even if it is the only group.
3213 $tag ||= '';
3214 if ($tag eq 'Note' or ($gclass && $gname)) {
3215 push @attributes,[$tag => $value];
3216 }
3217
3218 # if the tag eq 'Target' then the class name is embedded in the ID
3219 # (the GFF format is obviously screwed up here)
3220 elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) {
3221 ($gclass,$gname) = ($1,$2);
3222 ($tstart,$tstop) = / (\d+) (\d+)/;
3223 }
3224
3225 elsif (!$value) {
3226 push @attributes,[Note => $tag]; # e.g. "Confirmed_by_EST"
3227 }
3228
3229 # otherwise, the tag and value correspond to the
3230 # group class and name
3231 else {
3232 ($gclass,$gname) = ($tag,$value);
3233 }
3234 }
3235
3236 return ($gclass,$gname,$tstart,$tstop,\@attributes);
3237 }
3238
3239 =head2 _split_gff3_group
3240
3241 This is called internally from split_group().
3242
3243 =cut
3244
3245 sub _split_gff3_group {
3246 my $self = shift;
3247 my @groups = @_;
3248 my ($gclass,$gname,$tstart,$tstop,@attributes);
3249
3250 for my $group (@groups) {
3251 my ($tag,$value) = split /=/,$group;
3252 $tag = unescape($tag);
3253 my @values = map {unescape($_)} split /,/,$value;
3254 if ($tag eq 'Parent') {
3255 $gclass = 'Sequence';
3256 $gname = shift @values;
3257 }
3258 elsif ($tag eq 'ID') {
3259 $gclass = 'Sequence';
3260 $gname = shift @values;
3261 }
3262 elsif ($tag eq 'Target') {
3263 $gclass = 'Sequence';
3264 ($gname,$tstart,$tstop) = split /\s+/,shift @values;
3265 }
3266 push @attributes,[$tag=>$_] foreach @values;
3267 }
3268 return ($gclass,$gname,$tstart,$tstop,\@attributes);
3269 }
3270
3271 =head2 _delete_features(), _delete_groups(),_delete()
3272
3273 Title : _delete_features(), _delete_groups(),_delete()
3274 Usage : $count = $db->_delete_features(@feature_ids)
3275 $count = $db->_delete_groups(@group_ids)
3276 $count = $db->_delete(\%delete_spec)
3277 Function: low-level feature/group deleter
3278 Returns : count of groups removed
3279 Args : list of feature or group ids removed
3280 Status : for implementation by subclasses
3281
3282 These methods need to be implemented in adaptors. For
3283 _delete_features and _delete_groups, the arguments are a list of
3284 feature or group IDs to remove. For _delete(), the argument is a
3285 hashref with the three keys 'segments', 'types' and 'force'. The
3286 first contains an arrayref of Bio::DB::GFF::RelSegment objects to
3287 delete (all FEATURES within the segment are deleted). The second
3288 contains an arrayref of [method,source] feature types to delete. The
3289 two are ANDed together. If 'force' has a true value, this forces the
3290 operation to continue even if it would delete all features.
3291
3292 =cut
3293
3294 sub _delete_features {
3295 my $self = shift;
3296 my @feature_ids = @_;
3297 $self->throw('_delete_features is not implemented in this adaptor');
3298 }
3299
3300 sub _delete_groups {
3301 my $self = shift;
3302 my @group_ids = @_;
3303 $self->throw('_delete_groups is not implemented in this adaptor');
3304 }
3305
3306 sub _delete {
3307 my $self = shift;
3308 my $delete_options = shift;
3309 $self->throw('_delete is not implemented in this adaptor');
3310 }
3311
3312 sub unescape {
3313 my $v = shift;
3314 $v =~ tr/+/ /;
3315 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
3316 return $v;
3317 }
3318
3319
3320 package Bio::DB::GFF::ID_Iterator;
3321 use strict;
3322
3323 use Bio::Root::Root;
3324 use vars '@ISA';
3325 @ISA = 'Bio::Root::Root';
3326
3327 sub new {
3328 my $class = shift;
3329 my ($db,$ids,$type) = @_;
3330 return bless {ids=>$ids,db=>$db,type=>$type},$class;
3331 }
3332
3333 sub next_seq {
3334 my $self = shift;
3335 my $next = shift @{$self->{ids}};
3336 return unless $next;
3337 my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next;
3338 my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name)
3339 : $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name)
3340 : $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name)
3341 : $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier");
3342 $self->throw("id does not exist") unless $segment;
3343 return $segment;
3344 }
3345
3346 1;
3347
3348 __END__
3349
3350 =head1 BUGS
3351
3352 Features can only belong to a single group at a time. This must be
3353 addressed soon.
3354
3355 Start coordinate can be greater than stop coordinate for relative
3356 addressing. This breaks strict BioPerl compatibility and must be
3357 fixed.
3358
3359 =head1 SEE ALSO
3360
3361 L<bioperl>,
3362 L<Bio::DB::GFF::RelSegment>,
3363 L<Bio::DB::GFF::Aggregator>,
3364 L<Bio::DB::GFF::Feature>,
3365 L<Bio::DB::GFF::Adaptor::dbi::mysqlopt>,
3366 L<Bio::DB::GFF::Adaptor::dbi::oracle>,
3367 L<Bio::DB::GFF::Adaptor::memory>
3368
3369 =head1 AUTHOR
3370
3371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
3372
3373 Copyright (c) 2001 Cold Spring Harbor Laboratory.
3374
3375 This library is free software; you can redistribute it and/or modify
3376 it under the same terms as Perl itself.
3377
3378 =cut
3379