ensembl: variant_effect_predictor/Bio/DB/GFF.pm comparison

comparison variant_effect_predictor/Bio/DB/GFF.pm @ 0:1f6dce3d34e0

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1f6dce3d34e0
+# $Id: GFF.pm,v 1.71.2.2 2003/09/12 13:29:32 lstein Exp $
+=head1 NAME
+Bio::DB::GFF -- Storage and retrieval of sequence annotation data
+=head1 SYNOPSIS
+use Bio::DB::GFF;
+# Open the sequence database
+my $db      = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt',
+-dsn     => 'dbi:mysql:elegans',
+				   -fasta   => '/usr/local/fasta_files'
+				 );
+# fetch a 1 megabase segment of sequence starting at landmark "ZK909"
+my $segment = $db->segment('ZK909', 1 => 1000000);
+# pull out all transcript features
+my @transcripts = $segment->features('transcript');
+# for each transcript, total the length of the introns
+my %totals;
+for my $t (@transcripts) {
+my @introns = $t->Intron;
+$totals{$t->name} += $_->length foreach @introns;
+}
+# Sort the exons of the first transcript by position
+my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon;
+# Get a region 1000 bp upstream of first exon
+my $upstream = $exons[0]->segment(-1000,0);
+# get its DNA
+my $dna = $upstream->dna;
+# and get all curated polymorphisms inside it
+@polymorphisms = $upstream->contained_features('polymorphism:curated');
+# get all feature types in the database
+my @types = $db->types;
+# count all feature types in the segment
+my %type_counts = $segment->types(-enumerate=>1);
+# get an iterator on all curated features of type 'exon' or 'intron'
+my $iterator = $db->get_seq_stream(-type     => ['exon:curated','intron:curated']);
+while (my $s = $iterator->next_seq) {
+print $s,"\n";
+}
+# find all transcripts annotated as having function 'kinase'
+my $iterator = $db->get_seq_stream(-type=>'transcript',
+			             -attributes=>{Function=>'kinase'});
+while (my $s = $iterator->next_seq) {
+print $s,"\n";
+}
+=head1 DESCRIPTION
+Bio::DB::GFF provides fast indexed access to a sequence annotation
+database.  It supports multiple database types (ACeDB, relational),
+and multiple schemas through a system of adaptors and aggregators.
+The following operations are supported by this module:
+- retrieving a segment of sequence based on the ID of a landmark
+- retrieving the DNA from that segment
+- finding all annotations that overlap with the segment
+- finding all annotations that are completely contained within the
+segment
+- retrieving all annotations of a particular type, either within a
+segment, or globally
+- conversion from absolute to relative coordinates and back again,
+using any arbitrary landmark for the relative coordinates
+- using a sequence segment to create new segments based on relative
+offsets
+The data model used by Bio::DB::GFF is compatible with the GFF flat
+file format (http://www.sanger.ac.uk/software/GFF).  The module can
+load a set of GFF files into the database, and serves objects that
+have methods corresponding to GFF fields.
+The objects returned by Bio::DB::GFF are compatible with the
+SeqFeatureI interface, allowing their use by the Bio::Graphics and
+Bio::DAS modules.
+=head2 Auxiliary Scripts
+The bioperl distribution includes several scripts that make it easier
+to work with Bio::DB::GFF databases.  They are located in the scripts
+directory under a subdirectory named Bio::DB::GFF:
+=over 4
+=item bp_load_gff.pl
+This script will load a Bio::DB::GFF database from a flat GFF file of
+sequence annotations.  Only the relational database version of
+Bio::DB::GFF is supported.  It can be used to create the database from
+scratch, as well as to incrementally load new data.
+This script takes a --fasta argument to load raw DNA into the database
+as well.  However, GFF databases do not require access to the raw DNA
+for most of their functionality.
+load_gff.pl also has a --upgrade option, which will perform a
+non-destructive upgrade of older schemas to newer ones.
+=item bp_bulk_load_gff.pl
+This script will populate a Bio::DB::GFF database from a flat GFF file
+of sequence annotations.  Only the MySQL database version of
+Bio::DB::GFF is supported.  It uses the "LOAD DATA INFILE" query in
+order to accelerate loading considerably; however, it can only be used
+for the initial load, and not for updates.
+This script takes a --fasta argument to load raw DNA into the database
+as well.  However, GFF databases do not require access to the raw DNA
+for most of their functionality.
+=item bp_fast_load_gff.pl
+This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe
+tricks to allow for incremental updates.  It only supports the MySQL
+database version of Bio::DB::GFF and is guaranteed not to work on
+non-Unix platforms.
+Arguments are the same as bp_load_gff.pl
+=item gadfly_to_gff.pl
+This script will convert the GFF-like format used by the Berkeley
+Drosophila Sequencing project into a format suitable for use with this
+module.
+=item sgd_to_gff.pl
+This script will convert the tab-delimited feature files used by the
+Saccharomyces Genome Database into a format suitable for use with this
+module.
+=back
+=head2 GFF Fundamentals
+The GFF format is a flat tab-delimited file, each line of which
+corresponds to an annotation, or feature.  Each line has nine columns
+and looks like this:
+Chr1  curated  CDS 365647  365963  .  +  1  Transcript "R119.7"
+The 9 columns are as follows:
+=over 4
+=item 1. reference sequence
+This is the ID of the sequence that is used to establish the
+coordinate system of the annotation.  In the example above, the
+reference sequence is "Chr1".
+=item 2. source
+The source of the annotation.  This field describes how the annotation
+was derived.  In the example above, the source is "curated" to
+indicate that the feature is the result of human curation.  The names
+and versions of software programs are often used for the source field,
+as in "tRNAScan-SE/1.2".
+=item 3. method
+The annotation method.  This field describes the type of the
+annotation, such as "CDS".  Together the method and source describe
+the annotation type.
+=item 4. start position
+The start of the annotation relative to the reference sequence.
+=item 5. stop position
+The stop of the annotation relative to the reference sequence.  Start
+is always less than or equal to stop.
+=item 6. score
+For annotations that are associated with a numeric score (for example,
+a sequence similarity), this field describes the score.  The score
+units are completely unspecified, but for sequence similarities, it is
+typically percent identity.  Annotations that don't have a score can
+use "."
+=item 7. strand
+For those annotations which are strand-specific, this field is the
+strand on which the annotation resides.  It is "+" for the forward
+strand, "-" for the reverse strand, or "." for annotations that are
+not stranded.
+=item 8. phase
+For annotations that are linked to proteins, this field describes the
+phase of the annotation on the codons.  It is a number from 0 to 2, or
+"." for features that have no phase\.
+=item 9. group
+GFF provides a simple way of generating annotation hierarchies ("is
+composed of" relationships) by providing a group field.  The group
+field contains the class and ID of an annotation which is the logical
+parent of the current one.  In the example given above, the group is
+the Transcript named "R119.7".
+The group field is also used to store information about the target of
+sequence similarity hits, and miscellaneous notes.  See the next
+section for a description of how to describe similarity targets.
+The format of the group fields is "Class ID" with a single space (not
+a tab) separating the class from the ID. It is VERY IMPORTANT to
+follow this format, or grouping will not work properly.
+=back
+The sequences used to establish the coordinate system for annotations
+can correspond to sequenced clones, clone fragments, contigs or
+super-contigs.  Thus, this module can be used throughout the lifecycle
+of a sequencing project.
+In addition to a group ID, the GFF format allows annotations to have a
+group class.  For example, in the ACeDB representation, RNA
+interference experiments have a class of "RNAi" and an ID that is
+unique among the RNAi experiments.  Since not all databases support
+this notion, the class is optional in all calls to this module, and
+defaults to "Sequence" when not provided.
+Double-quotes are sometimes used in GFF files around components of the
+group field.  Strictly, this is only necessary if the group name or
+class contains whitespace.
+=head2 Making GFF files work with this module
+Some annotations do not need to be individually named.  For example,
+it is probably not useful to assign a unique name to each ALU repeat
+in a vertebrate genome.  Others, such as predicted genes, correspond
+to named biological objects; you probably want to be able to fetch the
+positions of these objects by referring to them by name.
+To accomodate named annotations, the GFF format places the object
+class and name in the group field.  The name identifies the object,
+and the class prevents similarly-named objects, for example clones and
+sequences, from collding.
+A named object is shown in the following excerpt from a GFF file:
+Chr1  curated transcript  939627 942410 . +  . Transcript Y95B8A.2
+This object is a predicted transcript named Y95BA.2.  In this case,
+the group field is used to identify the class and name of the object,
+even though no other annotation belongs to that group.
+It now becomes possible to retrieve the region of the genome covered
+by transcript Y95B8A.2 using the segment() method:
+$segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2');
+It is not necessary for the annotation's method to correspond to the
+object class, although this is commonly the case.
+As explained above, each annotation in a GFF file refers to a
+reference sequence.  It is important that each reference sequence also
+be identified by a line in the GFF file.  This allows the Bio::DB::GFF
+module to determine the length and class of the reference sequence,
+and makes it possible to do relative arithmetic.
+For example, if "Chr1" is used as a reference sequence, then it should
+have an entry in the GFF file similar to this one:
+Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1
+This indicates that the reference sequence named "Chr1" has length
+14972282 bp, method "chromosome" and source "assembly".  In addition,
+as indicated by the group field, Chr1 has class "Sequence" and name
+"Chr1".
+The object class "Sequence" is used by default when the class is not
+specified in the segment() call.  This allows you to use a shortcut
+form of the segment() method:
+$segment = $db->segment('Chr1');          # whole chromosome
+$segment = $db->segment('Chr1',1=>1000);  # first 1000 bp
+For your convenience, if, during loading a GFF file, Bio::DB::GFF
+encounters a line like the following:
+##sequence-region Chr1 1 14972282
+It will automatically generate the following entry:
+Chr1 reference Component 1 14972282 . + . Sequence Chr1
+This is sufficient to use Chr1 as a reference point.
+The ##sequence-region line is frequently found in the GFF files
+distributed by annotation groups.
+=head2 Sequence alignments
+There are two cases in which an annotation indicates the relationship
+between two sequences.  The first case is a similarity hit, where the
+annotation indicates an alignment.  The second case is a map assembly,
+in which the annotation indicates that a portion of a larger sequence
+is built up from one or more smaller ones.
+Both cases are indicated by using the B<Target> tag in the group
+field.  For example, a typical similarity hit will look like this:
+Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544
+The group field contains the Target tag, followed by an identifier for
+the biological object referred to.  The GFF format uses the notation
+I<Class>:I<Name> for the biological object, and even though this is
+stylistically inconsistent, that's the way it's done.  The object
+identifier is followed by two integers indicating the start and stop
+of the alignment on the target sequence.
+Unlike the main start and stop columns, it is possible for the target
+start to be greater than the target end.  The previous example
+indicates that the the section of Chr1 from 76,953 to 77,108 aligns to
+the protein SW:ABL_DROME starting at position 493 and extending to
+position 544.
+A similar notation is used for sequence assembly information as shown
+in this example:
+Chr1        assembly Link   10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826
+LINK_H06O01 assembly Cosmid 32386    64122    . . . Target Sequence:F49B2       6 31742
+This indicates that the region between bases 10922906 and 11177731 of
+Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826.  The region
+of LINK_H0601 between 32386 and 64122 is, in turn, composed of the
+bases 5 to 31742 of cosmid F49B2.
+=head2 Attributes
+While not intended to serve as a general-purpose sequence database
+(see bioperl-db for that), GFF allows you to tag features with
+arbitrary attributes.  Attributes appear in the Group field following
+the initial class/name pair.  For example:
+Chr1  cur trans  939 942 . +  . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3
+This line tags the feature named Transcript Y95B8A.2 as being "Gene"
+named sma-3 and having the Alias "sma3".  Features having these
+attributes can be looked up using the fetch_feature_by_attribute() method.
+Two attributes have special meaning: "Note" is for backward
+compatibility and is used for unstructured text remarks.  "Alias" is
+considered as a synonym for the feature name and will be consulted
+when looking up a feature by its name.
+=head2 Adaptors and Aggregators
+This module uses a system of adaptors and aggregators in order to make
+it adaptable to use with a variety of databases.
+=over 4
+=item Adaptors
+The core of the module handles the user API, annotation coordinate
+arithmetic, and other common issues.  The details of fetching
+information from databases is handled by an adaptor, which is
+specified during Bio::DB::GFF construction.  The adaptor encapsulates
+database-specific information such as the schema, user authentication
+and access methods.
+Currently there are two adaptors: 'dbi::mysql' and 'dbi::mysqlopt'.
+The former is an interface to a simple Mysql schema.  The latter is an
+optimized version of dbi::mysql which uses a binning scheme to
+accelerate range queries and the Bio::DB::Fasta module for rapid
+retrieval of sequences.  Note the double-colon between the words.
+=item Aggregators
+The GFF format uses a "group" field to indicate aggregation properties
+of individual features.  For example, a set of exons and introns may
+share a common transcript group, and multiple transcripts may share
+the same gene group.
+Aggregators are small modules that use the group information to
+rebuild the hierarchy.  When a Bio::DB::GFF object is created, you
+indicate that it use a set of one or more aggregators.  Each
+aggregator provides a new composite annotation type.  Before the
+database query is generated each aggregator is called to
+"disaggregate" its annotation type into list of component types
+contained in the database.  After the query is generated, each
+aggregator is called again in order to build composite annotations
+from the returned components.
+For example, during disaggregation, the standard
+"processed_transcript" aggregator generates a list of component
+feature types including "UTR", "CDS", and "polyA_site".  Later, it
+aggregates these features into a set of annotations of type
+"processed_transcript".
+During aggregation, the list of aggregators is called in reverse
+order.  This allows aggregators to collaborate to create multi-level
+structures: the transcript aggregator assembles transcripts from
+introns and exons; the gene aggregator then assembles genes from sets
+of transcripts.
+Three default aggregators are provided:
+transcript   assembles transcripts from features of type
+exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA
+clone        assembles clones from Clone_left_end, Clone_right_end
+and Sequence features.
+alignment    assembles gapped alignments from features of type
+"similarity".
+In addition, this module provides the optional "wormbase_gene"
+aggregator, which accomodates the WormBase representation of genes.
+This aggregator aggregates features of method "exon", "CDS", "5'UTR",
+"3'UTR", "polyA" and "TSS" into a single object.  It also expects to
+find a single feature of type "Sequence" that spans the entire gene.
+The existing aggregators are easily customized.
+Note that aggregation will not occur unless you specifically request
+the aggregation type.  For example, this call:
+@features = $segment->features('alignment');
+will generate an array of aggregated alignment features.  However,
+this call:
+@features = $segment->features();
+will return a list of unaggregated similarity segments.
+For more informnation, see the manual pages for
+Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone,
+etc.
+=back
+=head1 API
+The following is the API for Bio::DB::GFF.
+=cut
+package Bio::DB::GFF;
+use strict;
+use Bio::DB::GFF::Util::Rearrange;
+use Bio::DB::GFF::RelSegment;
+use Bio::DB::GFF::Feature;
+use Bio::DB::GFF::Aggregator;
+use Bio::DasI;
+use Bio::Root::Root;
+use vars qw(@ISA $VERSION);
+@ISA = qw(Bio::Root::Root Bio::DasI);
+$VERSION = '1.2003';
+my %valid_range_types = (overlaps     => 1,
+			 contains     => 1,
+			 contained_in => 1);
+=head1 Querying GFF Databases
+=head2 new
+Title   : new
+Usage   : my $db = new Bio::DB::GFF(@args);
+Function: create a new Bio::DB::GFF object
+Returns : new Bio::DB::GFF object
+Args    : lists of adaptors and aggregators
+Status  : Public
+These are the arguments:
+-adaptor      Name of the adaptor module to use.  If none
+provided, defaults to "dbi::mysqlopt".
+-aggregator   Array reference to a list of aggregators
+to apply to the database.  If none provided,
+	       defaults to ['processed_transcript','alignment'].
+<other>      Any other named argument pairs are passed to
+the adaptor for processing.
+The adaptor argument must correspond to a module contained within the
+Bio::DB::GFF::Adaptor namespace.  For example, the
+Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying
+'dbi::mysql'.  By Perl convention, the adaptors names are lower case
+because they are loaded at run time.
+The aggregator array may contain a list of aggregator names, a list of
+initialized aggregator objects, or a string in the form
+"aggregator_name{subpart1,subpart2,subpart3/main_method}" (the
+/main_method part is optional).  For example, if you wish to change
+the components aggregated by the transcript aggregator, you could pass
+it to the GFF constructor this way:
+my $transcript =
+Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr
+polyA spliced_leader)]);
+my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment],
+-adaptor   => 'dbi::mysql',
+-dsn      => 'dbi:mysql:elegans42');
+Alternatively, you could create an entirely new transcript aggregator
+this way:
+my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}';
+my $db      = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment],
+-adaptor   => 'dbi::mysql',
+-dsn       => 'dbi:mysql:elegans42');
+See L<Bio::DB::GFF::Aggregator> for more details.
+The commonly used 'dbi::mysql' adaptor recognizes the following
+adaptor-specific arguments:
+Argument       Description
+--------       -----------
+-dsn           the DBI data source, e.g. 'dbi:mysql:ens0040'
+If a partial name is given, such as "ens0040", the
+"dbi:mysql:" prefix will be added automatically.
+-user          username for authentication
+-pass          the password for authentication
+-refclass      landmark Class; defaults to "Sequence"
+The commonly used 'dbi::mysqlopt' adaptor also recogizes the following
+arguments.
+Argument       Description
+--------       -----------
+-fasta         path to a directory containing FASTA files for the DNA
+contained in this database (e.g. "/usr/local/share/fasta")
+-acedb         an acedb URL to use when converting features into ACEDB
+objects (e.g. sace://localhost:2005)
+=cut
+#'
+sub new {
+my $package   = shift;
+my ($adaptor,$aggregators,$args,$refclass);
+if (@_ == 1) {  # special case, default to dbi::mysqlopt
+$adaptor = 'dbi::mysqlopt';
+$args = {DSN => shift};
+} else {
+($adaptor,$aggregators,$refclass,$args) = rearrange([
+							 [qw(ADAPTOR FACTORY)],
+							 [qw(AGGREGATOR AGGREGATORS)],
+							 'REFCLASS',
+							],@_);
+}
+$adaptor    ||= 'dbi::mysqlopt';
+my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E";
+eval "require $class" unless $class->can('new');
+$package->throw("Unable to load $adaptor adaptor: $@") if $@;
+my $self = $class->new($args);
+$self->default_class($refclass) if defined $refclass;
+# handle the aggregators.
+# aggregators are responsible for creating complex multi-part features
+# from the GFF "group" field.  If none are provided, then we provide a
+# list of the two used in WormBase.
+# Each aggregator can be a scalar or a ref.  In the former case
+# it is treated as a class name to call new() on.  In the latter
+# the aggreator is treated as a ready made object.
+$aggregators = $self->default_aggregators unless defined $aggregators;
+my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators;
+for my $a (@a) {
+$self->add_aggregator($a);
+}
+# default settings go here.....
+$self->automerge(1);  # set automerge to true
+$self;
+}
+=head2 types
+Title   : types
+Usage   : $db->types(@args)
+Function: return list of feature types in range or database
+Returns : a list of Bio::DB::GFF::Typename objects
+Args    : see below
+Status  : public
+This routine returns a list of feature types known to the database.
+The list can be database-wide or restricted to a region.  It is also
+possible to find out how many times each feature occurs.
+For range queries, it is usually more convenient to create a
+Bio::DB::GFF::Segment object, and then invoke it's types() method.
+Arguments are as follows:
+-ref        ID of reference sequence
+-class      class of reference sequence
+-start      start of segment
+-stop       stop of segment
+-enumerate  if true, count the features
+The returned value will be a list of Bio::DB::GFF::Typename objects,
+which if evaluated in a string context will return the feature type in
+"method:source" format.  This object class also has method() and
+source() methods for retrieving the like-named fields.
+If -enumerate is true, then the function returns a hash (not a hash
+reference) in which the keys are type names in "method:source" format
+and the values are the number of times each feature appears in the
+database or segment.
+The argument -end is a synonum for -stop, and -count is a synonym for
+-enumerate.
+=cut
+sub types {
+my $self = shift;
+my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([
+								      [qw(REF REFSEQ)],
+								      qw(START),
+								      [qw(STOP END)],
+								      [qw(ENUMERATE COUNT)],
+								      [qw(CLASS SEQCLASS)],
+								      [qw(TYPE TYPES)],
+								     ],@_);
+$types = $self->parse_types($types) if defined $types;
+$self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types);
+}
+=head2 classes
+Title   : classes
+Usage   : $db->classes
+Function: return list of landmark classes in database
+Returns : a list of classes
+Args    : none
+Status  : public
+This routine returns the list of reference classes known to the
+database, or empty if classes are not used by the database.  Classes
+are distinct from types, being essentially qualifiers on the reference
+namespaces.
+=cut
+sub classes {
+my $self = shift;
+return ();
+}
+=head2 segment
+Title   : segment
+Usage   : $db->segment(@args);
+Function: create a segment object
+Returns : segment object(s)
+Args    : numerous, see below
+Status  : public
+This method generates a segment object, which is a Perl object
+subclassed from Bio::DB::GFF::Segment.  The segment can be used to
+find overlapping features and the raw DNA.
+When making the segment() call, you specify the ID of a sequence
+landmark (e.g. an accession number, a clone or contig), and a
+positional range relative to the landmark.  If no range is specified,
+then the entire extent of the landmark is used to generate the
+segment.
+You may also provide the ID of a "reference" sequence, which will set
+the coordinate system and orientation used for all features contained
+within the segment.  The reference sequence can be changed later.  If
+no reference sequence is provided, then the coordinate system is based
+on the landmark.
+Arguments:
+-name         ID of the landmark sequence.
+-class        Database object class for the landmark sequence.
+"Sequence" assumed if not specified.  This is
+irrelevant for databases which do not recognize
+object classes.
+-start        Start of the segment relative to landmark.  Positions
+follow standard 1-based sequence rules.  If not specified,
+defaults to the beginning of the landmark.
+-end          Stop of the segment relative to the landmark.  If not specified,
+defaults to the end of the landmark.
+-stop         Same as -end.
+-offset       For those who prefer 0-based indexing, the offset specifies the
+position of the new segment relative to the start of the landmark.
+-length       For those who prefer 0-based indexing, the length specifies the
+length of the new segment.
+-refseq       Specifies the ID of the reference landmark used to establish the
+coordinate system for the newly-created segment.
+-refclass     Specifies the class of the reference landmark, for those databases
+that distinguish different object classes.  Defaults to "Sequence".
+-absolute
+Return features in absolute coordinates rather than relative to the
+parent segment.
+-nocheck      Don't check the database for the coordinates and length of this
+feature.  Construct a segment using the indicated name as the
+reference, a start coordinate of 1, an undefined end coordinate,
+and a strand of +1.
+-force        Same as -nocheck.
+-seq,-sequence,-sourceseq   Aliases for -name.
+-begin,-end   Aliases for -start and -stop
+-off,-len     Aliases for -offset and -length
+-seqclass     Alias for -class
+Here's an example to explain how this works:
+my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql');
+If successful, $db will now hold the database accessor object.  We now
+try to fetch the fragment of sequence whose ID is A0000182 and class
+is "Accession."
+my $segment = $db->segment(-name=>'A0000182',-class=>'Accession');
+If successful, $segment now holds the entire segment corresponding to
+this accession number.  By default, the sequence is used as its own
+reference sequence, so its first base will be 1 and its last base will
+be the length of the accession.
+Assuming that this sequence belongs to a longer stretch of DNA, say a
+contig, we can fetch this information like so:
+my $sourceseq = $segment->sourceseq;
+and find the start and stop on the source like this:
+my $start = $segment->abs_start;
+my $stop = $segment->abs_stop;
+If we had another segment, say $s2, which is on the same contiguous
+piece of DNA, we can pass that to the refseq() method in order to
+establish it as the coordinate reference point:
+$segment->refseq($s2);
+Now calling start() will return the start of the segment relative to
+the beginning of $s2, accounting for differences in strandedness:
+my $rel_start = $segment->start;
+IMPORTANT NOTE: This method can be used to return the segment spanned
+by an arbitrary named annotation.  However, if the annotation appears
+at multiple locations on the genome, for example an EST that maps to
+multiple locations, then, provided that all locations reside on the
+same physical segment, the method will return a segment that spans the
+minimum and maximum positions.  If the reference sequence occupies
+ranges on different physical segments, then it returns them all in an
+array context, and raises a "multiple segment exception" exception in
+a scalar context.
+=cut
+#'
+sub segment {
+my $self = shift;
+my @segments =  Bio::DB::GFF::RelSegment->new(-factory => $self,
+						$self->setup_segment_args(@_));
+foreach (@segments) {
+$_->absolute(1) if $self->absolute;
+}
+$self->_multiple_return_args(@segments);
+}
+sub _multiple_return_args {
+my $self = shift;
+my @args = @_;
+if (@args == 0) {
+return;
+} elsif (@args == 1) {
+return $args[0];
+} elsif (wantarray) { # more than one reference sequence
+return @args;
+} else {
+$self->error($args[0]->name,
+		 " has more than one reference sequence in database.  Please call in a list context to retrieve them all.");
+$self->throw('multiple segment exception');
+return;
+}
+}
+# backward compatibility -- don't use!
+# (deliberately undocumented too)
+sub abs_segment {
+my $self = shift;
+return $self->segment($self->setup_segment_args(@_),-absolute=>1);
+}
+sub setup_segment_args {
+my $self = shift;
+return @_ if defined $_[0] && $_[0] =~ /^-/;
+return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3;
+return (-class=>$_[0],-name=>$_[1])              if @_ == 2;
+return (-name=>$_[0])                            if @_ == 1;
+}
+=head2 features
+Title   : features
+Usage   : $db->features(@args)
+Function: get all features, possibly filtered by type
+Returns : a list of Bio::DB::GFF::Feature objects
+Args    : see below
+Status  : public
+This routine will retrieve features in the database regardless of
+position.  It can be used to return all features, or a subset based on
+their method and source.
+Arguments are as follows:
+-types     List of feature types to return.  Argument is an array
+	     reference containing strings of the format "method:source"
+-merge     Whether to apply aggregators to the generated features.
+-rare      Turn on optimizations suitable for a relatively rare feature type,
+where it makes more sense to filter by feature type first,
+and then by position.
+-attributes A hash reference containing attributes to match.
+-iterator  Whether to return an iterator across the features.
+-binsize   A true value will create a set of artificial features whose
+start and stop positions indicate bins of the given size, and
+whose scores are the number of features in the bin.  The
+class and method of the feature will be set to "bin",
+its source to "method:source", and its group to "bin:method:source".
+This is a handy way of generating histograms of feature density.
+If -iterator is true, then the method returns a single scalar value
+consisting of a Bio::SeqIO object.  You can call next_seq() repeatedly
+on this object to fetch each of the features in turn.  If iterator is
+false or absent, then all the features are returned as a list.
+Currently aggregation is disabled when iterating over a series of
+features.
+Types are indicated using the nomenclature "method:source".  Either of
+these fields can be omitted, in which case a wildcard is used for the
+missing field.  Type names without the colon (e.g. "exon") are
+interpreted as the method name and a source wild card.  Regular
+expressions are allowed in either field, as in: "similarity:BLAST.*".
+The -attributes argument is a hashref containing one or more attributes
+to match against:
+-attributes => { Gene => 'abc-1',
+Note => 'confirmed' }
+Attribute matching is simple string matching, and multiple attributes
+are ANDed together.
+=cut
+sub features {
+my $self = shift;
+my ($types,$automerge,$sparse,$iterator,$other);
+if (defined $_[0] &&
+$_[0] =~ /^-/) {
+($types,$automerge,$sparse,$iterator,$other) = rearrange([
+							      [qw(TYPE TYPES)],
+							      [qw(MERGE AUTOMERGE)],
+							      [qw(RARE SPARSE)],
+							      'ITERATOR'
+							     ],@_);
+} else {
+$types = \@_;
+}
+# for whole database retrievals, we probably don't want to automerge!
+$automerge = $self->automerge unless defined $automerge;
+$other ||= {};
+$self->_features({
+		    rangetype => 'contains',
+		    types     => $types,
+		   },
+		   { sparse    => $sparse,
+		     automerge => $automerge,
+		     iterator  =>$iterator,
+		     %$other,
+		   }
+		   );
+}
+=head2 get_seq_stream
+Title   : get_seq_stream
+Usage   : my $seqio = $self->get_seq_sream(@args)
+Function: Performs a query and returns an iterator over it
+Returns : a Bio::SeqIO stream capable of producing sequence
+Args    : As in features()
+Status  : public
+This routine takes the same arguments as features(), but returns a
+Bio::SeqIO::Stream-compliant object.  Use it like this:
+$stream = $db->get_seq_stream('exon');
+while (my $exon = $stream->next_seq) {
+print $exon,"\n";
+}
+NOTE: This is also called get_feature_stream(), since that's what it
+really does.
+=cut
+sub get_seq_stream {
+my $self = shift;
+my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1)
+: (-types=>\@_,-iterator=>1);
+$self->features(@args);
+}
+*get_feature_stream = \&get_seq_stream;
+=head2 get_feature_by_name
+Title   : get_feature_by_name
+Usage   : $db->get_feature_by_name($class => $name)
+Function: fetch features by their name
+Returns : a list of Bio::DB::GFF::Feature objects
+Args    : the class and name of the desired feature
+Status  : public
+This method can be used to fetch a named feature from the database.
+GFF annotations are named using the group class and name fields, so
+for features that belong to a group of size one, this method can be
+used to retrieve that group (and is equivalent to the segment()
+method).  Any Alias attributes are also searched for matching names.
+An alternative syntax allows you to search for features by name within
+a circumscribed region:
+@f = $db->get_feature_by_name(-class => $class,-name=>$name,
+-ref   => $sequence_name,
+-start => $start,
+-end   => $end);
+This method may return zero, one, or several Bio::DB::GFF::Feature
+objects.
+Aggregation is performed on features as usual.
+NOTE: At various times, this function was called fetch_group(),
+fetch_feature(), fetch_feature_by_name() and segments().  These names
+are preserved for backward compatibility.
+=cut
+sub get_feature_by_name {
+my $self = shift;
+my ($gclass,$gname,$automerge,$ref,$start,$end);
+if (@_ == 1) {
+$gclass = $self->default_class;
+$gname  = shift;
+} else  {
+($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE',
+							      ['REF','REFSEQ'],
+							      'START',['STOP','END']
+							     ],@_);
+$gclass ||= $self->default_class;
+}
+$automerge = $self->automerge unless defined $automerge;
+# we need to refactor this... It's repeated code (see below)...
+my @aggregators;
+if ($automerge) {
+for my $a ($self->aggregators) {
+push @aggregators,$a if $a->disaggregate([],$self);
+}
+}
+my %groups;         # cache the groups we create to avoid consuming too much unecessary memory
+my $features = [];
+my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
+my $location = [$ref,$start,$end] if defined $ref;
+$self->_feature_by_name($gclass,$gname,$location,$callback);
+warn "aggregating...\n" if $self->debug;
+foreach my $a (@aggregators) {  # last aggregator gets first shot
+$a->aggregate($features,$self) or next;
+}
+@$features;
+}
+# horrible indecision regarding proper names!
+*fetch_group   = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name;
+*segments      = \&segment;
+=head2 get_feature_by_target
+Title   : get_feature_by_target
+Usage   : $db->get_feature_by_target($class => $name)
+Function: fetch features by their similarity target
+Returns : a list of Bio::DB::GFF::Feature objects
+Args    : the class and name of the desired feature
+Status  : public
+This method can be used to fetch a named feature from the database
+based on its similarity hit.
+=cut
+sub get_feature_by_target {
+shift->get_feature_by_name(@_);
+}
+=head2 get_feature_by_attribute
+Title   : get_feature_by_attribute
+Usage   : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2)
+Function: fetch segments by combinations of attribute values
+Returns : a list of Bio::DB::GFF::Feature objects
+Args    : the class and name of the desired feature
+Status  : public
+This method can be used to fetch a set of features from the database.
+Attributes are a list of name=E<gt>value pairs.  They will be logically
+ANDED together.
+=cut
+sub get_feature_by_attribute {
+my $self = shift;
+my %attributes = ref($_[0]) ? %{$_[0]} : @_;
+# we need to refactor this... It's repeated code (see above)...
+my @aggregators;
+if ($self->automerge) {
+for my $a ($self->aggregators) {
+unshift @aggregators,$a if $a->disaggregate([],$self);
+}
+}
+my %groups;         # cache the groups we create to avoid consuming too much unecessary memory
+my $features = [];
+my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
+$self->_feature_by_attribute(\%attributes,$callback);
+warn "aggregating...\n" if $self->debug;
+foreach my $a (@aggregators) {  # last aggregator gets first shot
+$a->aggregate($features,$self) or next;
+}
+@$features;
+}
+# more indecision...
+*fetch_feature_by_attribute = \&get_feature_by_attribute;
+=head2 get_feature_by_id
+Title   : get_feature_by_id
+Usage   : $db->get_feature_by_id($id)
+Function: fetch segments by feature ID
+Returns : a Bio::DB::GFF::Feature object
+Args    : the feature ID
+Status  : public
+This method can be used to fetch a feature from the database using its
+ID.  Not all GFF databases support IDs, so be careful with this.
+=cut
+sub get_feature_by_id {
+my $self = shift;
+my $id   = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
+my %groups;         # cache the groups we create to avoid consuming too much unecessary memory
+my $features = [];
+my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
+$self->_feature_by_id($id,'feature',$callback);
+return wantarray ? @$features : $features->[0];
+}
+*fetch_feature_by_id = \&get_feature_by_id;
+=head2 get_feature_by_gid
+Title   : get_feature_by_gid
+Usage   : $db->get_feature_by_gid($id)
+Function: fetch segments by feature ID
+Returns : a Bio::DB::GFF::Feature object
+Args    : the feature ID
+Status  : public
+This method can be used to fetch a feature from the database using its
+group ID.  Not all GFF databases support IDs, so be careful with this.
+The group ID is often more interesting than the feature ID, since
+groups can be complex objects containing subobjects.
+=cut
+sub get_feature_by_gid {
+my $self = shift;
+my $id   = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
+my %groups;         # cache the groups we create to avoid consuming too much unecessary memory
+my $features = [];
+my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
+$self->_feature_by_id($id,'group',$callback);
+return wantarray ? @$features : $features->[0];
+}
+*fetch_feature_by_gid = \&get_feature_by_gid;
+=head2 delete_features
+Title   : delete_features
+Usage   : $db->delete_features(@ids_or_features)
+Function: delete one or more features
+Returns : count of features deleted
+Args    : list of features or feature ids
+Status  : public
+Pass this method a list of numeric feature ids or a set of features.
+It will attempt to remove the features from the database and return a
+count of the features removed.
+NOTE: This method is also called delete_feature().  Also see
+delete_groups().
+=cut
+*delete_feature = \&delete_features;
+sub delete_features {
+my $self = shift;
+my @features_or_ids = @_;
+my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids;
+return unless @ids;
+$self->_delete_features(@ids);
+}
+=head2 delete_groups
+Title   : delete_groups
+Usage   : $db->delete_groups(@ids_or_features)
+Function: delete one or more feature groups
+Returns : count of features deleted
+Args    : list of features or feature group ids
+Status  : public
+Pass this method a list of numeric group ids or a set of features.  It
+will attempt to recursively remove the features and ALL members of
+their group from the database.  It returns a count of the number of
+features (not groups) returned.
+NOTE: This method is also called delete_group().  Also see
+delete_features().
+=cut
+*delete_group = \&delete_groupss;
+sub delete_groups {
+my $self = shift;
+my @features_or_ids = @_;
+my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids;
+return unless @ids;
+$self->_delete_groups(@ids);
+}
+=head2 delete
+Title   : delete
+Usage   : $db->delete(@args)
+Function: delete features
+Returns : count of features deleted -- if available
+Args    : numerous, see below
+Status  : public
+This method deletes all features that overlap the specified region or
+are of a particular type.  If no arguments are provided and the -force
+argument is true, then deletes ALL features.
+Arguments:
+-name         ID of the landmark sequence.
+-ref          ID of the landmark sequence (synonym for -name).
+-class        Database object class for the landmark sequence.
+"Sequence" assumed if not specified.  This is
+irrelevant for databases which do not recognize
+object classes.
+-start        Start of the segment relative to landmark.  Positions
+follow standard 1-based sequence rules.  If not specified,
+defaults to the beginning of the landmark.
+-end          Stop of the segment relative to the landmark.  If not specified,
+defaults to the end of the landmark.
+-offset       Zero-based addressing
+-length       Length of region
+-type,-types  Either a single scalar type to be deleted, or an
+reference to an array of types.
+-force        Force operation to be performed even if it would delete
+entire feature table.
+-range_type   Control the range type of the deletion.  One of "overlaps" (default)
+"contains" or "contained_in"
+Examples:
+$db->delete(-type=>['intron','repeat:repeatMasker']);  # remove all introns & repeats
+$db->delete(-name=>'chr3',-start=>1,-end=>1000);       # remove annotations on chr3 from 1 to 1000
+$db->delete(-name=>'chr3',-type=>'exon');              # remove all exons on chr3
+The short form of this call, as described in segment() is also allowed:
+$db->delete("chr3",1=>1000);
+$db->delete("chr3");
+IMPORTANT NOTE: This method only deletes features.  It does *NOT*
+delete the names of groups that contain the deleted features.  Group
+IDs will be reused if you later load a feature with the same group
+name as one that was previously deleted.
+NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the
+result code from the SQL DELETE operation.  Some dbd drivers return the
+count of rows deleted, while others return 0E0.  Caveat emptor.
+=cut
+sub delete {
+my $self = shift;
+my @args = $self->setup_segment_args(@_);
+my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) =
+rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET',
+	       'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args);
+$offset = 0 unless defined $offset;
+$start = $offset+1 unless defined $start;
+$end   = $start+$length-1 if !defined $end and $length;
+$class ||= $self->default_class;
+my $types = $self->parse_types($type);  # parse out list of types
+$range_type ||= 'overlaps';
+$self->throw("range type must be one of {".
+	       join(',',keys %valid_range_types).
+	       "}\n")
+unless $valid_range_types{lc $range_type};
+my @segments;
+if (defined $name && $name ne '') {
+my @args = (-name=>$name,-class=>$class);
+push @args,(-start=>$start) if defined $start;
+push @args,(-end  =>$end)   if defined $end;
+@segments = $self->segment(@args);
+return unless @segments;
+}
+$self->_delete({segments   => \@segments,
+		  types      => $types,
+		  range_type => $range_type,
+		  force      => $force}
+		);
+}
+=head2 absolute
+Title   : absolute
+Usage   : $abs = $db->absolute([$abs]);
+Function: gets/sets absolute mode
+Returns : current setting of absolute mode boolean
+Args    : new setting for absolute mode boolean
+Status  : public
+$db-E<gt>absolute(1) will turn on absolute mode for the entire database.
+All segments retrieved will use absolute coordinates by default,
+rather than relative coordinates.  You can still set them to use
+relative coordinates by calling $segment-E<gt>absolute(0).
+Note that this is not the same as calling abs_segment(); it continues
+to allow you to look up groups that are not used directly as reference
+sequences.
+=cut
+sub absolute {
+my $self = shift;
+my $d = $self->{absolute};
+$self->{absolute} = shift if @_;
+$d;
+}
+=head2 strict_bounds_checking
+Title   : strict_bounds_checking
+Usage   : $flag = $db->strict_bounds_checking([$flag])
+Function: gets/sets strict bounds checking
+Returns : current setting of bounds checking flag
+Args    : new setting for bounds checking flag
+Status  : public
+This flag enables extra checks for segment requests that go beyond the
+ends of their reference sequences.  If bounds checking is enabled,
+then retrieved segments will be truncated to their physical length,
+and their truncated() methods will return true.
+If the flag is off (the default), then the module will return segments
+that appear to extend beyond their physical boundaries.  Requests for
+features beyond the end of the segment will, however, return empty.
+=cut
+sub strict_bounds_checking {
+my $self = shift;
+my $d = $self->{strict};
+$self->{strict} = shift if @_;
+$d;
+}
+=head2 get_Seq_by_id
+Title   : get_Seq_by_id
+Usage   : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
+Function: Gets a Bio::Seq object by its name
+Returns : a Bio::Seq object
+Args    : the id (as a string) of a sequence
+Throws  : "id does not exist" exception
+NOTE: Bio::DB::RandomAccessI compliant method
+=cut
+sub  get_Seq_by_id {
+my $self = shift;
+my $id = shift;
+my $stream = $self->get_Stream_by_id($id);
+return $stream->next_seq;
+}
+=head2 get_Seq_by_accession
+Title   : get_Seq_by_accession
+Usage   : $seq = $db->get_Seq_by_accession('AL12234')
+Function: Gets a Bio::Seq object by its accession
+Returns : a Bio::Seq object
+Args    : the id (as a string) of a sequence
+Throws  : "id does not exist" exception
+NOTE: Bio::DB::RandomAccessI compliant method
+=cut
+sub  get_Seq_by_accession {
+my $self = shift;
+my $id = shift;
+my $stream = $self->get_Stream_by_accession($id);
+return $stream->next_seq;
+}
+=head2 get_Stream_by_acc ()
+=cut
+=head2 get_Seq_by_acc
+Title   : get_Seq_by_acc
+Usage   : $seq = $db->get_Seq_by_acc('X77802');
+Function: Gets a Bio::Seq object by accession number
+Returns : A Bio::Seq object
+Args    : accession number (as a string)
+Throws  : "acc does not exist" exception
+NOTE: Bio::DB::RandomAccessI compliant method
+=cut
+sub get_Stream_by_name {
+my $self = shift;
+my @ids  = @_;
+my $id = ref($ids[0]) ? $ids[0] : \@ids;
+Bio::DB::GFF::ID_Iterator->new($self,$id,'name');
+}
+=head2 get_Stream_by_id
+Title   : get_Stream_by_id
+Usage   : $seq = $db->get_Stream_by_id(@ids);
+Function: Retrieves a stream of Seq objects given their ids
+Returns : a Bio::SeqIO stream object
+Args    : an array of unique ids/accession numbers, or
+an array reference
+NOTE: This is also called get_Stream_by_batch()
+=cut
+sub get_Stream_by_id {
+my $self = shift;
+my @ids  = @_;
+my $id = ref($ids[0]) ? $ids[0] : \@ids;
+Bio::DB::GFF::ID_Iterator->new($self,$id,'feature');
+}
+=head2 get_Stream_by_batch ()
+Title   : get_Stream_by_batch
+Usage   : $seq = $db->get_Stream_by_batch(@ids);
+Function: Retrieves a stream of Seq objects given their ids
+Returns : a Bio::SeqIO stream object
+Args    : an array of unique ids/accession numbers, or
+an array reference
+NOTE: This is the same as get_Stream_by_id().
+=cut
+*get_Stream_by_batch = \&get_Stream_by_id;
+=head2 get_Stream_by_group ()
+Bioperl compatibility.
+=cut
+sub get_Stream_by_group {
+my $self = shift;
+my @ids  = @_;
+my $id = ref($ids[0]) ? $ids[0] : \@ids;
+Bio::DB::GFF::ID_Iterator->new($self,$id,'group');
+}
+=head2 all_seqfeatures
+Title   : all_seqfeatures
+Usage   : @features = $db->all_seqfeatures(@args)
+Function: fetch all the features in the database
+Returns : an array of features, or an iterator
+Args    : See below
+Status  : public
+This is equivalent to calling $db-E<gt>features() without any types, and
+will return all the features in the database.  The -merge and
+-iterator arguments are recognized, and behave the same as described
+for features().
+=cut
+sub all_seqfeatures {
+my $self = shift;
+my ($automerge,$iterator)= rearrange([
+					[qw(MERGE AUTOMERGE)],
+					'ITERATOR'
+				       ],@_);
+my @args;
+push @args,(-merge=>$automerge)   if defined $automerge;
+push @args,(-iterator=>$iterator) if defined $iterator;
+$self->features(@args);
+}
+=head1 Creating and Loading GFF Databases
+=head2 initialize
+Title   : initialize
+Usage   : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2);
+Function: initialize a GFF database
+Returns : true if initialization successful
+Args    : a set of named parameters
+Status  : Public
+This method can be used to initialize an empty database.  It takes the following
+named arguments:
+-erase     A boolean value.  If true the database will be wiped clean if it
+already contains data.
+Other named arguments may be recognized by subclasses.  They become database
+meta values that control various settable options.
+As a shortcut (and for backward compatibility) a single true argument
+is the same as initialize(-erase=E<gt>1).
+=cut
+sub initialize {
+my $self = shift;
+#$self->do_initialize(1) if @_ == 1 && $_[0];
+#why was this line (^) here?  I can't see that it actually does anything
+#one option would be to execute the line and return, but I don't know
+#why you would want to do that either.
+my ($erase,$meta) = rearrange(['ERASE'],@_);
+$meta ||= {};
+# initialize (possibly erasing)
+return unless $self->do_initialize($erase);
+my @default = $self->default_meta_values;
+# this is an awkward way of uppercasing the
+# even-numbered values (necessary for case-insensitive SQL databases)
+for (my $i=0; $i<@default; $i++) {
+$default[$i] = uc $default[$i] if !($i % 2);
+}
+my %values = (@default,%$meta);
+foreach (keys %values) {
+$self->meta($_ => $values{$_});
+}
+1;
+}
+=head2 load_gff
+Title   : load_gff
+Usage   : $db->load_gff($file|$directory|$filehandle);
+Function: load GFF data into database
+Returns : count of records loaded
+Args    : a directory, a file, a list of files,
+or a filehandle
+Status  : Public
+This method takes a single overloaded argument, which can be any of:
+=over 4
+=item 1. a scalar corresponding to a GFF file on the system
+A pathname to a local GFF file.  Any files ending with the .gz, .Z, or
+.bz2 suffixes will be transparently decompressed with the appropriate
+command-line utility.
+=item 2. an array reference containing a list of GFF files on the system
+For example ['/home/gff/gff1.gz','/home/gff/gff2.gz']
+=item 3. directory path
+The indicated directory will be searched for all files ending in the
+suffixes .gff, .gff.gz, .gff.Z or .gff.bz2.
+=item 4. filehandle
+An open filehandle from which to read the GFF data.  Tied filehandles
+now work as well.
+=item 5. a pipe expression
+A pipe expression will also work. For example, a GFF file on a remote
+web server can be loaded with an expression like this:
+$db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |");
+=back
+If successful, the method will return the number of GFF lines
+successfully loaded.
+NOTE:this method used to be called load(), but has been changed.  The
+old method name is also recognized.
+=cut
+sub load_gff {
+my $self              = shift;
+my $file_or_directory = shift || '.';
+return $self->do_load_gff($file_or_directory) if ref($file_or_directory) &&
+tied *$file_or_directory;
+my $tied_stdin = tied(*STDIN);
+open SAVEIN,"<&STDIN" unless $tied_stdin;
+local @ARGV = $self->setup_argv($file_or_directory,'gff') or return;  # to play tricks with reader
+my $result = $self->do_load_gff('ARGV');
+open STDIN,"<&SAVEIN" unless $tied_stdin;  # restore STDIN
+return $result;
+}
+*load = \&load_gff;
+=head2 load_fasta
+Title   : load_fasta
+Usage   : $db->load_fasta($file|$directory|$filehandle);
+Function: load FASTA data into database
+Returns : count of records loaded
+Args    : a directory, a file, a list of files,
+or a filehandle
+Status  : Public
+This method takes a single overloaded argument, which can be any of:
+=over 4
+=item 1. scalar corresponding to a FASTA file on the system
+A pathname to a local FASTA file.  Any files ending with the .gz, .Z, or
+.bz2 suffixes will be transparently decompressed with the appropriate
+command-line utility.
+=item 2. array reference containing a list of FASTA files on the
+system
+For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz']
+=item 3. path to a directory
+The indicated directory will be searched for all files ending in the
+suffixes .fa, .fa.gz, .fa.Z or .fa.bz2.
+a=item 4. filehandle
+An open filehandle from which to read the FASTA data.
+=item 5. pipe expression
+A pipe expression will also work. For example, a FASTA file on a remote
+web server can be loaded with an expression like this:
+$db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |");
+=back
+=cut
+sub load_fasta {
+my $self              = shift;
+my $file_or_directory = shift || '.';
+return $self->load_sequence($file_or_directory) if ref($file_or_directory) &&
+tied *$file_or_directory;
+my $tied = tied(*STDIN);
+open SAVEIN,"<&STDIN" unless $tied;
+local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return;  # to play tricks with reader
+my $result = $self->load_sequence('ARGV');
+open STDIN,"<&SAVEIN" unless $tied;  # restore STDIN
+return $result;
+}
+=head2 load_sequence_string
+Title   : load_sequence_string
+Usage   : $db->load_sequence_string($id,$dna)
+Function: load a single DNA entry
+Returns : true if successfully loaded
+Args    : a raw sequence string (DNA, RNA, protein)
+Status  : Public
+=cut
+sub load_sequence_string {
+my $self = shift;
+my ($acc,$seq)  = @_;
+my $offset = 0;
+$self->insert_sequence_chunk($acc,\$offset,\$seq) or return;
+$self->insert_sequence($acc,$offset,$seq) or return;
+1;
+}
+sub setup_argv {
+my $self = shift;
+my $file_or_directory = shift;
+my @suffixes          = @_;
+no strict 'refs';  # so that we can call fileno() on the argument
+my @argv;
+if (-d $file_or_directory) {
+@argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes;
+}elsif (my $fd = fileno($file_or_directory)) {
+open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN");
+@argv = '-';
+} elsif (ref $file_or_directory) {
+@argv = @$file_or_directory;
+} else {
+@argv = $file_or_directory;
+}
+foreach (@argv) {
+if (/\.gz$/) {
+$_ = "gunzip -c $_ |";
+} elsif (/\.Z$/) {
+$_ = "uncompress -c $_ |";
+} elsif (/\.bz2$/) {
+$_ = "bunzip2 -c $_ |";
+}
+}
+@argv;
+}
+=head2 lock_on_load
+Title   : lock_on_load
+Usage   : $lock = $db->lock_on_load([$lock])
+Function: set write locking during load
+Returns : current value of lock-on-load flag
+Args    : new value of lock-on-load-flag
+Status  : Public
+This method is honored by some of the adaptors.  If the value is true,
+the tables used by the GFF modules will be locked for writing during
+loads and inaccessible to other processes.
+=cut
+sub lock_on_load {
+my $self = shift;
+my $d = $self->{lock};
+$self->{lock} = shift if @_;
+$d;
+}
+=head2 meta
+Title   : meta
+Usage   : $value = $db->meta($name [,$newval])
+Function: get or set a meta variable
+Returns : a string
+Args    : meta variable name and optionally value
+Status  : abstract
+Get or set a named metavalues for the database.  Metavalues can be
+used for database-specific settings.
+By default, this method does nothing!
+=cut
+sub meta {
+my $self = shift;
+my ($name,$value) = @_;
+return;
+}
+=head2 default_meta_values
+Title   : default_meta_values
+Usage   : %values = $db->default_meta_values
+Function: empty the database
+Returns : a list of tag=>value pairs
+Args    : none
+Status  : protected
+This method returns a list of tag=E<gt>value pairs that contain default
+meta information about the database.  It is invoked by initialize() to
+write out the default meta values.  The base class version returns an
+empty list.
+For things to work properly, meta value names must be UPPERCASE.
+=cut
+sub default_meta_values {
+my $self = shift;
+return ();
+}
+=head2 error
+Title   : error
+Usage   : $db->error( [$new error] );
+Function: read or set error message
+Returns : error message
+Args    : an optional argument to set the error message
+Status  : Public
+This method can be used to retrieve the last error message.  Errors
+are not reset to empty by successful calls, so contents are only valid
+immediately after an error condition has been detected.
+=cut
+sub error {
+my $self = shift;
+my $g = $self->{error};
+$self->{error} = join '',@_ if @_;
+$g;
+}
+=head2 debug
+Title   : debug
+Usage   : $db->debug( [$flag] );
+Function: read or set debug flag
+Returns : current value of debug flag
+Args    : new debug flag (optional)
+Status  : Public
+This method can be used to turn on debug messages.  The exact nature
+of those messages depends on the adaptor in use.
+=cut
+sub debug {
+my $self = shift;
+my $g = $self->{debug};
+$self->{debug} = shift if @_;
+$g;
+}
+=head2 automerge
+Title   : automerge
+Usage   : $db->automerge( [$new automerge] );
+Function: get or set automerge value
+Returns : current value (boolean)
+Args    : an optional argument to set the automerge value
+Status  : Public
+By default, this module will use the aggregators to merge groups into
+single composite objects.  This default can be changed to false by
+calling automerge(0).
+=cut
+sub automerge {
+my $self = shift;
+my $g = $self->{automerge};
+$self->{automerge} = shift if @_;
+$g;
+}
+=head2 attributes
+Title   : attributes
+Usage   : @attributes = $db->attributes($id,$name)
+Function: get the "attributres" on a particular feature
+Returns : an array of string
+Args    : feature ID
+Status  : public
+Some GFF version 2 files use the groups column to store a series of
+attribute/value pairs.  In this interpretation of GFF, the first such
+pair is treated as the primary group for the feature; subsequent pairs
+are treated as attributes.  Two attributes have special meaning:
+"Note" is for backward compatibility and is used for unstructured text
+remarks.  "Alias" is considered as a synonym for the feature name.
+If no name is provided, then attributes() returns a flattened hash, of
+attribute=E<gt>value pairs.  This lets you do:
+%attributes = $db->attributes($id);
+Normally, attributes() will be called by the feature:
+@notes = $feature->attributes('Note');
+In a scalar context, attributes() returns the first value of the
+attribute if a tag is present, otherwise a hash reference in which the
+keys are attribute names and the values are anonymous arrays
+containing the values.
+=cut
+sub attributes {
+my $self = shift;
+my ($id,$tag) = @_;
+my @result = $self->do_attributes($id,$tag) or return;
+return @result if wantarray;
+# what to do in an array context
+return $result[0] if $tag;
+my %result;
+while (my($key,$value) = splice(@result,0,2)) {
+push @{$result{$key}},$value;
+}
+return \%result;
+}
+=head2 fast_queries
+Title   : fast_queries
+Usage   : $flag = $db->fast_queries([$flag])
+Function: turn on and off the "fast queries" option
+Returns : a boolean
+Args    : a boolean flag (optional)
+Status  : public
+The mysql database driver (and possibly others) support a "fast" query
+mode that caches results on the server side.  This makes queries come
+back faster, particularly when creating iterators.  The downside is
+that while iterating, new queries will die with a "command synch"
+error.  This method turns the feature on and off.
+For databases that do not support a fast query, this method has no
+effect.
+=cut
+# override this method in order to set the mysql_use_result attribute, which is an obscure
+# but extremely powerful optimization for both performance and memory.
+sub fast_queries {
+my $self = shift;
+my $d = $self->{fast_queries};
+$self->{fast_queries} = shift if @_;
+$d;
+}
+=head2 add_aggregator
+Title   : add_aggregator
+Usage   : $db->add_aggregator($aggregator)
+Function: add an aggregator to the list
+Returns : nothing
+Args    : an aggregator
+Status  : public
+This method will append an aggregator to the end of the list of
+registered aggregators.  Three different argument types are accepted:
+1) a Bio::DB::GFF::Aggregator object -- will be added
+2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}"
+-- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method
+part is optional).
+3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator
+subclass, where the token corresponds to the subclass name.
+=cut
+sub add_aggregator {
+my $self       = shift;
+my $aggregator = shift;
+my $list = $self->{aggregators} ||= [];
+if (ref $aggregator) { # an object
+@$list = grep {$_->get_method ne $aggregator->get_method} @$list;
+push @$list,$aggregator;
+}
+elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) {
+my($agg_name,$subparts,$mainpart) = ($1,$2,$3);
+my @subparts = split /,\s*/,$subparts;
+my @args = (-method      => $agg_name,
+		-sub_parts   => \@subparts);
+push @args,(-main_method => $mainpart) if $mainpart;
+warn "making an aggregator with (@args), subparts = @subparts" if $self->debug;
+push @$list,Bio::DB::GFF::Aggregator->new(@args);
+}
+else {
+my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E";
+eval "require $class";
+$self->throw("Unable to load $aggregator aggregator: $@") if $@;
+push @$list,$class->new();
+}
+}
+=head2 aggregators
+Title   : aggregators
+Usage   : $db->aggregators([@new_aggregators]);
+Function: retrieve list of aggregators
+Returns : list of aggregators
+Args    : a list of aggregators to set (optional)
+Status  : public
+This method will get or set the list of aggregators assigned to
+the database.  If 1 or more arguments are passed, the existing
+set will be cleared.
+=cut
+sub aggregators {
+my $self = shift;
+my $d = $self->{aggregators};
+if (@_) {
+$self->clear_aggregators;
+$self->add_aggregator($_) foreach @_;
+}
+return unless $d;
+return @$d;
+}
+=head2 clear_aggregators
+Title   : clear_aggregators
+Usage   : $db->clear_aggregators
+Function: clears list of aggregators
+Returns : nothing
+Args    : none
+Status  : public
+This method will clear the aggregators stored in the database object.
+Use aggregators() or add_aggregator() to add some back.
+=cut
+sub clear_aggregators { shift->{aggregators} = [] }
+=head1 Methods for use by Subclasses
+The following methods are chiefly of interest to subclasses and are
+not intended for use by end programmers.
+=head2 abscoords
+Title   : abscoords
+Usage   : $db->abscoords($name,$class,$refseq)
+Function: finds position of a landmark in reference coordinates
+Returns : ($ref,$class,$start,$stop,$strand)
+Args    : name and class of landmark
+Status  : public
+This method is called by Bio::DB::GFF::RelSegment to obtain the
+absolute coordinates of a sequence landmark.  The arguments are the
+name and class of the landmark.  If successful, abscoords() returns
+the ID of the reference sequence, its class, its start and stop
+positions, and the orientation of the reference sequence's coordinate
+system ("+" for forward strand, "-" for reverse strand).
+If $refseq is present in the argument list, it forces the query to
+search for the landmark in a particular reference sequence.
+=cut
+sub abscoords {
+my $self = shift;
+my ($name,$class,$refseq) = @_;
+$class ||= $self->{default_class};
+$self->get_abscoords($name,$class,$refseq);
+}
+=head1 Protected API
+The following methods are not intended for public consumption, but are
+intended to be overridden/implemented by adaptors.
+=head2 default_aggregators
+Title   : default_aggregators
+Usage   : $db->default_aggregators;
+Function: retrieve list of aggregators
+Returns : array reference containing list of aggregator names
+Args    : none
+Status  : protected
+This method (which is intended to be overridden by adaptors) returns a
+list of standard aggregators to be applied when no aggregators are
+specified in the constructor.
+=cut
+sub default_aggregators {
+my $self = shift;
+return ['processed_transcript','alignment'];
+}
+=head2 do_load_gff
+Title   : do_load_gff
+Usage   : $db->do_load_gff($handle)
+Function: load a GFF input stream
+Returns : number of features loaded
+Args    : A filehandle.
+Status  : protected
+This method is called to load a GFF data stream.  The method will read
+GFF features from E<lt>E<gt> and load them into the database.  On exit the
+method must return the number of features loaded.
+Note that the method is responsible for parsing the GFF lines.  This
+is to allow for differences in the interpretation of the "group"
+field, which are legion.
+You probably want to use load_gff() instead.  It is more flexible
+about the arguments it accepts.
+=cut
+# load from <>
+sub do_load_gff {
+my $self      = shift;
+my $io_handle = shift;
+local $self->{gff3_flag} = 0;
+$self->setup_load();
+my $fasta_sequence_id;
+while (<$io_handle>) {
+chomp;
+$self->{gff3_flag}++ if /^\#\#gff-version\s+3/;
+if (/^>(\S+)/) {  # uh oh, sequence coming
+$fasta_sequence_id = $1;
+last;
+}
+if (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
+$self->load_gff_line(
+			   {
+			    ref    => $1,
+			    class  => 'Sequence',
+			    source => 'reference',
+			    method => 'Component',
+			    start  => $2,
+			    stop   => $3,
+			    score  => undef,
+			    strand => undef,
+			    phase  => undef,
+			    gclass => 'Sequence',
+			    gname  => $1,
+			    tstart => undef,
+			    tstop  => undef,
+			    attributes  => [],
+			   }
+			  );
+next;
+}
+next if /^\#/;
+my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
+next unless defined($ref) && defined($method) && defined($start) && defined($stop);
+foreach (\$score,\$strand,\$phase) {
+undef $$_ if $$_ eq '.';
+}
+my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{gff3_flag});
+# no standard way in the GFF file to denote the class of the reference sequence -- drat!
+# so we invoke the factory to do it
+my $class = $self->refclass($ref);
+# call subclass to do the dirty work
+if ($start > $stop) {
+($start,$stop) = ($stop,$start);
+if ($strand eq '+') {
+	$strand = '-';
+} elsif ($strand eq '-') {
+	$strand = '+';
+}
+}
+$self->load_gff_line({ref    => $ref,
+			  class  => $class,
+			  source => $source,
+			  method => $method,
+			  start  => $start,
+			  stop   => $stop,
+			  score  => $score,
+			  strand => $strand,
+			  phase  => $phase,
+			  gclass => $gclass,
+			  gname  => $gname,
+			  tstart => $tstart,
+			  tstop  => $tstop,
+			  attributes  => $attributes}
+			);
+}
+my $result = $self->finish_load();
+$result += $self->load_sequence($io_handle,$fasta_sequence_id)
+if defined $fasta_sequence_id;
+$result;
+}
+=head2 load_sequence
+Title   : load_sequence
+Usage   : $db->load_sequence($handle [,$id])
+Function: load a FASTA data stream
+Returns : number of sequences
+Args    : a filehandle and optionally the ID of
+the first sequence in the stream.
+Status  : protected
+You probably want to use load_fasta() instead.  The $id argument is a
+hack used to switch from GFF loading to FASTA loading when load_gff()
+discovers FASTA data hiding at the bottom of the GFF file (as Artemis
+does).
+=cut
+sub load_sequence {
+my $self = shift;
+my $io_handle = shift;
+my $id        = shift;   # hack for GFF files that contain fasta data
+# read fasta file(s) from ARGV
+my ($seq,$offset,$loaded) = (undef,0,0);
+while (<$io_handle>) {
+chomp;
+if (/^>(\S+)/) {
+$self->insert_sequence($id,$offset,$seq) if $id;
+$id     = $1;
+$offset = 0;
+$seq    = '';
+$loaded++;
+} else {
+$seq .= $_;
+$self->insert_sequence_chunk($id,\$offset,\$seq);
+}
+}
+$self->insert_sequence($id,$offset,$seq) if $id;
+$loaded+0;
+}
+sub insert_sequence_chunk {
+my $self = shift;
+my ($id,$offsetp,$seqp) = @_;
+if (my $cs = $self->dna_chunk_size) {
+while (length($$seqp) >= $cs) {
+my $chunk = substr($$seqp,0,$cs);
+$self->insert_sequence($id,$$offsetp,$chunk);
+$$offsetp += length($chunk);
+substr($$seqp,0,$cs) = '';
+}
+}
+return 1;  # the calling routine may expect success or failure
+}
+# used to store big pieces of DNA in itty bitty pieces
+sub dna_chunk_size {
+return 0;
+}
+sub insert_sequence {
+my $self = shift;
+my($id,$offset,$seq) = @_;
+$self->throw('insert_sequence(): must be defined in subclass');
+}
+# This is the default class for reference points.  Defaults to Sequence.
+sub default_class {
+my $self = shift;
+my $d = exists($self->{default_class}) ? $self->{default_class} : 'Sequence';
+$self->{default_class} = shift if @_;
+$d;
+}
+# gets name of the reference sequence, and returns its class
+# currently just calls default_class
+sub refclass {
+my $self = shift;
+my $name = shift;
+return $self->default_class;
+}
+=head2 setup_load
+Title   : setup_load
+Usage   : $db->setup_load
+Function: called before load_gff_line()
+Returns : void
+Args    : none
+Status  : abstract
+This abstract method gives subclasses a chance to do any
+schema-specific initialization prior to loading a set of GFF records.
+It must be implemented by a subclass.
+=cut
+sub setup_load {
+# default, do nothing
+}
+=head2 finish_load
+Title   : finish_load
+Usage   : $db->finish_load
+Function: called after load_gff_line()
+Returns : number of records loaded
+Args    : none
+Status  :abstract
+This method gives subclasses a chance to do any schema-specific
+cleanup after loading a set of GFF records.
+=cut
+sub finish_load {
+# default, do nothing
+}
+=head2 load_gff_line
+Title   : load_gff_line
+Usage   : $db->load_gff_line(@args)
+Function: called to load one parsed line of GFF
+Returns : true if successfully inserted
+Args    : see below
+Status  : abstract
+This abstract method is called once per line of the GFF and passed a
+hashref containing parsed GFF fields.  The fields are:
+{ref    => $ref,
+class  => $class,
+source => $source,
+method => $method,
+start  => $start,
+stop   => $stop,
+score  => $score,
+strand => $strand,
+phase  => $phase,
+gclass => $gclass,
+gname  => $gname,
+tstart => $tstart,
+tstop  => $tstop,
+attributes  => $attributes}
+=cut
+sub load_gff_line {
+shift->throw("load_gff_line(): must be implemented by an adaptor");
+}
+=head2 do_initialize
+Title   : do_initialize
+Usage   : $db->do_initialize([$erase])
+Function: initialize and possibly erase database
+Returns : true if successful
+Args    : optional erase flag
+Status  : protected
+This method implements the initialize() method described above, and
+takes the same arguments.
+=cut
+sub do_initialize {
+shift->throw('do_initialize(): must be implemented by an adaptor');
+}
+=head2 dna
+Title   : dna
+Usage   : $db->dna($id,$start,$stop,$class)
+Function: return the raw DNA string for a segment
+Returns : a raw DNA string
+Args    : id of the sequence, its class, start and stop positions
+Status  : public
+This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA
+sequence.
+Arguments: -name          sequence name
+-start         start position
+-stop          stop position
+-class         sequence class
+If start and stop are both undef, then the entire DNA is retrieved.
+So to fetch the whole dna, call like this:
+$db->dna($name_of_sequence);
+or like this:
+$db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence);
+NOTE: you will probably prefer to create a Segment and then invoke its
+dna() method.
+=cut
+# call to return the DNA string for the indicated region
+# real work is done by get_dna()
+sub dna {
+my $self = shift;
+my ($id,$start,$stop,$class)  = rearrange([
+					     [qw(NAME ID REF REFSEQ)],
+					     qw(START),
+					     [qw(STOP END)],
+					    'CLASS',
+					   ],@_);
+# return unless defined $start && defined $stop;
+$self->get_dna($id,$start,$stop,$class);
+}
+sub features_in_range {
+my $self = shift;
+my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) =
+rearrange([
+	       [qw(RANGE_TYPE)],
+	       [qw(REF REFSEQ)],
+	       qw(CLASS),
+	       qw(START),
+	       [qw(STOP END)],
+	       [qw(TYPE TYPES)],
+	       qw(PARENT),
+	       [qw(RARE SPARSE)],
+	       [qw(MERGE AUTOMERGE)],
+	       'ITERATOR'
+	      ],@_);
+$other ||= {};
+$automerge = $types && $self->automerge unless defined $automerge;
+$self->throw("range type must be one of {".
+	       join(',',keys %valid_range_types).
+	       "}\n")
+unless $valid_range_types{lc $range_type};
+$self->_features({
+		    rangetype => lc $range_type,
+		    refseq    => $refseq,
+		    refclass  => $class,
+		    start     => $start,
+		    stop      => $stop,
+		    types     => $types },
+		   {
+		    sparse    => $sparse,
+		    automerge => $automerge,
+		    iterator  => $iterator,
+		    %$other,
+		   },
+		   $parent);
+}
+=head2 get_dna
+Title   : get_dna
+Usage   : $db->get_dna($id,$start,$stop,$class)
+Function: get DNA for indicated segment
+Returns : the dna string
+Args    : sequence ID, start, stop and class
+Status  : protected
+If start E<gt> stop and the sequence is nucleotide, then this method
+should return the reverse complement.  The sequence class may be
+ignored by those databases that do not recognize different object
+types.
+=cut
+sub get_dna {
+my $self = shift;
+my ($id,$start,$stop,$class,) = @_;
+$self->throw("get_dna() must be implemented by an adaptor");
+}
+=head2 get_features
+Title   : get_features
+Usage   : $db->get_features($search,$options,$callback)
+Function: get list of features for a region
+Returns : count of number of features retrieved
+Args    : see below
+Status  : protected
+The first argument is a hash reference containing search criteria for
+retrieving features.  It contains the following keys:
+rangetype One of "overlaps", "contains" or "contained_in".  Indicates
+the type of range query requested.
+refseq    ID of the landmark that establishes the absolute
+coordinate system.
+refclass  Class of this landmark.  Can be ignored by implementations
+that don't recognize such distinctions.
+start     Start of the range, inclusive.
+stop      Stop of the range, inclusive.
+types     Array reference containing the list of annotation types
+to fetch from the database.  Each annotation type is an
+array reference consisting of [source,method].
+The second argument is a hash reference containing certain options
+that affect the way information is retrieved:
+sort_by_group
+A flag.  If true, means that the returned features should be
+sorted by the group that they're in.
+sparse    A flag.  If true, means that the expected density of the
+features is such that it will be more efficient to search
+by type rather than by range.  If it is taking a long
+time to fetch features, give this a try.
+binsize   A true value will create a set of artificial features whose
+start and stop positions indicate bins of the given size, and
+whose scores are the number of features in the bin.  The
+class of the feature will be set to "bin", and its name to
+"method:source".  This is a handy way of generating histograms
+of feature density.
+The third argument, the $callback, is a code reference to which
+retrieved features are passed.  It is described in more detail below.
+This routine is responsible for getting arrays of GFF data out of the
+database and passing them to the callback subroutine.  The callback
+does the work of constructing a Bio::DB::GFF::Feature object out of
+that data.  The callback expects a list of 13 fields:
+$refseq      The reference sequence
+$start       feature start
+$stop        feature stop
+$source      feature source
+$method      feature method
+$score       feature score
+$strand      feature strand
+$phase       feature phase
+$groupclass  group class (may be undef)
+$groupname   group ID (may be undef)
+$tstart      target start for similarity hits (may be undef)
+$tstop       target stop for similarity hits (may be undef)
+$feature_id  A unique feature ID (may be undef)
+These fields are in the same order as the raw GFF file, with the
+exception that the group column has been parsed into group class and
+group name fields.
+The feature ID, if provided, is a unique identifier of the feature
+line.  The module does not depend on this ID in any way, but it is
+available via Bio::DB::GFF-E<gt>id() if wanted.  In the dbi::mysql and
+dbi::mysqlopt adaptor, the ID is a unique row ID.  In the acedb
+adaptor it is not used.
+=cut
+sub get_features{
+my $self = shift;
+my ($search,$options,$callback) = @_;
+$self->throw("get_features() must be implemented by an adaptor");
+}
+=head2 _feature_by_name
+Title   : _feature_by_name
+Usage   : $db->_feature_by_name($class,$name,$location,$callback)
+Function: get a list of features by name and class
+Returns : count of number of features retrieved
+Args    : name of feature, class of feature, and a callback
+Status  : abstract
+This method is used internally.  The callback arguments are the same
+as those used by make_feature().  This method must be overidden by
+subclasses.
+=cut
+sub _feature_by_name {
+my $self = shift;
+my ($class,$name,$location,$callback) = @_;
+$self->throw("_feature_by_name() must be implemented by an adaptor");
+}
+sub _feature_by_attribute {
+my $self = shift;
+my ($attributes,$callback) = @_;
+$self->throw("_feature_by_name() must be implemented by an adaptor");
+}
+=head2 _feature_by_id
+Title   : _feature_by_id
+Usage   : $db->_feature_by_id($ids,$type,$callback)
+Function: get a feature based
+Returns : count of number of features retrieved
+Args    : arrayref to feature IDs to fetch
+Status  : abstract
+This method is used internally to fetch features either by their ID or
+their group ID.  $ids is a arrayref containing a list of IDs, $type is
+one of "feature" or "group", and $callback is a callback.  The
+callback arguments are the same as those used by make_feature().  This
+method must be overidden by subclasses.
+=cut
+sub _feature_by_id {
+my $self = shift;
+my ($ids,$type,$callback) = @_;
+$self->throw("_feature_by_id() must be implemented by an adaptor");
+}
+=head2 overlapping_features
+Title   : overlapping_features
+Usage   : $db->overlapping_features(@args)
+Function: get features that overlap the indicated range
+Returns : a list of Bio::DB::GFF::Feature objects
+Args    : see below
+Status  : public
+This method is invoked by Bio::DB::GFF::Segment-E<gt>features() to find
+the list of features that overlap a given range.  It is generally
+preferable to create the Segment first, and then fetch the features.
+This method takes set of named arguments:
+-refseq    ID of the reference sequence
+-class     Class of the reference sequence
+-start     Start of the desired range in refseq coordinates
+-stop      Stop of the desired range in refseq coordinates
+-types     List of feature types to return.  Argument is an array
+	     reference containing strings of the format "method:source"
+-parent    A parent Bio::DB::GFF::Segment object, used to create
+	     relative coordinates in the generated features.
+-rare      Turn on an optimization suitable for a relatively rare feature type,
+where it will be faster to filter by feature type first
+and then by position, rather than vice versa.
+-merge     Whether to apply aggregators to the generated features.
+-iterator  Whether to return an iterator across the features.
+If -iterator is true, then the method returns a single scalar value
+consisting of a Bio::SeqIO object.  You can call next_seq() repeatedly
+on this object to fetch each of the features in turn.  If iterator is
+false or absent, then all the features are returned as a list.
+Currently aggregation is disabled when iterating over a series of
+features.
+Types are indicated using the nomenclature "method:source".  Either of
+these fields can be omitted, in which case a wildcard is used for the
+missing field.  Type names without the colon (e.g. "exon") are
+interpreted as the method name and a source wild card.  Regular
+expressions are allowed in either field, as in: "similarity:BLAST.*".
+=cut
+# call to return the features that overlap the named region
+# real work is done by get_features
+sub overlapping_features {
+my $self = shift;
+$self->features_in_range(-range_type=>'overlaps',@_);
+}
+=head2 contained_features
+Title   : contained_features
+Usage   : $db->contained_features(@args)
+Function: get features that are contained within the indicated range
+Returns : a list of Bio::DB::GFF::Feature objects
+Args    : see overlapping_features()
+Status  : public
+This call is similar to overlapping_features(), except that it only
+retrieves features whose end points are completely contained within
+the specified range.
+Generally you will want to fetch a Bio::DB::GFF::Segment object and
+call its contained_features() method rather than call this directly.
+=cut
+# The same, except that it only returns features that are completely contained within the
+# range (much faster usually)
+sub contained_features {
+my $self = shift;
+$self->features_in_range(-range_type=>'contains',@_);
+}
+=head2 contained_in
+Title   : contained_in
+Usage   : @features = $s->contained_in(@args)
+Function: get features that contain this segment
+Returns : a list of Bio::DB::GFF::Feature objects
+Args    : see features()
+Status  : Public
+This is identical in behavior to features() except that it returns
+only those features that completely contain the segment.
+=cut
+sub contained_in {
+my $self = shift;
+$self->features_in_range(-range_type=>'contained_in',@_);
+}
+=head2 get_abscoords
+Title   : get_abscoords
+Usage   : $db->get_abscoords($name,$class,$refseq)
+Function: get the absolute coordinates of sequence with name & class
+Returns : ($absref,$absstart,$absstop,$absstrand)
+Args    : name and class of the landmark
+Status  : protected
+Given the name and class of a genomic landmark, this function returns
+a four-element array consisting of:
+$absref      the ID of the reference sequence that contains this landmark
+$absstart    the position at which the landmark starts
+$absstop     the position at which the landmark stops
+$absstrand   the strand of the landmark, relative to the reference sequence
+If $refseq is provided, the function searches only within the
+specified reference sequence.
+=cut
+sub get_abscoords {
+my $self = shift;
+my ($name,$class,$refseq) = @_;
+$self->throw("get_abscoords() must be implemented by an adaptor");
+}
+=head2 get_types
+Title   : get_types
+Usage   : $db->get_types($absref,$class,$start,$stop,$count)
+Function: get list of all feature types on the indicated segment
+Returns : list or hash of Bio::DB::GFF::Typename objects
+Args    : see below
+Status  : protected
+Arguments are:
+$absref      the ID of the reference sequence
+$class       the class of the reference sequence
+$start       the position to start counting
+$stop        the position to end counting
+$count       a boolean indicating whether to count the number
+	       of occurrences of each feature type
+If $count is true, then a hash is returned.  The keys of the hash are
+feature type names in the format "method:source" and the values are
+the number of times a feature of this type overlaps the indicated
+segment.  Otherwise, the call returns a set of Bio::DB::GFF::Typename
+objects.  If $start or $stop are undef, then all features on the
+indicated segment are enumerated.  If $absref is undef, then the call
+returns all feature types in the database.
+=cut
+sub get_types {
+my $self = shift;
+my ($refseq,$class,$start,$stop,$count,$types) = @_;
+$self->throw("get_types() must be implemented by an adaptor");
+}
+=head2 make_feature
+Title   : make_feature
+Usage   : $db->make_feature(@args)
+Function: Create a Bio::DB::GFF::Feature object from string data
+Returns : a Bio::DB::GFF::Feature object
+Args    : see below
+Status  : internal
+This takes 14 arguments (really!):
+$parent                A Bio::DB::GFF::RelSegment object
+$group_hash            A hashref containing unique list of GFF groups
+$refname               The name of the reference sequence for this feature
+$refclass              The class of the reference sequence for this feature
+$start                 Start of feature
+$stop                  Stop of feature
+$source                Feature source field
+$method                Feature method field
+$score                 Feature score field
+$strand                Feature strand
+$phase                 Feature phase
+$group_class           Class of feature group
+$group_name            Name of feature group
+$tstart                For homologies, start of hit on target
+$tstop                 Stop of hit on target
+The $parent argument, if present, is used to establish relative
+coordinates in the resulting Bio::DB::Feature object.  This allows one
+feature to generate a list of other features that are relative to its
+coordinate system (for example, finding the coordinates of the second
+exon relative to the coordinates of the first).
+The $group_hash allows the group_class/group_name strings to be turned
+into rich database objects via the make_obect() method (see above).
+Because these objects may be expensive to create, $group_hash is used
+to uniquefy them.  The index of this hash is the composite key
+{$group_class,$group_name,$tstart,$tstop}.  Values are whatever object
+is returned by the make_object() method.
+The remainder of the fields are taken from the GFF line, with the
+exception that "Target" features, which contain information about the
+target of a homology search, are parsed into their components.
+=cut
+# This call is responsible for turning a line of GFF into a
+# feature object.
+# The $parent argument is a Bio::DB::GFF::Segment object and is used
+# to establish the coordinate system for the new feature.
+# The $group_hash argument is an hash ref that holds previously-
+# generated group objects.
+# Other arguments are taken right out of the GFF table.
+sub make_feature {
+my $self = shift;
+my ($parent,$group_hash,          # these arguments provided by generic mechanisms
+$srcseq,                      # the rest is provided by adaptor
+$start,$stop,
+$source,$method,
+$score,$strand,$phase,
+$group_class,$group_name,
+$tstart,$tstop,
+$db_id,$group_id) = @_;
+return unless $srcseq;            # return undef if called with no arguments.  This behavior is used for
+# on-the-fly aggregation.
+my $group;  # undefined
+if (defined $group_class && defined $group_name) {
+$tstart ||= '';
+$tstop  ||= '';
+if ($group_hash) {
+$group = $group_hash->{$group_class,$group_name,$tstart,$tstop}
+	||= $self->make_object($group_class,$group_name,$tstart,$tstop);
+} else {
+$group = $self->make_object($group_class,$group_name,$tstart,$tstop);
+}
+}
+# fix for some broken GFF files
+# unfortunately - has undesired side effects
+#  if (defined $tstart && defined $tstop && !defined $strand) {
+#    $strand = $tstart <= $tstop ? '+' : '-';
+#  }
+if (ref $parent) { # note that the src sequence is ignored
+return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop,
+						  $method,$source,
+						  $score,$strand,$phase,
+						  $group,$db_id,$group_id,
+						  $tstart,$tstop);
+} else {
+return Bio::DB::GFF::Feature->new($self,$srcseq,
+				      $start,$stop,
+				      $method,$source,
+				      $score,$strand,$phase,
+				      $group,$db_id,$group_id,
+				      $tstart,$tstop);
+}
+}
+sub make_aggregated_feature {
+my $self                 = shift;
+my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3);
+my $feature = $self->make_feature($parent,undef,@_);
+return [$feature] if $feature && !$feature->group;
+# if we have accumulated features and either:
+# (1) make_feature() returned undef, indicated very end or
+# (2) the current group is different from the previous one
+local $^W = 0;  # irritating uninitialized value warning in next statement
+if (@$accumulated_features &&
+(!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) {
+foreach my $a (@$aggregators) {  # last aggregator gets first shot
+$a->aggregate($accumulated_features,$self) or next;
+}
+my @result = @$accumulated_features;
+@$accumulated_features = $feature ? ($feature) : ();
+return unless @result;
+return \@result ;
+}
+push @$accumulated_features,$feature;
+return;
+}
+=head2 parse_types
+Title   : parse_types
+Usage   : $db->parse_types(@args)
+Function: parses list of types
+Returns : an array ref containing ['method','source'] pairs
+Args    : a list of types in 'method:source' form
+Status  : internal
+This method takes an array of type names in the format "method:source"
+and returns an array reference of ['method','source'] pairs.  It will
+also accept a single argument consisting of an array reference with
+the list of type names.
+=cut
+# turn feature types in the format "method:source" into a list of [method,source] refs
+sub parse_types {
+my $self  = shift;
+return []   if !@_ or !defined($_[0]);
+return $_[0] if ref $_[0] eq 'ARRAY' && ref $_[0][0];
+my @types = ref($_[0]) ? @{$_[0]} : @_;
+my @type_list = map { [split(':',$_,2)] } @types;
+return \@type_list;
+}
+=head2 make_match_sub
+Title   : make_match_sub
+Usage   : $db->make_match_sub($types)
+Function: creates a subroutine used for filtering features
+Returns : a code reference
+Args    : a list of parsed type names
+Status  : protected
+This method is used internally to generate a code subroutine that will
+accept or reject a feature based on its method and source.  It takes
+an array of parsed type names in the format returned by parse_types(),
+and generates an anonymous subroutine.  The subroutine takes a single
+Bio::DB::GFF::Feature object and returns true if the feature matches
+one of the desired feature types, and false otherwise.
+=cut
+# a subroutine that matches features indicated by list of types
+sub make_match_sub {
+my $self = shift;
+my $types = shift;
+return sub { 1 } unless ref $types && @$types;
+my @expr;
+for my $type (@$types) {
+my ($method,$source) = @$type;
+$method ||= '.*';
+$source  = $source ? ":$source" : "(?::.+)?";
+push @expr,"${method}${source}";
+}
+my $expr = join '|',@expr;
+return $self->{match_subs}{$expr} if $self->{match_subs}{$expr};
+my $sub =<<END;
+sub {
+my \$feature = shift or return;
+return \$feature->type =~ /^($expr)\$/i;
+}
+END
+warn "match sub: $sub\n" if $self->debug;
+my $compiled_sub = eval $sub;
+$self->throw($@) if $@;
+return $self->{match_subs}{$expr} = $compiled_sub;
+}
+=head2 make_object
+Title   : make_object
+Usage   : $db->make_object($class,$name,$start,$stop)
+Function: creates a feature object
+Returns : a feature object
+Args    : see below
+Status  : protected
+This method is called to make an object from the GFF "group" field.
+By default, all Target groups are turned into Bio::DB::GFF::Homol
+objects, and everything else becomes a Bio::DB::GFF::Featname.
+However, adaptors are free to override this method to generate more
+interesting objects, such as true BioPerl objects, or Acedb objects.
+Arguments are:
+$name      database ID for object
+$class     class of object
+$start     for similarities, start of match inside object
+$stop      for similarities, stop of match inside object
+=cut
+# abstract call to turn a feature into an object, given its class and name
+sub make_object {
+my $self = shift;
+my ($class,$name,$start,$stop) = @_;
+return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop)
+if defined $start and length $start;
+return Bio::DB::GFF::Featname->new($class,$name);
+}
+=head2 do_attributes
+Title   : do_attributes
+Usage   : $db->do_attributes($id [,$tag]);
+Function: internal method to retrieve attributes given an id and tag
+Returns : a list of Bio::DB::GFF::Feature objects
+Args    : a feature id and a attribute tag (optional)
+Status  : protected
+This method is overridden by subclasses in order to return a list of
+attributes.  If called with a tag, returns the value of attributes of
+that tag type.  If called without a tag, returns a flattened array of
+(tag=E<gt>value) pairs.  A particular tag can be present multiple times.
+=cut
+sub do_attributes {
+my $self = shift;
+my ($id,$tag) = @_;
+return ();
+}
+=head1 Internal Methods
+The following methods are internal to Bio::DB::GFF and are not
+guaranteed to remain the same.
+=head2 _features
+Title   : _features
+Usage   : $db->_features($search,$options,$parent)
+Function: internal method
+Returns : a list of Bio::DB::GFF::Feature objects
+Args    : see below
+Status  : internal
+This is an internal method that is called by overlapping_features(),
+contained_features() and features() to create features based on a
+parent segment's coordinate system.  It takes three arguments, a
+search options hashref, an options hashref, and a parent segment.
+The search hashref contains the following keys:
+rangetype     One of "overlaps", "contains" or "contained_in".  Indicates
+the type of range query requested.
+refseq        reference sequence ID
+refclass      reference sequence class
+start	        start of range
+stop		stop of range
+types	        arrayref containing list of types in "method:source" form
+The options hashref contains zero or more of the following keys:
+sparse	turn on optimizations for a rare feature
+automerge	if true, invoke aggregators to merge features
+iterator	if true, return an iterator
+The $parent argument is a scalar object containing a
+Bio::DB::GFF::RelSegment object or descendent.
+=cut
+#'
+sub _features {
+my $self = shift;
+my ($search,$options,$parent) = @_;
+(@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)})
+if defined($search->{start}) && $search->{start} > $search->{stop};
+my $types = $self->parse_types($search->{types});  # parse out list of types
+my @aggregated_types = @$types;         # keep a copy
+# allow the aggregators to operate on the original
+my @aggregators;
+if ($options->{automerge}) {
+for my $a ($self->aggregators) {
+$a = $a->clone if $options->{iterator};
+unshift @aggregators,$a
+	if $a->disaggregate(\@aggregated_types,$self);
+}
+}
+if ($options->{iterator}) {
+my @accumulated_features;
+my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) }
+: sub { [$self->make_feature($parent,undef,@_)] };
+return $self->get_features_iterator({ %$search,
+					  types => \@aggregated_types  },
+					{ %$options,
+					 sort_by_group => $options->{automerge}  },
+					$callback
+				       );
+}
+my %groups;         # cache the groups we create to avoid consuming too much unecessary memory
+my $features = [];
+my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) };
+$self->get_features({ %$search,
+			types  => \@aggregated_types },
+		        $options,
+		        $callback);
+if ($options->{automerge}) {
+warn "aggregating...\n" if $self->debug;
+foreach my $a (@aggregators) {  # last aggregator gets first shot
+warn "Aggregator $a:\n" if $self->debug;
+$a->aggregate($features,$self);
+}
+}
+@$features;
+}
+=head2 get_features_iterator
+Title   : get_features_iterator
+Usage   : $db->get_features_iterator($search,$options,$callback)
+Function: get an iterator on a features query
+Returns : a Bio::SeqIO object
+Args    : as per get_features()
+Status  : Public
+This method takes the same arguments as get_features(), but returns an
+iterator that can be used to fetch features sequentially, as per
+Bio::SeqIO.
+Internally, this method is simply a front end to range_query().
+The latter method constructs and executes the query, returning a
+statement handle. This routine passes the statement handle to the
+constructor for the iterator, along with the callback.
+=cut
+sub get_features_iterator {
+my $self = shift;
+my ($search,$options,$callback) = @_;
+$self->throw('feature iteration is not implemented in this adaptor');
+}
+=head2 split_group
+Title   : split_group
+Usage   : $db->split_group($group_field,$gff3_flag)
+Function: parse GFF group field
+Returns : ($gclass,$gname,$tstart,$tstop,$attributes)
+Args    : the gff group column and a flag indicating gff3 compatibility
+Status  : internal
+This is a method that is called by load_gff_line to parse out the
+contents of one or more group fields.  It returns the class of the
+group, its name, the start and stop of the target, if any, and an
+array reference containing any attributes that were stuck into the
+group field, in [attribute_name,attribute_value] format.
+=cut
+sub split_group {
+my $self = shift;
+my ($group,$gff3) = @_;
+if ($gff3) {
+my @groups = split /[;&]/,$group;  # so easy!
+return $self->_split_gff3_group(@groups);
+} else {
+# handle group parsing
+# protect embedded semicolons in the group; there must be faster/more elegant way
+# to do this.
+$group =~ s/\\;/$;/g;
+while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 }
+my @groups = split(/\s*;\s*/,$group);
+foreach (@groups) { s/$;/;/g }
+return $self->_split_gff2_group(@groups);
+}
+}
+=head2 _split_gff2_group
+This is an internal method called by split_group().
+=cut
+sub _split_gff2_group {
+my $self = shift;
+my @groups = @_;
+my ($gclass,$gname,$tstart,$tstop,@attributes);
+for (@groups) {
+my ($tag,$value) = /^(\S+)(?:\s+(.+))?/;
+$value ||= '';
+if ($value =~ /^\"(.+)\"$/) {  #remove quotes
+$value = $1;
+}
+$value =~ s/\\t/\t/g;
+$value =~ s/\\r/\r/g;
+# Any additional groups become part of the attributes hash
+# For historical reasons, the tag "Note" is treated as an
+# attribute, even if it is the only group.
+$tag ||= '';
+if ($tag eq 'Note' or ($gclass && $gname)) {
+push @attributes,[$tag => $value];
+}
+# if the tag eq 'Target' then the class name is embedded in the ID
+# (the GFF format is obviously screwed up here)
+elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) {
+($gclass,$gname) = ($1,$2);
+($tstart,$tstop) = / (\d+) (\d+)/;
+}
+elsif (!$value) {
+push @attributes,[Note => $tag];  # e.g. "Confirmed_by_EST"
+}
+# otherwise, the tag and value correspond to the
+# group class and name
+else {
+($gclass,$gname) = ($tag,$value);
+}
+}
+return ($gclass,$gname,$tstart,$tstop,\@attributes);
+}
+=head2 _split_gff3_group
+This is called internally from split_group().
+=cut
+sub _split_gff3_group {
+my $self   = shift;
+my @groups = @_;
+my ($gclass,$gname,$tstart,$tstop,@attributes);
+for my $group (@groups) {
+my ($tag,$value) = split /=/,$group;
+$tag             = unescape($tag);
+my @values       = map {unescape($_)} split /,/,$value;
+if ($tag eq 'Parent') {
+$gclass = 'Sequence';
+$gname  = shift @values;
+}
+elsif ($tag eq 'ID') {
+$gclass = 'Sequence';
+$gname  = shift @values;
+}
+elsif ($tag eq 'Target') {
+$gclass = 'Sequence';
+($gname,$tstart,$tstop) = split /\s+/,shift @values;
+}
+push @attributes,[$tag=>$_] foreach @values;
+}
+return ($gclass,$gname,$tstart,$tstop,\@attributes);
+}
+=head2 _delete_features(), _delete_groups(),_delete()
+Title   : _delete_features(), _delete_groups(),_delete()
+Usage   : $count = $db->_delete_features(@feature_ids)
+$count = $db->_delete_groups(@group_ids)
+$count = $db->_delete(\%delete_spec)
+Function: low-level feature/group deleter
+Returns : count of groups removed
+Args    : list of feature or group ids removed
+Status  : for implementation by subclasses
+These methods need to be implemented in adaptors.  For
+_delete_features and _delete_groups, the arguments are a list of
+feature or group IDs to remove.  For _delete(), the argument is a
+hashref with the three keys 'segments', 'types' and 'force'.  The
+first contains an arrayref of Bio::DB::GFF::RelSegment objects to
+delete (all FEATURES within the segment are deleted).  The second
+contains an arrayref of [method,source] feature types to delete.  The
+two are ANDed together.  If 'force' has a true value, this forces the
+operation to continue even if it would delete all features.
+=cut
+sub _delete_features {
+my $self = shift;
+my @feature_ids = @_;
+$self->throw('_delete_features is not implemented in this adaptor');
+}
+sub _delete_groups {
+my $self = shift;
+my @group_ids = @_;
+$self->throw('_delete_groups is not implemented in this adaptor');
+}
+sub _delete {
+my $self = shift;
+my $delete_options = shift;
+$self->throw('_delete is not implemented in this adaptor');
+}
+sub unescape {
+my $v = shift;
+$v =~ tr/+/ /;
+$v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
+return $v;
+}
+package Bio::DB::GFF::ID_Iterator;
+use strict;
+use Bio::Root::Root;
+use vars '@ISA';
+@ISA = 'Bio::Root::Root';
+sub new {
+my $class            = shift;
+my ($db,$ids,$type)  = @_;
+return bless {ids=>$ids,db=>$db,type=>$type},$class;
+}
+sub next_seq {
+my $self = shift;
+my $next = shift @{$self->{ids}};
+return unless $next;
+my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next;
+my $segment = $self->{type} eq 'name'      ? $self->{db}->segment($name)
+: $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name)
+: $self->{type} eq 'group'   ? $self->{db}->fetch_feature_by_gid($name)
+: $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier");
+$self->throw("id does not exist") unless $segment;
+return $segment;
+}
+1;
+__END__
+=head1 BUGS
+Features can only belong to a single group at a time.  This must be
+addressed soon.
+Start coordinate can be greater than stop coordinate for relative
+addressing.  This breaks strict BioPerl compatibility and must be
+fixed.
+=head1 SEE ALSO
+L<bioperl>,
+L<Bio::DB::GFF::RelSegment>,
+L<Bio::DB::GFF::Aggregator>,
+L<Bio::DB::GFF::Feature>,
+L<Bio::DB::GFF::Adaptor::dbi::mysqlopt>,
+L<Bio::DB::GFF::Adaptor::dbi::oracle>,
+L<Bio::DB::GFF::Adaptor::memory>
+=head1 AUTHOR
+Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
+Copyright (c) 2001 Cold Spring Harbor Laboratory.
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself.
+=cut

Mercurial > repos > mahtabm > ensembl

comparison variant_effect_predictor/Bio/DB/GFF.pm @ 0:1f6dce3d34e0