Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/DB/GFF.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/GFF.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,3379 @@ +# $Id: GFF.pm,v 1.71.2.2 2003/09/12 13:29:32 lstein Exp $ + +=head1 NAME + +Bio::DB::GFF -- Storage and retrieval of sequence annotation data + +=head1 SYNOPSIS + + use Bio::DB::GFF; + + # Open the sequence database + my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt', + -dsn => 'dbi:mysql:elegans', + -fasta => '/usr/local/fasta_files' + ); + + # fetch a 1 megabase segment of sequence starting at landmark "ZK909" + my $segment = $db->segment('ZK909', 1 => 1000000); + + # pull out all transcript features + my @transcripts = $segment->features('transcript'); + + # for each transcript, total the length of the introns + my %totals; + for my $t (@transcripts) { + my @introns = $t->Intron; + $totals{$t->name} += $_->length foreach @introns; + } + + # Sort the exons of the first transcript by position + my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon; + + # Get a region 1000 bp upstream of first exon + my $upstream = $exons[0]->segment(-1000,0); + + # get its DNA + my $dna = $upstream->dna; + + # and get all curated polymorphisms inside it + @polymorphisms = $upstream->contained_features('polymorphism:curated'); + + # get all feature types in the database + my @types = $db->types; + + # count all feature types in the segment + my %type_counts = $segment->types(-enumerate=>1); + + # get an iterator on all curated features of type 'exon' or 'intron' + my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']); + + while (my $s = $iterator->next_seq) { + print $s,"\n"; + } + + # find all transcripts annotated as having function 'kinase' + my $iterator = $db->get_seq_stream(-type=>'transcript', + -attributes=>{Function=>'kinase'}); + while (my $s = $iterator->next_seq) { + print $s,"\n"; + } + +=head1 DESCRIPTION + +Bio::DB::GFF provides fast indexed access to a sequence annotation +database. It supports multiple database types (ACeDB, relational), +and multiple schemas through a system of adaptors and aggregators. + +The following operations are supported by this module: + + - retrieving a segment of sequence based on the ID of a landmark + - retrieving the DNA from that segment + - finding all annotations that overlap with the segment + - finding all annotations that are completely contained within the + segment + - retrieving all annotations of a particular type, either within a + segment, or globally + - conversion from absolute to relative coordinates and back again, + using any arbitrary landmark for the relative coordinates + - using a sequence segment to create new segments based on relative + offsets + +The data model used by Bio::DB::GFF is compatible with the GFF flat +file format (http://www.sanger.ac.uk/software/GFF). The module can +load a set of GFF files into the database, and serves objects that +have methods corresponding to GFF fields. + +The objects returned by Bio::DB::GFF are compatible with the +SeqFeatureI interface, allowing their use by the Bio::Graphics and +Bio::DAS modules. + +=head2 Auxiliary Scripts + +The bioperl distribution includes several scripts that make it easier +to work with Bio::DB::GFF databases. They are located in the scripts +directory under a subdirectory named Bio::DB::GFF: + +=over 4 + +=item bp_load_gff.pl + +This script will load a Bio::DB::GFF database from a flat GFF file of +sequence annotations. Only the relational database version of +Bio::DB::GFF is supported. It can be used to create the database from +scratch, as well as to incrementally load new data. + +This script takes a --fasta argument to load raw DNA into the database +as well. However, GFF databases do not require access to the raw DNA +for most of their functionality. + +load_gff.pl also has a --upgrade option, which will perform a +non-destructive upgrade of older schemas to newer ones. + +=item bp_bulk_load_gff.pl + +This script will populate a Bio::DB::GFF database from a flat GFF file +of sequence annotations. Only the MySQL database version of +Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in +order to accelerate loading considerably; however, it can only be used +for the initial load, and not for updates. + +This script takes a --fasta argument to load raw DNA into the database +as well. However, GFF databases do not require access to the raw DNA +for most of their functionality. + +=item bp_fast_load_gff.pl + +This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe +tricks to allow for incremental updates. It only supports the MySQL +database version of Bio::DB::GFF and is guaranteed not to work on +non-Unix platforms. + +Arguments are the same as bp_load_gff.pl + +=item gadfly_to_gff.pl + +This script will convert the GFF-like format used by the Berkeley +Drosophila Sequencing project into a format suitable for use with this +module. + +=item sgd_to_gff.pl + +This script will convert the tab-delimited feature files used by the +Saccharomyces Genome Database into a format suitable for use with this +module. + +=back + +=head2 GFF Fundamentals + +The GFF format is a flat tab-delimited file, each line of which +corresponds to an annotation, or feature. Each line has nine columns +and looks like this: + + Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7" + +The 9 columns are as follows: + +=over 4 + +=item 1. reference sequence + +This is the ID of the sequence that is used to establish the +coordinate system of the annotation. In the example above, the +reference sequence is "Chr1". + +=item 2. source + +The source of the annotation. This field describes how the annotation +was derived. In the example above, the source is "curated" to +indicate that the feature is the result of human curation. The names +and versions of software programs are often used for the source field, +as in "tRNAScan-SE/1.2". + +=item 3. method + +The annotation method. This field describes the type of the +annotation, such as "CDS". Together the method and source describe +the annotation type. + +=item 4. start position + +The start of the annotation relative to the reference sequence. + +=item 5. stop position + +The stop of the annotation relative to the reference sequence. Start +is always less than or equal to stop. + +=item 6. score + +For annotations that are associated with a numeric score (for example, +a sequence similarity), this field describes the score. The score +units are completely unspecified, but for sequence similarities, it is +typically percent identity. Annotations that don't have a score can +use "." + +=item 7. strand + +For those annotations which are strand-specific, this field is the +strand on which the annotation resides. It is "+" for the forward +strand, "-" for the reverse strand, or "." for annotations that are +not stranded. + +=item 8. phase + +For annotations that are linked to proteins, this field describes the +phase of the annotation on the codons. It is a number from 0 to 2, or +"." for features that have no phase\. + +=item 9. group + +GFF provides a simple way of generating annotation hierarchies ("is +composed of" relationships) by providing a group field. The group +field contains the class and ID of an annotation which is the logical +parent of the current one. In the example given above, the group is +the Transcript named "R119.7". + +The group field is also used to store information about the target of +sequence similarity hits, and miscellaneous notes. See the next +section for a description of how to describe similarity targets. + +The format of the group fields is "Class ID" with a single space (not +a tab) separating the class from the ID. It is VERY IMPORTANT to +follow this format, or grouping will not work properly. + +=back + +The sequences used to establish the coordinate system for annotations +can correspond to sequenced clones, clone fragments, contigs or +super-contigs. Thus, this module can be used throughout the lifecycle +of a sequencing project. + +In addition to a group ID, the GFF format allows annotations to have a +group class. For example, in the ACeDB representation, RNA +interference experiments have a class of "RNAi" and an ID that is +unique among the RNAi experiments. Since not all databases support +this notion, the class is optional in all calls to this module, and +defaults to "Sequence" when not provided. + +Double-quotes are sometimes used in GFF files around components of the +group field. Strictly, this is only necessary if the group name or +class contains whitespace. + +=head2 Making GFF files work with this module + +Some annotations do not need to be individually named. For example, +it is probably not useful to assign a unique name to each ALU repeat +in a vertebrate genome. Others, such as predicted genes, correspond +to named biological objects; you probably want to be able to fetch the +positions of these objects by referring to them by name. + +To accomodate named annotations, the GFF format places the object +class and name in the group field. The name identifies the object, +and the class prevents similarly-named objects, for example clones and +sequences, from collding. + +A named object is shown in the following excerpt from a GFF file: + + Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2 + +This object is a predicted transcript named Y95BA.2. In this case, +the group field is used to identify the class and name of the object, +even though no other annotation belongs to that group. + +It now becomes possible to retrieve the region of the genome covered +by transcript Y95B8A.2 using the segment() method: + + $segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2'); + +It is not necessary for the annotation's method to correspond to the +object class, although this is commonly the case. + +As explained above, each annotation in a GFF file refers to a +reference sequence. It is important that each reference sequence also +be identified by a line in the GFF file. This allows the Bio::DB::GFF +module to determine the length and class of the reference sequence, +and makes it possible to do relative arithmetic. + +For example, if "Chr1" is used as a reference sequence, then it should +have an entry in the GFF file similar to this one: + + Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1 + +This indicates that the reference sequence named "Chr1" has length +14972282 bp, method "chromosome" and source "assembly". In addition, +as indicated by the group field, Chr1 has class "Sequence" and name +"Chr1". + +The object class "Sequence" is used by default when the class is not +specified in the segment() call. This allows you to use a shortcut +form of the segment() method: + + $segment = $db->segment('Chr1'); # whole chromosome + $segment = $db->segment('Chr1',1=>1000); # first 1000 bp + +For your convenience, if, during loading a GFF file, Bio::DB::GFF +encounters a line like the following: + + ##sequence-region Chr1 1 14972282 + +It will automatically generate the following entry: + + Chr1 reference Component 1 14972282 . + . Sequence Chr1 + +This is sufficient to use Chr1 as a reference point. +The ##sequence-region line is frequently found in the GFF files +distributed by annotation groups. + +=head2 Sequence alignments + +There are two cases in which an annotation indicates the relationship +between two sequences. The first case is a similarity hit, where the +annotation indicates an alignment. The second case is a map assembly, +in which the annotation indicates that a portion of a larger sequence +is built up from one or more smaller ones. + +Both cases are indicated by using the B<Target> tag in the group +field. For example, a typical similarity hit will look like this: + + Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544 + +The group field contains the Target tag, followed by an identifier for +the biological object referred to. The GFF format uses the notation +I<Class>:I<Name> for the biological object, and even though this is +stylistically inconsistent, that's the way it's done. The object +identifier is followed by two integers indicating the start and stop +of the alignment on the target sequence. + +Unlike the main start and stop columns, it is possible for the target +start to be greater than the target end. The previous example +indicates that the the section of Chr1 from 76,953 to 77,108 aligns to +the protein SW:ABL_DROME starting at position 493 and extending to +position 544. + +A similar notation is used for sequence assembly information as shown +in this example: + + Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826 + LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742 + +This indicates that the region between bases 10922906 and 11177731 of +Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region +of LINK_H0601 between 32386 and 64122 is, in turn, composed of the +bases 5 to 31742 of cosmid F49B2. + +=head2 Attributes + +While not intended to serve as a general-purpose sequence database +(see bioperl-db for that), GFF allows you to tag features with +arbitrary attributes. Attributes appear in the Group field following +the initial class/name pair. For example: + + Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3 + +This line tags the feature named Transcript Y95B8A.2 as being "Gene" +named sma-3 and having the Alias "sma3". Features having these +attributes can be looked up using the fetch_feature_by_attribute() method. + +Two attributes have special meaning: "Note" is for backward +compatibility and is used for unstructured text remarks. "Alias" is +considered as a synonym for the feature name and will be consulted +when looking up a feature by its name. + +=head2 Adaptors and Aggregators + +This module uses a system of adaptors and aggregators in order to make +it adaptable to use with a variety of databases. + +=over 4 + +=item Adaptors + +The core of the module handles the user API, annotation coordinate +arithmetic, and other common issues. The details of fetching +information from databases is handled by an adaptor, which is +specified during Bio::DB::GFF construction. The adaptor encapsulates +database-specific information such as the schema, user authentication +and access methods. + +Currently there are two adaptors: 'dbi::mysql' and 'dbi::mysqlopt'. +The former is an interface to a simple Mysql schema. The latter is an +optimized version of dbi::mysql which uses a binning scheme to +accelerate range queries and the Bio::DB::Fasta module for rapid +retrieval of sequences. Note the double-colon between the words. + +=item Aggregators + +The GFF format uses a "group" field to indicate aggregation properties +of individual features. For example, a set of exons and introns may +share a common transcript group, and multiple transcripts may share +the same gene group. + +Aggregators are small modules that use the group information to +rebuild the hierarchy. When a Bio::DB::GFF object is created, you +indicate that it use a set of one or more aggregators. Each +aggregator provides a new composite annotation type. Before the +database query is generated each aggregator is called to +"disaggregate" its annotation type into list of component types +contained in the database. After the query is generated, each +aggregator is called again in order to build composite annotations +from the returned components. + +For example, during disaggregation, the standard +"processed_transcript" aggregator generates a list of component +feature types including "UTR", "CDS", and "polyA_site". Later, it +aggregates these features into a set of annotations of type +"processed_transcript". + +During aggregation, the list of aggregators is called in reverse +order. This allows aggregators to collaborate to create multi-level +structures: the transcript aggregator assembles transcripts from +introns and exons; the gene aggregator then assembles genes from sets +of transcripts. + +Three default aggregators are provided: + + transcript assembles transcripts from features of type + exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA + clone assembles clones from Clone_left_end, Clone_right_end + and Sequence features. + alignment assembles gapped alignments from features of type + "similarity". + +In addition, this module provides the optional "wormbase_gene" +aggregator, which accomodates the WormBase representation of genes. +This aggregator aggregates features of method "exon", "CDS", "5'UTR", +"3'UTR", "polyA" and "TSS" into a single object. It also expects to +find a single feature of type "Sequence" that spans the entire gene. + +The existing aggregators are easily customized. + +Note that aggregation will not occur unless you specifically request +the aggregation type. For example, this call: + + @features = $segment->features('alignment'); + +will generate an array of aggregated alignment features. However, +this call: + + @features = $segment->features(); + +will return a list of unaggregated similarity segments. + +For more informnation, see the manual pages for +Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone, +etc. + +=back + +=head1 API + +The following is the API for Bio::DB::GFF. + +=cut + +package Bio::DB::GFF; + +use strict; + +use Bio::DB::GFF::Util::Rearrange; +use Bio::DB::GFF::RelSegment; +use Bio::DB::GFF::Feature; +use Bio::DB::GFF::Aggregator; +use Bio::DasI; +use Bio::Root::Root; + +use vars qw(@ISA $VERSION); +@ISA = qw(Bio::Root::Root Bio::DasI); + +$VERSION = '1.2003'; +my %valid_range_types = (overlaps => 1, + contains => 1, + contained_in => 1); + +=head1 Querying GFF Databases + +=head2 new + + Title : new + Usage : my $db = new Bio::DB::GFF(@args); + Function: create a new Bio::DB::GFF object + Returns : new Bio::DB::GFF object + Args : lists of adaptors and aggregators + Status : Public + +These are the arguments: + + -adaptor Name of the adaptor module to use. If none + provided, defaults to "dbi::mysqlopt". + + -aggregator Array reference to a list of aggregators + to apply to the database. If none provided, + defaults to ['processed_transcript','alignment']. + + <other> Any other named argument pairs are passed to + the adaptor for processing. + +The adaptor argument must correspond to a module contained within the +Bio::DB::GFF::Adaptor namespace. For example, the +Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying +'dbi::mysql'. By Perl convention, the adaptors names are lower case +because they are loaded at run time. + +The aggregator array may contain a list of aggregator names, a list of +initialized aggregator objects, or a string in the form +"aggregator_name{subpart1,subpart2,subpart3/main_method}" (the +/main_method part is optional). For example, if you wish to change +the components aggregated by the transcript aggregator, you could pass +it to the GFF constructor this way: + + my $transcript = + Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr + polyA spliced_leader)]); + + my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment], + -adaptor => 'dbi::mysql', + -dsn => 'dbi:mysql:elegans42'); + +Alternatively, you could create an entirely new transcript aggregator +this way: + + my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}'; + my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment], + -adaptor => 'dbi::mysql', + -dsn => 'dbi:mysql:elegans42'); + +See L<Bio::DB::GFF::Aggregator> for more details. + +The commonly used 'dbi::mysql' adaptor recognizes the following +adaptor-specific arguments: + + Argument Description + -------- ----------- + + -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' + If a partial name is given, such as "ens0040", the + "dbi:mysql:" prefix will be added automatically. + + -user username for authentication + + -pass the password for authentication + + -refclass landmark Class; defaults to "Sequence" + +The commonly used 'dbi::mysqlopt' adaptor also recogizes the following +arguments. + + Argument Description + -------- ----------- + + -fasta path to a directory containing FASTA files for the DNA + contained in this database (e.g. "/usr/local/share/fasta") + + -acedb an acedb URL to use when converting features into ACEDB + objects (e.g. sace://localhost:2005) + +=cut + +#' + +sub new { + my $package = shift; + my ($adaptor,$aggregators,$args,$refclass); + + if (@_ == 1) { # special case, default to dbi::mysqlopt + $adaptor = 'dbi::mysqlopt'; + $args = {DSN => shift}; + } else { + ($adaptor,$aggregators,$refclass,$args) = rearrange([ + [qw(ADAPTOR FACTORY)], + [qw(AGGREGATOR AGGREGATORS)], + 'REFCLASS', + ],@_); + } + + $adaptor ||= 'dbi::mysqlopt'; + my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E"; + eval "require $class" unless $class->can('new'); + $package->throw("Unable to load $adaptor adaptor: $@") if $@; + + my $self = $class->new($args); + $self->default_class($refclass) if defined $refclass; + + # handle the aggregators. + # aggregators are responsible for creating complex multi-part features + # from the GFF "group" field. If none are provided, then we provide a + # list of the two used in WormBase. + # Each aggregator can be a scalar or a ref. In the former case + # it is treated as a class name to call new() on. In the latter + # the aggreator is treated as a ready made object. + $aggregators = $self->default_aggregators unless defined $aggregators; + my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators; + for my $a (@a) { + $self->add_aggregator($a); + } + + # default settings go here..... + $self->automerge(1); # set automerge to true + + $self; +} + + +=head2 types + + Title : types + Usage : $db->types(@args) + Function: return list of feature types in range or database + Returns : a list of Bio::DB::GFF::Typename objects + Args : see below + Status : public + +This routine returns a list of feature types known to the database. +The list can be database-wide or restricted to a region. It is also +possible to find out how many times each feature occurs. + +For range queries, it is usually more convenient to create a +Bio::DB::GFF::Segment object, and then invoke it's types() method. + +Arguments are as follows: + + -ref ID of reference sequence + -class class of reference sequence + -start start of segment + -stop stop of segment + -enumerate if true, count the features + +The returned value will be a list of Bio::DB::GFF::Typename objects, +which if evaluated in a string context will return the feature type in +"method:source" format. This object class also has method() and +source() methods for retrieving the like-named fields. + +If -enumerate is true, then the function returns a hash (not a hash +reference) in which the keys are type names in "method:source" format +and the values are the number of times each feature appears in the +database or segment. + +The argument -end is a synonum for -stop, and -count is a synonym for +-enumerate. + +=cut + +sub types { + my $self = shift; + my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([ + [qw(REF REFSEQ)], + qw(START), + [qw(STOP END)], + [qw(ENUMERATE COUNT)], + [qw(CLASS SEQCLASS)], + [qw(TYPE TYPES)], + ],@_); + $types = $self->parse_types($types) if defined $types; + $self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types); +} + +=head2 classes + + Title : classes + Usage : $db->classes + Function: return list of landmark classes in database + Returns : a list of classes + Args : none + Status : public + +This routine returns the list of reference classes known to the +database, or empty if classes are not used by the database. Classes +are distinct from types, being essentially qualifiers on the reference +namespaces. + +=cut + +sub classes { + my $self = shift; + return (); +} + +=head2 segment + + Title : segment + Usage : $db->segment(@args); + Function: create a segment object + Returns : segment object(s) + Args : numerous, see below + Status : public + +This method generates a segment object, which is a Perl object +subclassed from Bio::DB::GFF::Segment. The segment can be used to +find overlapping features and the raw DNA. + +When making the segment() call, you specify the ID of a sequence +landmark (e.g. an accession number, a clone or contig), and a +positional range relative to the landmark. If no range is specified, +then the entire extent of the landmark is used to generate the +segment. + +You may also provide the ID of a "reference" sequence, which will set +the coordinate system and orientation used for all features contained +within the segment. The reference sequence can be changed later. If +no reference sequence is provided, then the coordinate system is based +on the landmark. + +Arguments: + + -name ID of the landmark sequence. + + -class Database object class for the landmark sequence. + "Sequence" assumed if not specified. This is + irrelevant for databases which do not recognize + object classes. + + -start Start of the segment relative to landmark. Positions + follow standard 1-based sequence rules. If not specified, + defaults to the beginning of the landmark. + + -end Stop of the segment relative to the landmark. If not specified, + defaults to the end of the landmark. + + -stop Same as -end. + + -offset For those who prefer 0-based indexing, the offset specifies the + position of the new segment relative to the start of the landmark. + + -length For those who prefer 0-based indexing, the length specifies the + length of the new segment. + + -refseq Specifies the ID of the reference landmark used to establish the + coordinate system for the newly-created segment. + + -refclass Specifies the class of the reference landmark, for those databases + that distinguish different object classes. Defaults to "Sequence". + + -absolute + Return features in absolute coordinates rather than relative to the + parent segment. + + -nocheck Don't check the database for the coordinates and length of this + feature. Construct a segment using the indicated name as the + reference, a start coordinate of 1, an undefined end coordinate, + and a strand of +1. + + -force Same as -nocheck. + + -seq,-sequence,-sourceseq Aliases for -name. + + -begin,-end Aliases for -start and -stop + + -off,-len Aliases for -offset and -length + + -seqclass Alias for -class + +Here's an example to explain how this works: + + my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql'); + +If successful, $db will now hold the database accessor object. We now +try to fetch the fragment of sequence whose ID is A0000182 and class +is "Accession." + + my $segment = $db->segment(-name=>'A0000182',-class=>'Accession'); + +If successful, $segment now holds the entire segment corresponding to +this accession number. By default, the sequence is used as its own +reference sequence, so its first base will be 1 and its last base will +be the length of the accession. + +Assuming that this sequence belongs to a longer stretch of DNA, say a +contig, we can fetch this information like so: + + my $sourceseq = $segment->sourceseq; + +and find the start and stop on the source like this: + + my $start = $segment->abs_start; + my $stop = $segment->abs_stop; + +If we had another segment, say $s2, which is on the same contiguous +piece of DNA, we can pass that to the refseq() method in order to +establish it as the coordinate reference point: + + $segment->refseq($s2); + +Now calling start() will return the start of the segment relative to +the beginning of $s2, accounting for differences in strandedness: + + my $rel_start = $segment->start; + +IMPORTANT NOTE: This method can be used to return the segment spanned +by an arbitrary named annotation. However, if the annotation appears +at multiple locations on the genome, for example an EST that maps to +multiple locations, then, provided that all locations reside on the +same physical segment, the method will return a segment that spans the +minimum and maximum positions. If the reference sequence occupies +ranges on different physical segments, then it returns them all in an +array context, and raises a "multiple segment exception" exception in +a scalar context. + +=cut + +#' + +sub segment { + my $self = shift; + my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self, + $self->setup_segment_args(@_)); + foreach (@segments) { + $_->absolute(1) if $self->absolute; + } + + $self->_multiple_return_args(@segments); +} + +sub _multiple_return_args { + my $self = shift; + my @args = @_; + if (@args == 0) { + return; + } elsif (@args == 1) { + return $args[0]; + } elsif (wantarray) { # more than one reference sequence + return @args; + } else { + $self->error($args[0]->name, + " has more than one reference sequence in database. Please call in a list context to retrieve them all."); + $self->throw('multiple segment exception'); + return; + } + +} + +# backward compatibility -- don't use! +# (deliberately undocumented too) +sub abs_segment { + my $self = shift; + return $self->segment($self->setup_segment_args(@_),-absolute=>1); +} + +sub setup_segment_args { + my $self = shift; + return @_ if defined $_[0] && $_[0] =~ /^-/; + return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3; + return (-class=>$_[0],-name=>$_[1]) if @_ == 2; + return (-name=>$_[0]) if @_ == 1; +} + +=head2 features + + Title : features + Usage : $db->features(@args) + Function: get all features, possibly filtered by type + Returns : a list of Bio::DB::GFF::Feature objects + Args : see below + Status : public + +This routine will retrieve features in the database regardless of +position. It can be used to return all features, or a subset based on +their method and source. + +Arguments are as follows: + + -types List of feature types to return. Argument is an array + reference containing strings of the format "method:source" + + -merge Whether to apply aggregators to the generated features. + + -rare Turn on optimizations suitable for a relatively rare feature type, + where it makes more sense to filter by feature type first, + and then by position. + + -attributes A hash reference containing attributes to match. + + -iterator Whether to return an iterator across the features. + + -binsize A true value will create a set of artificial features whose + start and stop positions indicate bins of the given size, and + whose scores are the number of features in the bin. The + class and method of the feature will be set to "bin", + its source to "method:source", and its group to "bin:method:source". + This is a handy way of generating histograms of feature density. + +If -iterator is true, then the method returns a single scalar value +consisting of a Bio::SeqIO object. You can call next_seq() repeatedly +on this object to fetch each of the features in turn. If iterator is +false or absent, then all the features are returned as a list. + +Currently aggregation is disabled when iterating over a series of +features. + +Types are indicated using the nomenclature "method:source". Either of +these fields can be omitted, in which case a wildcard is used for the +missing field. Type names without the colon (e.g. "exon") are +interpreted as the method name and a source wild card. Regular +expressions are allowed in either field, as in: "similarity:BLAST.*". + +The -attributes argument is a hashref containing one or more attributes +to match against: + + -attributes => { Gene => 'abc-1', + Note => 'confirmed' } + +Attribute matching is simple string matching, and multiple attributes +are ANDed together. + +=cut + +sub features { + my $self = shift; + my ($types,$automerge,$sparse,$iterator,$other); + if (defined $_[0] && + $_[0] =~ /^-/) { + ($types,$automerge,$sparse,$iterator,$other) = rearrange([ + [qw(TYPE TYPES)], + [qw(MERGE AUTOMERGE)], + [qw(RARE SPARSE)], + 'ITERATOR' + ],@_); + } else { + $types = \@_; + } + + # for whole database retrievals, we probably don't want to automerge! + $automerge = $self->automerge unless defined $automerge; + $other ||= {}; + $self->_features({ + rangetype => 'contains', + types => $types, + }, + { sparse => $sparse, + automerge => $automerge, + iterator =>$iterator, + %$other, + } + ); +} + +=head2 get_seq_stream + + Title : get_seq_stream + Usage : my $seqio = $self->get_seq_sream(@args) + Function: Performs a query and returns an iterator over it + Returns : a Bio::SeqIO stream capable of producing sequence + Args : As in features() + Status : public + +This routine takes the same arguments as features(), but returns a +Bio::SeqIO::Stream-compliant object. Use it like this: + + $stream = $db->get_seq_stream('exon'); + while (my $exon = $stream->next_seq) { + print $exon,"\n"; + } + +NOTE: This is also called get_feature_stream(), since that's what it +really does. + +=cut + +sub get_seq_stream { + my $self = shift; + my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1) + : (-types=>\@_,-iterator=>1); + $self->features(@args); +} + +*get_feature_stream = \&get_seq_stream; + +=head2 get_feature_by_name + + Title : get_feature_by_name + Usage : $db->get_feature_by_name($class => $name) + Function: fetch features by their name + Returns : a list of Bio::DB::GFF::Feature objects + Args : the class and name of the desired feature + Status : public + +This method can be used to fetch a named feature from the database. +GFF annotations are named using the group class and name fields, so +for features that belong to a group of size one, this method can be +used to retrieve that group (and is equivalent to the segment() +method). Any Alias attributes are also searched for matching names. + +An alternative syntax allows you to search for features by name within +a circumscribed region: + + @f = $db->get_feature_by_name(-class => $class,-name=>$name, + -ref => $sequence_name, + -start => $start, + -end => $end); + +This method may return zero, one, or several Bio::DB::GFF::Feature +objects. + +Aggregation is performed on features as usual. + +NOTE: At various times, this function was called fetch_group(), +fetch_feature(), fetch_feature_by_name() and segments(). These names +are preserved for backward compatibility. + +=cut + +sub get_feature_by_name { + my $self = shift; + my ($gclass,$gname,$automerge,$ref,$start,$end); + if (@_ == 1) { + $gclass = $self->default_class; + $gname = shift; + } else { + ($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE', + ['REF','REFSEQ'], + 'START',['STOP','END'] + ],@_); + $gclass ||= $self->default_class; + } + $automerge = $self->automerge unless defined $automerge; + + # we need to refactor this... It's repeated code (see below)... + my @aggregators; + if ($automerge) { + for my $a ($self->aggregators) { + push @aggregators,$a if $a->disaggregate([],$self); + } + } + + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; + my $location = [$ref,$start,$end] if defined $ref; + $self->_feature_by_name($gclass,$gname,$location,$callback); + + warn "aggregating...\n" if $self->debug; + foreach my $a (@aggregators) { # last aggregator gets first shot + $a->aggregate($features,$self) or next; + } + + @$features; +} + +# horrible indecision regarding proper names! +*fetch_group = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name; +*segments = \&segment; + +=head2 get_feature_by_target + + Title : get_feature_by_target + Usage : $db->get_feature_by_target($class => $name) + Function: fetch features by their similarity target + Returns : a list of Bio::DB::GFF::Feature objects + Args : the class and name of the desired feature + Status : public + +This method can be used to fetch a named feature from the database +based on its similarity hit. + +=cut + +sub get_feature_by_target { + shift->get_feature_by_name(@_); +} + +=head2 get_feature_by_attribute + + Title : get_feature_by_attribute + Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2) + Function: fetch segments by combinations of attribute values + Returns : a list of Bio::DB::GFF::Feature objects + Args : the class and name of the desired feature + Status : public + +This method can be used to fetch a set of features from the database. +Attributes are a list of name=E<gt>value pairs. They will be logically +ANDED together. + +=cut + +sub get_feature_by_attribute { + my $self = shift; + my %attributes = ref($_[0]) ? %{$_[0]} : @_; + + # we need to refactor this... It's repeated code (see above)... + my @aggregators; + if ($self->automerge) { + for my $a ($self->aggregators) { + unshift @aggregators,$a if $a->disaggregate([],$self); + } + } + + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; + $self->_feature_by_attribute(\%attributes,$callback); + + warn "aggregating...\n" if $self->debug; + foreach my $a (@aggregators) { # last aggregator gets first shot + $a->aggregate($features,$self) or next; + } + + @$features; +} + +# more indecision... +*fetch_feature_by_attribute = \&get_feature_by_attribute; + +=head2 get_feature_by_id + + Title : get_feature_by_id + Usage : $db->get_feature_by_id($id) + Function: fetch segments by feature ID + Returns : a Bio::DB::GFF::Feature object + Args : the feature ID + Status : public + +This method can be used to fetch a feature from the database using its +ID. Not all GFF databases support IDs, so be careful with this. + +=cut + +sub get_feature_by_id { + my $self = shift; + my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; + $self->_feature_by_id($id,'feature',$callback); + return wantarray ? @$features : $features->[0]; +} +*fetch_feature_by_id = \&get_feature_by_id; + +=head2 get_feature_by_gid + + Title : get_feature_by_gid + Usage : $db->get_feature_by_gid($id) + Function: fetch segments by feature ID + Returns : a Bio::DB::GFF::Feature object + Args : the feature ID + Status : public + +This method can be used to fetch a feature from the database using its +group ID. Not all GFF databases support IDs, so be careful with this. + +The group ID is often more interesting than the feature ID, since +groups can be complex objects containing subobjects. + +=cut + +sub get_feature_by_gid { + my $self = shift; + my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; + $self->_feature_by_id($id,'group',$callback); + return wantarray ? @$features : $features->[0]; +} +*fetch_feature_by_gid = \&get_feature_by_gid; + +=head2 delete_features + + Title : delete_features + Usage : $db->delete_features(@ids_or_features) + Function: delete one or more features + Returns : count of features deleted + Args : list of features or feature ids + Status : public + +Pass this method a list of numeric feature ids or a set of features. +It will attempt to remove the features from the database and return a +count of the features removed. + +NOTE: This method is also called delete_feature(). Also see +delete_groups(). + +=cut + +*delete_feature = \&delete_features; + +sub delete_features { + my $self = shift; + my @features_or_ids = @_; + my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids; + return unless @ids; + $self->_delete_features(@ids); +} + +=head2 delete_groups + + Title : delete_groups + Usage : $db->delete_groups(@ids_or_features) + Function: delete one or more feature groups + Returns : count of features deleted + Args : list of features or feature group ids + Status : public + +Pass this method a list of numeric group ids or a set of features. It +will attempt to recursively remove the features and ALL members of +their group from the database. It returns a count of the number of +features (not groups) returned. + +NOTE: This method is also called delete_group(). Also see +delete_features(). + +=cut + +*delete_group = \&delete_groupss; + +sub delete_groups { + my $self = shift; + my @features_or_ids = @_; + my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids; + return unless @ids; + $self->_delete_groups(@ids); +} + +=head2 delete + + Title : delete + Usage : $db->delete(@args) + Function: delete features + Returns : count of features deleted -- if available + Args : numerous, see below + Status : public + +This method deletes all features that overlap the specified region or +are of a particular type. If no arguments are provided and the -force +argument is true, then deletes ALL features. + +Arguments: + + -name ID of the landmark sequence. + + -ref ID of the landmark sequence (synonym for -name). + + -class Database object class for the landmark sequence. + "Sequence" assumed if not specified. This is + irrelevant for databases which do not recognize + object classes. + + -start Start of the segment relative to landmark. Positions + follow standard 1-based sequence rules. If not specified, + defaults to the beginning of the landmark. + + -end Stop of the segment relative to the landmark. If not specified, + defaults to the end of the landmark. + + -offset Zero-based addressing + + -length Length of region + + -type,-types Either a single scalar type to be deleted, or an + reference to an array of types. + + -force Force operation to be performed even if it would delete + entire feature table. + + -range_type Control the range type of the deletion. One of "overlaps" (default) + "contains" or "contained_in" + +Examples: + + $db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats + $db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000 + $db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3 + +The short form of this call, as described in segment() is also allowed: + + $db->delete("chr3",1=>1000); + $db->delete("chr3"); + +IMPORTANT NOTE: This method only deletes features. It does *NOT* +delete the names of groups that contain the deleted features. Group +IDs will be reused if you later load a feature with the same group +name as one that was previously deleted. + +NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the +result code from the SQL DELETE operation. Some dbd drivers return the +count of rows deleted, while others return 0E0. Caveat emptor. + +=cut + +sub delete { + my $self = shift; + my @args = $self->setup_segment_args(@_); + my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) = + rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET', + 'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args); + $offset = 0 unless defined $offset; + $start = $offset+1 unless defined $start; + $end = $start+$length-1 if !defined $end and $length; + $class ||= $self->default_class; + + my $types = $self->parse_types($type); # parse out list of types + + $range_type ||= 'overlaps'; + $self->throw("range type must be one of {". + join(',',keys %valid_range_types). + "}\n") + unless $valid_range_types{lc $range_type}; + + + my @segments; + if (defined $name && $name ne '') { + my @args = (-name=>$name,-class=>$class); + push @args,(-start=>$start) if defined $start; + push @args,(-end =>$end) if defined $end; + @segments = $self->segment(@args); + return unless @segments; + } + $self->_delete({segments => \@segments, + types => $types, + range_type => $range_type, + force => $force} + ); +} + +=head2 absolute + + Title : absolute + Usage : $abs = $db->absolute([$abs]); + Function: gets/sets absolute mode + Returns : current setting of absolute mode boolean + Args : new setting for absolute mode boolean + Status : public + +$db-E<gt>absolute(1) will turn on absolute mode for the entire database. +All segments retrieved will use absolute coordinates by default, +rather than relative coordinates. You can still set them to use +relative coordinates by calling $segment-E<gt>absolute(0). + +Note that this is not the same as calling abs_segment(); it continues +to allow you to look up groups that are not used directly as reference +sequences. + +=cut + +sub absolute { + my $self = shift; + my $d = $self->{absolute}; + $self->{absolute} = shift if @_; + $d; +} + +=head2 strict_bounds_checking + + Title : strict_bounds_checking + Usage : $flag = $db->strict_bounds_checking([$flag]) + Function: gets/sets strict bounds checking + Returns : current setting of bounds checking flag + Args : new setting for bounds checking flag + Status : public + +This flag enables extra checks for segment requests that go beyond the +ends of their reference sequences. If bounds checking is enabled, +then retrieved segments will be truncated to their physical length, +and their truncated() methods will return true. + +If the flag is off (the default), then the module will return segments +that appear to extend beyond their physical boundaries. Requests for +features beyond the end of the segment will, however, return empty. + +=cut + +sub strict_bounds_checking { + my $self = shift; + my $d = $self->{strict}; + $self->{strict} = shift if @_; + $d; +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') + Function: Gets a Bio::Seq object by its name + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + +NOTE: Bio::DB::RandomAccessI compliant method + +=cut + +sub get_Seq_by_id { + my $self = shift; + my $id = shift; + my $stream = $self->get_Stream_by_id($id); + return $stream->next_seq; +} + + +=head2 get_Seq_by_accession + + Title : get_Seq_by_accession + Usage : $seq = $db->get_Seq_by_accession('AL12234') + Function: Gets a Bio::Seq object by its accession + Returns : a Bio::Seq object + Args : the id (as a string) of a sequence + Throws : "id does not exist" exception + +NOTE: Bio::DB::RandomAccessI compliant method + +=cut + +sub get_Seq_by_accession { + my $self = shift; + my $id = shift; + my $stream = $self->get_Stream_by_accession($id); + return $stream->next_seq; +} + +=head2 get_Stream_by_acc () + +=cut + +=head2 get_Seq_by_acc + + Title : get_Seq_by_acc + Usage : $seq = $db->get_Seq_by_acc('X77802'); + Function: Gets a Bio::Seq object by accession number + Returns : A Bio::Seq object + Args : accession number (as a string) + Throws : "acc does not exist" exception + +NOTE: Bio::DB::RandomAccessI compliant method + +=cut + +sub get_Stream_by_name { + my $self = shift; + my @ids = @_; + my $id = ref($ids[0]) ? $ids[0] : \@ids; + Bio::DB::GFF::ID_Iterator->new($self,$id,'name'); +} + +=head2 get_Stream_by_id + + Title : get_Stream_by_id + Usage : $seq = $db->get_Stream_by_id(@ids); + Function: Retrieves a stream of Seq objects given their ids + Returns : a Bio::SeqIO stream object + Args : an array of unique ids/accession numbers, or + an array reference + +NOTE: This is also called get_Stream_by_batch() + +=cut + +sub get_Stream_by_id { + my $self = shift; + my @ids = @_; + my $id = ref($ids[0]) ? $ids[0] : \@ids; + Bio::DB::GFF::ID_Iterator->new($self,$id,'feature'); +} + +=head2 get_Stream_by_batch () + + Title : get_Stream_by_batch + Usage : $seq = $db->get_Stream_by_batch(@ids); + Function: Retrieves a stream of Seq objects given their ids + Returns : a Bio::SeqIO stream object + Args : an array of unique ids/accession numbers, or + an array reference + +NOTE: This is the same as get_Stream_by_id(). + +=cut + +*get_Stream_by_batch = \&get_Stream_by_id; + + +=head2 get_Stream_by_group () + +Bioperl compatibility. + +=cut + +sub get_Stream_by_group { + my $self = shift; + my @ids = @_; + my $id = ref($ids[0]) ? $ids[0] : \@ids; + Bio::DB::GFF::ID_Iterator->new($self,$id,'group'); +} + +=head2 all_seqfeatures + + Title : all_seqfeatures + Usage : @features = $db->all_seqfeatures(@args) + Function: fetch all the features in the database + Returns : an array of features, or an iterator + Args : See below + Status : public + +This is equivalent to calling $db-E<gt>features() without any types, and +will return all the features in the database. The -merge and +-iterator arguments are recognized, and behave the same as described +for features(). + +=cut + +sub all_seqfeatures { + my $self = shift; + my ($automerge,$iterator)= rearrange([ + [qw(MERGE AUTOMERGE)], + 'ITERATOR' + ],@_); + my @args; + push @args,(-merge=>$automerge) if defined $automerge; + push @args,(-iterator=>$iterator) if defined $iterator; + $self->features(@args); +} + +=head1 Creating and Loading GFF Databases + +=head2 initialize + + Title : initialize + Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2); + Function: initialize a GFF database + Returns : true if initialization successful + Args : a set of named parameters + Status : Public + +This method can be used to initialize an empty database. It takes the following +named arguments: + + -erase A boolean value. If true the database will be wiped clean if it + already contains data. + +Other named arguments may be recognized by subclasses. They become database +meta values that control various settable options. + +As a shortcut (and for backward compatibility) a single true argument +is the same as initialize(-erase=E<gt>1). + +=cut + +sub initialize { + my $self = shift; + #$self->do_initialize(1) if @_ == 1 && $_[0]; + #why was this line (^) here? I can't see that it actually does anything + #one option would be to execute the line and return, but I don't know + #why you would want to do that either. + + my ($erase,$meta) = rearrange(['ERASE'],@_); + $meta ||= {}; + + # initialize (possibly erasing) + return unless $self->do_initialize($erase); + my @default = $self->default_meta_values; + + # this is an awkward way of uppercasing the + # even-numbered values (necessary for case-insensitive SQL databases) + for (my $i=0; $i<@default; $i++) { + $default[$i] = uc $default[$i] if !($i % 2); + } + + my %values = (@default,%$meta); + foreach (keys %values) { + $self->meta($_ => $values{$_}); + } + 1; +} + + +=head2 load_gff + + Title : load_gff + Usage : $db->load_gff($file|$directory|$filehandle); + Function: load GFF data into database + Returns : count of records loaded + Args : a directory, a file, a list of files, + or a filehandle + Status : Public + +This method takes a single overloaded argument, which can be any of: + +=over 4 + +=item 1. a scalar corresponding to a GFF file on the system + +A pathname to a local GFF file. Any files ending with the .gz, .Z, or +.bz2 suffixes will be transparently decompressed with the appropriate +command-line utility. + +=item 2. an array reference containing a list of GFF files on the system + +For example ['/home/gff/gff1.gz','/home/gff/gff2.gz'] + +=item 3. directory path + +The indicated directory will be searched for all files ending in the +suffixes .gff, .gff.gz, .gff.Z or .gff.bz2. + +=item 4. filehandle + +An open filehandle from which to read the GFF data. Tied filehandles +now work as well. + +=item 5. a pipe expression + +A pipe expression will also work. For example, a GFF file on a remote +web server can be loaded with an expression like this: + + $db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |"); + +=back + +If successful, the method will return the number of GFF lines +successfully loaded. + +NOTE:this method used to be called load(), but has been changed. The +old method name is also recognized. + +=cut + +sub load_gff { + my $self = shift; + my $file_or_directory = shift || '.'; + return $self->do_load_gff($file_or_directory) if ref($file_or_directory) && + tied *$file_or_directory; + + my $tied_stdin = tied(*STDIN); + open SAVEIN,"<&STDIN" unless $tied_stdin; + local @ARGV = $self->setup_argv($file_or_directory,'gff') or return; # to play tricks with reader + my $result = $self->do_load_gff('ARGV'); + open STDIN,"<&SAVEIN" unless $tied_stdin; # restore STDIN + return $result; +} + +*load = \&load_gff; + +=head2 load_fasta + + Title : load_fasta + Usage : $db->load_fasta($file|$directory|$filehandle); + Function: load FASTA data into database + Returns : count of records loaded + Args : a directory, a file, a list of files, + or a filehandle + Status : Public + +This method takes a single overloaded argument, which can be any of: + +=over 4 + +=item 1. scalar corresponding to a FASTA file on the system + +A pathname to a local FASTA file. Any files ending with the .gz, .Z, or +.bz2 suffixes will be transparently decompressed with the appropriate +command-line utility. + +=item 2. array reference containing a list of FASTA files on the +system + +For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz'] + +=item 3. path to a directory + +The indicated directory will be searched for all files ending in the +suffixes .fa, .fa.gz, .fa.Z or .fa.bz2. + +a=item 4. filehandle + +An open filehandle from which to read the FASTA data. + +=item 5. pipe expression + +A pipe expression will also work. For example, a FASTA file on a remote +web server can be loaded with an expression like this: + + $db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |"); + +=back + +=cut + +sub load_fasta { + my $self = shift; + my $file_or_directory = shift || '.'; + return $self->load_sequence($file_or_directory) if ref($file_or_directory) && + tied *$file_or_directory; + + my $tied = tied(*STDIN); + open SAVEIN,"<&STDIN" unless $tied; + local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader + my $result = $self->load_sequence('ARGV'); + open STDIN,"<&SAVEIN" unless $tied; # restore STDIN + return $result; +} + +=head2 load_sequence_string + + Title : load_sequence_string + Usage : $db->load_sequence_string($id,$dna) + Function: load a single DNA entry + Returns : true if successfully loaded + Args : a raw sequence string (DNA, RNA, protein) + Status : Public + +=cut + +sub load_sequence_string { + my $self = shift; + my ($acc,$seq) = @_; + my $offset = 0; + $self->insert_sequence_chunk($acc,\$offset,\$seq) or return; + $self->insert_sequence($acc,$offset,$seq) or return; + 1; +} + +sub setup_argv { + my $self = shift; + my $file_or_directory = shift; + my @suffixes = @_; + no strict 'refs'; # so that we can call fileno() on the argument + + my @argv; + + if (-d $file_or_directory) { + @argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes; + }elsif (my $fd = fileno($file_or_directory)) { + open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN"); + @argv = '-'; + } elsif (ref $file_or_directory) { + @argv = @$file_or_directory; + } else { + @argv = $file_or_directory; + } + + foreach (@argv) { + if (/\.gz$/) { + $_ = "gunzip -c $_ |"; + } elsif (/\.Z$/) { + $_ = "uncompress -c $_ |"; + } elsif (/\.bz2$/) { + $_ = "bunzip2 -c $_ |"; + } + } + @argv; +} + +=head2 lock_on_load + + Title : lock_on_load + Usage : $lock = $db->lock_on_load([$lock]) + Function: set write locking during load + Returns : current value of lock-on-load flag + Args : new value of lock-on-load-flag + Status : Public + +This method is honored by some of the adaptors. If the value is true, +the tables used by the GFF modules will be locked for writing during +loads and inaccessible to other processes. + +=cut + +sub lock_on_load { + my $self = shift; + my $d = $self->{lock}; + $self->{lock} = shift if @_; + $d; +} + +=head2 meta + + Title : meta + Usage : $value = $db->meta($name [,$newval]) + Function: get or set a meta variable + Returns : a string + Args : meta variable name and optionally value + Status : abstract + +Get or set a named metavalues for the database. Metavalues can be +used for database-specific settings. + +By default, this method does nothing! + +=cut + +sub meta { + my $self = shift; + my ($name,$value) = @_; + return; +} + +=head2 default_meta_values + + Title : default_meta_values + Usage : %values = $db->default_meta_values + Function: empty the database + Returns : a list of tag=>value pairs + Args : none + Status : protected + +This method returns a list of tag=E<gt>value pairs that contain default +meta information about the database. It is invoked by initialize() to +write out the default meta values. The base class version returns an +empty list. + +For things to work properly, meta value names must be UPPERCASE. + +=cut + +sub default_meta_values { + my $self = shift; + return (); +} + + +=head2 error + + Title : error + Usage : $db->error( [$new error] ); + Function: read or set error message + Returns : error message + Args : an optional argument to set the error message + Status : Public + +This method can be used to retrieve the last error message. Errors +are not reset to empty by successful calls, so contents are only valid +immediately after an error condition has been detected. + +=cut + +sub error { + my $self = shift; + my $g = $self->{error}; + $self->{error} = join '',@_ if @_; + $g; +} + +=head2 debug + + Title : debug + Usage : $db->debug( [$flag] ); + Function: read or set debug flag + Returns : current value of debug flag + Args : new debug flag (optional) + Status : Public + +This method can be used to turn on debug messages. The exact nature +of those messages depends on the adaptor in use. + +=cut + +sub debug { + my $self = shift; + my $g = $self->{debug}; + $self->{debug} = shift if @_; + $g; +} + + +=head2 automerge + + Title : automerge + Usage : $db->automerge( [$new automerge] ); + Function: get or set automerge value + Returns : current value (boolean) + Args : an optional argument to set the automerge value + Status : Public + +By default, this module will use the aggregators to merge groups into +single composite objects. This default can be changed to false by +calling automerge(0). + +=cut + +sub automerge { + my $self = shift; + my $g = $self->{automerge}; + $self->{automerge} = shift if @_; + $g; +} + +=head2 attributes + + Title : attributes + Usage : @attributes = $db->attributes($id,$name) + Function: get the "attributres" on a particular feature + Returns : an array of string + Args : feature ID + Status : public + +Some GFF version 2 files use the groups column to store a series of +attribute/value pairs. In this interpretation of GFF, the first such +pair is treated as the primary group for the feature; subsequent pairs +are treated as attributes. Two attributes have special meaning: +"Note" is for backward compatibility and is used for unstructured text +remarks. "Alias" is considered as a synonym for the feature name. + +If no name is provided, then attributes() returns a flattened hash, of +attribute=E<gt>value pairs. This lets you do: + + %attributes = $db->attributes($id); + +Normally, attributes() will be called by the feature: + + @notes = $feature->attributes('Note'); + +In a scalar context, attributes() returns the first value of the +attribute if a tag is present, otherwise a hash reference in which the +keys are attribute names and the values are anonymous arrays +containing the values. + +=cut + +sub attributes { + my $self = shift; + my ($id,$tag) = @_; + my @result = $self->do_attributes($id,$tag) or return; + return @result if wantarray; + + # what to do in an array context + return $result[0] if $tag; + my %result; + while (my($key,$value) = splice(@result,0,2)) { + push @{$result{$key}},$value; + } + return \%result; +} + +=head2 fast_queries + + Title : fast_queries + Usage : $flag = $db->fast_queries([$flag]) + Function: turn on and off the "fast queries" option + Returns : a boolean + Args : a boolean flag (optional) + Status : public + +The mysql database driver (and possibly others) support a "fast" query +mode that caches results on the server side. This makes queries come +back faster, particularly when creating iterators. The downside is +that while iterating, new queries will die with a "command synch" +error. This method turns the feature on and off. + +For databases that do not support a fast query, this method has no +effect. + +=cut + +# override this method in order to set the mysql_use_result attribute, which is an obscure +# but extremely powerful optimization for both performance and memory. +sub fast_queries { + my $self = shift; + my $d = $self->{fast_queries}; + $self->{fast_queries} = shift if @_; + $d; +} + +=head2 add_aggregator + + Title : add_aggregator + Usage : $db->add_aggregator($aggregator) + Function: add an aggregator to the list + Returns : nothing + Args : an aggregator + Status : public + +This method will append an aggregator to the end of the list of +registered aggregators. Three different argument types are accepted: + + 1) a Bio::DB::GFF::Aggregator object -- will be added + 2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}" + -- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method + part is optional). + 3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator + subclass, where the token corresponds to the subclass name. + +=cut + +sub add_aggregator { + my $self = shift; + my $aggregator = shift; + my $list = $self->{aggregators} ||= []; + if (ref $aggregator) { # an object + @$list = grep {$_->get_method ne $aggregator->get_method} @$list; + push @$list,$aggregator; + } + + elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) { + my($agg_name,$subparts,$mainpart) = ($1,$2,$3); + my @subparts = split /,\s*/,$subparts; + my @args = (-method => $agg_name, + -sub_parts => \@subparts); + push @args,(-main_method => $mainpart) if $mainpart; + warn "making an aggregator with (@args), subparts = @subparts" if $self->debug; + push @$list,Bio::DB::GFF::Aggregator->new(@args); + } + + else { + my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E"; + eval "require $class"; + $self->throw("Unable to load $aggregator aggregator: $@") if $@; + push @$list,$class->new(); + } +} + +=head2 aggregators + + Title : aggregators + Usage : $db->aggregators([@new_aggregators]); + Function: retrieve list of aggregators + Returns : list of aggregators + Args : a list of aggregators to set (optional) + Status : public + +This method will get or set the list of aggregators assigned to +the database. If 1 or more arguments are passed, the existing +set will be cleared. + +=cut + +sub aggregators { + my $self = shift; + my $d = $self->{aggregators}; + if (@_) { + $self->clear_aggregators; + $self->add_aggregator($_) foreach @_; + } + return unless $d; + return @$d; +} + +=head2 clear_aggregators + + Title : clear_aggregators + Usage : $db->clear_aggregators + Function: clears list of aggregators + Returns : nothing + Args : none + Status : public + +This method will clear the aggregators stored in the database object. +Use aggregators() or add_aggregator() to add some back. + +=cut + +sub clear_aggregators { shift->{aggregators} = [] } + +=head1 Methods for use by Subclasses + +The following methods are chiefly of interest to subclasses and are +not intended for use by end programmers. + +=head2 abscoords + + Title : abscoords + Usage : $db->abscoords($name,$class,$refseq) + Function: finds position of a landmark in reference coordinates + Returns : ($ref,$class,$start,$stop,$strand) + Args : name and class of landmark + Status : public + +This method is called by Bio::DB::GFF::RelSegment to obtain the +absolute coordinates of a sequence landmark. The arguments are the +name and class of the landmark. If successful, abscoords() returns +the ID of the reference sequence, its class, its start and stop +positions, and the orientation of the reference sequence's coordinate +system ("+" for forward strand, "-" for reverse strand). + +If $refseq is present in the argument list, it forces the query to +search for the landmark in a particular reference sequence. + +=cut + +sub abscoords { + my $self = shift; + my ($name,$class,$refseq) = @_; + $class ||= $self->{default_class}; + $self->get_abscoords($name,$class,$refseq); +} + +=head1 Protected API + +The following methods are not intended for public consumption, but are +intended to be overridden/implemented by adaptors. + +=head2 default_aggregators + + Title : default_aggregators + Usage : $db->default_aggregators; + Function: retrieve list of aggregators + Returns : array reference containing list of aggregator names + Args : none + Status : protected + +This method (which is intended to be overridden by adaptors) returns a +list of standard aggregators to be applied when no aggregators are +specified in the constructor. + +=cut + +sub default_aggregators { + my $self = shift; + return ['processed_transcript','alignment']; +} + +=head2 do_load_gff + + Title : do_load_gff + Usage : $db->do_load_gff($handle) + Function: load a GFF input stream + Returns : number of features loaded + Args : A filehandle. + Status : protected + +This method is called to load a GFF data stream. The method will read +GFF features from E<lt>E<gt> and load them into the database. On exit the +method must return the number of features loaded. + +Note that the method is responsible for parsing the GFF lines. This +is to allow for differences in the interpretation of the "group" +field, which are legion. + +You probably want to use load_gff() instead. It is more flexible +about the arguments it accepts. + +=cut + +# load from <> +sub do_load_gff { + my $self = shift; + my $io_handle = shift; + + local $self->{gff3_flag} = 0; + $self->setup_load(); + + my $fasta_sequence_id; + + while (<$io_handle>) { + chomp; + $self->{gff3_flag}++ if /^\#\#gff-version\s+3/; + if (/^>(\S+)/) { # uh oh, sequence coming + $fasta_sequence_id = $1; + last; + } + if (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line + $self->load_gff_line( + { + ref => $1, + class => 'Sequence', + source => 'reference', + method => 'Component', + start => $2, + stop => $3, + score => undef, + strand => undef, + phase => undef, + gclass => 'Sequence', + gname => $1, + tstart => undef, + tstop => undef, + attributes => [], + } + ); + next; + } + + next if /^\#/; + my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t"; + next unless defined($ref) && defined($method) && defined($start) && defined($stop); + foreach (\$score,\$strand,\$phase) { + undef $$_ if $$_ eq '.'; + } + + my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{gff3_flag}); + + # no standard way in the GFF file to denote the class of the reference sequence -- drat! + # so we invoke the factory to do it + my $class = $self->refclass($ref); + + # call subclass to do the dirty work + if ($start > $stop) { + ($start,$stop) = ($stop,$start); + if ($strand eq '+') { + $strand = '-'; + } elsif ($strand eq '-') { + $strand = '+'; + } + } + $self->load_gff_line({ref => $ref, + class => $class, + source => $source, + method => $method, + start => $start, + stop => $stop, + score => $score, + strand => $strand, + phase => $phase, + gclass => $gclass, + gname => $gname, + tstart => $tstart, + tstop => $tstop, + attributes => $attributes} + ); + } + + my $result = $self->finish_load(); + $result += $self->load_sequence($io_handle,$fasta_sequence_id) + if defined $fasta_sequence_id; + $result; + +} + +=head2 load_sequence + + Title : load_sequence + Usage : $db->load_sequence($handle [,$id]) + Function: load a FASTA data stream + Returns : number of sequences + Args : a filehandle and optionally the ID of + the first sequence in the stream. + Status : protected + +You probably want to use load_fasta() instead. The $id argument is a +hack used to switch from GFF loading to FASTA loading when load_gff() +discovers FASTA data hiding at the bottom of the GFF file (as Artemis +does). + +=cut + +sub load_sequence { + my $self = shift; + my $io_handle = shift; + my $id = shift; # hack for GFF files that contain fasta data + + # read fasta file(s) from ARGV + my ($seq,$offset,$loaded) = (undef,0,0); + while (<$io_handle>) { + chomp; + if (/^>(\S+)/) { + $self->insert_sequence($id,$offset,$seq) if $id; + $id = $1; + $offset = 0; + $seq = ''; + $loaded++; + } else { + $seq .= $_; + $self->insert_sequence_chunk($id,\$offset,\$seq); + } + } + $self->insert_sequence($id,$offset,$seq) if $id; + $loaded+0; +} + +sub insert_sequence_chunk { + my $self = shift; + my ($id,$offsetp,$seqp) = @_; + if (my $cs = $self->dna_chunk_size) { + while (length($$seqp) >= $cs) { + my $chunk = substr($$seqp,0,$cs); + $self->insert_sequence($id,$$offsetp,$chunk); + $$offsetp += length($chunk); + substr($$seqp,0,$cs) = ''; + } + } + return 1; # the calling routine may expect success or failure +} + +# used to store big pieces of DNA in itty bitty pieces +sub dna_chunk_size { + return 0; +} + +sub insert_sequence { + my $self = shift; + my($id,$offset,$seq) = @_; + $self->throw('insert_sequence(): must be defined in subclass'); +} + +# This is the default class for reference points. Defaults to Sequence. +sub default_class { + my $self = shift; + my $d = exists($self->{default_class}) ? $self->{default_class} : 'Sequence'; + $self->{default_class} = shift if @_; + $d; +} + +# gets name of the reference sequence, and returns its class +# currently just calls default_class +sub refclass { + my $self = shift; + my $name = shift; + return $self->default_class; +} + +=head2 setup_load + + Title : setup_load + Usage : $db->setup_load + Function: called before load_gff_line() + Returns : void + Args : none + Status : abstract + +This abstract method gives subclasses a chance to do any +schema-specific initialization prior to loading a set of GFF records. +It must be implemented by a subclass. + +=cut + +sub setup_load { + # default, do nothing +} + +=head2 finish_load + + Title : finish_load + Usage : $db->finish_load + Function: called after load_gff_line() + Returns : number of records loaded + Args : none + Status :abstract + +This method gives subclasses a chance to do any schema-specific +cleanup after loading a set of GFF records. + +=cut + +sub finish_load { + # default, do nothing +} + +=head2 load_gff_line + + Title : load_gff_line + Usage : $db->load_gff_line(@args) + Function: called to load one parsed line of GFF + Returns : true if successfully inserted + Args : see below + Status : abstract + +This abstract method is called once per line of the GFF and passed a +hashref containing parsed GFF fields. The fields are: + + {ref => $ref, + class => $class, + source => $source, + method => $method, + start => $start, + stop => $stop, + score => $score, + strand => $strand, + phase => $phase, + gclass => $gclass, + gname => $gname, + tstart => $tstart, + tstop => $tstop, + attributes => $attributes} + +=cut + +sub load_gff_line { + shift->throw("load_gff_line(): must be implemented by an adaptor"); +} + + +=head2 do_initialize + + Title : do_initialize + Usage : $db->do_initialize([$erase]) + Function: initialize and possibly erase database + Returns : true if successful + Args : optional erase flag + Status : protected + +This method implements the initialize() method described above, and +takes the same arguments. + +=cut + +sub do_initialize { + shift->throw('do_initialize(): must be implemented by an adaptor'); +} + +=head2 dna + + Title : dna + Usage : $db->dna($id,$start,$stop,$class) + Function: return the raw DNA string for a segment + Returns : a raw DNA string + Args : id of the sequence, its class, start and stop positions + Status : public + +This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA +sequence. + +Arguments: -name sequence name + -start start position + -stop stop position + -class sequence class + +If start and stop are both undef, then the entire DNA is retrieved. +So to fetch the whole dna, call like this: + + $db->dna($name_of_sequence); + +or like this: + + $db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence); + +NOTE: you will probably prefer to create a Segment and then invoke its +dna() method. + +=cut + +# call to return the DNA string for the indicated region +# real work is done by get_dna() +sub dna { + my $self = shift; + my ($id,$start,$stop,$class) = rearrange([ + [qw(NAME ID REF REFSEQ)], + qw(START), + [qw(STOP END)], + 'CLASS', + ],@_); +# return unless defined $start && defined $stop; + $self->get_dna($id,$start,$stop,$class); +} + +sub features_in_range { + my $self = shift; + my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) = + rearrange([ + [qw(RANGE_TYPE)], + [qw(REF REFSEQ)], + qw(CLASS), + qw(START), + [qw(STOP END)], + [qw(TYPE TYPES)], + qw(PARENT), + [qw(RARE SPARSE)], + [qw(MERGE AUTOMERGE)], + 'ITERATOR' + ],@_); + $other ||= {}; + $automerge = $types && $self->automerge unless defined $automerge; + $self->throw("range type must be one of {". + join(',',keys %valid_range_types). + "}\n") + unless $valid_range_types{lc $range_type}; + $self->_features({ + rangetype => lc $range_type, + refseq => $refseq, + refclass => $class, + start => $start, + stop => $stop, + types => $types }, + { + sparse => $sparse, + automerge => $automerge, + iterator => $iterator, + %$other, + }, + $parent); +} + +=head2 get_dna + + Title : get_dna + Usage : $db->get_dna($id,$start,$stop,$class) + Function: get DNA for indicated segment + Returns : the dna string + Args : sequence ID, start, stop and class + Status : protected + +If start E<gt> stop and the sequence is nucleotide, then this method +should return the reverse complement. The sequence class may be +ignored by those databases that do not recognize different object +types. + +=cut + +sub get_dna { + my $self = shift; + my ($id,$start,$stop,$class,) = @_; + $self->throw("get_dna() must be implemented by an adaptor"); +} + +=head2 get_features + + Title : get_features + Usage : $db->get_features($search,$options,$callback) + Function: get list of features for a region + Returns : count of number of features retrieved + Args : see below + Status : protected + +The first argument is a hash reference containing search criteria for +retrieving features. It contains the following keys: + + rangetype One of "overlaps", "contains" or "contained_in". Indicates + the type of range query requested. + + refseq ID of the landmark that establishes the absolute + coordinate system. + + refclass Class of this landmark. Can be ignored by implementations + that don't recognize such distinctions. + + start Start of the range, inclusive. + + stop Stop of the range, inclusive. + + types Array reference containing the list of annotation types + to fetch from the database. Each annotation type is an + array reference consisting of [source,method]. + +The second argument is a hash reference containing certain options +that affect the way information is retrieved: + + sort_by_group + A flag. If true, means that the returned features should be + sorted by the group that they're in. + + sparse A flag. If true, means that the expected density of the + features is such that it will be more efficient to search + by type rather than by range. If it is taking a long + time to fetch features, give this a try. + + binsize A true value will create a set of artificial features whose + start and stop positions indicate bins of the given size, and + whose scores are the number of features in the bin. The + class of the feature will be set to "bin", and its name to + "method:source". This is a handy way of generating histograms + of feature density. + +The third argument, the $callback, is a code reference to which +retrieved features are passed. It is described in more detail below. + +This routine is responsible for getting arrays of GFF data out of the +database and passing them to the callback subroutine. The callback +does the work of constructing a Bio::DB::GFF::Feature object out of +that data. The callback expects a list of 13 fields: + + $refseq The reference sequence + $start feature start + $stop feature stop + $source feature source + $method feature method + $score feature score + $strand feature strand + $phase feature phase + $groupclass group class (may be undef) + $groupname group ID (may be undef) + $tstart target start for similarity hits (may be undef) + $tstop target stop for similarity hits (may be undef) + $feature_id A unique feature ID (may be undef) + +These fields are in the same order as the raw GFF file, with the +exception that the group column has been parsed into group class and +group name fields. + +The feature ID, if provided, is a unique identifier of the feature +line. The module does not depend on this ID in any way, but it is +available via Bio::DB::GFF-E<gt>id() if wanted. In the dbi::mysql and +dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb +adaptor it is not used. + +=cut + +sub get_features{ + my $self = shift; + my ($search,$options,$callback) = @_; + $self->throw("get_features() must be implemented by an adaptor"); +} + + +=head2 _feature_by_name + + Title : _feature_by_name + Usage : $db->_feature_by_name($class,$name,$location,$callback) + Function: get a list of features by name and class + Returns : count of number of features retrieved + Args : name of feature, class of feature, and a callback + Status : abstract + +This method is used internally. The callback arguments are the same +as those used by make_feature(). This method must be overidden by +subclasses. + +=cut + +sub _feature_by_name { + my $self = shift; + my ($class,$name,$location,$callback) = @_; + $self->throw("_feature_by_name() must be implemented by an adaptor"); +} + +sub _feature_by_attribute { + my $self = shift; + my ($attributes,$callback) = @_; + $self->throw("_feature_by_name() must be implemented by an adaptor"); +} + +=head2 _feature_by_id + + Title : _feature_by_id + Usage : $db->_feature_by_id($ids,$type,$callback) + Function: get a feature based + Returns : count of number of features retrieved + Args : arrayref to feature IDs to fetch + Status : abstract + +This method is used internally to fetch features either by their ID or +their group ID. $ids is a arrayref containing a list of IDs, $type is +one of "feature" or "group", and $callback is a callback. The +callback arguments are the same as those used by make_feature(). This +method must be overidden by subclasses. + +=cut + +sub _feature_by_id { + my $self = shift; + my ($ids,$type,$callback) = @_; + $self->throw("_feature_by_id() must be implemented by an adaptor"); +} + +=head2 overlapping_features + + Title : overlapping_features + Usage : $db->overlapping_features(@args) + Function: get features that overlap the indicated range + Returns : a list of Bio::DB::GFF::Feature objects + Args : see below + Status : public + +This method is invoked by Bio::DB::GFF::Segment-E<gt>features() to find +the list of features that overlap a given range. It is generally +preferable to create the Segment first, and then fetch the features. + +This method takes set of named arguments: + + -refseq ID of the reference sequence + -class Class of the reference sequence + -start Start of the desired range in refseq coordinates + -stop Stop of the desired range in refseq coordinates + -types List of feature types to return. Argument is an array + reference containing strings of the format "method:source" + -parent A parent Bio::DB::GFF::Segment object, used to create + relative coordinates in the generated features. + -rare Turn on an optimization suitable for a relatively rare feature type, + where it will be faster to filter by feature type first + and then by position, rather than vice versa. + -merge Whether to apply aggregators to the generated features. + -iterator Whether to return an iterator across the features. + +If -iterator is true, then the method returns a single scalar value +consisting of a Bio::SeqIO object. You can call next_seq() repeatedly +on this object to fetch each of the features in turn. If iterator is +false or absent, then all the features are returned as a list. + +Currently aggregation is disabled when iterating over a series of +features. + +Types are indicated using the nomenclature "method:source". Either of +these fields can be omitted, in which case a wildcard is used for the +missing field. Type names without the colon (e.g. "exon") are +interpreted as the method name and a source wild card. Regular +expressions are allowed in either field, as in: "similarity:BLAST.*". + +=cut + +# call to return the features that overlap the named region +# real work is done by get_features +sub overlapping_features { + my $self = shift; + $self->features_in_range(-range_type=>'overlaps',@_); +} + +=head2 contained_features + + Title : contained_features + Usage : $db->contained_features(@args) + Function: get features that are contained within the indicated range + Returns : a list of Bio::DB::GFF::Feature objects + Args : see overlapping_features() + Status : public + +This call is similar to overlapping_features(), except that it only +retrieves features whose end points are completely contained within +the specified range. + +Generally you will want to fetch a Bio::DB::GFF::Segment object and +call its contained_features() method rather than call this directly. + +=cut + +# The same, except that it only returns features that are completely contained within the +# range (much faster usually) +sub contained_features { + my $self = shift; + $self->features_in_range(-range_type=>'contains',@_); +} + +=head2 contained_in + + Title : contained_in + Usage : @features = $s->contained_in(@args) + Function: get features that contain this segment + Returns : a list of Bio::DB::GFF::Feature objects + Args : see features() + Status : Public + +This is identical in behavior to features() except that it returns +only those features that completely contain the segment. + +=cut + +sub contained_in { + my $self = shift; + $self->features_in_range(-range_type=>'contained_in',@_); +} + +=head2 get_abscoords + + Title : get_abscoords + Usage : $db->get_abscoords($name,$class,$refseq) + Function: get the absolute coordinates of sequence with name & class + Returns : ($absref,$absstart,$absstop,$absstrand) + Args : name and class of the landmark + Status : protected + +Given the name and class of a genomic landmark, this function returns +a four-element array consisting of: + + $absref the ID of the reference sequence that contains this landmark + $absstart the position at which the landmark starts + $absstop the position at which the landmark stops + $absstrand the strand of the landmark, relative to the reference sequence + +If $refseq is provided, the function searches only within the +specified reference sequence. + +=cut + +sub get_abscoords { + my $self = shift; + my ($name,$class,$refseq) = @_; + $self->throw("get_abscoords() must be implemented by an adaptor"); +} + +=head2 get_types + + Title : get_types + Usage : $db->get_types($absref,$class,$start,$stop,$count) + Function: get list of all feature types on the indicated segment + Returns : list or hash of Bio::DB::GFF::Typename objects + Args : see below + Status : protected + +Arguments are: + + $absref the ID of the reference sequence + $class the class of the reference sequence + $start the position to start counting + $stop the position to end counting + $count a boolean indicating whether to count the number + of occurrences of each feature type + +If $count is true, then a hash is returned. The keys of the hash are +feature type names in the format "method:source" and the values are +the number of times a feature of this type overlaps the indicated +segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename +objects. If $start or $stop are undef, then all features on the +indicated segment are enumerated. If $absref is undef, then the call +returns all feature types in the database. + +=cut + +sub get_types { + my $self = shift; + my ($refseq,$class,$start,$stop,$count,$types) = @_; + $self->throw("get_types() must be implemented by an adaptor"); +} + +=head2 make_feature + + Title : make_feature + Usage : $db->make_feature(@args) + Function: Create a Bio::DB::GFF::Feature object from string data + Returns : a Bio::DB::GFF::Feature object + Args : see below + Status : internal + + This takes 14 arguments (really!): + + $parent A Bio::DB::GFF::RelSegment object + $group_hash A hashref containing unique list of GFF groups + $refname The name of the reference sequence for this feature + $refclass The class of the reference sequence for this feature + $start Start of feature + $stop Stop of feature + $source Feature source field + $method Feature method field + $score Feature score field + $strand Feature strand + $phase Feature phase + $group_class Class of feature group + $group_name Name of feature group + $tstart For homologies, start of hit on target + $tstop Stop of hit on target + +The $parent argument, if present, is used to establish relative +coordinates in the resulting Bio::DB::Feature object. This allows one +feature to generate a list of other features that are relative to its +coordinate system (for example, finding the coordinates of the second +exon relative to the coordinates of the first). + +The $group_hash allows the group_class/group_name strings to be turned +into rich database objects via the make_obect() method (see above). +Because these objects may be expensive to create, $group_hash is used +to uniquefy them. The index of this hash is the composite key +{$group_class,$group_name,$tstart,$tstop}. Values are whatever object +is returned by the make_object() method. + +The remainder of the fields are taken from the GFF line, with the +exception that "Target" features, which contain information about the +target of a homology search, are parsed into their components. + +=cut + +# This call is responsible for turning a line of GFF into a +# feature object. +# The $parent argument is a Bio::DB::GFF::Segment object and is used +# to establish the coordinate system for the new feature. +# The $group_hash argument is an hash ref that holds previously- +# generated group objects. +# Other arguments are taken right out of the GFF table. +sub make_feature { + my $self = shift; + my ($parent,$group_hash, # these arguments provided by generic mechanisms + $srcseq, # the rest is provided by adaptor + $start,$stop, + $source,$method, + $score,$strand,$phase, + $group_class,$group_name, + $tstart,$tstop, + $db_id,$group_id) = @_; + + return unless $srcseq; # return undef if called with no arguments. This behavior is used for + # on-the-fly aggregation. + + my $group; # undefined + if (defined $group_class && defined $group_name) { + $tstart ||= ''; + $tstop ||= ''; + if ($group_hash) { + $group = $group_hash->{$group_class,$group_name,$tstart,$tstop} + ||= $self->make_object($group_class,$group_name,$tstart,$tstop); + } else { + $group = $self->make_object($group_class,$group_name,$tstart,$tstop); + } + } + +# fix for some broken GFF files +# unfortunately - has undesired side effects +# if (defined $tstart && defined $tstop && !defined $strand) { +# $strand = $tstart <= $tstop ? '+' : '-'; +# } + + if (ref $parent) { # note that the src sequence is ignored + return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop, + $method,$source, + $score,$strand,$phase, + $group,$db_id,$group_id, + $tstart,$tstop); + } else { + return Bio::DB::GFF::Feature->new($self,$srcseq, + $start,$stop, + $method,$source, + $score,$strand,$phase, + $group,$db_id,$group_id, + $tstart,$tstop); + } +} + +sub make_aggregated_feature { + my $self = shift; + my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3); + my $feature = $self->make_feature($parent,undef,@_); + return [$feature] if $feature && !$feature->group; + + # if we have accumulated features and either: + # (1) make_feature() returned undef, indicated very end or + # (2) the current group is different from the previous one + + local $^W = 0; # irritating uninitialized value warning in next statement + if (@$accumulated_features && + (!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) { + foreach my $a (@$aggregators) { # last aggregator gets first shot + $a->aggregate($accumulated_features,$self) or next; + } + my @result = @$accumulated_features; + @$accumulated_features = $feature ? ($feature) : (); + return unless @result; + return \@result ; + } + push @$accumulated_features,$feature; + return; +} + +=head2 parse_types + + Title : parse_types + Usage : $db->parse_types(@args) + Function: parses list of types + Returns : an array ref containing ['method','source'] pairs + Args : a list of types in 'method:source' form + Status : internal + +This method takes an array of type names in the format "method:source" +and returns an array reference of ['method','source'] pairs. It will +also accept a single argument consisting of an array reference with +the list of type names. + +=cut + +# turn feature types in the format "method:source" into a list of [method,source] refs +sub parse_types { + my $self = shift; + return [] if !@_ or !defined($_[0]); + return $_[0] if ref $_[0] eq 'ARRAY' && ref $_[0][0]; + my @types = ref($_[0]) ? @{$_[0]} : @_; + my @type_list = map { [split(':',$_,2)] } @types; + return \@type_list; +} + +=head2 make_match_sub + + Title : make_match_sub + Usage : $db->make_match_sub($types) + Function: creates a subroutine used for filtering features + Returns : a code reference + Args : a list of parsed type names + Status : protected + +This method is used internally to generate a code subroutine that will +accept or reject a feature based on its method and source. It takes +an array of parsed type names in the format returned by parse_types(), +and generates an anonymous subroutine. The subroutine takes a single +Bio::DB::GFF::Feature object and returns true if the feature matches +one of the desired feature types, and false otherwise. + +=cut + +# a subroutine that matches features indicated by list of types +sub make_match_sub { + my $self = shift; + my $types = shift; + + return sub { 1 } unless ref $types && @$types; + + my @expr; + for my $type (@$types) { + my ($method,$source) = @$type; + $method ||= '.*'; + $source = $source ? ":$source" : "(?::.+)?"; + push @expr,"${method}${source}"; + } + my $expr = join '|',@expr; + return $self->{match_subs}{$expr} if $self->{match_subs}{$expr}; + + my $sub =<<END; +sub { + my \$feature = shift or return; + return \$feature->type =~ /^($expr)\$/i; +} +END + warn "match sub: $sub\n" if $self->debug; + my $compiled_sub = eval $sub; + $self->throw($@) if $@; + return $self->{match_subs}{$expr} = $compiled_sub; +} + +=head2 make_object + + Title : make_object + Usage : $db->make_object($class,$name,$start,$stop) + Function: creates a feature object + Returns : a feature object + Args : see below + Status : protected + +This method is called to make an object from the GFF "group" field. +By default, all Target groups are turned into Bio::DB::GFF::Homol +objects, and everything else becomes a Bio::DB::GFF::Featname. +However, adaptors are free to override this method to generate more +interesting objects, such as true BioPerl objects, or Acedb objects. + +Arguments are: + + $name database ID for object + $class class of object + $start for similarities, start of match inside object + $stop for similarities, stop of match inside object + +=cut + +# abstract call to turn a feature into an object, given its class and name +sub make_object { + my $self = shift; + my ($class,$name,$start,$stop) = @_; + return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop) + if defined $start and length $start; + return Bio::DB::GFF::Featname->new($class,$name); +} + + +=head2 do_attributes + + Title : do_attributes + Usage : $db->do_attributes($id [,$tag]); + Function: internal method to retrieve attributes given an id and tag + Returns : a list of Bio::DB::GFF::Feature objects + Args : a feature id and a attribute tag (optional) + Status : protected + +This method is overridden by subclasses in order to return a list of +attributes. If called with a tag, returns the value of attributes of +that tag type. If called without a tag, returns a flattened array of +(tag=E<gt>value) pairs. A particular tag can be present multiple times. + +=cut + +sub do_attributes { + my $self = shift; + my ($id,$tag) = @_; + return (); +} + + + +=head1 Internal Methods + +The following methods are internal to Bio::DB::GFF and are not +guaranteed to remain the same. + +=head2 _features + + Title : _features + Usage : $db->_features($search,$options,$parent) + Function: internal method + Returns : a list of Bio::DB::GFF::Feature objects + Args : see below + Status : internal + +This is an internal method that is called by overlapping_features(), +contained_features() and features() to create features based on a +parent segment's coordinate system. It takes three arguments, a +search options hashref, an options hashref, and a parent segment. + +The search hashref contains the following keys: + + rangetype One of "overlaps", "contains" or "contained_in". Indicates + the type of range query requested. + refseq reference sequence ID + refclass reference sequence class + start start of range + stop stop of range + types arrayref containing list of types in "method:source" form + +The options hashref contains zero or more of the following keys: + + sparse turn on optimizations for a rare feature + automerge if true, invoke aggregators to merge features + iterator if true, return an iterator + +The $parent argument is a scalar object containing a +Bio::DB::GFF::RelSegment object or descendent. + +=cut + +#' + +sub _features { + my $self = shift; + my ($search,$options,$parent) = @_; + (@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)}) + if defined($search->{start}) && $search->{start} > $search->{stop}; + + my $types = $self->parse_types($search->{types}); # parse out list of types + my @aggregated_types = @$types; # keep a copy + + # allow the aggregators to operate on the original + my @aggregators; + if ($options->{automerge}) { + for my $a ($self->aggregators) { + $a = $a->clone if $options->{iterator}; + unshift @aggregators,$a + if $a->disaggregate(\@aggregated_types,$self); + } + } + + if ($options->{iterator}) { + my @accumulated_features; + my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) } + : sub { [$self->make_feature($parent,undef,@_)] }; + return $self->get_features_iterator({ %$search, + types => \@aggregated_types }, + { %$options, + sort_by_group => $options->{automerge} }, + $callback + ); + } + + my %groups; # cache the groups we create to avoid consuming too much unecessary memory + my $features = []; + + my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) }; + $self->get_features({ %$search, + types => \@aggregated_types }, + $options, + $callback); + + if ($options->{automerge}) { + warn "aggregating...\n" if $self->debug; + foreach my $a (@aggregators) { # last aggregator gets first shot + warn "Aggregator $a:\n" if $self->debug; + $a->aggregate($features,$self); + } + } + + @$features; +} + +=head2 get_features_iterator + + Title : get_features_iterator + Usage : $db->get_features_iterator($search,$options,$callback) + Function: get an iterator on a features query + Returns : a Bio::SeqIO object + Args : as per get_features() + Status : Public + +This method takes the same arguments as get_features(), but returns an +iterator that can be used to fetch features sequentially, as per +Bio::SeqIO. + +Internally, this method is simply a front end to range_query(). +The latter method constructs and executes the query, returning a +statement handle. This routine passes the statement handle to the +constructor for the iterator, along with the callback. + +=cut + +sub get_features_iterator { + my $self = shift; + my ($search,$options,$callback) = @_; + $self->throw('feature iteration is not implemented in this adaptor'); +} + +=head2 split_group + + Title : split_group + Usage : $db->split_group($group_field,$gff3_flag) + Function: parse GFF group field + Returns : ($gclass,$gname,$tstart,$tstop,$attributes) + Args : the gff group column and a flag indicating gff3 compatibility + Status : internal + +This is a method that is called by load_gff_line to parse out the +contents of one or more group fields. It returns the class of the +group, its name, the start and stop of the target, if any, and an +array reference containing any attributes that were stuck into the +group field, in [attribute_name,attribute_value] format. + +=cut + +sub split_group { + my $self = shift; + my ($group,$gff3) = @_; + if ($gff3) { + my @groups = split /[;&]/,$group; # so easy! + return $self->_split_gff3_group(@groups); + } else { + # handle group parsing + # protect embedded semicolons in the group; there must be faster/more elegant way + # to do this. + $group =~ s/\\;/$;/g; + while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 } + my @groups = split(/\s*;\s*/,$group); + foreach (@groups) { s/$;/;/g } + return $self->_split_gff2_group(@groups); + } +} + +=head2 _split_gff2_group + +This is an internal method called by split_group(). + +=cut + +sub _split_gff2_group { + my $self = shift; + my @groups = @_; + + my ($gclass,$gname,$tstart,$tstop,@attributes); + + for (@groups) { + + my ($tag,$value) = /^(\S+)(?:\s+(.+))?/; + $value ||= ''; + if ($value =~ /^\"(.+)\"$/) { #remove quotes + $value = $1; + } + $value =~ s/\\t/\t/g; + $value =~ s/\\r/\r/g; + + # Any additional groups become part of the attributes hash + # For historical reasons, the tag "Note" is treated as an + # attribute, even if it is the only group. + $tag ||= ''; + if ($tag eq 'Note' or ($gclass && $gname)) { + push @attributes,[$tag => $value]; + } + + # if the tag eq 'Target' then the class name is embedded in the ID + # (the GFF format is obviously screwed up here) + elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) { + ($gclass,$gname) = ($1,$2); + ($tstart,$tstop) = / (\d+) (\d+)/; + } + + elsif (!$value) { + push @attributes,[Note => $tag]; # e.g. "Confirmed_by_EST" + } + + # otherwise, the tag and value correspond to the + # group class and name + else { + ($gclass,$gname) = ($tag,$value); + } + } + + return ($gclass,$gname,$tstart,$tstop,\@attributes); +} + +=head2 _split_gff3_group + +This is called internally from split_group(). + +=cut + +sub _split_gff3_group { + my $self = shift; + my @groups = @_; + my ($gclass,$gname,$tstart,$tstop,@attributes); + + for my $group (@groups) { + my ($tag,$value) = split /=/,$group; + $tag = unescape($tag); + my @values = map {unescape($_)} split /,/,$value; + if ($tag eq 'Parent') { + $gclass = 'Sequence'; + $gname = shift @values; + } + elsif ($tag eq 'ID') { + $gclass = 'Sequence'; + $gname = shift @values; + } + elsif ($tag eq 'Target') { + $gclass = 'Sequence'; + ($gname,$tstart,$tstop) = split /\s+/,shift @values; + } + push @attributes,[$tag=>$_] foreach @values; + } + return ($gclass,$gname,$tstart,$tstop,\@attributes); +} + +=head2 _delete_features(), _delete_groups(),_delete() + + Title : _delete_features(), _delete_groups(),_delete() + Usage : $count = $db->_delete_features(@feature_ids) + $count = $db->_delete_groups(@group_ids) + $count = $db->_delete(\%delete_spec) + Function: low-level feature/group deleter + Returns : count of groups removed + Args : list of feature or group ids removed + Status : for implementation by subclasses + +These methods need to be implemented in adaptors. For +_delete_features and _delete_groups, the arguments are a list of +feature or group IDs to remove. For _delete(), the argument is a +hashref with the three keys 'segments', 'types' and 'force'. The +first contains an arrayref of Bio::DB::GFF::RelSegment objects to +delete (all FEATURES within the segment are deleted). The second +contains an arrayref of [method,source] feature types to delete. The +two are ANDed together. If 'force' has a true value, this forces the +operation to continue even if it would delete all features. + +=cut + +sub _delete_features { + my $self = shift; + my @feature_ids = @_; + $self->throw('_delete_features is not implemented in this adaptor'); +} + +sub _delete_groups { + my $self = shift; + my @group_ids = @_; + $self->throw('_delete_groups is not implemented in this adaptor'); +} + +sub _delete { + my $self = shift; + my $delete_options = shift; + $self->throw('_delete is not implemented in this adaptor'); +} + +sub unescape { + my $v = shift; + $v =~ tr/+/ /; + $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge; + return $v; +} + + +package Bio::DB::GFF::ID_Iterator; +use strict; + +use Bio::Root::Root; +use vars '@ISA'; +@ISA = 'Bio::Root::Root'; + +sub new { + my $class = shift; + my ($db,$ids,$type) = @_; + return bless {ids=>$ids,db=>$db,type=>$type},$class; +} + +sub next_seq { + my $self = shift; + my $next = shift @{$self->{ids}}; + return unless $next; + my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next; + my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name) + : $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name) + : $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name) + : $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier"); + $self->throw("id does not exist") unless $segment; + return $segment; +} + +1; + +__END__ + +=head1 BUGS + +Features can only belong to a single group at a time. This must be +addressed soon. + +Start coordinate can be greater than stop coordinate for relative +addressing. This breaks strict BioPerl compatibility and must be +fixed. + +=head1 SEE ALSO + +L<bioperl>, +L<Bio::DB::GFF::RelSegment>, +L<Bio::DB::GFF::Aggregator>, +L<Bio::DB::GFF::Feature>, +L<Bio::DB::GFF::Adaptor::dbi::mysqlopt>, +L<Bio::DB::GFF::Adaptor::dbi::oracle>, +L<Bio::DB::GFF::Adaptor::memory> + +=head1 AUTHOR + +Lincoln Stein E<lt>lstein@cshl.orgE<gt>. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut +