Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/GFF.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: GFF.pm,v 1.71.2.2 2003/09/12 13:29:32 lstein Exp $ | |
| 2 | |
| 3 =head1 NAME | |
| 4 | |
| 5 Bio::DB::GFF -- Storage and retrieval of sequence annotation data | |
| 6 | |
| 7 =head1 SYNOPSIS | |
| 8 | |
| 9 use Bio::DB::GFF; | |
| 10 | |
| 11 # Open the sequence database | |
| 12 my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt', | |
| 13 -dsn => 'dbi:mysql:elegans', | |
| 14 -fasta => '/usr/local/fasta_files' | |
| 15 ); | |
| 16 | |
| 17 # fetch a 1 megabase segment of sequence starting at landmark "ZK909" | |
| 18 my $segment = $db->segment('ZK909', 1 => 1000000); | |
| 19 | |
| 20 # pull out all transcript features | |
| 21 my @transcripts = $segment->features('transcript'); | |
| 22 | |
| 23 # for each transcript, total the length of the introns | |
| 24 my %totals; | |
| 25 for my $t (@transcripts) { | |
| 26 my @introns = $t->Intron; | |
| 27 $totals{$t->name} += $_->length foreach @introns; | |
| 28 } | |
| 29 | |
| 30 # Sort the exons of the first transcript by position | |
| 31 my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon; | |
| 32 | |
| 33 # Get a region 1000 bp upstream of first exon | |
| 34 my $upstream = $exons[0]->segment(-1000,0); | |
| 35 | |
| 36 # get its DNA | |
| 37 my $dna = $upstream->dna; | |
| 38 | |
| 39 # and get all curated polymorphisms inside it | |
| 40 @polymorphisms = $upstream->contained_features('polymorphism:curated'); | |
| 41 | |
| 42 # get all feature types in the database | |
| 43 my @types = $db->types; | |
| 44 | |
| 45 # count all feature types in the segment | |
| 46 my %type_counts = $segment->types(-enumerate=>1); | |
| 47 | |
| 48 # get an iterator on all curated features of type 'exon' or 'intron' | |
| 49 my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']); | |
| 50 | |
| 51 while (my $s = $iterator->next_seq) { | |
| 52 print $s,"\n"; | |
| 53 } | |
| 54 | |
| 55 # find all transcripts annotated as having function 'kinase' | |
| 56 my $iterator = $db->get_seq_stream(-type=>'transcript', | |
| 57 -attributes=>{Function=>'kinase'}); | |
| 58 while (my $s = $iterator->next_seq) { | |
| 59 print $s,"\n"; | |
| 60 } | |
| 61 | |
| 62 =head1 DESCRIPTION | |
| 63 | |
| 64 Bio::DB::GFF provides fast indexed access to a sequence annotation | |
| 65 database. It supports multiple database types (ACeDB, relational), | |
| 66 and multiple schemas through a system of adaptors and aggregators. | |
| 67 | |
| 68 The following operations are supported by this module: | |
| 69 | |
| 70 - retrieving a segment of sequence based on the ID of a landmark | |
| 71 - retrieving the DNA from that segment | |
| 72 - finding all annotations that overlap with the segment | |
| 73 - finding all annotations that are completely contained within the | |
| 74 segment | |
| 75 - retrieving all annotations of a particular type, either within a | |
| 76 segment, or globally | |
| 77 - conversion from absolute to relative coordinates and back again, | |
| 78 using any arbitrary landmark for the relative coordinates | |
| 79 - using a sequence segment to create new segments based on relative | |
| 80 offsets | |
| 81 | |
| 82 The data model used by Bio::DB::GFF is compatible with the GFF flat | |
| 83 file format (http://www.sanger.ac.uk/software/GFF). The module can | |
| 84 load a set of GFF files into the database, and serves objects that | |
| 85 have methods corresponding to GFF fields. | |
| 86 | |
| 87 The objects returned by Bio::DB::GFF are compatible with the | |
| 88 SeqFeatureI interface, allowing their use by the Bio::Graphics and | |
| 89 Bio::DAS modules. | |
| 90 | |
| 91 =head2 Auxiliary Scripts | |
| 92 | |
| 93 The bioperl distribution includes several scripts that make it easier | |
| 94 to work with Bio::DB::GFF databases. They are located in the scripts | |
| 95 directory under a subdirectory named Bio::DB::GFF: | |
| 96 | |
| 97 =over 4 | |
| 98 | |
| 99 =item bp_load_gff.pl | |
| 100 | |
| 101 This script will load a Bio::DB::GFF database from a flat GFF file of | |
| 102 sequence annotations. Only the relational database version of | |
| 103 Bio::DB::GFF is supported. It can be used to create the database from | |
| 104 scratch, as well as to incrementally load new data. | |
| 105 | |
| 106 This script takes a --fasta argument to load raw DNA into the database | |
| 107 as well. However, GFF databases do not require access to the raw DNA | |
| 108 for most of their functionality. | |
| 109 | |
| 110 load_gff.pl also has a --upgrade option, which will perform a | |
| 111 non-destructive upgrade of older schemas to newer ones. | |
| 112 | |
| 113 =item bp_bulk_load_gff.pl | |
| 114 | |
| 115 This script will populate a Bio::DB::GFF database from a flat GFF file | |
| 116 of sequence annotations. Only the MySQL database version of | |
| 117 Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in | |
| 118 order to accelerate loading considerably; however, it can only be used | |
| 119 for the initial load, and not for updates. | |
| 120 | |
| 121 This script takes a --fasta argument to load raw DNA into the database | |
| 122 as well. However, GFF databases do not require access to the raw DNA | |
| 123 for most of their functionality. | |
| 124 | |
| 125 =item bp_fast_load_gff.pl | |
| 126 | |
| 127 This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe | |
| 128 tricks to allow for incremental updates. It only supports the MySQL | |
| 129 database version of Bio::DB::GFF and is guaranteed not to work on | |
| 130 non-Unix platforms. | |
| 131 | |
| 132 Arguments are the same as bp_load_gff.pl | |
| 133 | |
| 134 =item gadfly_to_gff.pl | |
| 135 | |
| 136 This script will convert the GFF-like format used by the Berkeley | |
| 137 Drosophila Sequencing project into a format suitable for use with this | |
| 138 module. | |
| 139 | |
| 140 =item sgd_to_gff.pl | |
| 141 | |
| 142 This script will convert the tab-delimited feature files used by the | |
| 143 Saccharomyces Genome Database into a format suitable for use with this | |
| 144 module. | |
| 145 | |
| 146 =back | |
| 147 | |
| 148 =head2 GFF Fundamentals | |
| 149 | |
| 150 The GFF format is a flat tab-delimited file, each line of which | |
| 151 corresponds to an annotation, or feature. Each line has nine columns | |
| 152 and looks like this: | |
| 153 | |
| 154 Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7" | |
| 155 | |
| 156 The 9 columns are as follows: | |
| 157 | |
| 158 =over 4 | |
| 159 | |
| 160 =item 1. reference sequence | |
| 161 | |
| 162 This is the ID of the sequence that is used to establish the | |
| 163 coordinate system of the annotation. In the example above, the | |
| 164 reference sequence is "Chr1". | |
| 165 | |
| 166 =item 2. source | |
| 167 | |
| 168 The source of the annotation. This field describes how the annotation | |
| 169 was derived. In the example above, the source is "curated" to | |
| 170 indicate that the feature is the result of human curation. The names | |
| 171 and versions of software programs are often used for the source field, | |
| 172 as in "tRNAScan-SE/1.2". | |
| 173 | |
| 174 =item 3. method | |
| 175 | |
| 176 The annotation method. This field describes the type of the | |
| 177 annotation, such as "CDS". Together the method and source describe | |
| 178 the annotation type. | |
| 179 | |
| 180 =item 4. start position | |
| 181 | |
| 182 The start of the annotation relative to the reference sequence. | |
| 183 | |
| 184 =item 5. stop position | |
| 185 | |
| 186 The stop of the annotation relative to the reference sequence. Start | |
| 187 is always less than or equal to stop. | |
| 188 | |
| 189 =item 6. score | |
| 190 | |
| 191 For annotations that are associated with a numeric score (for example, | |
| 192 a sequence similarity), this field describes the score. The score | |
| 193 units are completely unspecified, but for sequence similarities, it is | |
| 194 typically percent identity. Annotations that don't have a score can | |
| 195 use "." | |
| 196 | |
| 197 =item 7. strand | |
| 198 | |
| 199 For those annotations which are strand-specific, this field is the | |
| 200 strand on which the annotation resides. It is "+" for the forward | |
| 201 strand, "-" for the reverse strand, or "." for annotations that are | |
| 202 not stranded. | |
| 203 | |
| 204 =item 8. phase | |
| 205 | |
| 206 For annotations that are linked to proteins, this field describes the | |
| 207 phase of the annotation on the codons. It is a number from 0 to 2, or | |
| 208 "." for features that have no phase\. | |
| 209 | |
| 210 =item 9. group | |
| 211 | |
| 212 GFF provides a simple way of generating annotation hierarchies ("is | |
| 213 composed of" relationships) by providing a group field. The group | |
| 214 field contains the class and ID of an annotation which is the logical | |
| 215 parent of the current one. In the example given above, the group is | |
| 216 the Transcript named "R119.7". | |
| 217 | |
| 218 The group field is also used to store information about the target of | |
| 219 sequence similarity hits, and miscellaneous notes. See the next | |
| 220 section for a description of how to describe similarity targets. | |
| 221 | |
| 222 The format of the group fields is "Class ID" with a single space (not | |
| 223 a tab) separating the class from the ID. It is VERY IMPORTANT to | |
| 224 follow this format, or grouping will not work properly. | |
| 225 | |
| 226 =back | |
| 227 | |
| 228 The sequences used to establish the coordinate system for annotations | |
| 229 can correspond to sequenced clones, clone fragments, contigs or | |
| 230 super-contigs. Thus, this module can be used throughout the lifecycle | |
| 231 of a sequencing project. | |
| 232 | |
| 233 In addition to a group ID, the GFF format allows annotations to have a | |
| 234 group class. For example, in the ACeDB representation, RNA | |
| 235 interference experiments have a class of "RNAi" and an ID that is | |
| 236 unique among the RNAi experiments. Since not all databases support | |
| 237 this notion, the class is optional in all calls to this module, and | |
| 238 defaults to "Sequence" when not provided. | |
| 239 | |
| 240 Double-quotes are sometimes used in GFF files around components of the | |
| 241 group field. Strictly, this is only necessary if the group name or | |
| 242 class contains whitespace. | |
| 243 | |
| 244 =head2 Making GFF files work with this module | |
| 245 | |
| 246 Some annotations do not need to be individually named. For example, | |
| 247 it is probably not useful to assign a unique name to each ALU repeat | |
| 248 in a vertebrate genome. Others, such as predicted genes, correspond | |
| 249 to named biological objects; you probably want to be able to fetch the | |
| 250 positions of these objects by referring to them by name. | |
| 251 | |
| 252 To accomodate named annotations, the GFF format places the object | |
| 253 class and name in the group field. The name identifies the object, | |
| 254 and the class prevents similarly-named objects, for example clones and | |
| 255 sequences, from collding. | |
| 256 | |
| 257 A named object is shown in the following excerpt from a GFF file: | |
| 258 | |
| 259 Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2 | |
| 260 | |
| 261 This object is a predicted transcript named Y95BA.2. In this case, | |
| 262 the group field is used to identify the class and name of the object, | |
| 263 even though no other annotation belongs to that group. | |
| 264 | |
| 265 It now becomes possible to retrieve the region of the genome covered | |
| 266 by transcript Y95B8A.2 using the segment() method: | |
| 267 | |
| 268 $segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2'); | |
| 269 | |
| 270 It is not necessary for the annotation's method to correspond to the | |
| 271 object class, although this is commonly the case. | |
| 272 | |
| 273 As explained above, each annotation in a GFF file refers to a | |
| 274 reference sequence. It is important that each reference sequence also | |
| 275 be identified by a line in the GFF file. This allows the Bio::DB::GFF | |
| 276 module to determine the length and class of the reference sequence, | |
| 277 and makes it possible to do relative arithmetic. | |
| 278 | |
| 279 For example, if "Chr1" is used as a reference sequence, then it should | |
| 280 have an entry in the GFF file similar to this one: | |
| 281 | |
| 282 Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1 | |
| 283 | |
| 284 This indicates that the reference sequence named "Chr1" has length | |
| 285 14972282 bp, method "chromosome" and source "assembly". In addition, | |
| 286 as indicated by the group field, Chr1 has class "Sequence" and name | |
| 287 "Chr1". | |
| 288 | |
| 289 The object class "Sequence" is used by default when the class is not | |
| 290 specified in the segment() call. This allows you to use a shortcut | |
| 291 form of the segment() method: | |
| 292 | |
| 293 $segment = $db->segment('Chr1'); # whole chromosome | |
| 294 $segment = $db->segment('Chr1',1=>1000); # first 1000 bp | |
| 295 | |
| 296 For your convenience, if, during loading a GFF file, Bio::DB::GFF | |
| 297 encounters a line like the following: | |
| 298 | |
| 299 ##sequence-region Chr1 1 14972282 | |
| 300 | |
| 301 It will automatically generate the following entry: | |
| 302 | |
| 303 Chr1 reference Component 1 14972282 . + . Sequence Chr1 | |
| 304 | |
| 305 This is sufficient to use Chr1 as a reference point. | |
| 306 The ##sequence-region line is frequently found in the GFF files | |
| 307 distributed by annotation groups. | |
| 308 | |
| 309 =head2 Sequence alignments | |
| 310 | |
| 311 There are two cases in which an annotation indicates the relationship | |
| 312 between two sequences. The first case is a similarity hit, where the | |
| 313 annotation indicates an alignment. The second case is a map assembly, | |
| 314 in which the annotation indicates that a portion of a larger sequence | |
| 315 is built up from one or more smaller ones. | |
| 316 | |
| 317 Both cases are indicated by using the B<Target> tag in the group | |
| 318 field. For example, a typical similarity hit will look like this: | |
| 319 | |
| 320 Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544 | |
| 321 | |
| 322 The group field contains the Target tag, followed by an identifier for | |
| 323 the biological object referred to. The GFF format uses the notation | |
| 324 I<Class>:I<Name> for the biological object, and even though this is | |
| 325 stylistically inconsistent, that's the way it's done. The object | |
| 326 identifier is followed by two integers indicating the start and stop | |
| 327 of the alignment on the target sequence. | |
| 328 | |
| 329 Unlike the main start and stop columns, it is possible for the target | |
| 330 start to be greater than the target end. The previous example | |
| 331 indicates that the the section of Chr1 from 76,953 to 77,108 aligns to | |
| 332 the protein SW:ABL_DROME starting at position 493 and extending to | |
| 333 position 544. | |
| 334 | |
| 335 A similar notation is used for sequence assembly information as shown | |
| 336 in this example: | |
| 337 | |
| 338 Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826 | |
| 339 LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742 | |
| 340 | |
| 341 This indicates that the region between bases 10922906 and 11177731 of | |
| 342 Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region | |
| 343 of LINK_H0601 between 32386 and 64122 is, in turn, composed of the | |
| 344 bases 5 to 31742 of cosmid F49B2. | |
| 345 | |
| 346 =head2 Attributes | |
| 347 | |
| 348 While not intended to serve as a general-purpose sequence database | |
| 349 (see bioperl-db for that), GFF allows you to tag features with | |
| 350 arbitrary attributes. Attributes appear in the Group field following | |
| 351 the initial class/name pair. For example: | |
| 352 | |
| 353 Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3 | |
| 354 | |
| 355 This line tags the feature named Transcript Y95B8A.2 as being "Gene" | |
| 356 named sma-3 and having the Alias "sma3". Features having these | |
| 357 attributes can be looked up using the fetch_feature_by_attribute() method. | |
| 358 | |
| 359 Two attributes have special meaning: "Note" is for backward | |
| 360 compatibility and is used for unstructured text remarks. "Alias" is | |
| 361 considered as a synonym for the feature name and will be consulted | |
| 362 when looking up a feature by its name. | |
| 363 | |
| 364 =head2 Adaptors and Aggregators | |
| 365 | |
| 366 This module uses a system of adaptors and aggregators in order to make | |
| 367 it adaptable to use with a variety of databases. | |
| 368 | |
| 369 =over 4 | |
| 370 | |
| 371 =item Adaptors | |
| 372 | |
| 373 The core of the module handles the user API, annotation coordinate | |
| 374 arithmetic, and other common issues. The details of fetching | |
| 375 information from databases is handled by an adaptor, which is | |
| 376 specified during Bio::DB::GFF construction. The adaptor encapsulates | |
| 377 database-specific information such as the schema, user authentication | |
| 378 and access methods. | |
| 379 | |
| 380 Currently there are two adaptors: 'dbi::mysql' and 'dbi::mysqlopt'. | |
| 381 The former is an interface to a simple Mysql schema. The latter is an | |
| 382 optimized version of dbi::mysql which uses a binning scheme to | |
| 383 accelerate range queries and the Bio::DB::Fasta module for rapid | |
| 384 retrieval of sequences. Note the double-colon between the words. | |
| 385 | |
| 386 =item Aggregators | |
| 387 | |
| 388 The GFF format uses a "group" field to indicate aggregation properties | |
| 389 of individual features. For example, a set of exons and introns may | |
| 390 share a common transcript group, and multiple transcripts may share | |
| 391 the same gene group. | |
| 392 | |
| 393 Aggregators are small modules that use the group information to | |
| 394 rebuild the hierarchy. When a Bio::DB::GFF object is created, you | |
| 395 indicate that it use a set of one or more aggregators. Each | |
| 396 aggregator provides a new composite annotation type. Before the | |
| 397 database query is generated each aggregator is called to | |
| 398 "disaggregate" its annotation type into list of component types | |
| 399 contained in the database. After the query is generated, each | |
| 400 aggregator is called again in order to build composite annotations | |
| 401 from the returned components. | |
| 402 | |
| 403 For example, during disaggregation, the standard | |
| 404 "processed_transcript" aggregator generates a list of component | |
| 405 feature types including "UTR", "CDS", and "polyA_site". Later, it | |
| 406 aggregates these features into a set of annotations of type | |
| 407 "processed_transcript". | |
| 408 | |
| 409 During aggregation, the list of aggregators is called in reverse | |
| 410 order. This allows aggregators to collaborate to create multi-level | |
| 411 structures: the transcript aggregator assembles transcripts from | |
| 412 introns and exons; the gene aggregator then assembles genes from sets | |
| 413 of transcripts. | |
| 414 | |
| 415 Three default aggregators are provided: | |
| 416 | |
| 417 transcript assembles transcripts from features of type | |
| 418 exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA | |
| 419 clone assembles clones from Clone_left_end, Clone_right_end | |
| 420 and Sequence features. | |
| 421 alignment assembles gapped alignments from features of type | |
| 422 "similarity". | |
| 423 | |
| 424 In addition, this module provides the optional "wormbase_gene" | |
| 425 aggregator, which accomodates the WormBase representation of genes. | |
| 426 This aggregator aggregates features of method "exon", "CDS", "5'UTR", | |
| 427 "3'UTR", "polyA" and "TSS" into a single object. It also expects to | |
| 428 find a single feature of type "Sequence" that spans the entire gene. | |
| 429 | |
| 430 The existing aggregators are easily customized. | |
| 431 | |
| 432 Note that aggregation will not occur unless you specifically request | |
| 433 the aggregation type. For example, this call: | |
| 434 | |
| 435 @features = $segment->features('alignment'); | |
| 436 | |
| 437 will generate an array of aggregated alignment features. However, | |
| 438 this call: | |
| 439 | |
| 440 @features = $segment->features(); | |
| 441 | |
| 442 will return a list of unaggregated similarity segments. | |
| 443 | |
| 444 For more informnation, see the manual pages for | |
| 445 Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone, | |
| 446 etc. | |
| 447 | |
| 448 =back | |
| 449 | |
| 450 =head1 API | |
| 451 | |
| 452 The following is the API for Bio::DB::GFF. | |
| 453 | |
| 454 =cut | |
| 455 | |
| 456 package Bio::DB::GFF; | |
| 457 | |
| 458 use strict; | |
| 459 | |
| 460 use Bio::DB::GFF::Util::Rearrange; | |
| 461 use Bio::DB::GFF::RelSegment; | |
| 462 use Bio::DB::GFF::Feature; | |
| 463 use Bio::DB::GFF::Aggregator; | |
| 464 use Bio::DasI; | |
| 465 use Bio::Root::Root; | |
| 466 | |
| 467 use vars qw(@ISA $VERSION); | |
| 468 @ISA = qw(Bio::Root::Root Bio::DasI); | |
| 469 | |
| 470 $VERSION = '1.2003'; | |
| 471 my %valid_range_types = (overlaps => 1, | |
| 472 contains => 1, | |
| 473 contained_in => 1); | |
| 474 | |
| 475 =head1 Querying GFF Databases | |
| 476 | |
| 477 =head2 new | |
| 478 | |
| 479 Title : new | |
| 480 Usage : my $db = new Bio::DB::GFF(@args); | |
| 481 Function: create a new Bio::DB::GFF object | |
| 482 Returns : new Bio::DB::GFF object | |
| 483 Args : lists of adaptors and aggregators | |
| 484 Status : Public | |
| 485 | |
| 486 These are the arguments: | |
| 487 | |
| 488 -adaptor Name of the adaptor module to use. If none | |
| 489 provided, defaults to "dbi::mysqlopt". | |
| 490 | |
| 491 -aggregator Array reference to a list of aggregators | |
| 492 to apply to the database. If none provided, | |
| 493 defaults to ['processed_transcript','alignment']. | |
| 494 | |
| 495 <other> Any other named argument pairs are passed to | |
| 496 the adaptor for processing. | |
| 497 | |
| 498 The adaptor argument must correspond to a module contained within the | |
| 499 Bio::DB::GFF::Adaptor namespace. For example, the | |
| 500 Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying | |
| 501 'dbi::mysql'. By Perl convention, the adaptors names are lower case | |
| 502 because they are loaded at run time. | |
| 503 | |
| 504 The aggregator array may contain a list of aggregator names, a list of | |
| 505 initialized aggregator objects, or a string in the form | |
| 506 "aggregator_name{subpart1,subpart2,subpart3/main_method}" (the | |
| 507 /main_method part is optional). For example, if you wish to change | |
| 508 the components aggregated by the transcript aggregator, you could pass | |
| 509 it to the GFF constructor this way: | |
| 510 | |
| 511 my $transcript = | |
| 512 Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr | |
| 513 polyA spliced_leader)]); | |
| 514 | |
| 515 my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment], | |
| 516 -adaptor => 'dbi::mysql', | |
| 517 -dsn => 'dbi:mysql:elegans42'); | |
| 518 | |
| 519 Alternatively, you could create an entirely new transcript aggregator | |
| 520 this way: | |
| 521 | |
| 522 my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}'; | |
| 523 my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment], | |
| 524 -adaptor => 'dbi::mysql', | |
| 525 -dsn => 'dbi:mysql:elegans42'); | |
| 526 | |
| 527 See L<Bio::DB::GFF::Aggregator> for more details. | |
| 528 | |
| 529 The commonly used 'dbi::mysql' adaptor recognizes the following | |
| 530 adaptor-specific arguments: | |
| 531 | |
| 532 Argument Description | |
| 533 -------- ----------- | |
| 534 | |
| 535 -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' | |
| 536 If a partial name is given, such as "ens0040", the | |
| 537 "dbi:mysql:" prefix will be added automatically. | |
| 538 | |
| 539 -user username for authentication | |
| 540 | |
| 541 -pass the password for authentication | |
| 542 | |
| 543 -refclass landmark Class; defaults to "Sequence" | |
| 544 | |
| 545 The commonly used 'dbi::mysqlopt' adaptor also recogizes the following | |
| 546 arguments. | |
| 547 | |
| 548 Argument Description | |
| 549 -------- ----------- | |
| 550 | |
| 551 -fasta path to a directory containing FASTA files for the DNA | |
| 552 contained in this database (e.g. "/usr/local/share/fasta") | |
| 553 | |
| 554 -acedb an acedb URL to use when converting features into ACEDB | |
| 555 objects (e.g. sace://localhost:2005) | |
| 556 | |
| 557 =cut | |
| 558 | |
| 559 #' | |
| 560 | |
| 561 sub new { | |
| 562 my $package = shift; | |
| 563 my ($adaptor,$aggregators,$args,$refclass); | |
| 564 | |
| 565 if (@_ == 1) { # special case, default to dbi::mysqlopt | |
| 566 $adaptor = 'dbi::mysqlopt'; | |
| 567 $args = {DSN => shift}; | |
| 568 } else { | |
| 569 ($adaptor,$aggregators,$refclass,$args) = rearrange([ | |
| 570 [qw(ADAPTOR FACTORY)], | |
| 571 [qw(AGGREGATOR AGGREGATORS)], | |
| 572 'REFCLASS', | |
| 573 ],@_); | |
| 574 } | |
| 575 | |
| 576 $adaptor ||= 'dbi::mysqlopt'; | |
| 577 my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E"; | |
| 578 eval "require $class" unless $class->can('new'); | |
| 579 $package->throw("Unable to load $adaptor adaptor: $@") if $@; | |
| 580 | |
| 581 my $self = $class->new($args); | |
| 582 $self->default_class($refclass) if defined $refclass; | |
| 583 | |
| 584 # handle the aggregators. | |
| 585 # aggregators are responsible for creating complex multi-part features | |
| 586 # from the GFF "group" field. If none are provided, then we provide a | |
| 587 # list of the two used in WormBase. | |
| 588 # Each aggregator can be a scalar or a ref. In the former case | |
| 589 # it is treated as a class name to call new() on. In the latter | |
| 590 # the aggreator is treated as a ready made object. | |
| 591 $aggregators = $self->default_aggregators unless defined $aggregators; | |
| 592 my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators; | |
| 593 for my $a (@a) { | |
| 594 $self->add_aggregator($a); | |
| 595 } | |
| 596 | |
| 597 # default settings go here..... | |
| 598 $self->automerge(1); # set automerge to true | |
| 599 | |
| 600 $self; | |
| 601 } | |
| 602 | |
| 603 | |
| 604 =head2 types | |
| 605 | |
| 606 Title : types | |
| 607 Usage : $db->types(@args) | |
| 608 Function: return list of feature types in range or database | |
| 609 Returns : a list of Bio::DB::GFF::Typename objects | |
| 610 Args : see below | |
| 611 Status : public | |
| 612 | |
| 613 This routine returns a list of feature types known to the database. | |
| 614 The list can be database-wide or restricted to a region. It is also | |
| 615 possible to find out how many times each feature occurs. | |
| 616 | |
| 617 For range queries, it is usually more convenient to create a | |
| 618 Bio::DB::GFF::Segment object, and then invoke it's types() method. | |
| 619 | |
| 620 Arguments are as follows: | |
| 621 | |
| 622 -ref ID of reference sequence | |
| 623 -class class of reference sequence | |
| 624 -start start of segment | |
| 625 -stop stop of segment | |
| 626 -enumerate if true, count the features | |
| 627 | |
| 628 The returned value will be a list of Bio::DB::GFF::Typename objects, | |
| 629 which if evaluated in a string context will return the feature type in | |
| 630 "method:source" format. This object class also has method() and | |
| 631 source() methods for retrieving the like-named fields. | |
| 632 | |
| 633 If -enumerate is true, then the function returns a hash (not a hash | |
| 634 reference) in which the keys are type names in "method:source" format | |
| 635 and the values are the number of times each feature appears in the | |
| 636 database or segment. | |
| 637 | |
| 638 The argument -end is a synonum for -stop, and -count is a synonym for | |
| 639 -enumerate. | |
| 640 | |
| 641 =cut | |
| 642 | |
| 643 sub types { | |
| 644 my $self = shift; | |
| 645 my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([ | |
| 646 [qw(REF REFSEQ)], | |
| 647 qw(START), | |
| 648 [qw(STOP END)], | |
| 649 [qw(ENUMERATE COUNT)], | |
| 650 [qw(CLASS SEQCLASS)], | |
| 651 [qw(TYPE TYPES)], | |
| 652 ],@_); | |
| 653 $types = $self->parse_types($types) if defined $types; | |
| 654 $self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types); | |
| 655 } | |
| 656 | |
| 657 =head2 classes | |
| 658 | |
| 659 Title : classes | |
| 660 Usage : $db->classes | |
| 661 Function: return list of landmark classes in database | |
| 662 Returns : a list of classes | |
| 663 Args : none | |
| 664 Status : public | |
| 665 | |
| 666 This routine returns the list of reference classes known to the | |
| 667 database, or empty if classes are not used by the database. Classes | |
| 668 are distinct from types, being essentially qualifiers on the reference | |
| 669 namespaces. | |
| 670 | |
| 671 =cut | |
| 672 | |
| 673 sub classes { | |
| 674 my $self = shift; | |
| 675 return (); | |
| 676 } | |
| 677 | |
| 678 =head2 segment | |
| 679 | |
| 680 Title : segment | |
| 681 Usage : $db->segment(@args); | |
| 682 Function: create a segment object | |
| 683 Returns : segment object(s) | |
| 684 Args : numerous, see below | |
| 685 Status : public | |
| 686 | |
| 687 This method generates a segment object, which is a Perl object | |
| 688 subclassed from Bio::DB::GFF::Segment. The segment can be used to | |
| 689 find overlapping features and the raw DNA. | |
| 690 | |
| 691 When making the segment() call, you specify the ID of a sequence | |
| 692 landmark (e.g. an accession number, a clone or contig), and a | |
| 693 positional range relative to the landmark. If no range is specified, | |
| 694 then the entire extent of the landmark is used to generate the | |
| 695 segment. | |
| 696 | |
| 697 You may also provide the ID of a "reference" sequence, which will set | |
| 698 the coordinate system and orientation used for all features contained | |
| 699 within the segment. The reference sequence can be changed later. If | |
| 700 no reference sequence is provided, then the coordinate system is based | |
| 701 on the landmark. | |
| 702 | |
| 703 Arguments: | |
| 704 | |
| 705 -name ID of the landmark sequence. | |
| 706 | |
| 707 -class Database object class for the landmark sequence. | |
| 708 "Sequence" assumed if not specified. This is | |
| 709 irrelevant for databases which do not recognize | |
| 710 object classes. | |
| 711 | |
| 712 -start Start of the segment relative to landmark. Positions | |
| 713 follow standard 1-based sequence rules. If not specified, | |
| 714 defaults to the beginning of the landmark. | |
| 715 | |
| 716 -end Stop of the segment relative to the landmark. If not specified, | |
| 717 defaults to the end of the landmark. | |
| 718 | |
| 719 -stop Same as -end. | |
| 720 | |
| 721 -offset For those who prefer 0-based indexing, the offset specifies the | |
| 722 position of the new segment relative to the start of the landmark. | |
| 723 | |
| 724 -length For those who prefer 0-based indexing, the length specifies the | |
| 725 length of the new segment. | |
| 726 | |
| 727 -refseq Specifies the ID of the reference landmark used to establish the | |
| 728 coordinate system for the newly-created segment. | |
| 729 | |
| 730 -refclass Specifies the class of the reference landmark, for those databases | |
| 731 that distinguish different object classes. Defaults to "Sequence". | |
| 732 | |
| 733 -absolute | |
| 734 Return features in absolute coordinates rather than relative to the | |
| 735 parent segment. | |
| 736 | |
| 737 -nocheck Don't check the database for the coordinates and length of this | |
| 738 feature. Construct a segment using the indicated name as the | |
| 739 reference, a start coordinate of 1, an undefined end coordinate, | |
| 740 and a strand of +1. | |
| 741 | |
| 742 -force Same as -nocheck. | |
| 743 | |
| 744 -seq,-sequence,-sourceseq Aliases for -name. | |
| 745 | |
| 746 -begin,-end Aliases for -start and -stop | |
| 747 | |
| 748 -off,-len Aliases for -offset and -length | |
| 749 | |
| 750 -seqclass Alias for -class | |
| 751 | |
| 752 Here's an example to explain how this works: | |
| 753 | |
| 754 my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql'); | |
| 755 | |
| 756 If successful, $db will now hold the database accessor object. We now | |
| 757 try to fetch the fragment of sequence whose ID is A0000182 and class | |
| 758 is "Accession." | |
| 759 | |
| 760 my $segment = $db->segment(-name=>'A0000182',-class=>'Accession'); | |
| 761 | |
| 762 If successful, $segment now holds the entire segment corresponding to | |
| 763 this accession number. By default, the sequence is used as its own | |
| 764 reference sequence, so its first base will be 1 and its last base will | |
| 765 be the length of the accession. | |
| 766 | |
| 767 Assuming that this sequence belongs to a longer stretch of DNA, say a | |
| 768 contig, we can fetch this information like so: | |
| 769 | |
| 770 my $sourceseq = $segment->sourceseq; | |
| 771 | |
| 772 and find the start and stop on the source like this: | |
| 773 | |
| 774 my $start = $segment->abs_start; | |
| 775 my $stop = $segment->abs_stop; | |
| 776 | |
| 777 If we had another segment, say $s2, which is on the same contiguous | |
| 778 piece of DNA, we can pass that to the refseq() method in order to | |
| 779 establish it as the coordinate reference point: | |
| 780 | |
| 781 $segment->refseq($s2); | |
| 782 | |
| 783 Now calling start() will return the start of the segment relative to | |
| 784 the beginning of $s2, accounting for differences in strandedness: | |
| 785 | |
| 786 my $rel_start = $segment->start; | |
| 787 | |
| 788 IMPORTANT NOTE: This method can be used to return the segment spanned | |
| 789 by an arbitrary named annotation. However, if the annotation appears | |
| 790 at multiple locations on the genome, for example an EST that maps to | |
| 791 multiple locations, then, provided that all locations reside on the | |
| 792 same physical segment, the method will return a segment that spans the | |
| 793 minimum and maximum positions. If the reference sequence occupies | |
| 794 ranges on different physical segments, then it returns them all in an | |
| 795 array context, and raises a "multiple segment exception" exception in | |
| 796 a scalar context. | |
| 797 | |
| 798 =cut | |
| 799 | |
| 800 #' | |
| 801 | |
| 802 sub segment { | |
| 803 my $self = shift; | |
| 804 my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self, | |
| 805 $self->setup_segment_args(@_)); | |
| 806 foreach (@segments) { | |
| 807 $_->absolute(1) if $self->absolute; | |
| 808 } | |
| 809 | |
| 810 $self->_multiple_return_args(@segments); | |
| 811 } | |
| 812 | |
| 813 sub _multiple_return_args { | |
| 814 my $self = shift; | |
| 815 my @args = @_; | |
| 816 if (@args == 0) { | |
| 817 return; | |
| 818 } elsif (@args == 1) { | |
| 819 return $args[0]; | |
| 820 } elsif (wantarray) { # more than one reference sequence | |
| 821 return @args; | |
| 822 } else { | |
| 823 $self->error($args[0]->name, | |
| 824 " has more than one reference sequence in database. Please call in a list context to retrieve them all."); | |
| 825 $self->throw('multiple segment exception'); | |
| 826 return; | |
| 827 } | |
| 828 | |
| 829 } | |
| 830 | |
| 831 # backward compatibility -- don't use! | |
| 832 # (deliberately undocumented too) | |
| 833 sub abs_segment { | |
| 834 my $self = shift; | |
| 835 return $self->segment($self->setup_segment_args(@_),-absolute=>1); | |
| 836 } | |
| 837 | |
| 838 sub setup_segment_args { | |
| 839 my $self = shift; | |
| 840 return @_ if defined $_[0] && $_[0] =~ /^-/; | |
| 841 return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3; | |
| 842 return (-class=>$_[0],-name=>$_[1]) if @_ == 2; | |
| 843 return (-name=>$_[0]) if @_ == 1; | |
| 844 } | |
| 845 | |
| 846 =head2 features | |
| 847 | |
| 848 Title : features | |
| 849 Usage : $db->features(@args) | |
| 850 Function: get all features, possibly filtered by type | |
| 851 Returns : a list of Bio::DB::GFF::Feature objects | |
| 852 Args : see below | |
| 853 Status : public | |
| 854 | |
| 855 This routine will retrieve features in the database regardless of | |
| 856 position. It can be used to return all features, or a subset based on | |
| 857 their method and source. | |
| 858 | |
| 859 Arguments are as follows: | |
| 860 | |
| 861 -types List of feature types to return. Argument is an array | |
| 862 reference containing strings of the format "method:source" | |
| 863 | |
| 864 -merge Whether to apply aggregators to the generated features. | |
| 865 | |
| 866 -rare Turn on optimizations suitable for a relatively rare feature type, | |
| 867 where it makes more sense to filter by feature type first, | |
| 868 and then by position. | |
| 869 | |
| 870 -attributes A hash reference containing attributes to match. | |
| 871 | |
| 872 -iterator Whether to return an iterator across the features. | |
| 873 | |
| 874 -binsize A true value will create a set of artificial features whose | |
| 875 start and stop positions indicate bins of the given size, and | |
| 876 whose scores are the number of features in the bin. The | |
| 877 class and method of the feature will be set to "bin", | |
| 878 its source to "method:source", and its group to "bin:method:source". | |
| 879 This is a handy way of generating histograms of feature density. | |
| 880 | |
| 881 If -iterator is true, then the method returns a single scalar value | |
| 882 consisting of a Bio::SeqIO object. You can call next_seq() repeatedly | |
| 883 on this object to fetch each of the features in turn. If iterator is | |
| 884 false or absent, then all the features are returned as a list. | |
| 885 | |
| 886 Currently aggregation is disabled when iterating over a series of | |
| 887 features. | |
| 888 | |
| 889 Types are indicated using the nomenclature "method:source". Either of | |
| 890 these fields can be omitted, in which case a wildcard is used for the | |
| 891 missing field. Type names without the colon (e.g. "exon") are | |
| 892 interpreted as the method name and a source wild card. Regular | |
| 893 expressions are allowed in either field, as in: "similarity:BLAST.*". | |
| 894 | |
| 895 The -attributes argument is a hashref containing one or more attributes | |
| 896 to match against: | |
| 897 | |
| 898 -attributes => { Gene => 'abc-1', | |
| 899 Note => 'confirmed' } | |
| 900 | |
| 901 Attribute matching is simple string matching, and multiple attributes | |
| 902 are ANDed together. | |
| 903 | |
| 904 =cut | |
| 905 | |
| 906 sub features { | |
| 907 my $self = shift; | |
| 908 my ($types,$automerge,$sparse,$iterator,$other); | |
| 909 if (defined $_[0] && | |
| 910 $_[0] =~ /^-/) { | |
| 911 ($types,$automerge,$sparse,$iterator,$other) = rearrange([ | |
| 912 [qw(TYPE TYPES)], | |
| 913 [qw(MERGE AUTOMERGE)], | |
| 914 [qw(RARE SPARSE)], | |
| 915 'ITERATOR' | |
| 916 ],@_); | |
| 917 } else { | |
| 918 $types = \@_; | |
| 919 } | |
| 920 | |
| 921 # for whole database retrievals, we probably don't want to automerge! | |
| 922 $automerge = $self->automerge unless defined $automerge; | |
| 923 $other ||= {}; | |
| 924 $self->_features({ | |
| 925 rangetype => 'contains', | |
| 926 types => $types, | |
| 927 }, | |
| 928 { sparse => $sparse, | |
| 929 automerge => $automerge, | |
| 930 iterator =>$iterator, | |
| 931 %$other, | |
| 932 } | |
| 933 ); | |
| 934 } | |
| 935 | |
| 936 =head2 get_seq_stream | |
| 937 | |
| 938 Title : get_seq_stream | |
| 939 Usage : my $seqio = $self->get_seq_sream(@args) | |
| 940 Function: Performs a query and returns an iterator over it | |
| 941 Returns : a Bio::SeqIO stream capable of producing sequence | |
| 942 Args : As in features() | |
| 943 Status : public | |
| 944 | |
| 945 This routine takes the same arguments as features(), but returns a | |
| 946 Bio::SeqIO::Stream-compliant object. Use it like this: | |
| 947 | |
| 948 $stream = $db->get_seq_stream('exon'); | |
| 949 while (my $exon = $stream->next_seq) { | |
| 950 print $exon,"\n"; | |
| 951 } | |
| 952 | |
| 953 NOTE: This is also called get_feature_stream(), since that's what it | |
| 954 really does. | |
| 955 | |
| 956 =cut | |
| 957 | |
| 958 sub get_seq_stream { | |
| 959 my $self = shift; | |
| 960 my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1) | |
| 961 : (-types=>\@_,-iterator=>1); | |
| 962 $self->features(@args); | |
| 963 } | |
| 964 | |
| 965 *get_feature_stream = \&get_seq_stream; | |
| 966 | |
| 967 =head2 get_feature_by_name | |
| 968 | |
| 969 Title : get_feature_by_name | |
| 970 Usage : $db->get_feature_by_name($class => $name) | |
| 971 Function: fetch features by their name | |
| 972 Returns : a list of Bio::DB::GFF::Feature objects | |
| 973 Args : the class and name of the desired feature | |
| 974 Status : public | |
| 975 | |
| 976 This method can be used to fetch a named feature from the database. | |
| 977 GFF annotations are named using the group class and name fields, so | |
| 978 for features that belong to a group of size one, this method can be | |
| 979 used to retrieve that group (and is equivalent to the segment() | |
| 980 method). Any Alias attributes are also searched for matching names. | |
| 981 | |
| 982 An alternative syntax allows you to search for features by name within | |
| 983 a circumscribed region: | |
| 984 | |
| 985 @f = $db->get_feature_by_name(-class => $class,-name=>$name, | |
| 986 -ref => $sequence_name, | |
| 987 -start => $start, | |
| 988 -end => $end); | |
| 989 | |
| 990 This method may return zero, one, or several Bio::DB::GFF::Feature | |
| 991 objects. | |
| 992 | |
| 993 Aggregation is performed on features as usual. | |
| 994 | |
| 995 NOTE: At various times, this function was called fetch_group(), | |
| 996 fetch_feature(), fetch_feature_by_name() and segments(). These names | |
| 997 are preserved for backward compatibility. | |
| 998 | |
| 999 =cut | |
| 1000 | |
| 1001 sub get_feature_by_name { | |
| 1002 my $self = shift; | |
| 1003 my ($gclass,$gname,$automerge,$ref,$start,$end); | |
| 1004 if (@_ == 1) { | |
| 1005 $gclass = $self->default_class; | |
| 1006 $gname = shift; | |
| 1007 } else { | |
| 1008 ($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE', | |
| 1009 ['REF','REFSEQ'], | |
| 1010 'START',['STOP','END'] | |
| 1011 ],@_); | |
| 1012 $gclass ||= $self->default_class; | |
| 1013 } | |
| 1014 $automerge = $self->automerge unless defined $automerge; | |
| 1015 | |
| 1016 # we need to refactor this... It's repeated code (see below)... | |
| 1017 my @aggregators; | |
| 1018 if ($automerge) { | |
| 1019 for my $a ($self->aggregators) { | |
| 1020 push @aggregators,$a if $a->disaggregate([],$self); | |
| 1021 } | |
| 1022 } | |
| 1023 | |
| 1024 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
| 1025 my $features = []; | |
| 1026 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; | |
| 1027 my $location = [$ref,$start,$end] if defined $ref; | |
| 1028 $self->_feature_by_name($gclass,$gname,$location,$callback); | |
| 1029 | |
| 1030 warn "aggregating...\n" if $self->debug; | |
| 1031 foreach my $a (@aggregators) { # last aggregator gets first shot | |
| 1032 $a->aggregate($features,$self) or next; | |
| 1033 } | |
| 1034 | |
| 1035 @$features; | |
| 1036 } | |
| 1037 | |
| 1038 # horrible indecision regarding proper names! | |
| 1039 *fetch_group = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name; | |
| 1040 *segments = \&segment; | |
| 1041 | |
| 1042 =head2 get_feature_by_target | |
| 1043 | |
| 1044 Title : get_feature_by_target | |
| 1045 Usage : $db->get_feature_by_target($class => $name) | |
| 1046 Function: fetch features by their similarity target | |
| 1047 Returns : a list of Bio::DB::GFF::Feature objects | |
| 1048 Args : the class and name of the desired feature | |
| 1049 Status : public | |
| 1050 | |
| 1051 This method can be used to fetch a named feature from the database | |
| 1052 based on its similarity hit. | |
| 1053 | |
| 1054 =cut | |
| 1055 | |
| 1056 sub get_feature_by_target { | |
| 1057 shift->get_feature_by_name(@_); | |
| 1058 } | |
| 1059 | |
| 1060 =head2 get_feature_by_attribute | |
| 1061 | |
| 1062 Title : get_feature_by_attribute | |
| 1063 Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2) | |
| 1064 Function: fetch segments by combinations of attribute values | |
| 1065 Returns : a list of Bio::DB::GFF::Feature objects | |
| 1066 Args : the class and name of the desired feature | |
| 1067 Status : public | |
| 1068 | |
| 1069 This method can be used to fetch a set of features from the database. | |
| 1070 Attributes are a list of name=E<gt>value pairs. They will be logically | |
| 1071 ANDED together. | |
| 1072 | |
| 1073 =cut | |
| 1074 | |
| 1075 sub get_feature_by_attribute { | |
| 1076 my $self = shift; | |
| 1077 my %attributes = ref($_[0]) ? %{$_[0]} : @_; | |
| 1078 | |
| 1079 # we need to refactor this... It's repeated code (see above)... | |
| 1080 my @aggregators; | |
| 1081 if ($self->automerge) { | |
| 1082 for my $a ($self->aggregators) { | |
| 1083 unshift @aggregators,$a if $a->disaggregate([],$self); | |
| 1084 } | |
| 1085 } | |
| 1086 | |
| 1087 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
| 1088 my $features = []; | |
| 1089 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; | |
| 1090 $self->_feature_by_attribute(\%attributes,$callback); | |
| 1091 | |
| 1092 warn "aggregating...\n" if $self->debug; | |
| 1093 foreach my $a (@aggregators) { # last aggregator gets first shot | |
| 1094 $a->aggregate($features,$self) or next; | |
| 1095 } | |
| 1096 | |
| 1097 @$features; | |
| 1098 } | |
| 1099 | |
| 1100 # more indecision... | |
| 1101 *fetch_feature_by_attribute = \&get_feature_by_attribute; | |
| 1102 | |
| 1103 =head2 get_feature_by_id | |
| 1104 | |
| 1105 Title : get_feature_by_id | |
| 1106 Usage : $db->get_feature_by_id($id) | |
| 1107 Function: fetch segments by feature ID | |
| 1108 Returns : a Bio::DB::GFF::Feature object | |
| 1109 Args : the feature ID | |
| 1110 Status : public | |
| 1111 | |
| 1112 This method can be used to fetch a feature from the database using its | |
| 1113 ID. Not all GFF databases support IDs, so be careful with this. | |
| 1114 | |
| 1115 =cut | |
| 1116 | |
| 1117 sub get_feature_by_id { | |
| 1118 my $self = shift; | |
| 1119 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; | |
| 1120 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
| 1121 my $features = []; | |
| 1122 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; | |
| 1123 $self->_feature_by_id($id,'feature',$callback); | |
| 1124 return wantarray ? @$features : $features->[0]; | |
| 1125 } | |
| 1126 *fetch_feature_by_id = \&get_feature_by_id; | |
| 1127 | |
| 1128 =head2 get_feature_by_gid | |
| 1129 | |
| 1130 Title : get_feature_by_gid | |
| 1131 Usage : $db->get_feature_by_gid($id) | |
| 1132 Function: fetch segments by feature ID | |
| 1133 Returns : a Bio::DB::GFF::Feature object | |
| 1134 Args : the feature ID | |
| 1135 Status : public | |
| 1136 | |
| 1137 This method can be used to fetch a feature from the database using its | |
| 1138 group ID. Not all GFF databases support IDs, so be careful with this. | |
| 1139 | |
| 1140 The group ID is often more interesting than the feature ID, since | |
| 1141 groups can be complex objects containing subobjects. | |
| 1142 | |
| 1143 =cut | |
| 1144 | |
| 1145 sub get_feature_by_gid { | |
| 1146 my $self = shift; | |
| 1147 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; | |
| 1148 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
| 1149 my $features = []; | |
| 1150 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; | |
| 1151 $self->_feature_by_id($id,'group',$callback); | |
| 1152 return wantarray ? @$features : $features->[0]; | |
| 1153 } | |
| 1154 *fetch_feature_by_gid = \&get_feature_by_gid; | |
| 1155 | |
| 1156 =head2 delete_features | |
| 1157 | |
| 1158 Title : delete_features | |
| 1159 Usage : $db->delete_features(@ids_or_features) | |
| 1160 Function: delete one or more features | |
| 1161 Returns : count of features deleted | |
| 1162 Args : list of features or feature ids | |
| 1163 Status : public | |
| 1164 | |
| 1165 Pass this method a list of numeric feature ids or a set of features. | |
| 1166 It will attempt to remove the features from the database and return a | |
| 1167 count of the features removed. | |
| 1168 | |
| 1169 NOTE: This method is also called delete_feature(). Also see | |
| 1170 delete_groups(). | |
| 1171 | |
| 1172 =cut | |
| 1173 | |
| 1174 *delete_feature = \&delete_features; | |
| 1175 | |
| 1176 sub delete_features { | |
| 1177 my $self = shift; | |
| 1178 my @features_or_ids = @_; | |
| 1179 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids; | |
| 1180 return unless @ids; | |
| 1181 $self->_delete_features(@ids); | |
| 1182 } | |
| 1183 | |
| 1184 =head2 delete_groups | |
| 1185 | |
| 1186 Title : delete_groups | |
| 1187 Usage : $db->delete_groups(@ids_or_features) | |
| 1188 Function: delete one or more feature groups | |
| 1189 Returns : count of features deleted | |
| 1190 Args : list of features or feature group ids | |
| 1191 Status : public | |
| 1192 | |
| 1193 Pass this method a list of numeric group ids or a set of features. It | |
| 1194 will attempt to recursively remove the features and ALL members of | |
| 1195 their group from the database. It returns a count of the number of | |
| 1196 features (not groups) returned. | |
| 1197 | |
| 1198 NOTE: This method is also called delete_group(). Also see | |
| 1199 delete_features(). | |
| 1200 | |
| 1201 =cut | |
| 1202 | |
| 1203 *delete_group = \&delete_groupss; | |
| 1204 | |
| 1205 sub delete_groups { | |
| 1206 my $self = shift; | |
| 1207 my @features_or_ids = @_; | |
| 1208 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids; | |
| 1209 return unless @ids; | |
| 1210 $self->_delete_groups(@ids); | |
| 1211 } | |
| 1212 | |
| 1213 =head2 delete | |
| 1214 | |
| 1215 Title : delete | |
| 1216 Usage : $db->delete(@args) | |
| 1217 Function: delete features | |
| 1218 Returns : count of features deleted -- if available | |
| 1219 Args : numerous, see below | |
| 1220 Status : public | |
| 1221 | |
| 1222 This method deletes all features that overlap the specified region or | |
| 1223 are of a particular type. If no arguments are provided and the -force | |
| 1224 argument is true, then deletes ALL features. | |
| 1225 | |
| 1226 Arguments: | |
| 1227 | |
| 1228 -name ID of the landmark sequence. | |
| 1229 | |
| 1230 -ref ID of the landmark sequence (synonym for -name). | |
| 1231 | |
| 1232 -class Database object class for the landmark sequence. | |
| 1233 "Sequence" assumed if not specified. This is | |
| 1234 irrelevant for databases which do not recognize | |
| 1235 object classes. | |
| 1236 | |
| 1237 -start Start of the segment relative to landmark. Positions | |
| 1238 follow standard 1-based sequence rules. If not specified, | |
| 1239 defaults to the beginning of the landmark. | |
| 1240 | |
| 1241 -end Stop of the segment relative to the landmark. If not specified, | |
| 1242 defaults to the end of the landmark. | |
| 1243 | |
| 1244 -offset Zero-based addressing | |
| 1245 | |
| 1246 -length Length of region | |
| 1247 | |
| 1248 -type,-types Either a single scalar type to be deleted, or an | |
| 1249 reference to an array of types. | |
| 1250 | |
| 1251 -force Force operation to be performed even if it would delete | |
| 1252 entire feature table. | |
| 1253 | |
| 1254 -range_type Control the range type of the deletion. One of "overlaps" (default) | |
| 1255 "contains" or "contained_in" | |
| 1256 | |
| 1257 Examples: | |
| 1258 | |
| 1259 $db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats | |
| 1260 $db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000 | |
| 1261 $db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3 | |
| 1262 | |
| 1263 The short form of this call, as described in segment() is also allowed: | |
| 1264 | |
| 1265 $db->delete("chr3",1=>1000); | |
| 1266 $db->delete("chr3"); | |
| 1267 | |
| 1268 IMPORTANT NOTE: This method only deletes features. It does *NOT* | |
| 1269 delete the names of groups that contain the deleted features. Group | |
| 1270 IDs will be reused if you later load a feature with the same group | |
| 1271 name as one that was previously deleted. | |
| 1272 | |
| 1273 NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the | |
| 1274 result code from the SQL DELETE operation. Some dbd drivers return the | |
| 1275 count of rows deleted, while others return 0E0. Caveat emptor. | |
| 1276 | |
| 1277 =cut | |
| 1278 | |
| 1279 sub delete { | |
| 1280 my $self = shift; | |
| 1281 my @args = $self->setup_segment_args(@_); | |
| 1282 my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) = | |
| 1283 rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET', | |
| 1284 'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args); | |
| 1285 $offset = 0 unless defined $offset; | |
| 1286 $start = $offset+1 unless defined $start; | |
| 1287 $end = $start+$length-1 if !defined $end and $length; | |
| 1288 $class ||= $self->default_class; | |
| 1289 | |
| 1290 my $types = $self->parse_types($type); # parse out list of types | |
| 1291 | |
| 1292 $range_type ||= 'overlaps'; | |
| 1293 $self->throw("range type must be one of {". | |
| 1294 join(',',keys %valid_range_types). | |
| 1295 "}\n") | |
| 1296 unless $valid_range_types{lc $range_type}; | |
| 1297 | |
| 1298 | |
| 1299 my @segments; | |
| 1300 if (defined $name && $name ne '') { | |
| 1301 my @args = (-name=>$name,-class=>$class); | |
| 1302 push @args,(-start=>$start) if defined $start; | |
| 1303 push @args,(-end =>$end) if defined $end; | |
| 1304 @segments = $self->segment(@args); | |
| 1305 return unless @segments; | |
| 1306 } | |
| 1307 $self->_delete({segments => \@segments, | |
| 1308 types => $types, | |
| 1309 range_type => $range_type, | |
| 1310 force => $force} | |
| 1311 ); | |
| 1312 } | |
| 1313 | |
| 1314 =head2 absolute | |
| 1315 | |
| 1316 Title : absolute | |
| 1317 Usage : $abs = $db->absolute([$abs]); | |
| 1318 Function: gets/sets absolute mode | |
| 1319 Returns : current setting of absolute mode boolean | |
| 1320 Args : new setting for absolute mode boolean | |
| 1321 Status : public | |
| 1322 | |
| 1323 $db-E<gt>absolute(1) will turn on absolute mode for the entire database. | |
| 1324 All segments retrieved will use absolute coordinates by default, | |
| 1325 rather than relative coordinates. You can still set them to use | |
| 1326 relative coordinates by calling $segment-E<gt>absolute(0). | |
| 1327 | |
| 1328 Note that this is not the same as calling abs_segment(); it continues | |
| 1329 to allow you to look up groups that are not used directly as reference | |
| 1330 sequences. | |
| 1331 | |
| 1332 =cut | |
| 1333 | |
| 1334 sub absolute { | |
| 1335 my $self = shift; | |
| 1336 my $d = $self->{absolute}; | |
| 1337 $self->{absolute} = shift if @_; | |
| 1338 $d; | |
| 1339 } | |
| 1340 | |
| 1341 =head2 strict_bounds_checking | |
| 1342 | |
| 1343 Title : strict_bounds_checking | |
| 1344 Usage : $flag = $db->strict_bounds_checking([$flag]) | |
| 1345 Function: gets/sets strict bounds checking | |
| 1346 Returns : current setting of bounds checking flag | |
| 1347 Args : new setting for bounds checking flag | |
| 1348 Status : public | |
| 1349 | |
| 1350 This flag enables extra checks for segment requests that go beyond the | |
| 1351 ends of their reference sequences. If bounds checking is enabled, | |
| 1352 then retrieved segments will be truncated to their physical length, | |
| 1353 and their truncated() methods will return true. | |
| 1354 | |
| 1355 If the flag is off (the default), then the module will return segments | |
| 1356 that appear to extend beyond their physical boundaries. Requests for | |
| 1357 features beyond the end of the segment will, however, return empty. | |
| 1358 | |
| 1359 =cut | |
| 1360 | |
| 1361 sub strict_bounds_checking { | |
| 1362 my $self = shift; | |
| 1363 my $d = $self->{strict}; | |
| 1364 $self->{strict} = shift if @_; | |
| 1365 $d; | |
| 1366 } | |
| 1367 | |
| 1368 =head2 get_Seq_by_id | |
| 1369 | |
| 1370 Title : get_Seq_by_id | |
| 1371 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') | |
| 1372 Function: Gets a Bio::Seq object by its name | |
| 1373 Returns : a Bio::Seq object | |
| 1374 Args : the id (as a string) of a sequence | |
| 1375 Throws : "id does not exist" exception | |
| 1376 | |
| 1377 NOTE: Bio::DB::RandomAccessI compliant method | |
| 1378 | |
| 1379 =cut | |
| 1380 | |
| 1381 sub get_Seq_by_id { | |
| 1382 my $self = shift; | |
| 1383 my $id = shift; | |
| 1384 my $stream = $self->get_Stream_by_id($id); | |
| 1385 return $stream->next_seq; | |
| 1386 } | |
| 1387 | |
| 1388 | |
| 1389 =head2 get_Seq_by_accession | |
| 1390 | |
| 1391 Title : get_Seq_by_accession | |
| 1392 Usage : $seq = $db->get_Seq_by_accession('AL12234') | |
| 1393 Function: Gets a Bio::Seq object by its accession | |
| 1394 Returns : a Bio::Seq object | |
| 1395 Args : the id (as a string) of a sequence | |
| 1396 Throws : "id does not exist" exception | |
| 1397 | |
| 1398 NOTE: Bio::DB::RandomAccessI compliant method | |
| 1399 | |
| 1400 =cut | |
| 1401 | |
| 1402 sub get_Seq_by_accession { | |
| 1403 my $self = shift; | |
| 1404 my $id = shift; | |
| 1405 my $stream = $self->get_Stream_by_accession($id); | |
| 1406 return $stream->next_seq; | |
| 1407 } | |
| 1408 | |
| 1409 =head2 get_Stream_by_acc () | |
| 1410 | |
| 1411 =cut | |
| 1412 | |
| 1413 =head2 get_Seq_by_acc | |
| 1414 | |
| 1415 Title : get_Seq_by_acc | |
| 1416 Usage : $seq = $db->get_Seq_by_acc('X77802'); | |
| 1417 Function: Gets a Bio::Seq object by accession number | |
| 1418 Returns : A Bio::Seq object | |
| 1419 Args : accession number (as a string) | |
| 1420 Throws : "acc does not exist" exception | |
| 1421 | |
| 1422 NOTE: Bio::DB::RandomAccessI compliant method | |
| 1423 | |
| 1424 =cut | |
| 1425 | |
| 1426 sub get_Stream_by_name { | |
| 1427 my $self = shift; | |
| 1428 my @ids = @_; | |
| 1429 my $id = ref($ids[0]) ? $ids[0] : \@ids; | |
| 1430 Bio::DB::GFF::ID_Iterator->new($self,$id,'name'); | |
| 1431 } | |
| 1432 | |
| 1433 =head2 get_Stream_by_id | |
| 1434 | |
| 1435 Title : get_Stream_by_id | |
| 1436 Usage : $seq = $db->get_Stream_by_id(@ids); | |
| 1437 Function: Retrieves a stream of Seq objects given their ids | |
| 1438 Returns : a Bio::SeqIO stream object | |
| 1439 Args : an array of unique ids/accession numbers, or | |
| 1440 an array reference | |
| 1441 | |
| 1442 NOTE: This is also called get_Stream_by_batch() | |
| 1443 | |
| 1444 =cut | |
| 1445 | |
| 1446 sub get_Stream_by_id { | |
| 1447 my $self = shift; | |
| 1448 my @ids = @_; | |
| 1449 my $id = ref($ids[0]) ? $ids[0] : \@ids; | |
| 1450 Bio::DB::GFF::ID_Iterator->new($self,$id,'feature'); | |
| 1451 } | |
| 1452 | |
| 1453 =head2 get_Stream_by_batch () | |
| 1454 | |
| 1455 Title : get_Stream_by_batch | |
| 1456 Usage : $seq = $db->get_Stream_by_batch(@ids); | |
| 1457 Function: Retrieves a stream of Seq objects given their ids | |
| 1458 Returns : a Bio::SeqIO stream object | |
| 1459 Args : an array of unique ids/accession numbers, or | |
| 1460 an array reference | |
| 1461 | |
| 1462 NOTE: This is the same as get_Stream_by_id(). | |
| 1463 | |
| 1464 =cut | |
| 1465 | |
| 1466 *get_Stream_by_batch = \&get_Stream_by_id; | |
| 1467 | |
| 1468 | |
| 1469 =head2 get_Stream_by_group () | |
| 1470 | |
| 1471 Bioperl compatibility. | |
| 1472 | |
| 1473 =cut | |
| 1474 | |
| 1475 sub get_Stream_by_group { | |
| 1476 my $self = shift; | |
| 1477 my @ids = @_; | |
| 1478 my $id = ref($ids[0]) ? $ids[0] : \@ids; | |
| 1479 Bio::DB::GFF::ID_Iterator->new($self,$id,'group'); | |
| 1480 } | |
| 1481 | |
| 1482 =head2 all_seqfeatures | |
| 1483 | |
| 1484 Title : all_seqfeatures | |
| 1485 Usage : @features = $db->all_seqfeatures(@args) | |
| 1486 Function: fetch all the features in the database | |
| 1487 Returns : an array of features, or an iterator | |
| 1488 Args : See below | |
| 1489 Status : public | |
| 1490 | |
| 1491 This is equivalent to calling $db-E<gt>features() without any types, and | |
| 1492 will return all the features in the database. The -merge and | |
| 1493 -iterator arguments are recognized, and behave the same as described | |
| 1494 for features(). | |
| 1495 | |
| 1496 =cut | |
| 1497 | |
| 1498 sub all_seqfeatures { | |
| 1499 my $self = shift; | |
| 1500 my ($automerge,$iterator)= rearrange([ | |
| 1501 [qw(MERGE AUTOMERGE)], | |
| 1502 'ITERATOR' | |
| 1503 ],@_); | |
| 1504 my @args; | |
| 1505 push @args,(-merge=>$automerge) if defined $automerge; | |
| 1506 push @args,(-iterator=>$iterator) if defined $iterator; | |
| 1507 $self->features(@args); | |
| 1508 } | |
| 1509 | |
| 1510 =head1 Creating and Loading GFF Databases | |
| 1511 | |
| 1512 =head2 initialize | |
| 1513 | |
| 1514 Title : initialize | |
| 1515 Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2); | |
| 1516 Function: initialize a GFF database | |
| 1517 Returns : true if initialization successful | |
| 1518 Args : a set of named parameters | |
| 1519 Status : Public | |
| 1520 | |
| 1521 This method can be used to initialize an empty database. It takes the following | |
| 1522 named arguments: | |
| 1523 | |
| 1524 -erase A boolean value. If true the database will be wiped clean if it | |
| 1525 already contains data. | |
| 1526 | |
| 1527 Other named arguments may be recognized by subclasses. They become database | |
| 1528 meta values that control various settable options. | |
| 1529 | |
| 1530 As a shortcut (and for backward compatibility) a single true argument | |
| 1531 is the same as initialize(-erase=E<gt>1). | |
| 1532 | |
| 1533 =cut | |
| 1534 | |
| 1535 sub initialize { | |
| 1536 my $self = shift; | |
| 1537 #$self->do_initialize(1) if @_ == 1 && $_[0]; | |
| 1538 #why was this line (^) here? I can't see that it actually does anything | |
| 1539 #one option would be to execute the line and return, but I don't know | |
| 1540 #why you would want to do that either. | |
| 1541 | |
| 1542 my ($erase,$meta) = rearrange(['ERASE'],@_); | |
| 1543 $meta ||= {}; | |
| 1544 | |
| 1545 # initialize (possibly erasing) | |
| 1546 return unless $self->do_initialize($erase); | |
| 1547 my @default = $self->default_meta_values; | |
| 1548 | |
| 1549 # this is an awkward way of uppercasing the | |
| 1550 # even-numbered values (necessary for case-insensitive SQL databases) | |
| 1551 for (my $i=0; $i<@default; $i++) { | |
| 1552 $default[$i] = uc $default[$i] if !($i % 2); | |
| 1553 } | |
| 1554 | |
| 1555 my %values = (@default,%$meta); | |
| 1556 foreach (keys %values) { | |
| 1557 $self->meta($_ => $values{$_}); | |
| 1558 } | |
| 1559 1; | |
| 1560 } | |
| 1561 | |
| 1562 | |
| 1563 =head2 load_gff | |
| 1564 | |
| 1565 Title : load_gff | |
| 1566 Usage : $db->load_gff($file|$directory|$filehandle); | |
| 1567 Function: load GFF data into database | |
| 1568 Returns : count of records loaded | |
| 1569 Args : a directory, a file, a list of files, | |
| 1570 or a filehandle | |
| 1571 Status : Public | |
| 1572 | |
| 1573 This method takes a single overloaded argument, which can be any of: | |
| 1574 | |
| 1575 =over 4 | |
| 1576 | |
| 1577 =item 1. a scalar corresponding to a GFF file on the system | |
| 1578 | |
| 1579 A pathname to a local GFF file. Any files ending with the .gz, .Z, or | |
| 1580 .bz2 suffixes will be transparently decompressed with the appropriate | |
| 1581 command-line utility. | |
| 1582 | |
| 1583 =item 2. an array reference containing a list of GFF files on the system | |
| 1584 | |
| 1585 For example ['/home/gff/gff1.gz','/home/gff/gff2.gz'] | |
| 1586 | |
| 1587 =item 3. directory path | |
| 1588 | |
| 1589 The indicated directory will be searched for all files ending in the | |
| 1590 suffixes .gff, .gff.gz, .gff.Z or .gff.bz2. | |
| 1591 | |
| 1592 =item 4. filehandle | |
| 1593 | |
| 1594 An open filehandle from which to read the GFF data. Tied filehandles | |
| 1595 now work as well. | |
| 1596 | |
| 1597 =item 5. a pipe expression | |
| 1598 | |
| 1599 A pipe expression will also work. For example, a GFF file on a remote | |
| 1600 web server can be loaded with an expression like this: | |
| 1601 | |
| 1602 $db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |"); | |
| 1603 | |
| 1604 =back | |
| 1605 | |
| 1606 If successful, the method will return the number of GFF lines | |
| 1607 successfully loaded. | |
| 1608 | |
| 1609 NOTE:this method used to be called load(), but has been changed. The | |
| 1610 old method name is also recognized. | |
| 1611 | |
| 1612 =cut | |
| 1613 | |
| 1614 sub load_gff { | |
| 1615 my $self = shift; | |
| 1616 my $file_or_directory = shift || '.'; | |
| 1617 return $self->do_load_gff($file_or_directory) if ref($file_or_directory) && | |
| 1618 tied *$file_or_directory; | |
| 1619 | |
| 1620 my $tied_stdin = tied(*STDIN); | |
| 1621 open SAVEIN,"<&STDIN" unless $tied_stdin; | |
| 1622 local @ARGV = $self->setup_argv($file_or_directory,'gff') or return; # to play tricks with reader | |
| 1623 my $result = $self->do_load_gff('ARGV'); | |
| 1624 open STDIN,"<&SAVEIN" unless $tied_stdin; # restore STDIN | |
| 1625 return $result; | |
| 1626 } | |
| 1627 | |
| 1628 *load = \&load_gff; | |
| 1629 | |
| 1630 =head2 load_fasta | |
| 1631 | |
| 1632 Title : load_fasta | |
| 1633 Usage : $db->load_fasta($file|$directory|$filehandle); | |
| 1634 Function: load FASTA data into database | |
| 1635 Returns : count of records loaded | |
| 1636 Args : a directory, a file, a list of files, | |
| 1637 or a filehandle | |
| 1638 Status : Public | |
| 1639 | |
| 1640 This method takes a single overloaded argument, which can be any of: | |
| 1641 | |
| 1642 =over 4 | |
| 1643 | |
| 1644 =item 1. scalar corresponding to a FASTA file on the system | |
| 1645 | |
| 1646 A pathname to a local FASTA file. Any files ending with the .gz, .Z, or | |
| 1647 .bz2 suffixes will be transparently decompressed with the appropriate | |
| 1648 command-line utility. | |
| 1649 | |
| 1650 =item 2. array reference containing a list of FASTA files on the | |
| 1651 system | |
| 1652 | |
| 1653 For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz'] | |
| 1654 | |
| 1655 =item 3. path to a directory | |
| 1656 | |
| 1657 The indicated directory will be searched for all files ending in the | |
| 1658 suffixes .fa, .fa.gz, .fa.Z or .fa.bz2. | |
| 1659 | |
| 1660 a=item 4. filehandle | |
| 1661 | |
| 1662 An open filehandle from which to read the FASTA data. | |
| 1663 | |
| 1664 =item 5. pipe expression | |
| 1665 | |
| 1666 A pipe expression will also work. For example, a FASTA file on a remote | |
| 1667 web server can be loaded with an expression like this: | |
| 1668 | |
| 1669 $db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |"); | |
| 1670 | |
| 1671 =back | |
| 1672 | |
| 1673 =cut | |
| 1674 | |
| 1675 sub load_fasta { | |
| 1676 my $self = shift; | |
| 1677 my $file_or_directory = shift || '.'; | |
| 1678 return $self->load_sequence($file_or_directory) if ref($file_or_directory) && | |
| 1679 tied *$file_or_directory; | |
| 1680 | |
| 1681 my $tied = tied(*STDIN); | |
| 1682 open SAVEIN,"<&STDIN" unless $tied; | |
| 1683 local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader | |
| 1684 my $result = $self->load_sequence('ARGV'); | |
| 1685 open STDIN,"<&SAVEIN" unless $tied; # restore STDIN | |
| 1686 return $result; | |
| 1687 } | |
| 1688 | |
| 1689 =head2 load_sequence_string | |
| 1690 | |
| 1691 Title : load_sequence_string | |
| 1692 Usage : $db->load_sequence_string($id,$dna) | |
| 1693 Function: load a single DNA entry | |
| 1694 Returns : true if successfully loaded | |
| 1695 Args : a raw sequence string (DNA, RNA, protein) | |
| 1696 Status : Public | |
| 1697 | |
| 1698 =cut | |
| 1699 | |
| 1700 sub load_sequence_string { | |
| 1701 my $self = shift; | |
| 1702 my ($acc,$seq) = @_; | |
| 1703 my $offset = 0; | |
| 1704 $self->insert_sequence_chunk($acc,\$offset,\$seq) or return; | |
| 1705 $self->insert_sequence($acc,$offset,$seq) or return; | |
| 1706 1; | |
| 1707 } | |
| 1708 | |
| 1709 sub setup_argv { | |
| 1710 my $self = shift; | |
| 1711 my $file_or_directory = shift; | |
| 1712 my @suffixes = @_; | |
| 1713 no strict 'refs'; # so that we can call fileno() on the argument | |
| 1714 | |
| 1715 my @argv; | |
| 1716 | |
| 1717 if (-d $file_or_directory) { | |
| 1718 @argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes; | |
| 1719 }elsif (my $fd = fileno($file_or_directory)) { | |
| 1720 open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN"); | |
| 1721 @argv = '-'; | |
| 1722 } elsif (ref $file_or_directory) { | |
| 1723 @argv = @$file_or_directory; | |
| 1724 } else { | |
| 1725 @argv = $file_or_directory; | |
| 1726 } | |
| 1727 | |
| 1728 foreach (@argv) { | |
| 1729 if (/\.gz$/) { | |
| 1730 $_ = "gunzip -c $_ |"; | |
| 1731 } elsif (/\.Z$/) { | |
| 1732 $_ = "uncompress -c $_ |"; | |
| 1733 } elsif (/\.bz2$/) { | |
| 1734 $_ = "bunzip2 -c $_ |"; | |
| 1735 } | |
| 1736 } | |
| 1737 @argv; | |
| 1738 } | |
| 1739 | |
| 1740 =head2 lock_on_load | |
| 1741 | |
| 1742 Title : lock_on_load | |
| 1743 Usage : $lock = $db->lock_on_load([$lock]) | |
| 1744 Function: set write locking during load | |
| 1745 Returns : current value of lock-on-load flag | |
| 1746 Args : new value of lock-on-load-flag | |
| 1747 Status : Public | |
| 1748 | |
| 1749 This method is honored by some of the adaptors. If the value is true, | |
| 1750 the tables used by the GFF modules will be locked for writing during | |
| 1751 loads and inaccessible to other processes. | |
| 1752 | |
| 1753 =cut | |
| 1754 | |
| 1755 sub lock_on_load { | |
| 1756 my $self = shift; | |
| 1757 my $d = $self->{lock}; | |
| 1758 $self->{lock} = shift if @_; | |
| 1759 $d; | |
| 1760 } | |
| 1761 | |
| 1762 =head2 meta | |
| 1763 | |
| 1764 Title : meta | |
| 1765 Usage : $value = $db->meta($name [,$newval]) | |
| 1766 Function: get or set a meta variable | |
| 1767 Returns : a string | |
| 1768 Args : meta variable name and optionally value | |
| 1769 Status : abstract | |
| 1770 | |
| 1771 Get or set a named metavalues for the database. Metavalues can be | |
| 1772 used for database-specific settings. | |
| 1773 | |
| 1774 By default, this method does nothing! | |
| 1775 | |
| 1776 =cut | |
| 1777 | |
| 1778 sub meta { | |
| 1779 my $self = shift; | |
| 1780 my ($name,$value) = @_; | |
| 1781 return; | |
| 1782 } | |
| 1783 | |
| 1784 =head2 default_meta_values | |
| 1785 | |
| 1786 Title : default_meta_values | |
| 1787 Usage : %values = $db->default_meta_values | |
| 1788 Function: empty the database | |
| 1789 Returns : a list of tag=>value pairs | |
| 1790 Args : none | |
| 1791 Status : protected | |
| 1792 | |
| 1793 This method returns a list of tag=E<gt>value pairs that contain default | |
| 1794 meta information about the database. It is invoked by initialize() to | |
| 1795 write out the default meta values. The base class version returns an | |
| 1796 empty list. | |
| 1797 | |
| 1798 For things to work properly, meta value names must be UPPERCASE. | |
| 1799 | |
| 1800 =cut | |
| 1801 | |
| 1802 sub default_meta_values { | |
| 1803 my $self = shift; | |
| 1804 return (); | |
| 1805 } | |
| 1806 | |
| 1807 | |
| 1808 =head2 error | |
| 1809 | |
| 1810 Title : error | |
| 1811 Usage : $db->error( [$new error] ); | |
| 1812 Function: read or set error message | |
| 1813 Returns : error message | |
| 1814 Args : an optional argument to set the error message | |
| 1815 Status : Public | |
| 1816 | |
| 1817 This method can be used to retrieve the last error message. Errors | |
| 1818 are not reset to empty by successful calls, so contents are only valid | |
| 1819 immediately after an error condition has been detected. | |
| 1820 | |
| 1821 =cut | |
| 1822 | |
| 1823 sub error { | |
| 1824 my $self = shift; | |
| 1825 my $g = $self->{error}; | |
| 1826 $self->{error} = join '',@_ if @_; | |
| 1827 $g; | |
| 1828 } | |
| 1829 | |
| 1830 =head2 debug | |
| 1831 | |
| 1832 Title : debug | |
| 1833 Usage : $db->debug( [$flag] ); | |
| 1834 Function: read or set debug flag | |
| 1835 Returns : current value of debug flag | |
| 1836 Args : new debug flag (optional) | |
| 1837 Status : Public | |
| 1838 | |
| 1839 This method can be used to turn on debug messages. The exact nature | |
| 1840 of those messages depends on the adaptor in use. | |
| 1841 | |
| 1842 =cut | |
| 1843 | |
| 1844 sub debug { | |
| 1845 my $self = shift; | |
| 1846 my $g = $self->{debug}; | |
| 1847 $self->{debug} = shift if @_; | |
| 1848 $g; | |
| 1849 } | |
| 1850 | |
| 1851 | |
| 1852 =head2 automerge | |
| 1853 | |
| 1854 Title : automerge | |
| 1855 Usage : $db->automerge( [$new automerge] ); | |
| 1856 Function: get or set automerge value | |
| 1857 Returns : current value (boolean) | |
| 1858 Args : an optional argument to set the automerge value | |
| 1859 Status : Public | |
| 1860 | |
| 1861 By default, this module will use the aggregators to merge groups into | |
| 1862 single composite objects. This default can be changed to false by | |
| 1863 calling automerge(0). | |
| 1864 | |
| 1865 =cut | |
| 1866 | |
| 1867 sub automerge { | |
| 1868 my $self = shift; | |
| 1869 my $g = $self->{automerge}; | |
| 1870 $self->{automerge} = shift if @_; | |
| 1871 $g; | |
| 1872 } | |
| 1873 | |
| 1874 =head2 attributes | |
| 1875 | |
| 1876 Title : attributes | |
| 1877 Usage : @attributes = $db->attributes($id,$name) | |
| 1878 Function: get the "attributres" on a particular feature | |
| 1879 Returns : an array of string | |
| 1880 Args : feature ID | |
| 1881 Status : public | |
| 1882 | |
| 1883 Some GFF version 2 files use the groups column to store a series of | |
| 1884 attribute/value pairs. In this interpretation of GFF, the first such | |
| 1885 pair is treated as the primary group for the feature; subsequent pairs | |
| 1886 are treated as attributes. Two attributes have special meaning: | |
| 1887 "Note" is for backward compatibility and is used for unstructured text | |
| 1888 remarks. "Alias" is considered as a synonym for the feature name. | |
| 1889 | |
| 1890 If no name is provided, then attributes() returns a flattened hash, of | |
| 1891 attribute=E<gt>value pairs. This lets you do: | |
| 1892 | |
| 1893 %attributes = $db->attributes($id); | |
| 1894 | |
| 1895 Normally, attributes() will be called by the feature: | |
| 1896 | |
| 1897 @notes = $feature->attributes('Note'); | |
| 1898 | |
| 1899 In a scalar context, attributes() returns the first value of the | |
| 1900 attribute if a tag is present, otherwise a hash reference in which the | |
| 1901 keys are attribute names and the values are anonymous arrays | |
| 1902 containing the values. | |
| 1903 | |
| 1904 =cut | |
| 1905 | |
| 1906 sub attributes { | |
| 1907 my $self = shift; | |
| 1908 my ($id,$tag) = @_; | |
| 1909 my @result = $self->do_attributes($id,$tag) or return; | |
| 1910 return @result if wantarray; | |
| 1911 | |
| 1912 # what to do in an array context | |
| 1913 return $result[0] if $tag; | |
| 1914 my %result; | |
| 1915 while (my($key,$value) = splice(@result,0,2)) { | |
| 1916 push @{$result{$key}},$value; | |
| 1917 } | |
| 1918 return \%result; | |
| 1919 } | |
| 1920 | |
| 1921 =head2 fast_queries | |
| 1922 | |
| 1923 Title : fast_queries | |
| 1924 Usage : $flag = $db->fast_queries([$flag]) | |
| 1925 Function: turn on and off the "fast queries" option | |
| 1926 Returns : a boolean | |
| 1927 Args : a boolean flag (optional) | |
| 1928 Status : public | |
| 1929 | |
| 1930 The mysql database driver (and possibly others) support a "fast" query | |
| 1931 mode that caches results on the server side. This makes queries come | |
| 1932 back faster, particularly when creating iterators. The downside is | |
| 1933 that while iterating, new queries will die with a "command synch" | |
| 1934 error. This method turns the feature on and off. | |
| 1935 | |
| 1936 For databases that do not support a fast query, this method has no | |
| 1937 effect. | |
| 1938 | |
| 1939 =cut | |
| 1940 | |
| 1941 # override this method in order to set the mysql_use_result attribute, which is an obscure | |
| 1942 # but extremely powerful optimization for both performance and memory. | |
| 1943 sub fast_queries { | |
| 1944 my $self = shift; | |
| 1945 my $d = $self->{fast_queries}; | |
| 1946 $self->{fast_queries} = shift if @_; | |
| 1947 $d; | |
| 1948 } | |
| 1949 | |
| 1950 =head2 add_aggregator | |
| 1951 | |
| 1952 Title : add_aggregator | |
| 1953 Usage : $db->add_aggregator($aggregator) | |
| 1954 Function: add an aggregator to the list | |
| 1955 Returns : nothing | |
| 1956 Args : an aggregator | |
| 1957 Status : public | |
| 1958 | |
| 1959 This method will append an aggregator to the end of the list of | |
| 1960 registered aggregators. Three different argument types are accepted: | |
| 1961 | |
| 1962 1) a Bio::DB::GFF::Aggregator object -- will be added | |
| 1963 2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}" | |
| 1964 -- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method | |
| 1965 part is optional). | |
| 1966 3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator | |
| 1967 subclass, where the token corresponds to the subclass name. | |
| 1968 | |
| 1969 =cut | |
| 1970 | |
| 1971 sub add_aggregator { | |
| 1972 my $self = shift; | |
| 1973 my $aggregator = shift; | |
| 1974 my $list = $self->{aggregators} ||= []; | |
| 1975 if (ref $aggregator) { # an object | |
| 1976 @$list = grep {$_->get_method ne $aggregator->get_method} @$list; | |
| 1977 push @$list,$aggregator; | |
| 1978 } | |
| 1979 | |
| 1980 elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) { | |
| 1981 my($agg_name,$subparts,$mainpart) = ($1,$2,$3); | |
| 1982 my @subparts = split /,\s*/,$subparts; | |
| 1983 my @args = (-method => $agg_name, | |
| 1984 -sub_parts => \@subparts); | |
| 1985 push @args,(-main_method => $mainpart) if $mainpart; | |
| 1986 warn "making an aggregator with (@args), subparts = @subparts" if $self->debug; | |
| 1987 push @$list,Bio::DB::GFF::Aggregator->new(@args); | |
| 1988 } | |
| 1989 | |
| 1990 else { | |
| 1991 my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E"; | |
| 1992 eval "require $class"; | |
| 1993 $self->throw("Unable to load $aggregator aggregator: $@") if $@; | |
| 1994 push @$list,$class->new(); | |
| 1995 } | |
| 1996 } | |
| 1997 | |
| 1998 =head2 aggregators | |
| 1999 | |
| 2000 Title : aggregators | |
| 2001 Usage : $db->aggregators([@new_aggregators]); | |
| 2002 Function: retrieve list of aggregators | |
| 2003 Returns : list of aggregators | |
| 2004 Args : a list of aggregators to set (optional) | |
| 2005 Status : public | |
| 2006 | |
| 2007 This method will get or set the list of aggregators assigned to | |
| 2008 the database. If 1 or more arguments are passed, the existing | |
| 2009 set will be cleared. | |
| 2010 | |
| 2011 =cut | |
| 2012 | |
| 2013 sub aggregators { | |
| 2014 my $self = shift; | |
| 2015 my $d = $self->{aggregators}; | |
| 2016 if (@_) { | |
| 2017 $self->clear_aggregators; | |
| 2018 $self->add_aggregator($_) foreach @_; | |
| 2019 } | |
| 2020 return unless $d; | |
| 2021 return @$d; | |
| 2022 } | |
| 2023 | |
| 2024 =head2 clear_aggregators | |
| 2025 | |
| 2026 Title : clear_aggregators | |
| 2027 Usage : $db->clear_aggregators | |
| 2028 Function: clears list of aggregators | |
| 2029 Returns : nothing | |
| 2030 Args : none | |
| 2031 Status : public | |
| 2032 | |
| 2033 This method will clear the aggregators stored in the database object. | |
| 2034 Use aggregators() or add_aggregator() to add some back. | |
| 2035 | |
| 2036 =cut | |
| 2037 | |
| 2038 sub clear_aggregators { shift->{aggregators} = [] } | |
| 2039 | |
| 2040 =head1 Methods for use by Subclasses | |
| 2041 | |
| 2042 The following methods are chiefly of interest to subclasses and are | |
| 2043 not intended for use by end programmers. | |
| 2044 | |
| 2045 =head2 abscoords | |
| 2046 | |
| 2047 Title : abscoords | |
| 2048 Usage : $db->abscoords($name,$class,$refseq) | |
| 2049 Function: finds position of a landmark in reference coordinates | |
| 2050 Returns : ($ref,$class,$start,$stop,$strand) | |
| 2051 Args : name and class of landmark | |
| 2052 Status : public | |
| 2053 | |
| 2054 This method is called by Bio::DB::GFF::RelSegment to obtain the | |
| 2055 absolute coordinates of a sequence landmark. The arguments are the | |
| 2056 name and class of the landmark. If successful, abscoords() returns | |
| 2057 the ID of the reference sequence, its class, its start and stop | |
| 2058 positions, and the orientation of the reference sequence's coordinate | |
| 2059 system ("+" for forward strand, "-" for reverse strand). | |
| 2060 | |
| 2061 If $refseq is present in the argument list, it forces the query to | |
| 2062 search for the landmark in a particular reference sequence. | |
| 2063 | |
| 2064 =cut | |
| 2065 | |
| 2066 sub abscoords { | |
| 2067 my $self = shift; | |
| 2068 my ($name,$class,$refseq) = @_; | |
| 2069 $class ||= $self->{default_class}; | |
| 2070 $self->get_abscoords($name,$class,$refseq); | |
| 2071 } | |
| 2072 | |
| 2073 =head1 Protected API | |
| 2074 | |
| 2075 The following methods are not intended for public consumption, but are | |
| 2076 intended to be overridden/implemented by adaptors. | |
| 2077 | |
| 2078 =head2 default_aggregators | |
| 2079 | |
| 2080 Title : default_aggregators | |
| 2081 Usage : $db->default_aggregators; | |
| 2082 Function: retrieve list of aggregators | |
| 2083 Returns : array reference containing list of aggregator names | |
| 2084 Args : none | |
| 2085 Status : protected | |
| 2086 | |
| 2087 This method (which is intended to be overridden by adaptors) returns a | |
| 2088 list of standard aggregators to be applied when no aggregators are | |
| 2089 specified in the constructor. | |
| 2090 | |
| 2091 =cut | |
| 2092 | |
| 2093 sub default_aggregators { | |
| 2094 my $self = shift; | |
| 2095 return ['processed_transcript','alignment']; | |
| 2096 } | |
| 2097 | |
| 2098 =head2 do_load_gff | |
| 2099 | |
| 2100 Title : do_load_gff | |
| 2101 Usage : $db->do_load_gff($handle) | |
| 2102 Function: load a GFF input stream | |
| 2103 Returns : number of features loaded | |
| 2104 Args : A filehandle. | |
| 2105 Status : protected | |
| 2106 | |
| 2107 This method is called to load a GFF data stream. The method will read | |
| 2108 GFF features from E<lt>E<gt> and load them into the database. On exit the | |
| 2109 method must return the number of features loaded. | |
| 2110 | |
| 2111 Note that the method is responsible for parsing the GFF lines. This | |
| 2112 is to allow for differences in the interpretation of the "group" | |
| 2113 field, which are legion. | |
| 2114 | |
| 2115 You probably want to use load_gff() instead. It is more flexible | |
| 2116 about the arguments it accepts. | |
| 2117 | |
| 2118 =cut | |
| 2119 | |
| 2120 # load from <> | |
| 2121 sub do_load_gff { | |
| 2122 my $self = shift; | |
| 2123 my $io_handle = shift; | |
| 2124 | |
| 2125 local $self->{gff3_flag} = 0; | |
| 2126 $self->setup_load(); | |
| 2127 | |
| 2128 my $fasta_sequence_id; | |
| 2129 | |
| 2130 while (<$io_handle>) { | |
| 2131 chomp; | |
| 2132 $self->{gff3_flag}++ if /^\#\#gff-version\s+3/; | |
| 2133 if (/^>(\S+)/) { # uh oh, sequence coming | |
| 2134 $fasta_sequence_id = $1; | |
| 2135 last; | |
| 2136 } | |
| 2137 if (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line | |
| 2138 $self->load_gff_line( | |
| 2139 { | |
| 2140 ref => $1, | |
| 2141 class => 'Sequence', | |
| 2142 source => 'reference', | |
| 2143 method => 'Component', | |
| 2144 start => $2, | |
| 2145 stop => $3, | |
| 2146 score => undef, | |
| 2147 strand => undef, | |
| 2148 phase => undef, | |
| 2149 gclass => 'Sequence', | |
| 2150 gname => $1, | |
| 2151 tstart => undef, | |
| 2152 tstop => undef, | |
| 2153 attributes => [], | |
| 2154 } | |
| 2155 ); | |
| 2156 next; | |
| 2157 } | |
| 2158 | |
| 2159 next if /^\#/; | |
| 2160 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t"; | |
| 2161 next unless defined($ref) && defined($method) && defined($start) && defined($stop); | |
| 2162 foreach (\$score,\$strand,\$phase) { | |
| 2163 undef $$_ if $$_ eq '.'; | |
| 2164 } | |
| 2165 | |
| 2166 my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{gff3_flag}); | |
| 2167 | |
| 2168 # no standard way in the GFF file to denote the class of the reference sequence -- drat! | |
| 2169 # so we invoke the factory to do it | |
| 2170 my $class = $self->refclass($ref); | |
| 2171 | |
| 2172 # call subclass to do the dirty work | |
| 2173 if ($start > $stop) { | |
| 2174 ($start,$stop) = ($stop,$start); | |
| 2175 if ($strand eq '+') { | |
| 2176 $strand = '-'; | |
| 2177 } elsif ($strand eq '-') { | |
| 2178 $strand = '+'; | |
| 2179 } | |
| 2180 } | |
| 2181 $self->load_gff_line({ref => $ref, | |
| 2182 class => $class, | |
| 2183 source => $source, | |
| 2184 method => $method, | |
| 2185 start => $start, | |
| 2186 stop => $stop, | |
| 2187 score => $score, | |
| 2188 strand => $strand, | |
| 2189 phase => $phase, | |
| 2190 gclass => $gclass, | |
| 2191 gname => $gname, | |
| 2192 tstart => $tstart, | |
| 2193 tstop => $tstop, | |
| 2194 attributes => $attributes} | |
| 2195 ); | |
| 2196 } | |
| 2197 | |
| 2198 my $result = $self->finish_load(); | |
| 2199 $result += $self->load_sequence($io_handle,$fasta_sequence_id) | |
| 2200 if defined $fasta_sequence_id; | |
| 2201 $result; | |
| 2202 | |
| 2203 } | |
| 2204 | |
| 2205 =head2 load_sequence | |
| 2206 | |
| 2207 Title : load_sequence | |
| 2208 Usage : $db->load_sequence($handle [,$id]) | |
| 2209 Function: load a FASTA data stream | |
| 2210 Returns : number of sequences | |
| 2211 Args : a filehandle and optionally the ID of | |
| 2212 the first sequence in the stream. | |
| 2213 Status : protected | |
| 2214 | |
| 2215 You probably want to use load_fasta() instead. The $id argument is a | |
| 2216 hack used to switch from GFF loading to FASTA loading when load_gff() | |
| 2217 discovers FASTA data hiding at the bottom of the GFF file (as Artemis | |
| 2218 does). | |
| 2219 | |
| 2220 =cut | |
| 2221 | |
| 2222 sub load_sequence { | |
| 2223 my $self = shift; | |
| 2224 my $io_handle = shift; | |
| 2225 my $id = shift; # hack for GFF files that contain fasta data | |
| 2226 | |
| 2227 # read fasta file(s) from ARGV | |
| 2228 my ($seq,$offset,$loaded) = (undef,0,0); | |
| 2229 while (<$io_handle>) { | |
| 2230 chomp; | |
| 2231 if (/^>(\S+)/) { | |
| 2232 $self->insert_sequence($id,$offset,$seq) if $id; | |
| 2233 $id = $1; | |
| 2234 $offset = 0; | |
| 2235 $seq = ''; | |
| 2236 $loaded++; | |
| 2237 } else { | |
| 2238 $seq .= $_; | |
| 2239 $self->insert_sequence_chunk($id,\$offset,\$seq); | |
| 2240 } | |
| 2241 } | |
| 2242 $self->insert_sequence($id,$offset,$seq) if $id; | |
| 2243 $loaded+0; | |
| 2244 } | |
| 2245 | |
| 2246 sub insert_sequence_chunk { | |
| 2247 my $self = shift; | |
| 2248 my ($id,$offsetp,$seqp) = @_; | |
| 2249 if (my $cs = $self->dna_chunk_size) { | |
| 2250 while (length($$seqp) >= $cs) { | |
| 2251 my $chunk = substr($$seqp,0,$cs); | |
| 2252 $self->insert_sequence($id,$$offsetp,$chunk); | |
| 2253 $$offsetp += length($chunk); | |
| 2254 substr($$seqp,0,$cs) = ''; | |
| 2255 } | |
| 2256 } | |
| 2257 return 1; # the calling routine may expect success or failure | |
| 2258 } | |
| 2259 | |
| 2260 # used to store big pieces of DNA in itty bitty pieces | |
| 2261 sub dna_chunk_size { | |
| 2262 return 0; | |
| 2263 } | |
| 2264 | |
| 2265 sub insert_sequence { | |
| 2266 my $self = shift; | |
| 2267 my($id,$offset,$seq) = @_; | |
| 2268 $self->throw('insert_sequence(): must be defined in subclass'); | |
| 2269 } | |
| 2270 | |
| 2271 # This is the default class for reference points. Defaults to Sequence. | |
| 2272 sub default_class { | |
| 2273 my $self = shift; | |
| 2274 my $d = exists($self->{default_class}) ? $self->{default_class} : 'Sequence'; | |
| 2275 $self->{default_class} = shift if @_; | |
| 2276 $d; | |
| 2277 } | |
| 2278 | |
| 2279 # gets name of the reference sequence, and returns its class | |
| 2280 # currently just calls default_class | |
| 2281 sub refclass { | |
| 2282 my $self = shift; | |
| 2283 my $name = shift; | |
| 2284 return $self->default_class; | |
| 2285 } | |
| 2286 | |
| 2287 =head2 setup_load | |
| 2288 | |
| 2289 Title : setup_load | |
| 2290 Usage : $db->setup_load | |
| 2291 Function: called before load_gff_line() | |
| 2292 Returns : void | |
| 2293 Args : none | |
| 2294 Status : abstract | |
| 2295 | |
| 2296 This abstract method gives subclasses a chance to do any | |
| 2297 schema-specific initialization prior to loading a set of GFF records. | |
| 2298 It must be implemented by a subclass. | |
| 2299 | |
| 2300 =cut | |
| 2301 | |
| 2302 sub setup_load { | |
| 2303 # default, do nothing | |
| 2304 } | |
| 2305 | |
| 2306 =head2 finish_load | |
| 2307 | |
| 2308 Title : finish_load | |
| 2309 Usage : $db->finish_load | |
| 2310 Function: called after load_gff_line() | |
| 2311 Returns : number of records loaded | |
| 2312 Args : none | |
| 2313 Status :abstract | |
| 2314 | |
| 2315 This method gives subclasses a chance to do any schema-specific | |
| 2316 cleanup after loading a set of GFF records. | |
| 2317 | |
| 2318 =cut | |
| 2319 | |
| 2320 sub finish_load { | |
| 2321 # default, do nothing | |
| 2322 } | |
| 2323 | |
| 2324 =head2 load_gff_line | |
| 2325 | |
| 2326 Title : load_gff_line | |
| 2327 Usage : $db->load_gff_line(@args) | |
| 2328 Function: called to load one parsed line of GFF | |
| 2329 Returns : true if successfully inserted | |
| 2330 Args : see below | |
| 2331 Status : abstract | |
| 2332 | |
| 2333 This abstract method is called once per line of the GFF and passed a | |
| 2334 hashref containing parsed GFF fields. The fields are: | |
| 2335 | |
| 2336 {ref => $ref, | |
| 2337 class => $class, | |
| 2338 source => $source, | |
| 2339 method => $method, | |
| 2340 start => $start, | |
| 2341 stop => $stop, | |
| 2342 score => $score, | |
| 2343 strand => $strand, | |
| 2344 phase => $phase, | |
| 2345 gclass => $gclass, | |
| 2346 gname => $gname, | |
| 2347 tstart => $tstart, | |
| 2348 tstop => $tstop, | |
| 2349 attributes => $attributes} | |
| 2350 | |
| 2351 =cut | |
| 2352 | |
| 2353 sub load_gff_line { | |
| 2354 shift->throw("load_gff_line(): must be implemented by an adaptor"); | |
| 2355 } | |
| 2356 | |
| 2357 | |
| 2358 =head2 do_initialize | |
| 2359 | |
| 2360 Title : do_initialize | |
| 2361 Usage : $db->do_initialize([$erase]) | |
| 2362 Function: initialize and possibly erase database | |
| 2363 Returns : true if successful | |
| 2364 Args : optional erase flag | |
| 2365 Status : protected | |
| 2366 | |
| 2367 This method implements the initialize() method described above, and | |
| 2368 takes the same arguments. | |
| 2369 | |
| 2370 =cut | |
| 2371 | |
| 2372 sub do_initialize { | |
| 2373 shift->throw('do_initialize(): must be implemented by an adaptor'); | |
| 2374 } | |
| 2375 | |
| 2376 =head2 dna | |
| 2377 | |
| 2378 Title : dna | |
| 2379 Usage : $db->dna($id,$start,$stop,$class) | |
| 2380 Function: return the raw DNA string for a segment | |
| 2381 Returns : a raw DNA string | |
| 2382 Args : id of the sequence, its class, start and stop positions | |
| 2383 Status : public | |
| 2384 | |
| 2385 This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA | |
| 2386 sequence. | |
| 2387 | |
| 2388 Arguments: -name sequence name | |
| 2389 -start start position | |
| 2390 -stop stop position | |
| 2391 -class sequence class | |
| 2392 | |
| 2393 If start and stop are both undef, then the entire DNA is retrieved. | |
| 2394 So to fetch the whole dna, call like this: | |
| 2395 | |
| 2396 $db->dna($name_of_sequence); | |
| 2397 | |
| 2398 or like this: | |
| 2399 | |
| 2400 $db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence); | |
| 2401 | |
| 2402 NOTE: you will probably prefer to create a Segment and then invoke its | |
| 2403 dna() method. | |
| 2404 | |
| 2405 =cut | |
| 2406 | |
| 2407 # call to return the DNA string for the indicated region | |
| 2408 # real work is done by get_dna() | |
| 2409 sub dna { | |
| 2410 my $self = shift; | |
| 2411 my ($id,$start,$stop,$class) = rearrange([ | |
| 2412 [qw(NAME ID REF REFSEQ)], | |
| 2413 qw(START), | |
| 2414 [qw(STOP END)], | |
| 2415 'CLASS', | |
| 2416 ],@_); | |
| 2417 # return unless defined $start && defined $stop; | |
| 2418 $self->get_dna($id,$start,$stop,$class); | |
| 2419 } | |
| 2420 | |
| 2421 sub features_in_range { | |
| 2422 my $self = shift; | |
| 2423 my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) = | |
| 2424 rearrange([ | |
| 2425 [qw(RANGE_TYPE)], | |
| 2426 [qw(REF REFSEQ)], | |
| 2427 qw(CLASS), | |
| 2428 qw(START), | |
| 2429 [qw(STOP END)], | |
| 2430 [qw(TYPE TYPES)], | |
| 2431 qw(PARENT), | |
| 2432 [qw(RARE SPARSE)], | |
| 2433 [qw(MERGE AUTOMERGE)], | |
| 2434 'ITERATOR' | |
| 2435 ],@_); | |
| 2436 $other ||= {}; | |
| 2437 $automerge = $types && $self->automerge unless defined $automerge; | |
| 2438 $self->throw("range type must be one of {". | |
| 2439 join(',',keys %valid_range_types). | |
| 2440 "}\n") | |
| 2441 unless $valid_range_types{lc $range_type}; | |
| 2442 $self->_features({ | |
| 2443 rangetype => lc $range_type, | |
| 2444 refseq => $refseq, | |
| 2445 refclass => $class, | |
| 2446 start => $start, | |
| 2447 stop => $stop, | |
| 2448 types => $types }, | |
| 2449 { | |
| 2450 sparse => $sparse, | |
| 2451 automerge => $automerge, | |
| 2452 iterator => $iterator, | |
| 2453 %$other, | |
| 2454 }, | |
| 2455 $parent); | |
| 2456 } | |
| 2457 | |
| 2458 =head2 get_dna | |
| 2459 | |
| 2460 Title : get_dna | |
| 2461 Usage : $db->get_dna($id,$start,$stop,$class) | |
| 2462 Function: get DNA for indicated segment | |
| 2463 Returns : the dna string | |
| 2464 Args : sequence ID, start, stop and class | |
| 2465 Status : protected | |
| 2466 | |
| 2467 If start E<gt> stop and the sequence is nucleotide, then this method | |
| 2468 should return the reverse complement. The sequence class may be | |
| 2469 ignored by those databases that do not recognize different object | |
| 2470 types. | |
| 2471 | |
| 2472 =cut | |
| 2473 | |
| 2474 sub get_dna { | |
| 2475 my $self = shift; | |
| 2476 my ($id,$start,$stop,$class,) = @_; | |
| 2477 $self->throw("get_dna() must be implemented by an adaptor"); | |
| 2478 } | |
| 2479 | |
| 2480 =head2 get_features | |
| 2481 | |
| 2482 Title : get_features | |
| 2483 Usage : $db->get_features($search,$options,$callback) | |
| 2484 Function: get list of features for a region | |
| 2485 Returns : count of number of features retrieved | |
| 2486 Args : see below | |
| 2487 Status : protected | |
| 2488 | |
| 2489 The first argument is a hash reference containing search criteria for | |
| 2490 retrieving features. It contains the following keys: | |
| 2491 | |
| 2492 rangetype One of "overlaps", "contains" or "contained_in". Indicates | |
| 2493 the type of range query requested. | |
| 2494 | |
| 2495 refseq ID of the landmark that establishes the absolute | |
| 2496 coordinate system. | |
| 2497 | |
| 2498 refclass Class of this landmark. Can be ignored by implementations | |
| 2499 that don't recognize such distinctions. | |
| 2500 | |
| 2501 start Start of the range, inclusive. | |
| 2502 | |
| 2503 stop Stop of the range, inclusive. | |
| 2504 | |
| 2505 types Array reference containing the list of annotation types | |
| 2506 to fetch from the database. Each annotation type is an | |
| 2507 array reference consisting of [source,method]. | |
| 2508 | |
| 2509 The second argument is a hash reference containing certain options | |
| 2510 that affect the way information is retrieved: | |
| 2511 | |
| 2512 sort_by_group | |
| 2513 A flag. If true, means that the returned features should be | |
| 2514 sorted by the group that they're in. | |
| 2515 | |
| 2516 sparse A flag. If true, means that the expected density of the | |
| 2517 features is such that it will be more efficient to search | |
| 2518 by type rather than by range. If it is taking a long | |
| 2519 time to fetch features, give this a try. | |
| 2520 | |
| 2521 binsize A true value will create a set of artificial features whose | |
| 2522 start and stop positions indicate bins of the given size, and | |
| 2523 whose scores are the number of features in the bin. The | |
| 2524 class of the feature will be set to "bin", and its name to | |
| 2525 "method:source". This is a handy way of generating histograms | |
| 2526 of feature density. | |
| 2527 | |
| 2528 The third argument, the $callback, is a code reference to which | |
| 2529 retrieved features are passed. It is described in more detail below. | |
| 2530 | |
| 2531 This routine is responsible for getting arrays of GFF data out of the | |
| 2532 database and passing them to the callback subroutine. The callback | |
| 2533 does the work of constructing a Bio::DB::GFF::Feature object out of | |
| 2534 that data. The callback expects a list of 13 fields: | |
| 2535 | |
| 2536 $refseq The reference sequence | |
| 2537 $start feature start | |
| 2538 $stop feature stop | |
| 2539 $source feature source | |
| 2540 $method feature method | |
| 2541 $score feature score | |
| 2542 $strand feature strand | |
| 2543 $phase feature phase | |
| 2544 $groupclass group class (may be undef) | |
| 2545 $groupname group ID (may be undef) | |
| 2546 $tstart target start for similarity hits (may be undef) | |
| 2547 $tstop target stop for similarity hits (may be undef) | |
| 2548 $feature_id A unique feature ID (may be undef) | |
| 2549 | |
| 2550 These fields are in the same order as the raw GFF file, with the | |
| 2551 exception that the group column has been parsed into group class and | |
| 2552 group name fields. | |
| 2553 | |
| 2554 The feature ID, if provided, is a unique identifier of the feature | |
| 2555 line. The module does not depend on this ID in any way, but it is | |
| 2556 available via Bio::DB::GFF-E<gt>id() if wanted. In the dbi::mysql and | |
| 2557 dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb | |
| 2558 adaptor it is not used. | |
| 2559 | |
| 2560 =cut | |
| 2561 | |
| 2562 sub get_features{ | |
| 2563 my $self = shift; | |
| 2564 my ($search,$options,$callback) = @_; | |
| 2565 $self->throw("get_features() must be implemented by an adaptor"); | |
| 2566 } | |
| 2567 | |
| 2568 | |
| 2569 =head2 _feature_by_name | |
| 2570 | |
| 2571 Title : _feature_by_name | |
| 2572 Usage : $db->_feature_by_name($class,$name,$location,$callback) | |
| 2573 Function: get a list of features by name and class | |
| 2574 Returns : count of number of features retrieved | |
| 2575 Args : name of feature, class of feature, and a callback | |
| 2576 Status : abstract | |
| 2577 | |
| 2578 This method is used internally. The callback arguments are the same | |
| 2579 as those used by make_feature(). This method must be overidden by | |
| 2580 subclasses. | |
| 2581 | |
| 2582 =cut | |
| 2583 | |
| 2584 sub _feature_by_name { | |
| 2585 my $self = shift; | |
| 2586 my ($class,$name,$location,$callback) = @_; | |
| 2587 $self->throw("_feature_by_name() must be implemented by an adaptor"); | |
| 2588 } | |
| 2589 | |
| 2590 sub _feature_by_attribute { | |
| 2591 my $self = shift; | |
| 2592 my ($attributes,$callback) = @_; | |
| 2593 $self->throw("_feature_by_name() must be implemented by an adaptor"); | |
| 2594 } | |
| 2595 | |
| 2596 =head2 _feature_by_id | |
| 2597 | |
| 2598 Title : _feature_by_id | |
| 2599 Usage : $db->_feature_by_id($ids,$type,$callback) | |
| 2600 Function: get a feature based | |
| 2601 Returns : count of number of features retrieved | |
| 2602 Args : arrayref to feature IDs to fetch | |
| 2603 Status : abstract | |
| 2604 | |
| 2605 This method is used internally to fetch features either by their ID or | |
| 2606 their group ID. $ids is a arrayref containing a list of IDs, $type is | |
| 2607 one of "feature" or "group", and $callback is a callback. The | |
| 2608 callback arguments are the same as those used by make_feature(). This | |
| 2609 method must be overidden by subclasses. | |
| 2610 | |
| 2611 =cut | |
| 2612 | |
| 2613 sub _feature_by_id { | |
| 2614 my $self = shift; | |
| 2615 my ($ids,$type,$callback) = @_; | |
| 2616 $self->throw("_feature_by_id() must be implemented by an adaptor"); | |
| 2617 } | |
| 2618 | |
| 2619 =head2 overlapping_features | |
| 2620 | |
| 2621 Title : overlapping_features | |
| 2622 Usage : $db->overlapping_features(@args) | |
| 2623 Function: get features that overlap the indicated range | |
| 2624 Returns : a list of Bio::DB::GFF::Feature objects | |
| 2625 Args : see below | |
| 2626 Status : public | |
| 2627 | |
| 2628 This method is invoked by Bio::DB::GFF::Segment-E<gt>features() to find | |
| 2629 the list of features that overlap a given range. It is generally | |
| 2630 preferable to create the Segment first, and then fetch the features. | |
| 2631 | |
| 2632 This method takes set of named arguments: | |
| 2633 | |
| 2634 -refseq ID of the reference sequence | |
| 2635 -class Class of the reference sequence | |
| 2636 -start Start of the desired range in refseq coordinates | |
| 2637 -stop Stop of the desired range in refseq coordinates | |
| 2638 -types List of feature types to return. Argument is an array | |
| 2639 reference containing strings of the format "method:source" | |
| 2640 -parent A parent Bio::DB::GFF::Segment object, used to create | |
| 2641 relative coordinates in the generated features. | |
| 2642 -rare Turn on an optimization suitable for a relatively rare feature type, | |
| 2643 where it will be faster to filter by feature type first | |
| 2644 and then by position, rather than vice versa. | |
| 2645 -merge Whether to apply aggregators to the generated features. | |
| 2646 -iterator Whether to return an iterator across the features. | |
| 2647 | |
| 2648 If -iterator is true, then the method returns a single scalar value | |
| 2649 consisting of a Bio::SeqIO object. You can call next_seq() repeatedly | |
| 2650 on this object to fetch each of the features in turn. If iterator is | |
| 2651 false or absent, then all the features are returned as a list. | |
| 2652 | |
| 2653 Currently aggregation is disabled when iterating over a series of | |
| 2654 features. | |
| 2655 | |
| 2656 Types are indicated using the nomenclature "method:source". Either of | |
| 2657 these fields can be omitted, in which case a wildcard is used for the | |
| 2658 missing field. Type names without the colon (e.g. "exon") are | |
| 2659 interpreted as the method name and a source wild card. Regular | |
| 2660 expressions are allowed in either field, as in: "similarity:BLAST.*". | |
| 2661 | |
| 2662 =cut | |
| 2663 | |
| 2664 # call to return the features that overlap the named region | |
| 2665 # real work is done by get_features | |
| 2666 sub overlapping_features { | |
| 2667 my $self = shift; | |
| 2668 $self->features_in_range(-range_type=>'overlaps',@_); | |
| 2669 } | |
| 2670 | |
| 2671 =head2 contained_features | |
| 2672 | |
| 2673 Title : contained_features | |
| 2674 Usage : $db->contained_features(@args) | |
| 2675 Function: get features that are contained within the indicated range | |
| 2676 Returns : a list of Bio::DB::GFF::Feature objects | |
| 2677 Args : see overlapping_features() | |
| 2678 Status : public | |
| 2679 | |
| 2680 This call is similar to overlapping_features(), except that it only | |
| 2681 retrieves features whose end points are completely contained within | |
| 2682 the specified range. | |
| 2683 | |
| 2684 Generally you will want to fetch a Bio::DB::GFF::Segment object and | |
| 2685 call its contained_features() method rather than call this directly. | |
| 2686 | |
| 2687 =cut | |
| 2688 | |
| 2689 # The same, except that it only returns features that are completely contained within the | |
| 2690 # range (much faster usually) | |
| 2691 sub contained_features { | |
| 2692 my $self = shift; | |
| 2693 $self->features_in_range(-range_type=>'contains',@_); | |
| 2694 } | |
| 2695 | |
| 2696 =head2 contained_in | |
| 2697 | |
| 2698 Title : contained_in | |
| 2699 Usage : @features = $s->contained_in(@args) | |
| 2700 Function: get features that contain this segment | |
| 2701 Returns : a list of Bio::DB::GFF::Feature objects | |
| 2702 Args : see features() | |
| 2703 Status : Public | |
| 2704 | |
| 2705 This is identical in behavior to features() except that it returns | |
| 2706 only those features that completely contain the segment. | |
| 2707 | |
| 2708 =cut | |
| 2709 | |
| 2710 sub contained_in { | |
| 2711 my $self = shift; | |
| 2712 $self->features_in_range(-range_type=>'contained_in',@_); | |
| 2713 } | |
| 2714 | |
| 2715 =head2 get_abscoords | |
| 2716 | |
| 2717 Title : get_abscoords | |
| 2718 Usage : $db->get_abscoords($name,$class,$refseq) | |
| 2719 Function: get the absolute coordinates of sequence with name & class | |
| 2720 Returns : ($absref,$absstart,$absstop,$absstrand) | |
| 2721 Args : name and class of the landmark | |
| 2722 Status : protected | |
| 2723 | |
| 2724 Given the name and class of a genomic landmark, this function returns | |
| 2725 a four-element array consisting of: | |
| 2726 | |
| 2727 $absref the ID of the reference sequence that contains this landmark | |
| 2728 $absstart the position at which the landmark starts | |
| 2729 $absstop the position at which the landmark stops | |
| 2730 $absstrand the strand of the landmark, relative to the reference sequence | |
| 2731 | |
| 2732 If $refseq is provided, the function searches only within the | |
| 2733 specified reference sequence. | |
| 2734 | |
| 2735 =cut | |
| 2736 | |
| 2737 sub get_abscoords { | |
| 2738 my $self = shift; | |
| 2739 my ($name,$class,$refseq) = @_; | |
| 2740 $self->throw("get_abscoords() must be implemented by an adaptor"); | |
| 2741 } | |
| 2742 | |
| 2743 =head2 get_types | |
| 2744 | |
| 2745 Title : get_types | |
| 2746 Usage : $db->get_types($absref,$class,$start,$stop,$count) | |
| 2747 Function: get list of all feature types on the indicated segment | |
| 2748 Returns : list or hash of Bio::DB::GFF::Typename objects | |
| 2749 Args : see below | |
| 2750 Status : protected | |
| 2751 | |
| 2752 Arguments are: | |
| 2753 | |
| 2754 $absref the ID of the reference sequence | |
| 2755 $class the class of the reference sequence | |
| 2756 $start the position to start counting | |
| 2757 $stop the position to end counting | |
| 2758 $count a boolean indicating whether to count the number | |
| 2759 of occurrences of each feature type | |
| 2760 | |
| 2761 If $count is true, then a hash is returned. The keys of the hash are | |
| 2762 feature type names in the format "method:source" and the values are | |
| 2763 the number of times a feature of this type overlaps the indicated | |
| 2764 segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename | |
| 2765 objects. If $start or $stop are undef, then all features on the | |
| 2766 indicated segment are enumerated. If $absref is undef, then the call | |
| 2767 returns all feature types in the database. | |
| 2768 | |
| 2769 =cut | |
| 2770 | |
| 2771 sub get_types { | |
| 2772 my $self = shift; | |
| 2773 my ($refseq,$class,$start,$stop,$count,$types) = @_; | |
| 2774 $self->throw("get_types() must be implemented by an adaptor"); | |
| 2775 } | |
| 2776 | |
| 2777 =head2 make_feature | |
| 2778 | |
| 2779 Title : make_feature | |
| 2780 Usage : $db->make_feature(@args) | |
| 2781 Function: Create a Bio::DB::GFF::Feature object from string data | |
| 2782 Returns : a Bio::DB::GFF::Feature object | |
| 2783 Args : see below | |
| 2784 Status : internal | |
| 2785 | |
| 2786 This takes 14 arguments (really!): | |
| 2787 | |
| 2788 $parent A Bio::DB::GFF::RelSegment object | |
| 2789 $group_hash A hashref containing unique list of GFF groups | |
| 2790 $refname The name of the reference sequence for this feature | |
| 2791 $refclass The class of the reference sequence for this feature | |
| 2792 $start Start of feature | |
| 2793 $stop Stop of feature | |
| 2794 $source Feature source field | |
| 2795 $method Feature method field | |
| 2796 $score Feature score field | |
| 2797 $strand Feature strand | |
| 2798 $phase Feature phase | |
| 2799 $group_class Class of feature group | |
| 2800 $group_name Name of feature group | |
| 2801 $tstart For homologies, start of hit on target | |
| 2802 $tstop Stop of hit on target | |
| 2803 | |
| 2804 The $parent argument, if present, is used to establish relative | |
| 2805 coordinates in the resulting Bio::DB::Feature object. This allows one | |
| 2806 feature to generate a list of other features that are relative to its | |
| 2807 coordinate system (for example, finding the coordinates of the second | |
| 2808 exon relative to the coordinates of the first). | |
| 2809 | |
| 2810 The $group_hash allows the group_class/group_name strings to be turned | |
| 2811 into rich database objects via the make_obect() method (see above). | |
| 2812 Because these objects may be expensive to create, $group_hash is used | |
| 2813 to uniquefy them. The index of this hash is the composite key | |
| 2814 {$group_class,$group_name,$tstart,$tstop}. Values are whatever object | |
| 2815 is returned by the make_object() method. | |
| 2816 | |
| 2817 The remainder of the fields are taken from the GFF line, with the | |
| 2818 exception that "Target" features, which contain information about the | |
| 2819 target of a homology search, are parsed into their components. | |
| 2820 | |
| 2821 =cut | |
| 2822 | |
| 2823 # This call is responsible for turning a line of GFF into a | |
| 2824 # feature object. | |
| 2825 # The $parent argument is a Bio::DB::GFF::Segment object and is used | |
| 2826 # to establish the coordinate system for the new feature. | |
| 2827 # The $group_hash argument is an hash ref that holds previously- | |
| 2828 # generated group objects. | |
| 2829 # Other arguments are taken right out of the GFF table. | |
| 2830 sub make_feature { | |
| 2831 my $self = shift; | |
| 2832 my ($parent,$group_hash, # these arguments provided by generic mechanisms | |
| 2833 $srcseq, # the rest is provided by adaptor | |
| 2834 $start,$stop, | |
| 2835 $source,$method, | |
| 2836 $score,$strand,$phase, | |
| 2837 $group_class,$group_name, | |
| 2838 $tstart,$tstop, | |
| 2839 $db_id,$group_id) = @_; | |
| 2840 | |
| 2841 return unless $srcseq; # return undef if called with no arguments. This behavior is used for | |
| 2842 # on-the-fly aggregation. | |
| 2843 | |
| 2844 my $group; # undefined | |
| 2845 if (defined $group_class && defined $group_name) { | |
| 2846 $tstart ||= ''; | |
| 2847 $tstop ||= ''; | |
| 2848 if ($group_hash) { | |
| 2849 $group = $group_hash->{$group_class,$group_name,$tstart,$tstop} | |
| 2850 ||= $self->make_object($group_class,$group_name,$tstart,$tstop); | |
| 2851 } else { | |
| 2852 $group = $self->make_object($group_class,$group_name,$tstart,$tstop); | |
| 2853 } | |
| 2854 } | |
| 2855 | |
| 2856 # fix for some broken GFF files | |
| 2857 # unfortunately - has undesired side effects | |
| 2858 # if (defined $tstart && defined $tstop && !defined $strand) { | |
| 2859 # $strand = $tstart <= $tstop ? '+' : '-'; | |
| 2860 # } | |
| 2861 | |
| 2862 if (ref $parent) { # note that the src sequence is ignored | |
| 2863 return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop, | |
| 2864 $method,$source, | |
| 2865 $score,$strand,$phase, | |
| 2866 $group,$db_id,$group_id, | |
| 2867 $tstart,$tstop); | |
| 2868 } else { | |
| 2869 return Bio::DB::GFF::Feature->new($self,$srcseq, | |
| 2870 $start,$stop, | |
| 2871 $method,$source, | |
| 2872 $score,$strand,$phase, | |
| 2873 $group,$db_id,$group_id, | |
| 2874 $tstart,$tstop); | |
| 2875 } | |
| 2876 } | |
| 2877 | |
| 2878 sub make_aggregated_feature { | |
| 2879 my $self = shift; | |
| 2880 my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3); | |
| 2881 my $feature = $self->make_feature($parent,undef,@_); | |
| 2882 return [$feature] if $feature && !$feature->group; | |
| 2883 | |
| 2884 # if we have accumulated features and either: | |
| 2885 # (1) make_feature() returned undef, indicated very end or | |
| 2886 # (2) the current group is different from the previous one | |
| 2887 | |
| 2888 local $^W = 0; # irritating uninitialized value warning in next statement | |
| 2889 if (@$accumulated_features && | |
| 2890 (!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) { | |
| 2891 foreach my $a (@$aggregators) { # last aggregator gets first shot | |
| 2892 $a->aggregate($accumulated_features,$self) or next; | |
| 2893 } | |
| 2894 my @result = @$accumulated_features; | |
| 2895 @$accumulated_features = $feature ? ($feature) : (); | |
| 2896 return unless @result; | |
| 2897 return \@result ; | |
| 2898 } | |
| 2899 push @$accumulated_features,$feature; | |
| 2900 return; | |
| 2901 } | |
| 2902 | |
| 2903 =head2 parse_types | |
| 2904 | |
| 2905 Title : parse_types | |
| 2906 Usage : $db->parse_types(@args) | |
| 2907 Function: parses list of types | |
| 2908 Returns : an array ref containing ['method','source'] pairs | |
| 2909 Args : a list of types in 'method:source' form | |
| 2910 Status : internal | |
| 2911 | |
| 2912 This method takes an array of type names in the format "method:source" | |
| 2913 and returns an array reference of ['method','source'] pairs. It will | |
| 2914 also accept a single argument consisting of an array reference with | |
| 2915 the list of type names. | |
| 2916 | |
| 2917 =cut | |
| 2918 | |
| 2919 # turn feature types in the format "method:source" into a list of [method,source] refs | |
| 2920 sub parse_types { | |
| 2921 my $self = shift; | |
| 2922 return [] if !@_ or !defined($_[0]); | |
| 2923 return $_[0] if ref $_[0] eq 'ARRAY' && ref $_[0][0]; | |
| 2924 my @types = ref($_[0]) ? @{$_[0]} : @_; | |
| 2925 my @type_list = map { [split(':',$_,2)] } @types; | |
| 2926 return \@type_list; | |
| 2927 } | |
| 2928 | |
| 2929 =head2 make_match_sub | |
| 2930 | |
| 2931 Title : make_match_sub | |
| 2932 Usage : $db->make_match_sub($types) | |
| 2933 Function: creates a subroutine used for filtering features | |
| 2934 Returns : a code reference | |
| 2935 Args : a list of parsed type names | |
| 2936 Status : protected | |
| 2937 | |
| 2938 This method is used internally to generate a code subroutine that will | |
| 2939 accept or reject a feature based on its method and source. It takes | |
| 2940 an array of parsed type names in the format returned by parse_types(), | |
| 2941 and generates an anonymous subroutine. The subroutine takes a single | |
| 2942 Bio::DB::GFF::Feature object and returns true if the feature matches | |
| 2943 one of the desired feature types, and false otherwise. | |
| 2944 | |
| 2945 =cut | |
| 2946 | |
| 2947 # a subroutine that matches features indicated by list of types | |
| 2948 sub make_match_sub { | |
| 2949 my $self = shift; | |
| 2950 my $types = shift; | |
| 2951 | |
| 2952 return sub { 1 } unless ref $types && @$types; | |
| 2953 | |
| 2954 my @expr; | |
| 2955 for my $type (@$types) { | |
| 2956 my ($method,$source) = @$type; | |
| 2957 $method ||= '.*'; | |
| 2958 $source = $source ? ":$source" : "(?::.+)?"; | |
| 2959 push @expr,"${method}${source}"; | |
| 2960 } | |
| 2961 my $expr = join '|',@expr; | |
| 2962 return $self->{match_subs}{$expr} if $self->{match_subs}{$expr}; | |
| 2963 | |
| 2964 my $sub =<<END; | |
| 2965 sub { | |
| 2966 my \$feature = shift or return; | |
| 2967 return \$feature->type =~ /^($expr)\$/i; | |
| 2968 } | |
| 2969 END | |
| 2970 warn "match sub: $sub\n" if $self->debug; | |
| 2971 my $compiled_sub = eval $sub; | |
| 2972 $self->throw($@) if $@; | |
| 2973 return $self->{match_subs}{$expr} = $compiled_sub; | |
| 2974 } | |
| 2975 | |
| 2976 =head2 make_object | |
| 2977 | |
| 2978 Title : make_object | |
| 2979 Usage : $db->make_object($class,$name,$start,$stop) | |
| 2980 Function: creates a feature object | |
| 2981 Returns : a feature object | |
| 2982 Args : see below | |
| 2983 Status : protected | |
| 2984 | |
| 2985 This method is called to make an object from the GFF "group" field. | |
| 2986 By default, all Target groups are turned into Bio::DB::GFF::Homol | |
| 2987 objects, and everything else becomes a Bio::DB::GFF::Featname. | |
| 2988 However, adaptors are free to override this method to generate more | |
| 2989 interesting objects, such as true BioPerl objects, or Acedb objects. | |
| 2990 | |
| 2991 Arguments are: | |
| 2992 | |
| 2993 $name database ID for object | |
| 2994 $class class of object | |
| 2995 $start for similarities, start of match inside object | |
| 2996 $stop for similarities, stop of match inside object | |
| 2997 | |
| 2998 =cut | |
| 2999 | |
| 3000 # abstract call to turn a feature into an object, given its class and name | |
| 3001 sub make_object { | |
| 3002 my $self = shift; | |
| 3003 my ($class,$name,$start,$stop) = @_; | |
| 3004 return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop) | |
| 3005 if defined $start and length $start; | |
| 3006 return Bio::DB::GFF::Featname->new($class,$name); | |
| 3007 } | |
| 3008 | |
| 3009 | |
| 3010 =head2 do_attributes | |
| 3011 | |
| 3012 Title : do_attributes | |
| 3013 Usage : $db->do_attributes($id [,$tag]); | |
| 3014 Function: internal method to retrieve attributes given an id and tag | |
| 3015 Returns : a list of Bio::DB::GFF::Feature objects | |
| 3016 Args : a feature id and a attribute tag (optional) | |
| 3017 Status : protected | |
| 3018 | |
| 3019 This method is overridden by subclasses in order to return a list of | |
| 3020 attributes. If called with a tag, returns the value of attributes of | |
| 3021 that tag type. If called without a tag, returns a flattened array of | |
| 3022 (tag=E<gt>value) pairs. A particular tag can be present multiple times. | |
| 3023 | |
| 3024 =cut | |
| 3025 | |
| 3026 sub do_attributes { | |
| 3027 my $self = shift; | |
| 3028 my ($id,$tag) = @_; | |
| 3029 return (); | |
| 3030 } | |
| 3031 | |
| 3032 | |
| 3033 | |
| 3034 =head1 Internal Methods | |
| 3035 | |
| 3036 The following methods are internal to Bio::DB::GFF and are not | |
| 3037 guaranteed to remain the same. | |
| 3038 | |
| 3039 =head2 _features | |
| 3040 | |
| 3041 Title : _features | |
| 3042 Usage : $db->_features($search,$options,$parent) | |
| 3043 Function: internal method | |
| 3044 Returns : a list of Bio::DB::GFF::Feature objects | |
| 3045 Args : see below | |
| 3046 Status : internal | |
| 3047 | |
| 3048 This is an internal method that is called by overlapping_features(), | |
| 3049 contained_features() and features() to create features based on a | |
| 3050 parent segment's coordinate system. It takes three arguments, a | |
| 3051 search options hashref, an options hashref, and a parent segment. | |
| 3052 | |
| 3053 The search hashref contains the following keys: | |
| 3054 | |
| 3055 rangetype One of "overlaps", "contains" or "contained_in". Indicates | |
| 3056 the type of range query requested. | |
| 3057 refseq reference sequence ID | |
| 3058 refclass reference sequence class | |
| 3059 start start of range | |
| 3060 stop stop of range | |
| 3061 types arrayref containing list of types in "method:source" form | |
| 3062 | |
| 3063 The options hashref contains zero or more of the following keys: | |
| 3064 | |
| 3065 sparse turn on optimizations for a rare feature | |
| 3066 automerge if true, invoke aggregators to merge features | |
| 3067 iterator if true, return an iterator | |
| 3068 | |
| 3069 The $parent argument is a scalar object containing a | |
| 3070 Bio::DB::GFF::RelSegment object or descendent. | |
| 3071 | |
| 3072 =cut | |
| 3073 | |
| 3074 #' | |
| 3075 | |
| 3076 sub _features { | |
| 3077 my $self = shift; | |
| 3078 my ($search,$options,$parent) = @_; | |
| 3079 (@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)}) | |
| 3080 if defined($search->{start}) && $search->{start} > $search->{stop}; | |
| 3081 | |
| 3082 my $types = $self->parse_types($search->{types}); # parse out list of types | |
| 3083 my @aggregated_types = @$types; # keep a copy | |
| 3084 | |
| 3085 # allow the aggregators to operate on the original | |
| 3086 my @aggregators; | |
| 3087 if ($options->{automerge}) { | |
| 3088 for my $a ($self->aggregators) { | |
| 3089 $a = $a->clone if $options->{iterator}; | |
| 3090 unshift @aggregators,$a | |
| 3091 if $a->disaggregate(\@aggregated_types,$self); | |
| 3092 } | |
| 3093 } | |
| 3094 | |
| 3095 if ($options->{iterator}) { | |
| 3096 my @accumulated_features; | |
| 3097 my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) } | |
| 3098 : sub { [$self->make_feature($parent,undef,@_)] }; | |
| 3099 return $self->get_features_iterator({ %$search, | |
| 3100 types => \@aggregated_types }, | |
| 3101 { %$options, | |
| 3102 sort_by_group => $options->{automerge} }, | |
| 3103 $callback | |
| 3104 ); | |
| 3105 } | |
| 3106 | |
| 3107 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
| 3108 my $features = []; | |
| 3109 | |
| 3110 my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) }; | |
| 3111 $self->get_features({ %$search, | |
| 3112 types => \@aggregated_types }, | |
| 3113 $options, | |
| 3114 $callback); | |
| 3115 | |
| 3116 if ($options->{automerge}) { | |
| 3117 warn "aggregating...\n" if $self->debug; | |
| 3118 foreach my $a (@aggregators) { # last aggregator gets first shot | |
| 3119 warn "Aggregator $a:\n" if $self->debug; | |
| 3120 $a->aggregate($features,$self); | |
| 3121 } | |
| 3122 } | |
| 3123 | |
| 3124 @$features; | |
| 3125 } | |
| 3126 | |
| 3127 =head2 get_features_iterator | |
| 3128 | |
| 3129 Title : get_features_iterator | |
| 3130 Usage : $db->get_features_iterator($search,$options,$callback) | |
| 3131 Function: get an iterator on a features query | |
| 3132 Returns : a Bio::SeqIO object | |
| 3133 Args : as per get_features() | |
| 3134 Status : Public | |
| 3135 | |
| 3136 This method takes the same arguments as get_features(), but returns an | |
| 3137 iterator that can be used to fetch features sequentially, as per | |
| 3138 Bio::SeqIO. | |
| 3139 | |
| 3140 Internally, this method is simply a front end to range_query(). | |
| 3141 The latter method constructs and executes the query, returning a | |
| 3142 statement handle. This routine passes the statement handle to the | |
| 3143 constructor for the iterator, along with the callback. | |
| 3144 | |
| 3145 =cut | |
| 3146 | |
| 3147 sub get_features_iterator { | |
| 3148 my $self = shift; | |
| 3149 my ($search,$options,$callback) = @_; | |
| 3150 $self->throw('feature iteration is not implemented in this adaptor'); | |
| 3151 } | |
| 3152 | |
| 3153 =head2 split_group | |
| 3154 | |
| 3155 Title : split_group | |
| 3156 Usage : $db->split_group($group_field,$gff3_flag) | |
| 3157 Function: parse GFF group field | |
| 3158 Returns : ($gclass,$gname,$tstart,$tstop,$attributes) | |
| 3159 Args : the gff group column and a flag indicating gff3 compatibility | |
| 3160 Status : internal | |
| 3161 | |
| 3162 This is a method that is called by load_gff_line to parse out the | |
| 3163 contents of one or more group fields. It returns the class of the | |
| 3164 group, its name, the start and stop of the target, if any, and an | |
| 3165 array reference containing any attributes that were stuck into the | |
| 3166 group field, in [attribute_name,attribute_value] format. | |
| 3167 | |
| 3168 =cut | |
| 3169 | |
| 3170 sub split_group { | |
| 3171 my $self = shift; | |
| 3172 my ($group,$gff3) = @_; | |
| 3173 if ($gff3) { | |
| 3174 my @groups = split /[;&]/,$group; # so easy! | |
| 3175 return $self->_split_gff3_group(@groups); | |
| 3176 } else { | |
| 3177 # handle group parsing | |
| 3178 # protect embedded semicolons in the group; there must be faster/more elegant way | |
| 3179 # to do this. | |
| 3180 $group =~ s/\\;/$;/g; | |
| 3181 while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 } | |
| 3182 my @groups = split(/\s*;\s*/,$group); | |
| 3183 foreach (@groups) { s/$;/;/g } | |
| 3184 return $self->_split_gff2_group(@groups); | |
| 3185 } | |
| 3186 } | |
| 3187 | |
| 3188 =head2 _split_gff2_group | |
| 3189 | |
| 3190 This is an internal method called by split_group(). | |
| 3191 | |
| 3192 =cut | |
| 3193 | |
| 3194 sub _split_gff2_group { | |
| 3195 my $self = shift; | |
| 3196 my @groups = @_; | |
| 3197 | |
| 3198 my ($gclass,$gname,$tstart,$tstop,@attributes); | |
| 3199 | |
| 3200 for (@groups) { | |
| 3201 | |
| 3202 my ($tag,$value) = /^(\S+)(?:\s+(.+))?/; | |
| 3203 $value ||= ''; | |
| 3204 if ($value =~ /^\"(.+)\"$/) { #remove quotes | |
| 3205 $value = $1; | |
| 3206 } | |
| 3207 $value =~ s/\\t/\t/g; | |
| 3208 $value =~ s/\\r/\r/g; | |
| 3209 | |
| 3210 # Any additional groups become part of the attributes hash | |
| 3211 # For historical reasons, the tag "Note" is treated as an | |
| 3212 # attribute, even if it is the only group. | |
| 3213 $tag ||= ''; | |
| 3214 if ($tag eq 'Note' or ($gclass && $gname)) { | |
| 3215 push @attributes,[$tag => $value]; | |
| 3216 } | |
| 3217 | |
| 3218 # if the tag eq 'Target' then the class name is embedded in the ID | |
| 3219 # (the GFF format is obviously screwed up here) | |
| 3220 elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) { | |
| 3221 ($gclass,$gname) = ($1,$2); | |
| 3222 ($tstart,$tstop) = / (\d+) (\d+)/; | |
| 3223 } | |
| 3224 | |
| 3225 elsif (!$value) { | |
| 3226 push @attributes,[Note => $tag]; # e.g. "Confirmed_by_EST" | |
| 3227 } | |
| 3228 | |
| 3229 # otherwise, the tag and value correspond to the | |
| 3230 # group class and name | |
| 3231 else { | |
| 3232 ($gclass,$gname) = ($tag,$value); | |
| 3233 } | |
| 3234 } | |
| 3235 | |
| 3236 return ($gclass,$gname,$tstart,$tstop,\@attributes); | |
| 3237 } | |
| 3238 | |
| 3239 =head2 _split_gff3_group | |
| 3240 | |
| 3241 This is called internally from split_group(). | |
| 3242 | |
| 3243 =cut | |
| 3244 | |
| 3245 sub _split_gff3_group { | |
| 3246 my $self = shift; | |
| 3247 my @groups = @_; | |
| 3248 my ($gclass,$gname,$tstart,$tstop,@attributes); | |
| 3249 | |
| 3250 for my $group (@groups) { | |
| 3251 my ($tag,$value) = split /=/,$group; | |
| 3252 $tag = unescape($tag); | |
| 3253 my @values = map {unescape($_)} split /,/,$value; | |
| 3254 if ($tag eq 'Parent') { | |
| 3255 $gclass = 'Sequence'; | |
| 3256 $gname = shift @values; | |
| 3257 } | |
| 3258 elsif ($tag eq 'ID') { | |
| 3259 $gclass = 'Sequence'; | |
| 3260 $gname = shift @values; | |
| 3261 } | |
| 3262 elsif ($tag eq 'Target') { | |
| 3263 $gclass = 'Sequence'; | |
| 3264 ($gname,$tstart,$tstop) = split /\s+/,shift @values; | |
| 3265 } | |
| 3266 push @attributes,[$tag=>$_] foreach @values; | |
| 3267 } | |
| 3268 return ($gclass,$gname,$tstart,$tstop,\@attributes); | |
| 3269 } | |
| 3270 | |
| 3271 =head2 _delete_features(), _delete_groups(),_delete() | |
| 3272 | |
| 3273 Title : _delete_features(), _delete_groups(),_delete() | |
| 3274 Usage : $count = $db->_delete_features(@feature_ids) | |
| 3275 $count = $db->_delete_groups(@group_ids) | |
| 3276 $count = $db->_delete(\%delete_spec) | |
| 3277 Function: low-level feature/group deleter | |
| 3278 Returns : count of groups removed | |
| 3279 Args : list of feature or group ids removed | |
| 3280 Status : for implementation by subclasses | |
| 3281 | |
| 3282 These methods need to be implemented in adaptors. For | |
| 3283 _delete_features and _delete_groups, the arguments are a list of | |
| 3284 feature or group IDs to remove. For _delete(), the argument is a | |
| 3285 hashref with the three keys 'segments', 'types' and 'force'. The | |
| 3286 first contains an arrayref of Bio::DB::GFF::RelSegment objects to | |
| 3287 delete (all FEATURES within the segment are deleted). The second | |
| 3288 contains an arrayref of [method,source] feature types to delete. The | |
| 3289 two are ANDed together. If 'force' has a true value, this forces the | |
| 3290 operation to continue even if it would delete all features. | |
| 3291 | |
| 3292 =cut | |
| 3293 | |
| 3294 sub _delete_features { | |
| 3295 my $self = shift; | |
| 3296 my @feature_ids = @_; | |
| 3297 $self->throw('_delete_features is not implemented in this adaptor'); | |
| 3298 } | |
| 3299 | |
| 3300 sub _delete_groups { | |
| 3301 my $self = shift; | |
| 3302 my @group_ids = @_; | |
| 3303 $self->throw('_delete_groups is not implemented in this adaptor'); | |
| 3304 } | |
| 3305 | |
| 3306 sub _delete { | |
| 3307 my $self = shift; | |
| 3308 my $delete_options = shift; | |
| 3309 $self->throw('_delete is not implemented in this adaptor'); | |
| 3310 } | |
| 3311 | |
| 3312 sub unescape { | |
| 3313 my $v = shift; | |
| 3314 $v =~ tr/+/ /; | |
| 3315 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge; | |
| 3316 return $v; | |
| 3317 } | |
| 3318 | |
| 3319 | |
| 3320 package Bio::DB::GFF::ID_Iterator; | |
| 3321 use strict; | |
| 3322 | |
| 3323 use Bio::Root::Root; | |
| 3324 use vars '@ISA'; | |
| 3325 @ISA = 'Bio::Root::Root'; | |
| 3326 | |
| 3327 sub new { | |
| 3328 my $class = shift; | |
| 3329 my ($db,$ids,$type) = @_; | |
| 3330 return bless {ids=>$ids,db=>$db,type=>$type},$class; | |
| 3331 } | |
| 3332 | |
| 3333 sub next_seq { | |
| 3334 my $self = shift; | |
| 3335 my $next = shift @{$self->{ids}}; | |
| 3336 return unless $next; | |
| 3337 my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next; | |
| 3338 my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name) | |
| 3339 : $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name) | |
| 3340 : $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name) | |
| 3341 : $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier"); | |
| 3342 $self->throw("id does not exist") unless $segment; | |
| 3343 return $segment; | |
| 3344 } | |
| 3345 | |
| 3346 1; | |
| 3347 | |
| 3348 __END__ | |
| 3349 | |
| 3350 =head1 BUGS | |
| 3351 | |
| 3352 Features can only belong to a single group at a time. This must be | |
| 3353 addressed soon. | |
| 3354 | |
| 3355 Start coordinate can be greater than stop coordinate for relative | |
| 3356 addressing. This breaks strict BioPerl compatibility and must be | |
| 3357 fixed. | |
| 3358 | |
| 3359 =head1 SEE ALSO | |
| 3360 | |
| 3361 L<bioperl>, | |
| 3362 L<Bio::DB::GFF::RelSegment>, | |
| 3363 L<Bio::DB::GFF::Aggregator>, | |
| 3364 L<Bio::DB::GFF::Feature>, | |
| 3365 L<Bio::DB::GFF::Adaptor::dbi::mysqlopt>, | |
| 3366 L<Bio::DB::GFF::Adaptor::dbi::oracle>, | |
| 3367 L<Bio::DB::GFF::Adaptor::memory> | |
| 3368 | |
| 3369 =head1 AUTHOR | |
| 3370 | |
| 3371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>. | |
| 3372 | |
| 3373 Copyright (c) 2001 Cold Spring Harbor Laboratory. | |
| 3374 | |
| 3375 This library is free software; you can redistribute it and/or modify | |
| 3376 it under the same terms as Perl itself. | |
| 3377 | |
| 3378 =cut | |
| 3379 |
