Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/GFF.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: GFF.pm,v 1.71.2.2 2003/09/12 13:29:32 lstein Exp $ | |
2 | |
3 =head1 NAME | |
4 | |
5 Bio::DB::GFF -- Storage and retrieval of sequence annotation data | |
6 | |
7 =head1 SYNOPSIS | |
8 | |
9 use Bio::DB::GFF; | |
10 | |
11 # Open the sequence database | |
12 my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt', | |
13 -dsn => 'dbi:mysql:elegans', | |
14 -fasta => '/usr/local/fasta_files' | |
15 ); | |
16 | |
17 # fetch a 1 megabase segment of sequence starting at landmark "ZK909" | |
18 my $segment = $db->segment('ZK909', 1 => 1000000); | |
19 | |
20 # pull out all transcript features | |
21 my @transcripts = $segment->features('transcript'); | |
22 | |
23 # for each transcript, total the length of the introns | |
24 my %totals; | |
25 for my $t (@transcripts) { | |
26 my @introns = $t->Intron; | |
27 $totals{$t->name} += $_->length foreach @introns; | |
28 } | |
29 | |
30 # Sort the exons of the first transcript by position | |
31 my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon; | |
32 | |
33 # Get a region 1000 bp upstream of first exon | |
34 my $upstream = $exons[0]->segment(-1000,0); | |
35 | |
36 # get its DNA | |
37 my $dna = $upstream->dna; | |
38 | |
39 # and get all curated polymorphisms inside it | |
40 @polymorphisms = $upstream->contained_features('polymorphism:curated'); | |
41 | |
42 # get all feature types in the database | |
43 my @types = $db->types; | |
44 | |
45 # count all feature types in the segment | |
46 my %type_counts = $segment->types(-enumerate=>1); | |
47 | |
48 # get an iterator on all curated features of type 'exon' or 'intron' | |
49 my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']); | |
50 | |
51 while (my $s = $iterator->next_seq) { | |
52 print $s,"\n"; | |
53 } | |
54 | |
55 # find all transcripts annotated as having function 'kinase' | |
56 my $iterator = $db->get_seq_stream(-type=>'transcript', | |
57 -attributes=>{Function=>'kinase'}); | |
58 while (my $s = $iterator->next_seq) { | |
59 print $s,"\n"; | |
60 } | |
61 | |
62 =head1 DESCRIPTION | |
63 | |
64 Bio::DB::GFF provides fast indexed access to a sequence annotation | |
65 database. It supports multiple database types (ACeDB, relational), | |
66 and multiple schemas through a system of adaptors and aggregators. | |
67 | |
68 The following operations are supported by this module: | |
69 | |
70 - retrieving a segment of sequence based on the ID of a landmark | |
71 - retrieving the DNA from that segment | |
72 - finding all annotations that overlap with the segment | |
73 - finding all annotations that are completely contained within the | |
74 segment | |
75 - retrieving all annotations of a particular type, either within a | |
76 segment, or globally | |
77 - conversion from absolute to relative coordinates and back again, | |
78 using any arbitrary landmark for the relative coordinates | |
79 - using a sequence segment to create new segments based on relative | |
80 offsets | |
81 | |
82 The data model used by Bio::DB::GFF is compatible with the GFF flat | |
83 file format (http://www.sanger.ac.uk/software/GFF). The module can | |
84 load a set of GFF files into the database, and serves objects that | |
85 have methods corresponding to GFF fields. | |
86 | |
87 The objects returned by Bio::DB::GFF are compatible with the | |
88 SeqFeatureI interface, allowing their use by the Bio::Graphics and | |
89 Bio::DAS modules. | |
90 | |
91 =head2 Auxiliary Scripts | |
92 | |
93 The bioperl distribution includes several scripts that make it easier | |
94 to work with Bio::DB::GFF databases. They are located in the scripts | |
95 directory under a subdirectory named Bio::DB::GFF: | |
96 | |
97 =over 4 | |
98 | |
99 =item bp_load_gff.pl | |
100 | |
101 This script will load a Bio::DB::GFF database from a flat GFF file of | |
102 sequence annotations. Only the relational database version of | |
103 Bio::DB::GFF is supported. It can be used to create the database from | |
104 scratch, as well as to incrementally load new data. | |
105 | |
106 This script takes a --fasta argument to load raw DNA into the database | |
107 as well. However, GFF databases do not require access to the raw DNA | |
108 for most of their functionality. | |
109 | |
110 load_gff.pl also has a --upgrade option, which will perform a | |
111 non-destructive upgrade of older schemas to newer ones. | |
112 | |
113 =item bp_bulk_load_gff.pl | |
114 | |
115 This script will populate a Bio::DB::GFF database from a flat GFF file | |
116 of sequence annotations. Only the MySQL database version of | |
117 Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in | |
118 order to accelerate loading considerably; however, it can only be used | |
119 for the initial load, and not for updates. | |
120 | |
121 This script takes a --fasta argument to load raw DNA into the database | |
122 as well. However, GFF databases do not require access to the raw DNA | |
123 for most of their functionality. | |
124 | |
125 =item bp_fast_load_gff.pl | |
126 | |
127 This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe | |
128 tricks to allow for incremental updates. It only supports the MySQL | |
129 database version of Bio::DB::GFF and is guaranteed not to work on | |
130 non-Unix platforms. | |
131 | |
132 Arguments are the same as bp_load_gff.pl | |
133 | |
134 =item gadfly_to_gff.pl | |
135 | |
136 This script will convert the GFF-like format used by the Berkeley | |
137 Drosophila Sequencing project into a format suitable for use with this | |
138 module. | |
139 | |
140 =item sgd_to_gff.pl | |
141 | |
142 This script will convert the tab-delimited feature files used by the | |
143 Saccharomyces Genome Database into a format suitable for use with this | |
144 module. | |
145 | |
146 =back | |
147 | |
148 =head2 GFF Fundamentals | |
149 | |
150 The GFF format is a flat tab-delimited file, each line of which | |
151 corresponds to an annotation, or feature. Each line has nine columns | |
152 and looks like this: | |
153 | |
154 Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7" | |
155 | |
156 The 9 columns are as follows: | |
157 | |
158 =over 4 | |
159 | |
160 =item 1. reference sequence | |
161 | |
162 This is the ID of the sequence that is used to establish the | |
163 coordinate system of the annotation. In the example above, the | |
164 reference sequence is "Chr1". | |
165 | |
166 =item 2. source | |
167 | |
168 The source of the annotation. This field describes how the annotation | |
169 was derived. In the example above, the source is "curated" to | |
170 indicate that the feature is the result of human curation. The names | |
171 and versions of software programs are often used for the source field, | |
172 as in "tRNAScan-SE/1.2". | |
173 | |
174 =item 3. method | |
175 | |
176 The annotation method. This field describes the type of the | |
177 annotation, such as "CDS". Together the method and source describe | |
178 the annotation type. | |
179 | |
180 =item 4. start position | |
181 | |
182 The start of the annotation relative to the reference sequence. | |
183 | |
184 =item 5. stop position | |
185 | |
186 The stop of the annotation relative to the reference sequence. Start | |
187 is always less than or equal to stop. | |
188 | |
189 =item 6. score | |
190 | |
191 For annotations that are associated with a numeric score (for example, | |
192 a sequence similarity), this field describes the score. The score | |
193 units are completely unspecified, but for sequence similarities, it is | |
194 typically percent identity. Annotations that don't have a score can | |
195 use "." | |
196 | |
197 =item 7. strand | |
198 | |
199 For those annotations which are strand-specific, this field is the | |
200 strand on which the annotation resides. It is "+" for the forward | |
201 strand, "-" for the reverse strand, or "." for annotations that are | |
202 not stranded. | |
203 | |
204 =item 8. phase | |
205 | |
206 For annotations that are linked to proteins, this field describes the | |
207 phase of the annotation on the codons. It is a number from 0 to 2, or | |
208 "." for features that have no phase\. | |
209 | |
210 =item 9. group | |
211 | |
212 GFF provides a simple way of generating annotation hierarchies ("is | |
213 composed of" relationships) by providing a group field. The group | |
214 field contains the class and ID of an annotation which is the logical | |
215 parent of the current one. In the example given above, the group is | |
216 the Transcript named "R119.7". | |
217 | |
218 The group field is also used to store information about the target of | |
219 sequence similarity hits, and miscellaneous notes. See the next | |
220 section for a description of how to describe similarity targets. | |
221 | |
222 The format of the group fields is "Class ID" with a single space (not | |
223 a tab) separating the class from the ID. It is VERY IMPORTANT to | |
224 follow this format, or grouping will not work properly. | |
225 | |
226 =back | |
227 | |
228 The sequences used to establish the coordinate system for annotations | |
229 can correspond to sequenced clones, clone fragments, contigs or | |
230 super-contigs. Thus, this module can be used throughout the lifecycle | |
231 of a sequencing project. | |
232 | |
233 In addition to a group ID, the GFF format allows annotations to have a | |
234 group class. For example, in the ACeDB representation, RNA | |
235 interference experiments have a class of "RNAi" and an ID that is | |
236 unique among the RNAi experiments. Since not all databases support | |
237 this notion, the class is optional in all calls to this module, and | |
238 defaults to "Sequence" when not provided. | |
239 | |
240 Double-quotes are sometimes used in GFF files around components of the | |
241 group field. Strictly, this is only necessary if the group name or | |
242 class contains whitespace. | |
243 | |
244 =head2 Making GFF files work with this module | |
245 | |
246 Some annotations do not need to be individually named. For example, | |
247 it is probably not useful to assign a unique name to each ALU repeat | |
248 in a vertebrate genome. Others, such as predicted genes, correspond | |
249 to named biological objects; you probably want to be able to fetch the | |
250 positions of these objects by referring to them by name. | |
251 | |
252 To accomodate named annotations, the GFF format places the object | |
253 class and name in the group field. The name identifies the object, | |
254 and the class prevents similarly-named objects, for example clones and | |
255 sequences, from collding. | |
256 | |
257 A named object is shown in the following excerpt from a GFF file: | |
258 | |
259 Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2 | |
260 | |
261 This object is a predicted transcript named Y95BA.2. In this case, | |
262 the group field is used to identify the class and name of the object, | |
263 even though no other annotation belongs to that group. | |
264 | |
265 It now becomes possible to retrieve the region of the genome covered | |
266 by transcript Y95B8A.2 using the segment() method: | |
267 | |
268 $segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2'); | |
269 | |
270 It is not necessary for the annotation's method to correspond to the | |
271 object class, although this is commonly the case. | |
272 | |
273 As explained above, each annotation in a GFF file refers to a | |
274 reference sequence. It is important that each reference sequence also | |
275 be identified by a line in the GFF file. This allows the Bio::DB::GFF | |
276 module to determine the length and class of the reference sequence, | |
277 and makes it possible to do relative arithmetic. | |
278 | |
279 For example, if "Chr1" is used as a reference sequence, then it should | |
280 have an entry in the GFF file similar to this one: | |
281 | |
282 Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1 | |
283 | |
284 This indicates that the reference sequence named "Chr1" has length | |
285 14972282 bp, method "chromosome" and source "assembly". In addition, | |
286 as indicated by the group field, Chr1 has class "Sequence" and name | |
287 "Chr1". | |
288 | |
289 The object class "Sequence" is used by default when the class is not | |
290 specified in the segment() call. This allows you to use a shortcut | |
291 form of the segment() method: | |
292 | |
293 $segment = $db->segment('Chr1'); # whole chromosome | |
294 $segment = $db->segment('Chr1',1=>1000); # first 1000 bp | |
295 | |
296 For your convenience, if, during loading a GFF file, Bio::DB::GFF | |
297 encounters a line like the following: | |
298 | |
299 ##sequence-region Chr1 1 14972282 | |
300 | |
301 It will automatically generate the following entry: | |
302 | |
303 Chr1 reference Component 1 14972282 . + . Sequence Chr1 | |
304 | |
305 This is sufficient to use Chr1 as a reference point. | |
306 The ##sequence-region line is frequently found in the GFF files | |
307 distributed by annotation groups. | |
308 | |
309 =head2 Sequence alignments | |
310 | |
311 There are two cases in which an annotation indicates the relationship | |
312 between two sequences. The first case is a similarity hit, where the | |
313 annotation indicates an alignment. The second case is a map assembly, | |
314 in which the annotation indicates that a portion of a larger sequence | |
315 is built up from one or more smaller ones. | |
316 | |
317 Both cases are indicated by using the B<Target> tag in the group | |
318 field. For example, a typical similarity hit will look like this: | |
319 | |
320 Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544 | |
321 | |
322 The group field contains the Target tag, followed by an identifier for | |
323 the biological object referred to. The GFF format uses the notation | |
324 I<Class>:I<Name> for the biological object, and even though this is | |
325 stylistically inconsistent, that's the way it's done. The object | |
326 identifier is followed by two integers indicating the start and stop | |
327 of the alignment on the target sequence. | |
328 | |
329 Unlike the main start and stop columns, it is possible for the target | |
330 start to be greater than the target end. The previous example | |
331 indicates that the the section of Chr1 from 76,953 to 77,108 aligns to | |
332 the protein SW:ABL_DROME starting at position 493 and extending to | |
333 position 544. | |
334 | |
335 A similar notation is used for sequence assembly information as shown | |
336 in this example: | |
337 | |
338 Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826 | |
339 LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742 | |
340 | |
341 This indicates that the region between bases 10922906 and 11177731 of | |
342 Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region | |
343 of LINK_H0601 between 32386 and 64122 is, in turn, composed of the | |
344 bases 5 to 31742 of cosmid F49B2. | |
345 | |
346 =head2 Attributes | |
347 | |
348 While not intended to serve as a general-purpose sequence database | |
349 (see bioperl-db for that), GFF allows you to tag features with | |
350 arbitrary attributes. Attributes appear in the Group field following | |
351 the initial class/name pair. For example: | |
352 | |
353 Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3 | |
354 | |
355 This line tags the feature named Transcript Y95B8A.2 as being "Gene" | |
356 named sma-3 and having the Alias "sma3". Features having these | |
357 attributes can be looked up using the fetch_feature_by_attribute() method. | |
358 | |
359 Two attributes have special meaning: "Note" is for backward | |
360 compatibility and is used for unstructured text remarks. "Alias" is | |
361 considered as a synonym for the feature name and will be consulted | |
362 when looking up a feature by its name. | |
363 | |
364 =head2 Adaptors and Aggregators | |
365 | |
366 This module uses a system of adaptors and aggregators in order to make | |
367 it adaptable to use with a variety of databases. | |
368 | |
369 =over 4 | |
370 | |
371 =item Adaptors | |
372 | |
373 The core of the module handles the user API, annotation coordinate | |
374 arithmetic, and other common issues. The details of fetching | |
375 information from databases is handled by an adaptor, which is | |
376 specified during Bio::DB::GFF construction. The adaptor encapsulates | |
377 database-specific information such as the schema, user authentication | |
378 and access methods. | |
379 | |
380 Currently there are two adaptors: 'dbi::mysql' and 'dbi::mysqlopt'. | |
381 The former is an interface to a simple Mysql schema. The latter is an | |
382 optimized version of dbi::mysql which uses a binning scheme to | |
383 accelerate range queries and the Bio::DB::Fasta module for rapid | |
384 retrieval of sequences. Note the double-colon between the words. | |
385 | |
386 =item Aggregators | |
387 | |
388 The GFF format uses a "group" field to indicate aggregation properties | |
389 of individual features. For example, a set of exons and introns may | |
390 share a common transcript group, and multiple transcripts may share | |
391 the same gene group. | |
392 | |
393 Aggregators are small modules that use the group information to | |
394 rebuild the hierarchy. When a Bio::DB::GFF object is created, you | |
395 indicate that it use a set of one or more aggregators. Each | |
396 aggregator provides a new composite annotation type. Before the | |
397 database query is generated each aggregator is called to | |
398 "disaggregate" its annotation type into list of component types | |
399 contained in the database. After the query is generated, each | |
400 aggregator is called again in order to build composite annotations | |
401 from the returned components. | |
402 | |
403 For example, during disaggregation, the standard | |
404 "processed_transcript" aggregator generates a list of component | |
405 feature types including "UTR", "CDS", and "polyA_site". Later, it | |
406 aggregates these features into a set of annotations of type | |
407 "processed_transcript". | |
408 | |
409 During aggregation, the list of aggregators is called in reverse | |
410 order. This allows aggregators to collaborate to create multi-level | |
411 structures: the transcript aggregator assembles transcripts from | |
412 introns and exons; the gene aggregator then assembles genes from sets | |
413 of transcripts. | |
414 | |
415 Three default aggregators are provided: | |
416 | |
417 transcript assembles transcripts from features of type | |
418 exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA | |
419 clone assembles clones from Clone_left_end, Clone_right_end | |
420 and Sequence features. | |
421 alignment assembles gapped alignments from features of type | |
422 "similarity". | |
423 | |
424 In addition, this module provides the optional "wormbase_gene" | |
425 aggregator, which accomodates the WormBase representation of genes. | |
426 This aggregator aggregates features of method "exon", "CDS", "5'UTR", | |
427 "3'UTR", "polyA" and "TSS" into a single object. It also expects to | |
428 find a single feature of type "Sequence" that spans the entire gene. | |
429 | |
430 The existing aggregators are easily customized. | |
431 | |
432 Note that aggregation will not occur unless you specifically request | |
433 the aggregation type. For example, this call: | |
434 | |
435 @features = $segment->features('alignment'); | |
436 | |
437 will generate an array of aggregated alignment features. However, | |
438 this call: | |
439 | |
440 @features = $segment->features(); | |
441 | |
442 will return a list of unaggregated similarity segments. | |
443 | |
444 For more informnation, see the manual pages for | |
445 Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone, | |
446 etc. | |
447 | |
448 =back | |
449 | |
450 =head1 API | |
451 | |
452 The following is the API for Bio::DB::GFF. | |
453 | |
454 =cut | |
455 | |
456 package Bio::DB::GFF; | |
457 | |
458 use strict; | |
459 | |
460 use Bio::DB::GFF::Util::Rearrange; | |
461 use Bio::DB::GFF::RelSegment; | |
462 use Bio::DB::GFF::Feature; | |
463 use Bio::DB::GFF::Aggregator; | |
464 use Bio::DasI; | |
465 use Bio::Root::Root; | |
466 | |
467 use vars qw(@ISA $VERSION); | |
468 @ISA = qw(Bio::Root::Root Bio::DasI); | |
469 | |
470 $VERSION = '1.2003'; | |
471 my %valid_range_types = (overlaps => 1, | |
472 contains => 1, | |
473 contained_in => 1); | |
474 | |
475 =head1 Querying GFF Databases | |
476 | |
477 =head2 new | |
478 | |
479 Title : new | |
480 Usage : my $db = new Bio::DB::GFF(@args); | |
481 Function: create a new Bio::DB::GFF object | |
482 Returns : new Bio::DB::GFF object | |
483 Args : lists of adaptors and aggregators | |
484 Status : Public | |
485 | |
486 These are the arguments: | |
487 | |
488 -adaptor Name of the adaptor module to use. If none | |
489 provided, defaults to "dbi::mysqlopt". | |
490 | |
491 -aggregator Array reference to a list of aggregators | |
492 to apply to the database. If none provided, | |
493 defaults to ['processed_transcript','alignment']. | |
494 | |
495 <other> Any other named argument pairs are passed to | |
496 the adaptor for processing. | |
497 | |
498 The adaptor argument must correspond to a module contained within the | |
499 Bio::DB::GFF::Adaptor namespace. For example, the | |
500 Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying | |
501 'dbi::mysql'. By Perl convention, the adaptors names are lower case | |
502 because they are loaded at run time. | |
503 | |
504 The aggregator array may contain a list of aggregator names, a list of | |
505 initialized aggregator objects, or a string in the form | |
506 "aggregator_name{subpart1,subpart2,subpart3/main_method}" (the | |
507 /main_method part is optional). For example, if you wish to change | |
508 the components aggregated by the transcript aggregator, you could pass | |
509 it to the GFF constructor this way: | |
510 | |
511 my $transcript = | |
512 Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr | |
513 polyA spliced_leader)]); | |
514 | |
515 my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment], | |
516 -adaptor => 'dbi::mysql', | |
517 -dsn => 'dbi:mysql:elegans42'); | |
518 | |
519 Alternatively, you could create an entirely new transcript aggregator | |
520 this way: | |
521 | |
522 my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}'; | |
523 my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment], | |
524 -adaptor => 'dbi::mysql', | |
525 -dsn => 'dbi:mysql:elegans42'); | |
526 | |
527 See L<Bio::DB::GFF::Aggregator> for more details. | |
528 | |
529 The commonly used 'dbi::mysql' adaptor recognizes the following | |
530 adaptor-specific arguments: | |
531 | |
532 Argument Description | |
533 -------- ----------- | |
534 | |
535 -dsn the DBI data source, e.g. 'dbi:mysql:ens0040' | |
536 If a partial name is given, such as "ens0040", the | |
537 "dbi:mysql:" prefix will be added automatically. | |
538 | |
539 -user username for authentication | |
540 | |
541 -pass the password for authentication | |
542 | |
543 -refclass landmark Class; defaults to "Sequence" | |
544 | |
545 The commonly used 'dbi::mysqlopt' adaptor also recogizes the following | |
546 arguments. | |
547 | |
548 Argument Description | |
549 -------- ----------- | |
550 | |
551 -fasta path to a directory containing FASTA files for the DNA | |
552 contained in this database (e.g. "/usr/local/share/fasta") | |
553 | |
554 -acedb an acedb URL to use when converting features into ACEDB | |
555 objects (e.g. sace://localhost:2005) | |
556 | |
557 =cut | |
558 | |
559 #' | |
560 | |
561 sub new { | |
562 my $package = shift; | |
563 my ($adaptor,$aggregators,$args,$refclass); | |
564 | |
565 if (@_ == 1) { # special case, default to dbi::mysqlopt | |
566 $adaptor = 'dbi::mysqlopt'; | |
567 $args = {DSN => shift}; | |
568 } else { | |
569 ($adaptor,$aggregators,$refclass,$args) = rearrange([ | |
570 [qw(ADAPTOR FACTORY)], | |
571 [qw(AGGREGATOR AGGREGATORS)], | |
572 'REFCLASS', | |
573 ],@_); | |
574 } | |
575 | |
576 $adaptor ||= 'dbi::mysqlopt'; | |
577 my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E"; | |
578 eval "require $class" unless $class->can('new'); | |
579 $package->throw("Unable to load $adaptor adaptor: $@") if $@; | |
580 | |
581 my $self = $class->new($args); | |
582 $self->default_class($refclass) if defined $refclass; | |
583 | |
584 # handle the aggregators. | |
585 # aggregators are responsible for creating complex multi-part features | |
586 # from the GFF "group" field. If none are provided, then we provide a | |
587 # list of the two used in WormBase. | |
588 # Each aggregator can be a scalar or a ref. In the former case | |
589 # it is treated as a class name to call new() on. In the latter | |
590 # the aggreator is treated as a ready made object. | |
591 $aggregators = $self->default_aggregators unless defined $aggregators; | |
592 my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators; | |
593 for my $a (@a) { | |
594 $self->add_aggregator($a); | |
595 } | |
596 | |
597 # default settings go here..... | |
598 $self->automerge(1); # set automerge to true | |
599 | |
600 $self; | |
601 } | |
602 | |
603 | |
604 =head2 types | |
605 | |
606 Title : types | |
607 Usage : $db->types(@args) | |
608 Function: return list of feature types in range or database | |
609 Returns : a list of Bio::DB::GFF::Typename objects | |
610 Args : see below | |
611 Status : public | |
612 | |
613 This routine returns a list of feature types known to the database. | |
614 The list can be database-wide or restricted to a region. It is also | |
615 possible to find out how many times each feature occurs. | |
616 | |
617 For range queries, it is usually more convenient to create a | |
618 Bio::DB::GFF::Segment object, and then invoke it's types() method. | |
619 | |
620 Arguments are as follows: | |
621 | |
622 -ref ID of reference sequence | |
623 -class class of reference sequence | |
624 -start start of segment | |
625 -stop stop of segment | |
626 -enumerate if true, count the features | |
627 | |
628 The returned value will be a list of Bio::DB::GFF::Typename objects, | |
629 which if evaluated in a string context will return the feature type in | |
630 "method:source" format. This object class also has method() and | |
631 source() methods for retrieving the like-named fields. | |
632 | |
633 If -enumerate is true, then the function returns a hash (not a hash | |
634 reference) in which the keys are type names in "method:source" format | |
635 and the values are the number of times each feature appears in the | |
636 database or segment. | |
637 | |
638 The argument -end is a synonum for -stop, and -count is a synonym for | |
639 -enumerate. | |
640 | |
641 =cut | |
642 | |
643 sub types { | |
644 my $self = shift; | |
645 my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([ | |
646 [qw(REF REFSEQ)], | |
647 qw(START), | |
648 [qw(STOP END)], | |
649 [qw(ENUMERATE COUNT)], | |
650 [qw(CLASS SEQCLASS)], | |
651 [qw(TYPE TYPES)], | |
652 ],@_); | |
653 $types = $self->parse_types($types) if defined $types; | |
654 $self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types); | |
655 } | |
656 | |
657 =head2 classes | |
658 | |
659 Title : classes | |
660 Usage : $db->classes | |
661 Function: return list of landmark classes in database | |
662 Returns : a list of classes | |
663 Args : none | |
664 Status : public | |
665 | |
666 This routine returns the list of reference classes known to the | |
667 database, or empty if classes are not used by the database. Classes | |
668 are distinct from types, being essentially qualifiers on the reference | |
669 namespaces. | |
670 | |
671 =cut | |
672 | |
673 sub classes { | |
674 my $self = shift; | |
675 return (); | |
676 } | |
677 | |
678 =head2 segment | |
679 | |
680 Title : segment | |
681 Usage : $db->segment(@args); | |
682 Function: create a segment object | |
683 Returns : segment object(s) | |
684 Args : numerous, see below | |
685 Status : public | |
686 | |
687 This method generates a segment object, which is a Perl object | |
688 subclassed from Bio::DB::GFF::Segment. The segment can be used to | |
689 find overlapping features and the raw DNA. | |
690 | |
691 When making the segment() call, you specify the ID of a sequence | |
692 landmark (e.g. an accession number, a clone or contig), and a | |
693 positional range relative to the landmark. If no range is specified, | |
694 then the entire extent of the landmark is used to generate the | |
695 segment. | |
696 | |
697 You may also provide the ID of a "reference" sequence, which will set | |
698 the coordinate system and orientation used for all features contained | |
699 within the segment. The reference sequence can be changed later. If | |
700 no reference sequence is provided, then the coordinate system is based | |
701 on the landmark. | |
702 | |
703 Arguments: | |
704 | |
705 -name ID of the landmark sequence. | |
706 | |
707 -class Database object class for the landmark sequence. | |
708 "Sequence" assumed if not specified. This is | |
709 irrelevant for databases which do not recognize | |
710 object classes. | |
711 | |
712 -start Start of the segment relative to landmark. Positions | |
713 follow standard 1-based sequence rules. If not specified, | |
714 defaults to the beginning of the landmark. | |
715 | |
716 -end Stop of the segment relative to the landmark. If not specified, | |
717 defaults to the end of the landmark. | |
718 | |
719 -stop Same as -end. | |
720 | |
721 -offset For those who prefer 0-based indexing, the offset specifies the | |
722 position of the new segment relative to the start of the landmark. | |
723 | |
724 -length For those who prefer 0-based indexing, the length specifies the | |
725 length of the new segment. | |
726 | |
727 -refseq Specifies the ID of the reference landmark used to establish the | |
728 coordinate system for the newly-created segment. | |
729 | |
730 -refclass Specifies the class of the reference landmark, for those databases | |
731 that distinguish different object classes. Defaults to "Sequence". | |
732 | |
733 -absolute | |
734 Return features in absolute coordinates rather than relative to the | |
735 parent segment. | |
736 | |
737 -nocheck Don't check the database for the coordinates and length of this | |
738 feature. Construct a segment using the indicated name as the | |
739 reference, a start coordinate of 1, an undefined end coordinate, | |
740 and a strand of +1. | |
741 | |
742 -force Same as -nocheck. | |
743 | |
744 -seq,-sequence,-sourceseq Aliases for -name. | |
745 | |
746 -begin,-end Aliases for -start and -stop | |
747 | |
748 -off,-len Aliases for -offset and -length | |
749 | |
750 -seqclass Alias for -class | |
751 | |
752 Here's an example to explain how this works: | |
753 | |
754 my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql'); | |
755 | |
756 If successful, $db will now hold the database accessor object. We now | |
757 try to fetch the fragment of sequence whose ID is A0000182 and class | |
758 is "Accession." | |
759 | |
760 my $segment = $db->segment(-name=>'A0000182',-class=>'Accession'); | |
761 | |
762 If successful, $segment now holds the entire segment corresponding to | |
763 this accession number. By default, the sequence is used as its own | |
764 reference sequence, so its first base will be 1 and its last base will | |
765 be the length of the accession. | |
766 | |
767 Assuming that this sequence belongs to a longer stretch of DNA, say a | |
768 contig, we can fetch this information like so: | |
769 | |
770 my $sourceseq = $segment->sourceseq; | |
771 | |
772 and find the start and stop on the source like this: | |
773 | |
774 my $start = $segment->abs_start; | |
775 my $stop = $segment->abs_stop; | |
776 | |
777 If we had another segment, say $s2, which is on the same contiguous | |
778 piece of DNA, we can pass that to the refseq() method in order to | |
779 establish it as the coordinate reference point: | |
780 | |
781 $segment->refseq($s2); | |
782 | |
783 Now calling start() will return the start of the segment relative to | |
784 the beginning of $s2, accounting for differences in strandedness: | |
785 | |
786 my $rel_start = $segment->start; | |
787 | |
788 IMPORTANT NOTE: This method can be used to return the segment spanned | |
789 by an arbitrary named annotation. However, if the annotation appears | |
790 at multiple locations on the genome, for example an EST that maps to | |
791 multiple locations, then, provided that all locations reside on the | |
792 same physical segment, the method will return a segment that spans the | |
793 minimum and maximum positions. If the reference sequence occupies | |
794 ranges on different physical segments, then it returns them all in an | |
795 array context, and raises a "multiple segment exception" exception in | |
796 a scalar context. | |
797 | |
798 =cut | |
799 | |
800 #' | |
801 | |
802 sub segment { | |
803 my $self = shift; | |
804 my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self, | |
805 $self->setup_segment_args(@_)); | |
806 foreach (@segments) { | |
807 $_->absolute(1) if $self->absolute; | |
808 } | |
809 | |
810 $self->_multiple_return_args(@segments); | |
811 } | |
812 | |
813 sub _multiple_return_args { | |
814 my $self = shift; | |
815 my @args = @_; | |
816 if (@args == 0) { | |
817 return; | |
818 } elsif (@args == 1) { | |
819 return $args[0]; | |
820 } elsif (wantarray) { # more than one reference sequence | |
821 return @args; | |
822 } else { | |
823 $self->error($args[0]->name, | |
824 " has more than one reference sequence in database. Please call in a list context to retrieve them all."); | |
825 $self->throw('multiple segment exception'); | |
826 return; | |
827 } | |
828 | |
829 } | |
830 | |
831 # backward compatibility -- don't use! | |
832 # (deliberately undocumented too) | |
833 sub abs_segment { | |
834 my $self = shift; | |
835 return $self->segment($self->setup_segment_args(@_),-absolute=>1); | |
836 } | |
837 | |
838 sub setup_segment_args { | |
839 my $self = shift; | |
840 return @_ if defined $_[0] && $_[0] =~ /^-/; | |
841 return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3; | |
842 return (-class=>$_[0],-name=>$_[1]) if @_ == 2; | |
843 return (-name=>$_[0]) if @_ == 1; | |
844 } | |
845 | |
846 =head2 features | |
847 | |
848 Title : features | |
849 Usage : $db->features(@args) | |
850 Function: get all features, possibly filtered by type | |
851 Returns : a list of Bio::DB::GFF::Feature objects | |
852 Args : see below | |
853 Status : public | |
854 | |
855 This routine will retrieve features in the database regardless of | |
856 position. It can be used to return all features, or a subset based on | |
857 their method and source. | |
858 | |
859 Arguments are as follows: | |
860 | |
861 -types List of feature types to return. Argument is an array | |
862 reference containing strings of the format "method:source" | |
863 | |
864 -merge Whether to apply aggregators to the generated features. | |
865 | |
866 -rare Turn on optimizations suitable for a relatively rare feature type, | |
867 where it makes more sense to filter by feature type first, | |
868 and then by position. | |
869 | |
870 -attributes A hash reference containing attributes to match. | |
871 | |
872 -iterator Whether to return an iterator across the features. | |
873 | |
874 -binsize A true value will create a set of artificial features whose | |
875 start and stop positions indicate bins of the given size, and | |
876 whose scores are the number of features in the bin. The | |
877 class and method of the feature will be set to "bin", | |
878 its source to "method:source", and its group to "bin:method:source". | |
879 This is a handy way of generating histograms of feature density. | |
880 | |
881 If -iterator is true, then the method returns a single scalar value | |
882 consisting of a Bio::SeqIO object. You can call next_seq() repeatedly | |
883 on this object to fetch each of the features in turn. If iterator is | |
884 false or absent, then all the features are returned as a list. | |
885 | |
886 Currently aggregation is disabled when iterating over a series of | |
887 features. | |
888 | |
889 Types are indicated using the nomenclature "method:source". Either of | |
890 these fields can be omitted, in which case a wildcard is used for the | |
891 missing field. Type names without the colon (e.g. "exon") are | |
892 interpreted as the method name and a source wild card. Regular | |
893 expressions are allowed in either field, as in: "similarity:BLAST.*". | |
894 | |
895 The -attributes argument is a hashref containing one or more attributes | |
896 to match against: | |
897 | |
898 -attributes => { Gene => 'abc-1', | |
899 Note => 'confirmed' } | |
900 | |
901 Attribute matching is simple string matching, and multiple attributes | |
902 are ANDed together. | |
903 | |
904 =cut | |
905 | |
906 sub features { | |
907 my $self = shift; | |
908 my ($types,$automerge,$sparse,$iterator,$other); | |
909 if (defined $_[0] && | |
910 $_[0] =~ /^-/) { | |
911 ($types,$automerge,$sparse,$iterator,$other) = rearrange([ | |
912 [qw(TYPE TYPES)], | |
913 [qw(MERGE AUTOMERGE)], | |
914 [qw(RARE SPARSE)], | |
915 'ITERATOR' | |
916 ],@_); | |
917 } else { | |
918 $types = \@_; | |
919 } | |
920 | |
921 # for whole database retrievals, we probably don't want to automerge! | |
922 $automerge = $self->automerge unless defined $automerge; | |
923 $other ||= {}; | |
924 $self->_features({ | |
925 rangetype => 'contains', | |
926 types => $types, | |
927 }, | |
928 { sparse => $sparse, | |
929 automerge => $automerge, | |
930 iterator =>$iterator, | |
931 %$other, | |
932 } | |
933 ); | |
934 } | |
935 | |
936 =head2 get_seq_stream | |
937 | |
938 Title : get_seq_stream | |
939 Usage : my $seqio = $self->get_seq_sream(@args) | |
940 Function: Performs a query and returns an iterator over it | |
941 Returns : a Bio::SeqIO stream capable of producing sequence | |
942 Args : As in features() | |
943 Status : public | |
944 | |
945 This routine takes the same arguments as features(), but returns a | |
946 Bio::SeqIO::Stream-compliant object. Use it like this: | |
947 | |
948 $stream = $db->get_seq_stream('exon'); | |
949 while (my $exon = $stream->next_seq) { | |
950 print $exon,"\n"; | |
951 } | |
952 | |
953 NOTE: This is also called get_feature_stream(), since that's what it | |
954 really does. | |
955 | |
956 =cut | |
957 | |
958 sub get_seq_stream { | |
959 my $self = shift; | |
960 my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1) | |
961 : (-types=>\@_,-iterator=>1); | |
962 $self->features(@args); | |
963 } | |
964 | |
965 *get_feature_stream = \&get_seq_stream; | |
966 | |
967 =head2 get_feature_by_name | |
968 | |
969 Title : get_feature_by_name | |
970 Usage : $db->get_feature_by_name($class => $name) | |
971 Function: fetch features by their name | |
972 Returns : a list of Bio::DB::GFF::Feature objects | |
973 Args : the class and name of the desired feature | |
974 Status : public | |
975 | |
976 This method can be used to fetch a named feature from the database. | |
977 GFF annotations are named using the group class and name fields, so | |
978 for features that belong to a group of size one, this method can be | |
979 used to retrieve that group (and is equivalent to the segment() | |
980 method). Any Alias attributes are also searched for matching names. | |
981 | |
982 An alternative syntax allows you to search for features by name within | |
983 a circumscribed region: | |
984 | |
985 @f = $db->get_feature_by_name(-class => $class,-name=>$name, | |
986 -ref => $sequence_name, | |
987 -start => $start, | |
988 -end => $end); | |
989 | |
990 This method may return zero, one, or several Bio::DB::GFF::Feature | |
991 objects. | |
992 | |
993 Aggregation is performed on features as usual. | |
994 | |
995 NOTE: At various times, this function was called fetch_group(), | |
996 fetch_feature(), fetch_feature_by_name() and segments(). These names | |
997 are preserved for backward compatibility. | |
998 | |
999 =cut | |
1000 | |
1001 sub get_feature_by_name { | |
1002 my $self = shift; | |
1003 my ($gclass,$gname,$automerge,$ref,$start,$end); | |
1004 if (@_ == 1) { | |
1005 $gclass = $self->default_class; | |
1006 $gname = shift; | |
1007 } else { | |
1008 ($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE', | |
1009 ['REF','REFSEQ'], | |
1010 'START',['STOP','END'] | |
1011 ],@_); | |
1012 $gclass ||= $self->default_class; | |
1013 } | |
1014 $automerge = $self->automerge unless defined $automerge; | |
1015 | |
1016 # we need to refactor this... It's repeated code (see below)... | |
1017 my @aggregators; | |
1018 if ($automerge) { | |
1019 for my $a ($self->aggregators) { | |
1020 push @aggregators,$a if $a->disaggregate([],$self); | |
1021 } | |
1022 } | |
1023 | |
1024 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
1025 my $features = []; | |
1026 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; | |
1027 my $location = [$ref,$start,$end] if defined $ref; | |
1028 $self->_feature_by_name($gclass,$gname,$location,$callback); | |
1029 | |
1030 warn "aggregating...\n" if $self->debug; | |
1031 foreach my $a (@aggregators) { # last aggregator gets first shot | |
1032 $a->aggregate($features,$self) or next; | |
1033 } | |
1034 | |
1035 @$features; | |
1036 } | |
1037 | |
1038 # horrible indecision regarding proper names! | |
1039 *fetch_group = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name; | |
1040 *segments = \&segment; | |
1041 | |
1042 =head2 get_feature_by_target | |
1043 | |
1044 Title : get_feature_by_target | |
1045 Usage : $db->get_feature_by_target($class => $name) | |
1046 Function: fetch features by their similarity target | |
1047 Returns : a list of Bio::DB::GFF::Feature objects | |
1048 Args : the class and name of the desired feature | |
1049 Status : public | |
1050 | |
1051 This method can be used to fetch a named feature from the database | |
1052 based on its similarity hit. | |
1053 | |
1054 =cut | |
1055 | |
1056 sub get_feature_by_target { | |
1057 shift->get_feature_by_name(@_); | |
1058 } | |
1059 | |
1060 =head2 get_feature_by_attribute | |
1061 | |
1062 Title : get_feature_by_attribute | |
1063 Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2) | |
1064 Function: fetch segments by combinations of attribute values | |
1065 Returns : a list of Bio::DB::GFF::Feature objects | |
1066 Args : the class and name of the desired feature | |
1067 Status : public | |
1068 | |
1069 This method can be used to fetch a set of features from the database. | |
1070 Attributes are a list of name=E<gt>value pairs. They will be logically | |
1071 ANDED together. | |
1072 | |
1073 =cut | |
1074 | |
1075 sub get_feature_by_attribute { | |
1076 my $self = shift; | |
1077 my %attributes = ref($_[0]) ? %{$_[0]} : @_; | |
1078 | |
1079 # we need to refactor this... It's repeated code (see above)... | |
1080 my @aggregators; | |
1081 if ($self->automerge) { | |
1082 for my $a ($self->aggregators) { | |
1083 unshift @aggregators,$a if $a->disaggregate([],$self); | |
1084 } | |
1085 } | |
1086 | |
1087 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
1088 my $features = []; | |
1089 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; | |
1090 $self->_feature_by_attribute(\%attributes,$callback); | |
1091 | |
1092 warn "aggregating...\n" if $self->debug; | |
1093 foreach my $a (@aggregators) { # last aggregator gets first shot | |
1094 $a->aggregate($features,$self) or next; | |
1095 } | |
1096 | |
1097 @$features; | |
1098 } | |
1099 | |
1100 # more indecision... | |
1101 *fetch_feature_by_attribute = \&get_feature_by_attribute; | |
1102 | |
1103 =head2 get_feature_by_id | |
1104 | |
1105 Title : get_feature_by_id | |
1106 Usage : $db->get_feature_by_id($id) | |
1107 Function: fetch segments by feature ID | |
1108 Returns : a Bio::DB::GFF::Feature object | |
1109 Args : the feature ID | |
1110 Status : public | |
1111 | |
1112 This method can be used to fetch a feature from the database using its | |
1113 ID. Not all GFF databases support IDs, so be careful with this. | |
1114 | |
1115 =cut | |
1116 | |
1117 sub get_feature_by_id { | |
1118 my $self = shift; | |
1119 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; | |
1120 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
1121 my $features = []; | |
1122 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; | |
1123 $self->_feature_by_id($id,'feature',$callback); | |
1124 return wantarray ? @$features : $features->[0]; | |
1125 } | |
1126 *fetch_feature_by_id = \&get_feature_by_id; | |
1127 | |
1128 =head2 get_feature_by_gid | |
1129 | |
1130 Title : get_feature_by_gid | |
1131 Usage : $db->get_feature_by_gid($id) | |
1132 Function: fetch segments by feature ID | |
1133 Returns : a Bio::DB::GFF::Feature object | |
1134 Args : the feature ID | |
1135 Status : public | |
1136 | |
1137 This method can be used to fetch a feature from the database using its | |
1138 group ID. Not all GFF databases support IDs, so be careful with this. | |
1139 | |
1140 The group ID is often more interesting than the feature ID, since | |
1141 groups can be complex objects containing subobjects. | |
1142 | |
1143 =cut | |
1144 | |
1145 sub get_feature_by_gid { | |
1146 my $self = shift; | |
1147 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; | |
1148 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
1149 my $features = []; | |
1150 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; | |
1151 $self->_feature_by_id($id,'group',$callback); | |
1152 return wantarray ? @$features : $features->[0]; | |
1153 } | |
1154 *fetch_feature_by_gid = \&get_feature_by_gid; | |
1155 | |
1156 =head2 delete_features | |
1157 | |
1158 Title : delete_features | |
1159 Usage : $db->delete_features(@ids_or_features) | |
1160 Function: delete one or more features | |
1161 Returns : count of features deleted | |
1162 Args : list of features or feature ids | |
1163 Status : public | |
1164 | |
1165 Pass this method a list of numeric feature ids or a set of features. | |
1166 It will attempt to remove the features from the database and return a | |
1167 count of the features removed. | |
1168 | |
1169 NOTE: This method is also called delete_feature(). Also see | |
1170 delete_groups(). | |
1171 | |
1172 =cut | |
1173 | |
1174 *delete_feature = \&delete_features; | |
1175 | |
1176 sub delete_features { | |
1177 my $self = shift; | |
1178 my @features_or_ids = @_; | |
1179 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids; | |
1180 return unless @ids; | |
1181 $self->_delete_features(@ids); | |
1182 } | |
1183 | |
1184 =head2 delete_groups | |
1185 | |
1186 Title : delete_groups | |
1187 Usage : $db->delete_groups(@ids_or_features) | |
1188 Function: delete one or more feature groups | |
1189 Returns : count of features deleted | |
1190 Args : list of features or feature group ids | |
1191 Status : public | |
1192 | |
1193 Pass this method a list of numeric group ids or a set of features. It | |
1194 will attempt to recursively remove the features and ALL members of | |
1195 their group from the database. It returns a count of the number of | |
1196 features (not groups) returned. | |
1197 | |
1198 NOTE: This method is also called delete_group(). Also see | |
1199 delete_features(). | |
1200 | |
1201 =cut | |
1202 | |
1203 *delete_group = \&delete_groupss; | |
1204 | |
1205 sub delete_groups { | |
1206 my $self = shift; | |
1207 my @features_or_ids = @_; | |
1208 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids; | |
1209 return unless @ids; | |
1210 $self->_delete_groups(@ids); | |
1211 } | |
1212 | |
1213 =head2 delete | |
1214 | |
1215 Title : delete | |
1216 Usage : $db->delete(@args) | |
1217 Function: delete features | |
1218 Returns : count of features deleted -- if available | |
1219 Args : numerous, see below | |
1220 Status : public | |
1221 | |
1222 This method deletes all features that overlap the specified region or | |
1223 are of a particular type. If no arguments are provided and the -force | |
1224 argument is true, then deletes ALL features. | |
1225 | |
1226 Arguments: | |
1227 | |
1228 -name ID of the landmark sequence. | |
1229 | |
1230 -ref ID of the landmark sequence (synonym for -name). | |
1231 | |
1232 -class Database object class for the landmark sequence. | |
1233 "Sequence" assumed if not specified. This is | |
1234 irrelevant for databases which do not recognize | |
1235 object classes. | |
1236 | |
1237 -start Start of the segment relative to landmark. Positions | |
1238 follow standard 1-based sequence rules. If not specified, | |
1239 defaults to the beginning of the landmark. | |
1240 | |
1241 -end Stop of the segment relative to the landmark. If not specified, | |
1242 defaults to the end of the landmark. | |
1243 | |
1244 -offset Zero-based addressing | |
1245 | |
1246 -length Length of region | |
1247 | |
1248 -type,-types Either a single scalar type to be deleted, or an | |
1249 reference to an array of types. | |
1250 | |
1251 -force Force operation to be performed even if it would delete | |
1252 entire feature table. | |
1253 | |
1254 -range_type Control the range type of the deletion. One of "overlaps" (default) | |
1255 "contains" or "contained_in" | |
1256 | |
1257 Examples: | |
1258 | |
1259 $db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats | |
1260 $db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000 | |
1261 $db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3 | |
1262 | |
1263 The short form of this call, as described in segment() is also allowed: | |
1264 | |
1265 $db->delete("chr3",1=>1000); | |
1266 $db->delete("chr3"); | |
1267 | |
1268 IMPORTANT NOTE: This method only deletes features. It does *NOT* | |
1269 delete the names of groups that contain the deleted features. Group | |
1270 IDs will be reused if you later load a feature with the same group | |
1271 name as one that was previously deleted. | |
1272 | |
1273 NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the | |
1274 result code from the SQL DELETE operation. Some dbd drivers return the | |
1275 count of rows deleted, while others return 0E0. Caveat emptor. | |
1276 | |
1277 =cut | |
1278 | |
1279 sub delete { | |
1280 my $self = shift; | |
1281 my @args = $self->setup_segment_args(@_); | |
1282 my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) = | |
1283 rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET', | |
1284 'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args); | |
1285 $offset = 0 unless defined $offset; | |
1286 $start = $offset+1 unless defined $start; | |
1287 $end = $start+$length-1 if !defined $end and $length; | |
1288 $class ||= $self->default_class; | |
1289 | |
1290 my $types = $self->parse_types($type); # parse out list of types | |
1291 | |
1292 $range_type ||= 'overlaps'; | |
1293 $self->throw("range type must be one of {". | |
1294 join(',',keys %valid_range_types). | |
1295 "}\n") | |
1296 unless $valid_range_types{lc $range_type}; | |
1297 | |
1298 | |
1299 my @segments; | |
1300 if (defined $name && $name ne '') { | |
1301 my @args = (-name=>$name,-class=>$class); | |
1302 push @args,(-start=>$start) if defined $start; | |
1303 push @args,(-end =>$end) if defined $end; | |
1304 @segments = $self->segment(@args); | |
1305 return unless @segments; | |
1306 } | |
1307 $self->_delete({segments => \@segments, | |
1308 types => $types, | |
1309 range_type => $range_type, | |
1310 force => $force} | |
1311 ); | |
1312 } | |
1313 | |
1314 =head2 absolute | |
1315 | |
1316 Title : absolute | |
1317 Usage : $abs = $db->absolute([$abs]); | |
1318 Function: gets/sets absolute mode | |
1319 Returns : current setting of absolute mode boolean | |
1320 Args : new setting for absolute mode boolean | |
1321 Status : public | |
1322 | |
1323 $db-E<gt>absolute(1) will turn on absolute mode for the entire database. | |
1324 All segments retrieved will use absolute coordinates by default, | |
1325 rather than relative coordinates. You can still set them to use | |
1326 relative coordinates by calling $segment-E<gt>absolute(0). | |
1327 | |
1328 Note that this is not the same as calling abs_segment(); it continues | |
1329 to allow you to look up groups that are not used directly as reference | |
1330 sequences. | |
1331 | |
1332 =cut | |
1333 | |
1334 sub absolute { | |
1335 my $self = shift; | |
1336 my $d = $self->{absolute}; | |
1337 $self->{absolute} = shift if @_; | |
1338 $d; | |
1339 } | |
1340 | |
1341 =head2 strict_bounds_checking | |
1342 | |
1343 Title : strict_bounds_checking | |
1344 Usage : $flag = $db->strict_bounds_checking([$flag]) | |
1345 Function: gets/sets strict bounds checking | |
1346 Returns : current setting of bounds checking flag | |
1347 Args : new setting for bounds checking flag | |
1348 Status : public | |
1349 | |
1350 This flag enables extra checks for segment requests that go beyond the | |
1351 ends of their reference sequences. If bounds checking is enabled, | |
1352 then retrieved segments will be truncated to their physical length, | |
1353 and their truncated() methods will return true. | |
1354 | |
1355 If the flag is off (the default), then the module will return segments | |
1356 that appear to extend beyond their physical boundaries. Requests for | |
1357 features beyond the end of the segment will, however, return empty. | |
1358 | |
1359 =cut | |
1360 | |
1361 sub strict_bounds_checking { | |
1362 my $self = shift; | |
1363 my $d = $self->{strict}; | |
1364 $self->{strict} = shift if @_; | |
1365 $d; | |
1366 } | |
1367 | |
1368 =head2 get_Seq_by_id | |
1369 | |
1370 Title : get_Seq_by_id | |
1371 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') | |
1372 Function: Gets a Bio::Seq object by its name | |
1373 Returns : a Bio::Seq object | |
1374 Args : the id (as a string) of a sequence | |
1375 Throws : "id does not exist" exception | |
1376 | |
1377 NOTE: Bio::DB::RandomAccessI compliant method | |
1378 | |
1379 =cut | |
1380 | |
1381 sub get_Seq_by_id { | |
1382 my $self = shift; | |
1383 my $id = shift; | |
1384 my $stream = $self->get_Stream_by_id($id); | |
1385 return $stream->next_seq; | |
1386 } | |
1387 | |
1388 | |
1389 =head2 get_Seq_by_accession | |
1390 | |
1391 Title : get_Seq_by_accession | |
1392 Usage : $seq = $db->get_Seq_by_accession('AL12234') | |
1393 Function: Gets a Bio::Seq object by its accession | |
1394 Returns : a Bio::Seq object | |
1395 Args : the id (as a string) of a sequence | |
1396 Throws : "id does not exist" exception | |
1397 | |
1398 NOTE: Bio::DB::RandomAccessI compliant method | |
1399 | |
1400 =cut | |
1401 | |
1402 sub get_Seq_by_accession { | |
1403 my $self = shift; | |
1404 my $id = shift; | |
1405 my $stream = $self->get_Stream_by_accession($id); | |
1406 return $stream->next_seq; | |
1407 } | |
1408 | |
1409 =head2 get_Stream_by_acc () | |
1410 | |
1411 =cut | |
1412 | |
1413 =head2 get_Seq_by_acc | |
1414 | |
1415 Title : get_Seq_by_acc | |
1416 Usage : $seq = $db->get_Seq_by_acc('X77802'); | |
1417 Function: Gets a Bio::Seq object by accession number | |
1418 Returns : A Bio::Seq object | |
1419 Args : accession number (as a string) | |
1420 Throws : "acc does not exist" exception | |
1421 | |
1422 NOTE: Bio::DB::RandomAccessI compliant method | |
1423 | |
1424 =cut | |
1425 | |
1426 sub get_Stream_by_name { | |
1427 my $self = shift; | |
1428 my @ids = @_; | |
1429 my $id = ref($ids[0]) ? $ids[0] : \@ids; | |
1430 Bio::DB::GFF::ID_Iterator->new($self,$id,'name'); | |
1431 } | |
1432 | |
1433 =head2 get_Stream_by_id | |
1434 | |
1435 Title : get_Stream_by_id | |
1436 Usage : $seq = $db->get_Stream_by_id(@ids); | |
1437 Function: Retrieves a stream of Seq objects given their ids | |
1438 Returns : a Bio::SeqIO stream object | |
1439 Args : an array of unique ids/accession numbers, or | |
1440 an array reference | |
1441 | |
1442 NOTE: This is also called get_Stream_by_batch() | |
1443 | |
1444 =cut | |
1445 | |
1446 sub get_Stream_by_id { | |
1447 my $self = shift; | |
1448 my @ids = @_; | |
1449 my $id = ref($ids[0]) ? $ids[0] : \@ids; | |
1450 Bio::DB::GFF::ID_Iterator->new($self,$id,'feature'); | |
1451 } | |
1452 | |
1453 =head2 get_Stream_by_batch () | |
1454 | |
1455 Title : get_Stream_by_batch | |
1456 Usage : $seq = $db->get_Stream_by_batch(@ids); | |
1457 Function: Retrieves a stream of Seq objects given their ids | |
1458 Returns : a Bio::SeqIO stream object | |
1459 Args : an array of unique ids/accession numbers, or | |
1460 an array reference | |
1461 | |
1462 NOTE: This is the same as get_Stream_by_id(). | |
1463 | |
1464 =cut | |
1465 | |
1466 *get_Stream_by_batch = \&get_Stream_by_id; | |
1467 | |
1468 | |
1469 =head2 get_Stream_by_group () | |
1470 | |
1471 Bioperl compatibility. | |
1472 | |
1473 =cut | |
1474 | |
1475 sub get_Stream_by_group { | |
1476 my $self = shift; | |
1477 my @ids = @_; | |
1478 my $id = ref($ids[0]) ? $ids[0] : \@ids; | |
1479 Bio::DB::GFF::ID_Iterator->new($self,$id,'group'); | |
1480 } | |
1481 | |
1482 =head2 all_seqfeatures | |
1483 | |
1484 Title : all_seqfeatures | |
1485 Usage : @features = $db->all_seqfeatures(@args) | |
1486 Function: fetch all the features in the database | |
1487 Returns : an array of features, or an iterator | |
1488 Args : See below | |
1489 Status : public | |
1490 | |
1491 This is equivalent to calling $db-E<gt>features() without any types, and | |
1492 will return all the features in the database. The -merge and | |
1493 -iterator arguments are recognized, and behave the same as described | |
1494 for features(). | |
1495 | |
1496 =cut | |
1497 | |
1498 sub all_seqfeatures { | |
1499 my $self = shift; | |
1500 my ($automerge,$iterator)= rearrange([ | |
1501 [qw(MERGE AUTOMERGE)], | |
1502 'ITERATOR' | |
1503 ],@_); | |
1504 my @args; | |
1505 push @args,(-merge=>$automerge) if defined $automerge; | |
1506 push @args,(-iterator=>$iterator) if defined $iterator; | |
1507 $self->features(@args); | |
1508 } | |
1509 | |
1510 =head1 Creating and Loading GFF Databases | |
1511 | |
1512 =head2 initialize | |
1513 | |
1514 Title : initialize | |
1515 Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2); | |
1516 Function: initialize a GFF database | |
1517 Returns : true if initialization successful | |
1518 Args : a set of named parameters | |
1519 Status : Public | |
1520 | |
1521 This method can be used to initialize an empty database. It takes the following | |
1522 named arguments: | |
1523 | |
1524 -erase A boolean value. If true the database will be wiped clean if it | |
1525 already contains data. | |
1526 | |
1527 Other named arguments may be recognized by subclasses. They become database | |
1528 meta values that control various settable options. | |
1529 | |
1530 As a shortcut (and for backward compatibility) a single true argument | |
1531 is the same as initialize(-erase=E<gt>1). | |
1532 | |
1533 =cut | |
1534 | |
1535 sub initialize { | |
1536 my $self = shift; | |
1537 #$self->do_initialize(1) if @_ == 1 && $_[0]; | |
1538 #why was this line (^) here? I can't see that it actually does anything | |
1539 #one option would be to execute the line and return, but I don't know | |
1540 #why you would want to do that either. | |
1541 | |
1542 my ($erase,$meta) = rearrange(['ERASE'],@_); | |
1543 $meta ||= {}; | |
1544 | |
1545 # initialize (possibly erasing) | |
1546 return unless $self->do_initialize($erase); | |
1547 my @default = $self->default_meta_values; | |
1548 | |
1549 # this is an awkward way of uppercasing the | |
1550 # even-numbered values (necessary for case-insensitive SQL databases) | |
1551 for (my $i=0; $i<@default; $i++) { | |
1552 $default[$i] = uc $default[$i] if !($i % 2); | |
1553 } | |
1554 | |
1555 my %values = (@default,%$meta); | |
1556 foreach (keys %values) { | |
1557 $self->meta($_ => $values{$_}); | |
1558 } | |
1559 1; | |
1560 } | |
1561 | |
1562 | |
1563 =head2 load_gff | |
1564 | |
1565 Title : load_gff | |
1566 Usage : $db->load_gff($file|$directory|$filehandle); | |
1567 Function: load GFF data into database | |
1568 Returns : count of records loaded | |
1569 Args : a directory, a file, a list of files, | |
1570 or a filehandle | |
1571 Status : Public | |
1572 | |
1573 This method takes a single overloaded argument, which can be any of: | |
1574 | |
1575 =over 4 | |
1576 | |
1577 =item 1. a scalar corresponding to a GFF file on the system | |
1578 | |
1579 A pathname to a local GFF file. Any files ending with the .gz, .Z, or | |
1580 .bz2 suffixes will be transparently decompressed with the appropriate | |
1581 command-line utility. | |
1582 | |
1583 =item 2. an array reference containing a list of GFF files on the system | |
1584 | |
1585 For example ['/home/gff/gff1.gz','/home/gff/gff2.gz'] | |
1586 | |
1587 =item 3. directory path | |
1588 | |
1589 The indicated directory will be searched for all files ending in the | |
1590 suffixes .gff, .gff.gz, .gff.Z or .gff.bz2. | |
1591 | |
1592 =item 4. filehandle | |
1593 | |
1594 An open filehandle from which to read the GFF data. Tied filehandles | |
1595 now work as well. | |
1596 | |
1597 =item 5. a pipe expression | |
1598 | |
1599 A pipe expression will also work. For example, a GFF file on a remote | |
1600 web server can be loaded with an expression like this: | |
1601 | |
1602 $db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |"); | |
1603 | |
1604 =back | |
1605 | |
1606 If successful, the method will return the number of GFF lines | |
1607 successfully loaded. | |
1608 | |
1609 NOTE:this method used to be called load(), but has been changed. The | |
1610 old method name is also recognized. | |
1611 | |
1612 =cut | |
1613 | |
1614 sub load_gff { | |
1615 my $self = shift; | |
1616 my $file_or_directory = shift || '.'; | |
1617 return $self->do_load_gff($file_or_directory) if ref($file_or_directory) && | |
1618 tied *$file_or_directory; | |
1619 | |
1620 my $tied_stdin = tied(*STDIN); | |
1621 open SAVEIN,"<&STDIN" unless $tied_stdin; | |
1622 local @ARGV = $self->setup_argv($file_or_directory,'gff') or return; # to play tricks with reader | |
1623 my $result = $self->do_load_gff('ARGV'); | |
1624 open STDIN,"<&SAVEIN" unless $tied_stdin; # restore STDIN | |
1625 return $result; | |
1626 } | |
1627 | |
1628 *load = \&load_gff; | |
1629 | |
1630 =head2 load_fasta | |
1631 | |
1632 Title : load_fasta | |
1633 Usage : $db->load_fasta($file|$directory|$filehandle); | |
1634 Function: load FASTA data into database | |
1635 Returns : count of records loaded | |
1636 Args : a directory, a file, a list of files, | |
1637 or a filehandle | |
1638 Status : Public | |
1639 | |
1640 This method takes a single overloaded argument, which can be any of: | |
1641 | |
1642 =over 4 | |
1643 | |
1644 =item 1. scalar corresponding to a FASTA file on the system | |
1645 | |
1646 A pathname to a local FASTA file. Any files ending with the .gz, .Z, or | |
1647 .bz2 suffixes will be transparently decompressed with the appropriate | |
1648 command-line utility. | |
1649 | |
1650 =item 2. array reference containing a list of FASTA files on the | |
1651 system | |
1652 | |
1653 For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz'] | |
1654 | |
1655 =item 3. path to a directory | |
1656 | |
1657 The indicated directory will be searched for all files ending in the | |
1658 suffixes .fa, .fa.gz, .fa.Z or .fa.bz2. | |
1659 | |
1660 a=item 4. filehandle | |
1661 | |
1662 An open filehandle from which to read the FASTA data. | |
1663 | |
1664 =item 5. pipe expression | |
1665 | |
1666 A pipe expression will also work. For example, a FASTA file on a remote | |
1667 web server can be loaded with an expression like this: | |
1668 | |
1669 $db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |"); | |
1670 | |
1671 =back | |
1672 | |
1673 =cut | |
1674 | |
1675 sub load_fasta { | |
1676 my $self = shift; | |
1677 my $file_or_directory = shift || '.'; | |
1678 return $self->load_sequence($file_or_directory) if ref($file_or_directory) && | |
1679 tied *$file_or_directory; | |
1680 | |
1681 my $tied = tied(*STDIN); | |
1682 open SAVEIN,"<&STDIN" unless $tied; | |
1683 local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader | |
1684 my $result = $self->load_sequence('ARGV'); | |
1685 open STDIN,"<&SAVEIN" unless $tied; # restore STDIN | |
1686 return $result; | |
1687 } | |
1688 | |
1689 =head2 load_sequence_string | |
1690 | |
1691 Title : load_sequence_string | |
1692 Usage : $db->load_sequence_string($id,$dna) | |
1693 Function: load a single DNA entry | |
1694 Returns : true if successfully loaded | |
1695 Args : a raw sequence string (DNA, RNA, protein) | |
1696 Status : Public | |
1697 | |
1698 =cut | |
1699 | |
1700 sub load_sequence_string { | |
1701 my $self = shift; | |
1702 my ($acc,$seq) = @_; | |
1703 my $offset = 0; | |
1704 $self->insert_sequence_chunk($acc,\$offset,\$seq) or return; | |
1705 $self->insert_sequence($acc,$offset,$seq) or return; | |
1706 1; | |
1707 } | |
1708 | |
1709 sub setup_argv { | |
1710 my $self = shift; | |
1711 my $file_or_directory = shift; | |
1712 my @suffixes = @_; | |
1713 no strict 'refs'; # so that we can call fileno() on the argument | |
1714 | |
1715 my @argv; | |
1716 | |
1717 if (-d $file_or_directory) { | |
1718 @argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes; | |
1719 }elsif (my $fd = fileno($file_or_directory)) { | |
1720 open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN"); | |
1721 @argv = '-'; | |
1722 } elsif (ref $file_or_directory) { | |
1723 @argv = @$file_or_directory; | |
1724 } else { | |
1725 @argv = $file_or_directory; | |
1726 } | |
1727 | |
1728 foreach (@argv) { | |
1729 if (/\.gz$/) { | |
1730 $_ = "gunzip -c $_ |"; | |
1731 } elsif (/\.Z$/) { | |
1732 $_ = "uncompress -c $_ |"; | |
1733 } elsif (/\.bz2$/) { | |
1734 $_ = "bunzip2 -c $_ |"; | |
1735 } | |
1736 } | |
1737 @argv; | |
1738 } | |
1739 | |
1740 =head2 lock_on_load | |
1741 | |
1742 Title : lock_on_load | |
1743 Usage : $lock = $db->lock_on_load([$lock]) | |
1744 Function: set write locking during load | |
1745 Returns : current value of lock-on-load flag | |
1746 Args : new value of lock-on-load-flag | |
1747 Status : Public | |
1748 | |
1749 This method is honored by some of the adaptors. If the value is true, | |
1750 the tables used by the GFF modules will be locked for writing during | |
1751 loads and inaccessible to other processes. | |
1752 | |
1753 =cut | |
1754 | |
1755 sub lock_on_load { | |
1756 my $self = shift; | |
1757 my $d = $self->{lock}; | |
1758 $self->{lock} = shift if @_; | |
1759 $d; | |
1760 } | |
1761 | |
1762 =head2 meta | |
1763 | |
1764 Title : meta | |
1765 Usage : $value = $db->meta($name [,$newval]) | |
1766 Function: get or set a meta variable | |
1767 Returns : a string | |
1768 Args : meta variable name and optionally value | |
1769 Status : abstract | |
1770 | |
1771 Get or set a named metavalues for the database. Metavalues can be | |
1772 used for database-specific settings. | |
1773 | |
1774 By default, this method does nothing! | |
1775 | |
1776 =cut | |
1777 | |
1778 sub meta { | |
1779 my $self = shift; | |
1780 my ($name,$value) = @_; | |
1781 return; | |
1782 } | |
1783 | |
1784 =head2 default_meta_values | |
1785 | |
1786 Title : default_meta_values | |
1787 Usage : %values = $db->default_meta_values | |
1788 Function: empty the database | |
1789 Returns : a list of tag=>value pairs | |
1790 Args : none | |
1791 Status : protected | |
1792 | |
1793 This method returns a list of tag=E<gt>value pairs that contain default | |
1794 meta information about the database. It is invoked by initialize() to | |
1795 write out the default meta values. The base class version returns an | |
1796 empty list. | |
1797 | |
1798 For things to work properly, meta value names must be UPPERCASE. | |
1799 | |
1800 =cut | |
1801 | |
1802 sub default_meta_values { | |
1803 my $self = shift; | |
1804 return (); | |
1805 } | |
1806 | |
1807 | |
1808 =head2 error | |
1809 | |
1810 Title : error | |
1811 Usage : $db->error( [$new error] ); | |
1812 Function: read or set error message | |
1813 Returns : error message | |
1814 Args : an optional argument to set the error message | |
1815 Status : Public | |
1816 | |
1817 This method can be used to retrieve the last error message. Errors | |
1818 are not reset to empty by successful calls, so contents are only valid | |
1819 immediately after an error condition has been detected. | |
1820 | |
1821 =cut | |
1822 | |
1823 sub error { | |
1824 my $self = shift; | |
1825 my $g = $self->{error}; | |
1826 $self->{error} = join '',@_ if @_; | |
1827 $g; | |
1828 } | |
1829 | |
1830 =head2 debug | |
1831 | |
1832 Title : debug | |
1833 Usage : $db->debug( [$flag] ); | |
1834 Function: read or set debug flag | |
1835 Returns : current value of debug flag | |
1836 Args : new debug flag (optional) | |
1837 Status : Public | |
1838 | |
1839 This method can be used to turn on debug messages. The exact nature | |
1840 of those messages depends on the adaptor in use. | |
1841 | |
1842 =cut | |
1843 | |
1844 sub debug { | |
1845 my $self = shift; | |
1846 my $g = $self->{debug}; | |
1847 $self->{debug} = shift if @_; | |
1848 $g; | |
1849 } | |
1850 | |
1851 | |
1852 =head2 automerge | |
1853 | |
1854 Title : automerge | |
1855 Usage : $db->automerge( [$new automerge] ); | |
1856 Function: get or set automerge value | |
1857 Returns : current value (boolean) | |
1858 Args : an optional argument to set the automerge value | |
1859 Status : Public | |
1860 | |
1861 By default, this module will use the aggregators to merge groups into | |
1862 single composite objects. This default can be changed to false by | |
1863 calling automerge(0). | |
1864 | |
1865 =cut | |
1866 | |
1867 sub automerge { | |
1868 my $self = shift; | |
1869 my $g = $self->{automerge}; | |
1870 $self->{automerge} = shift if @_; | |
1871 $g; | |
1872 } | |
1873 | |
1874 =head2 attributes | |
1875 | |
1876 Title : attributes | |
1877 Usage : @attributes = $db->attributes($id,$name) | |
1878 Function: get the "attributres" on a particular feature | |
1879 Returns : an array of string | |
1880 Args : feature ID | |
1881 Status : public | |
1882 | |
1883 Some GFF version 2 files use the groups column to store a series of | |
1884 attribute/value pairs. In this interpretation of GFF, the first such | |
1885 pair is treated as the primary group for the feature; subsequent pairs | |
1886 are treated as attributes. Two attributes have special meaning: | |
1887 "Note" is for backward compatibility and is used for unstructured text | |
1888 remarks. "Alias" is considered as a synonym for the feature name. | |
1889 | |
1890 If no name is provided, then attributes() returns a flattened hash, of | |
1891 attribute=E<gt>value pairs. This lets you do: | |
1892 | |
1893 %attributes = $db->attributes($id); | |
1894 | |
1895 Normally, attributes() will be called by the feature: | |
1896 | |
1897 @notes = $feature->attributes('Note'); | |
1898 | |
1899 In a scalar context, attributes() returns the first value of the | |
1900 attribute if a tag is present, otherwise a hash reference in which the | |
1901 keys are attribute names and the values are anonymous arrays | |
1902 containing the values. | |
1903 | |
1904 =cut | |
1905 | |
1906 sub attributes { | |
1907 my $self = shift; | |
1908 my ($id,$tag) = @_; | |
1909 my @result = $self->do_attributes($id,$tag) or return; | |
1910 return @result if wantarray; | |
1911 | |
1912 # what to do in an array context | |
1913 return $result[0] if $tag; | |
1914 my %result; | |
1915 while (my($key,$value) = splice(@result,0,2)) { | |
1916 push @{$result{$key}},$value; | |
1917 } | |
1918 return \%result; | |
1919 } | |
1920 | |
1921 =head2 fast_queries | |
1922 | |
1923 Title : fast_queries | |
1924 Usage : $flag = $db->fast_queries([$flag]) | |
1925 Function: turn on and off the "fast queries" option | |
1926 Returns : a boolean | |
1927 Args : a boolean flag (optional) | |
1928 Status : public | |
1929 | |
1930 The mysql database driver (and possibly others) support a "fast" query | |
1931 mode that caches results on the server side. This makes queries come | |
1932 back faster, particularly when creating iterators. The downside is | |
1933 that while iterating, new queries will die with a "command synch" | |
1934 error. This method turns the feature on and off. | |
1935 | |
1936 For databases that do not support a fast query, this method has no | |
1937 effect. | |
1938 | |
1939 =cut | |
1940 | |
1941 # override this method in order to set the mysql_use_result attribute, which is an obscure | |
1942 # but extremely powerful optimization for both performance and memory. | |
1943 sub fast_queries { | |
1944 my $self = shift; | |
1945 my $d = $self->{fast_queries}; | |
1946 $self->{fast_queries} = shift if @_; | |
1947 $d; | |
1948 } | |
1949 | |
1950 =head2 add_aggregator | |
1951 | |
1952 Title : add_aggregator | |
1953 Usage : $db->add_aggregator($aggregator) | |
1954 Function: add an aggregator to the list | |
1955 Returns : nothing | |
1956 Args : an aggregator | |
1957 Status : public | |
1958 | |
1959 This method will append an aggregator to the end of the list of | |
1960 registered aggregators. Three different argument types are accepted: | |
1961 | |
1962 1) a Bio::DB::GFF::Aggregator object -- will be added | |
1963 2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}" | |
1964 -- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method | |
1965 part is optional). | |
1966 3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator | |
1967 subclass, where the token corresponds to the subclass name. | |
1968 | |
1969 =cut | |
1970 | |
1971 sub add_aggregator { | |
1972 my $self = shift; | |
1973 my $aggregator = shift; | |
1974 my $list = $self->{aggregators} ||= []; | |
1975 if (ref $aggregator) { # an object | |
1976 @$list = grep {$_->get_method ne $aggregator->get_method} @$list; | |
1977 push @$list,$aggregator; | |
1978 } | |
1979 | |
1980 elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) { | |
1981 my($agg_name,$subparts,$mainpart) = ($1,$2,$3); | |
1982 my @subparts = split /,\s*/,$subparts; | |
1983 my @args = (-method => $agg_name, | |
1984 -sub_parts => \@subparts); | |
1985 push @args,(-main_method => $mainpart) if $mainpart; | |
1986 warn "making an aggregator with (@args), subparts = @subparts" if $self->debug; | |
1987 push @$list,Bio::DB::GFF::Aggregator->new(@args); | |
1988 } | |
1989 | |
1990 else { | |
1991 my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E"; | |
1992 eval "require $class"; | |
1993 $self->throw("Unable to load $aggregator aggregator: $@") if $@; | |
1994 push @$list,$class->new(); | |
1995 } | |
1996 } | |
1997 | |
1998 =head2 aggregators | |
1999 | |
2000 Title : aggregators | |
2001 Usage : $db->aggregators([@new_aggregators]); | |
2002 Function: retrieve list of aggregators | |
2003 Returns : list of aggregators | |
2004 Args : a list of aggregators to set (optional) | |
2005 Status : public | |
2006 | |
2007 This method will get or set the list of aggregators assigned to | |
2008 the database. If 1 or more arguments are passed, the existing | |
2009 set will be cleared. | |
2010 | |
2011 =cut | |
2012 | |
2013 sub aggregators { | |
2014 my $self = shift; | |
2015 my $d = $self->{aggregators}; | |
2016 if (@_) { | |
2017 $self->clear_aggregators; | |
2018 $self->add_aggregator($_) foreach @_; | |
2019 } | |
2020 return unless $d; | |
2021 return @$d; | |
2022 } | |
2023 | |
2024 =head2 clear_aggregators | |
2025 | |
2026 Title : clear_aggregators | |
2027 Usage : $db->clear_aggregators | |
2028 Function: clears list of aggregators | |
2029 Returns : nothing | |
2030 Args : none | |
2031 Status : public | |
2032 | |
2033 This method will clear the aggregators stored in the database object. | |
2034 Use aggregators() or add_aggregator() to add some back. | |
2035 | |
2036 =cut | |
2037 | |
2038 sub clear_aggregators { shift->{aggregators} = [] } | |
2039 | |
2040 =head1 Methods for use by Subclasses | |
2041 | |
2042 The following methods are chiefly of interest to subclasses and are | |
2043 not intended for use by end programmers. | |
2044 | |
2045 =head2 abscoords | |
2046 | |
2047 Title : abscoords | |
2048 Usage : $db->abscoords($name,$class,$refseq) | |
2049 Function: finds position of a landmark in reference coordinates | |
2050 Returns : ($ref,$class,$start,$stop,$strand) | |
2051 Args : name and class of landmark | |
2052 Status : public | |
2053 | |
2054 This method is called by Bio::DB::GFF::RelSegment to obtain the | |
2055 absolute coordinates of a sequence landmark. The arguments are the | |
2056 name and class of the landmark. If successful, abscoords() returns | |
2057 the ID of the reference sequence, its class, its start and stop | |
2058 positions, and the orientation of the reference sequence's coordinate | |
2059 system ("+" for forward strand, "-" for reverse strand). | |
2060 | |
2061 If $refseq is present in the argument list, it forces the query to | |
2062 search for the landmark in a particular reference sequence. | |
2063 | |
2064 =cut | |
2065 | |
2066 sub abscoords { | |
2067 my $self = shift; | |
2068 my ($name,$class,$refseq) = @_; | |
2069 $class ||= $self->{default_class}; | |
2070 $self->get_abscoords($name,$class,$refseq); | |
2071 } | |
2072 | |
2073 =head1 Protected API | |
2074 | |
2075 The following methods are not intended for public consumption, but are | |
2076 intended to be overridden/implemented by adaptors. | |
2077 | |
2078 =head2 default_aggregators | |
2079 | |
2080 Title : default_aggregators | |
2081 Usage : $db->default_aggregators; | |
2082 Function: retrieve list of aggregators | |
2083 Returns : array reference containing list of aggregator names | |
2084 Args : none | |
2085 Status : protected | |
2086 | |
2087 This method (which is intended to be overridden by adaptors) returns a | |
2088 list of standard aggregators to be applied when no aggregators are | |
2089 specified in the constructor. | |
2090 | |
2091 =cut | |
2092 | |
2093 sub default_aggregators { | |
2094 my $self = shift; | |
2095 return ['processed_transcript','alignment']; | |
2096 } | |
2097 | |
2098 =head2 do_load_gff | |
2099 | |
2100 Title : do_load_gff | |
2101 Usage : $db->do_load_gff($handle) | |
2102 Function: load a GFF input stream | |
2103 Returns : number of features loaded | |
2104 Args : A filehandle. | |
2105 Status : protected | |
2106 | |
2107 This method is called to load a GFF data stream. The method will read | |
2108 GFF features from E<lt>E<gt> and load them into the database. On exit the | |
2109 method must return the number of features loaded. | |
2110 | |
2111 Note that the method is responsible for parsing the GFF lines. This | |
2112 is to allow for differences in the interpretation of the "group" | |
2113 field, which are legion. | |
2114 | |
2115 You probably want to use load_gff() instead. It is more flexible | |
2116 about the arguments it accepts. | |
2117 | |
2118 =cut | |
2119 | |
2120 # load from <> | |
2121 sub do_load_gff { | |
2122 my $self = shift; | |
2123 my $io_handle = shift; | |
2124 | |
2125 local $self->{gff3_flag} = 0; | |
2126 $self->setup_load(); | |
2127 | |
2128 my $fasta_sequence_id; | |
2129 | |
2130 while (<$io_handle>) { | |
2131 chomp; | |
2132 $self->{gff3_flag}++ if /^\#\#gff-version\s+3/; | |
2133 if (/^>(\S+)/) { # uh oh, sequence coming | |
2134 $fasta_sequence_id = $1; | |
2135 last; | |
2136 } | |
2137 if (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line | |
2138 $self->load_gff_line( | |
2139 { | |
2140 ref => $1, | |
2141 class => 'Sequence', | |
2142 source => 'reference', | |
2143 method => 'Component', | |
2144 start => $2, | |
2145 stop => $3, | |
2146 score => undef, | |
2147 strand => undef, | |
2148 phase => undef, | |
2149 gclass => 'Sequence', | |
2150 gname => $1, | |
2151 tstart => undef, | |
2152 tstop => undef, | |
2153 attributes => [], | |
2154 } | |
2155 ); | |
2156 next; | |
2157 } | |
2158 | |
2159 next if /^\#/; | |
2160 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t"; | |
2161 next unless defined($ref) && defined($method) && defined($start) && defined($stop); | |
2162 foreach (\$score,\$strand,\$phase) { | |
2163 undef $$_ if $$_ eq '.'; | |
2164 } | |
2165 | |
2166 my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{gff3_flag}); | |
2167 | |
2168 # no standard way in the GFF file to denote the class of the reference sequence -- drat! | |
2169 # so we invoke the factory to do it | |
2170 my $class = $self->refclass($ref); | |
2171 | |
2172 # call subclass to do the dirty work | |
2173 if ($start > $stop) { | |
2174 ($start,$stop) = ($stop,$start); | |
2175 if ($strand eq '+') { | |
2176 $strand = '-'; | |
2177 } elsif ($strand eq '-') { | |
2178 $strand = '+'; | |
2179 } | |
2180 } | |
2181 $self->load_gff_line({ref => $ref, | |
2182 class => $class, | |
2183 source => $source, | |
2184 method => $method, | |
2185 start => $start, | |
2186 stop => $stop, | |
2187 score => $score, | |
2188 strand => $strand, | |
2189 phase => $phase, | |
2190 gclass => $gclass, | |
2191 gname => $gname, | |
2192 tstart => $tstart, | |
2193 tstop => $tstop, | |
2194 attributes => $attributes} | |
2195 ); | |
2196 } | |
2197 | |
2198 my $result = $self->finish_load(); | |
2199 $result += $self->load_sequence($io_handle,$fasta_sequence_id) | |
2200 if defined $fasta_sequence_id; | |
2201 $result; | |
2202 | |
2203 } | |
2204 | |
2205 =head2 load_sequence | |
2206 | |
2207 Title : load_sequence | |
2208 Usage : $db->load_sequence($handle [,$id]) | |
2209 Function: load a FASTA data stream | |
2210 Returns : number of sequences | |
2211 Args : a filehandle and optionally the ID of | |
2212 the first sequence in the stream. | |
2213 Status : protected | |
2214 | |
2215 You probably want to use load_fasta() instead. The $id argument is a | |
2216 hack used to switch from GFF loading to FASTA loading when load_gff() | |
2217 discovers FASTA data hiding at the bottom of the GFF file (as Artemis | |
2218 does). | |
2219 | |
2220 =cut | |
2221 | |
2222 sub load_sequence { | |
2223 my $self = shift; | |
2224 my $io_handle = shift; | |
2225 my $id = shift; # hack for GFF files that contain fasta data | |
2226 | |
2227 # read fasta file(s) from ARGV | |
2228 my ($seq,$offset,$loaded) = (undef,0,0); | |
2229 while (<$io_handle>) { | |
2230 chomp; | |
2231 if (/^>(\S+)/) { | |
2232 $self->insert_sequence($id,$offset,$seq) if $id; | |
2233 $id = $1; | |
2234 $offset = 0; | |
2235 $seq = ''; | |
2236 $loaded++; | |
2237 } else { | |
2238 $seq .= $_; | |
2239 $self->insert_sequence_chunk($id,\$offset,\$seq); | |
2240 } | |
2241 } | |
2242 $self->insert_sequence($id,$offset,$seq) if $id; | |
2243 $loaded+0; | |
2244 } | |
2245 | |
2246 sub insert_sequence_chunk { | |
2247 my $self = shift; | |
2248 my ($id,$offsetp,$seqp) = @_; | |
2249 if (my $cs = $self->dna_chunk_size) { | |
2250 while (length($$seqp) >= $cs) { | |
2251 my $chunk = substr($$seqp,0,$cs); | |
2252 $self->insert_sequence($id,$$offsetp,$chunk); | |
2253 $$offsetp += length($chunk); | |
2254 substr($$seqp,0,$cs) = ''; | |
2255 } | |
2256 } | |
2257 return 1; # the calling routine may expect success or failure | |
2258 } | |
2259 | |
2260 # used to store big pieces of DNA in itty bitty pieces | |
2261 sub dna_chunk_size { | |
2262 return 0; | |
2263 } | |
2264 | |
2265 sub insert_sequence { | |
2266 my $self = shift; | |
2267 my($id,$offset,$seq) = @_; | |
2268 $self->throw('insert_sequence(): must be defined in subclass'); | |
2269 } | |
2270 | |
2271 # This is the default class for reference points. Defaults to Sequence. | |
2272 sub default_class { | |
2273 my $self = shift; | |
2274 my $d = exists($self->{default_class}) ? $self->{default_class} : 'Sequence'; | |
2275 $self->{default_class} = shift if @_; | |
2276 $d; | |
2277 } | |
2278 | |
2279 # gets name of the reference sequence, and returns its class | |
2280 # currently just calls default_class | |
2281 sub refclass { | |
2282 my $self = shift; | |
2283 my $name = shift; | |
2284 return $self->default_class; | |
2285 } | |
2286 | |
2287 =head2 setup_load | |
2288 | |
2289 Title : setup_load | |
2290 Usage : $db->setup_load | |
2291 Function: called before load_gff_line() | |
2292 Returns : void | |
2293 Args : none | |
2294 Status : abstract | |
2295 | |
2296 This abstract method gives subclasses a chance to do any | |
2297 schema-specific initialization prior to loading a set of GFF records. | |
2298 It must be implemented by a subclass. | |
2299 | |
2300 =cut | |
2301 | |
2302 sub setup_load { | |
2303 # default, do nothing | |
2304 } | |
2305 | |
2306 =head2 finish_load | |
2307 | |
2308 Title : finish_load | |
2309 Usage : $db->finish_load | |
2310 Function: called after load_gff_line() | |
2311 Returns : number of records loaded | |
2312 Args : none | |
2313 Status :abstract | |
2314 | |
2315 This method gives subclasses a chance to do any schema-specific | |
2316 cleanup after loading a set of GFF records. | |
2317 | |
2318 =cut | |
2319 | |
2320 sub finish_load { | |
2321 # default, do nothing | |
2322 } | |
2323 | |
2324 =head2 load_gff_line | |
2325 | |
2326 Title : load_gff_line | |
2327 Usage : $db->load_gff_line(@args) | |
2328 Function: called to load one parsed line of GFF | |
2329 Returns : true if successfully inserted | |
2330 Args : see below | |
2331 Status : abstract | |
2332 | |
2333 This abstract method is called once per line of the GFF and passed a | |
2334 hashref containing parsed GFF fields. The fields are: | |
2335 | |
2336 {ref => $ref, | |
2337 class => $class, | |
2338 source => $source, | |
2339 method => $method, | |
2340 start => $start, | |
2341 stop => $stop, | |
2342 score => $score, | |
2343 strand => $strand, | |
2344 phase => $phase, | |
2345 gclass => $gclass, | |
2346 gname => $gname, | |
2347 tstart => $tstart, | |
2348 tstop => $tstop, | |
2349 attributes => $attributes} | |
2350 | |
2351 =cut | |
2352 | |
2353 sub load_gff_line { | |
2354 shift->throw("load_gff_line(): must be implemented by an adaptor"); | |
2355 } | |
2356 | |
2357 | |
2358 =head2 do_initialize | |
2359 | |
2360 Title : do_initialize | |
2361 Usage : $db->do_initialize([$erase]) | |
2362 Function: initialize and possibly erase database | |
2363 Returns : true if successful | |
2364 Args : optional erase flag | |
2365 Status : protected | |
2366 | |
2367 This method implements the initialize() method described above, and | |
2368 takes the same arguments. | |
2369 | |
2370 =cut | |
2371 | |
2372 sub do_initialize { | |
2373 shift->throw('do_initialize(): must be implemented by an adaptor'); | |
2374 } | |
2375 | |
2376 =head2 dna | |
2377 | |
2378 Title : dna | |
2379 Usage : $db->dna($id,$start,$stop,$class) | |
2380 Function: return the raw DNA string for a segment | |
2381 Returns : a raw DNA string | |
2382 Args : id of the sequence, its class, start and stop positions | |
2383 Status : public | |
2384 | |
2385 This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA | |
2386 sequence. | |
2387 | |
2388 Arguments: -name sequence name | |
2389 -start start position | |
2390 -stop stop position | |
2391 -class sequence class | |
2392 | |
2393 If start and stop are both undef, then the entire DNA is retrieved. | |
2394 So to fetch the whole dna, call like this: | |
2395 | |
2396 $db->dna($name_of_sequence); | |
2397 | |
2398 or like this: | |
2399 | |
2400 $db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence); | |
2401 | |
2402 NOTE: you will probably prefer to create a Segment and then invoke its | |
2403 dna() method. | |
2404 | |
2405 =cut | |
2406 | |
2407 # call to return the DNA string for the indicated region | |
2408 # real work is done by get_dna() | |
2409 sub dna { | |
2410 my $self = shift; | |
2411 my ($id,$start,$stop,$class) = rearrange([ | |
2412 [qw(NAME ID REF REFSEQ)], | |
2413 qw(START), | |
2414 [qw(STOP END)], | |
2415 'CLASS', | |
2416 ],@_); | |
2417 # return unless defined $start && defined $stop; | |
2418 $self->get_dna($id,$start,$stop,$class); | |
2419 } | |
2420 | |
2421 sub features_in_range { | |
2422 my $self = shift; | |
2423 my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) = | |
2424 rearrange([ | |
2425 [qw(RANGE_TYPE)], | |
2426 [qw(REF REFSEQ)], | |
2427 qw(CLASS), | |
2428 qw(START), | |
2429 [qw(STOP END)], | |
2430 [qw(TYPE TYPES)], | |
2431 qw(PARENT), | |
2432 [qw(RARE SPARSE)], | |
2433 [qw(MERGE AUTOMERGE)], | |
2434 'ITERATOR' | |
2435 ],@_); | |
2436 $other ||= {}; | |
2437 $automerge = $types && $self->automerge unless defined $automerge; | |
2438 $self->throw("range type must be one of {". | |
2439 join(',',keys %valid_range_types). | |
2440 "}\n") | |
2441 unless $valid_range_types{lc $range_type}; | |
2442 $self->_features({ | |
2443 rangetype => lc $range_type, | |
2444 refseq => $refseq, | |
2445 refclass => $class, | |
2446 start => $start, | |
2447 stop => $stop, | |
2448 types => $types }, | |
2449 { | |
2450 sparse => $sparse, | |
2451 automerge => $automerge, | |
2452 iterator => $iterator, | |
2453 %$other, | |
2454 }, | |
2455 $parent); | |
2456 } | |
2457 | |
2458 =head2 get_dna | |
2459 | |
2460 Title : get_dna | |
2461 Usage : $db->get_dna($id,$start,$stop,$class) | |
2462 Function: get DNA for indicated segment | |
2463 Returns : the dna string | |
2464 Args : sequence ID, start, stop and class | |
2465 Status : protected | |
2466 | |
2467 If start E<gt> stop and the sequence is nucleotide, then this method | |
2468 should return the reverse complement. The sequence class may be | |
2469 ignored by those databases that do not recognize different object | |
2470 types. | |
2471 | |
2472 =cut | |
2473 | |
2474 sub get_dna { | |
2475 my $self = shift; | |
2476 my ($id,$start,$stop,$class,) = @_; | |
2477 $self->throw("get_dna() must be implemented by an adaptor"); | |
2478 } | |
2479 | |
2480 =head2 get_features | |
2481 | |
2482 Title : get_features | |
2483 Usage : $db->get_features($search,$options,$callback) | |
2484 Function: get list of features for a region | |
2485 Returns : count of number of features retrieved | |
2486 Args : see below | |
2487 Status : protected | |
2488 | |
2489 The first argument is a hash reference containing search criteria for | |
2490 retrieving features. It contains the following keys: | |
2491 | |
2492 rangetype One of "overlaps", "contains" or "contained_in". Indicates | |
2493 the type of range query requested. | |
2494 | |
2495 refseq ID of the landmark that establishes the absolute | |
2496 coordinate system. | |
2497 | |
2498 refclass Class of this landmark. Can be ignored by implementations | |
2499 that don't recognize such distinctions. | |
2500 | |
2501 start Start of the range, inclusive. | |
2502 | |
2503 stop Stop of the range, inclusive. | |
2504 | |
2505 types Array reference containing the list of annotation types | |
2506 to fetch from the database. Each annotation type is an | |
2507 array reference consisting of [source,method]. | |
2508 | |
2509 The second argument is a hash reference containing certain options | |
2510 that affect the way information is retrieved: | |
2511 | |
2512 sort_by_group | |
2513 A flag. If true, means that the returned features should be | |
2514 sorted by the group that they're in. | |
2515 | |
2516 sparse A flag. If true, means that the expected density of the | |
2517 features is such that it will be more efficient to search | |
2518 by type rather than by range. If it is taking a long | |
2519 time to fetch features, give this a try. | |
2520 | |
2521 binsize A true value will create a set of artificial features whose | |
2522 start and stop positions indicate bins of the given size, and | |
2523 whose scores are the number of features in the bin. The | |
2524 class of the feature will be set to "bin", and its name to | |
2525 "method:source". This is a handy way of generating histograms | |
2526 of feature density. | |
2527 | |
2528 The third argument, the $callback, is a code reference to which | |
2529 retrieved features are passed. It is described in more detail below. | |
2530 | |
2531 This routine is responsible for getting arrays of GFF data out of the | |
2532 database and passing them to the callback subroutine. The callback | |
2533 does the work of constructing a Bio::DB::GFF::Feature object out of | |
2534 that data. The callback expects a list of 13 fields: | |
2535 | |
2536 $refseq The reference sequence | |
2537 $start feature start | |
2538 $stop feature stop | |
2539 $source feature source | |
2540 $method feature method | |
2541 $score feature score | |
2542 $strand feature strand | |
2543 $phase feature phase | |
2544 $groupclass group class (may be undef) | |
2545 $groupname group ID (may be undef) | |
2546 $tstart target start for similarity hits (may be undef) | |
2547 $tstop target stop for similarity hits (may be undef) | |
2548 $feature_id A unique feature ID (may be undef) | |
2549 | |
2550 These fields are in the same order as the raw GFF file, with the | |
2551 exception that the group column has been parsed into group class and | |
2552 group name fields. | |
2553 | |
2554 The feature ID, if provided, is a unique identifier of the feature | |
2555 line. The module does not depend on this ID in any way, but it is | |
2556 available via Bio::DB::GFF-E<gt>id() if wanted. In the dbi::mysql and | |
2557 dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb | |
2558 adaptor it is not used. | |
2559 | |
2560 =cut | |
2561 | |
2562 sub get_features{ | |
2563 my $self = shift; | |
2564 my ($search,$options,$callback) = @_; | |
2565 $self->throw("get_features() must be implemented by an adaptor"); | |
2566 } | |
2567 | |
2568 | |
2569 =head2 _feature_by_name | |
2570 | |
2571 Title : _feature_by_name | |
2572 Usage : $db->_feature_by_name($class,$name,$location,$callback) | |
2573 Function: get a list of features by name and class | |
2574 Returns : count of number of features retrieved | |
2575 Args : name of feature, class of feature, and a callback | |
2576 Status : abstract | |
2577 | |
2578 This method is used internally. The callback arguments are the same | |
2579 as those used by make_feature(). This method must be overidden by | |
2580 subclasses. | |
2581 | |
2582 =cut | |
2583 | |
2584 sub _feature_by_name { | |
2585 my $self = shift; | |
2586 my ($class,$name,$location,$callback) = @_; | |
2587 $self->throw("_feature_by_name() must be implemented by an adaptor"); | |
2588 } | |
2589 | |
2590 sub _feature_by_attribute { | |
2591 my $self = shift; | |
2592 my ($attributes,$callback) = @_; | |
2593 $self->throw("_feature_by_name() must be implemented by an adaptor"); | |
2594 } | |
2595 | |
2596 =head2 _feature_by_id | |
2597 | |
2598 Title : _feature_by_id | |
2599 Usage : $db->_feature_by_id($ids,$type,$callback) | |
2600 Function: get a feature based | |
2601 Returns : count of number of features retrieved | |
2602 Args : arrayref to feature IDs to fetch | |
2603 Status : abstract | |
2604 | |
2605 This method is used internally to fetch features either by their ID or | |
2606 their group ID. $ids is a arrayref containing a list of IDs, $type is | |
2607 one of "feature" or "group", and $callback is a callback. The | |
2608 callback arguments are the same as those used by make_feature(). This | |
2609 method must be overidden by subclasses. | |
2610 | |
2611 =cut | |
2612 | |
2613 sub _feature_by_id { | |
2614 my $self = shift; | |
2615 my ($ids,$type,$callback) = @_; | |
2616 $self->throw("_feature_by_id() must be implemented by an adaptor"); | |
2617 } | |
2618 | |
2619 =head2 overlapping_features | |
2620 | |
2621 Title : overlapping_features | |
2622 Usage : $db->overlapping_features(@args) | |
2623 Function: get features that overlap the indicated range | |
2624 Returns : a list of Bio::DB::GFF::Feature objects | |
2625 Args : see below | |
2626 Status : public | |
2627 | |
2628 This method is invoked by Bio::DB::GFF::Segment-E<gt>features() to find | |
2629 the list of features that overlap a given range. It is generally | |
2630 preferable to create the Segment first, and then fetch the features. | |
2631 | |
2632 This method takes set of named arguments: | |
2633 | |
2634 -refseq ID of the reference sequence | |
2635 -class Class of the reference sequence | |
2636 -start Start of the desired range in refseq coordinates | |
2637 -stop Stop of the desired range in refseq coordinates | |
2638 -types List of feature types to return. Argument is an array | |
2639 reference containing strings of the format "method:source" | |
2640 -parent A parent Bio::DB::GFF::Segment object, used to create | |
2641 relative coordinates in the generated features. | |
2642 -rare Turn on an optimization suitable for a relatively rare feature type, | |
2643 where it will be faster to filter by feature type first | |
2644 and then by position, rather than vice versa. | |
2645 -merge Whether to apply aggregators to the generated features. | |
2646 -iterator Whether to return an iterator across the features. | |
2647 | |
2648 If -iterator is true, then the method returns a single scalar value | |
2649 consisting of a Bio::SeqIO object. You can call next_seq() repeatedly | |
2650 on this object to fetch each of the features in turn. If iterator is | |
2651 false or absent, then all the features are returned as a list. | |
2652 | |
2653 Currently aggregation is disabled when iterating over a series of | |
2654 features. | |
2655 | |
2656 Types are indicated using the nomenclature "method:source". Either of | |
2657 these fields can be omitted, in which case a wildcard is used for the | |
2658 missing field. Type names without the colon (e.g. "exon") are | |
2659 interpreted as the method name and a source wild card. Regular | |
2660 expressions are allowed in either field, as in: "similarity:BLAST.*". | |
2661 | |
2662 =cut | |
2663 | |
2664 # call to return the features that overlap the named region | |
2665 # real work is done by get_features | |
2666 sub overlapping_features { | |
2667 my $self = shift; | |
2668 $self->features_in_range(-range_type=>'overlaps',@_); | |
2669 } | |
2670 | |
2671 =head2 contained_features | |
2672 | |
2673 Title : contained_features | |
2674 Usage : $db->contained_features(@args) | |
2675 Function: get features that are contained within the indicated range | |
2676 Returns : a list of Bio::DB::GFF::Feature objects | |
2677 Args : see overlapping_features() | |
2678 Status : public | |
2679 | |
2680 This call is similar to overlapping_features(), except that it only | |
2681 retrieves features whose end points are completely contained within | |
2682 the specified range. | |
2683 | |
2684 Generally you will want to fetch a Bio::DB::GFF::Segment object and | |
2685 call its contained_features() method rather than call this directly. | |
2686 | |
2687 =cut | |
2688 | |
2689 # The same, except that it only returns features that are completely contained within the | |
2690 # range (much faster usually) | |
2691 sub contained_features { | |
2692 my $self = shift; | |
2693 $self->features_in_range(-range_type=>'contains',@_); | |
2694 } | |
2695 | |
2696 =head2 contained_in | |
2697 | |
2698 Title : contained_in | |
2699 Usage : @features = $s->contained_in(@args) | |
2700 Function: get features that contain this segment | |
2701 Returns : a list of Bio::DB::GFF::Feature objects | |
2702 Args : see features() | |
2703 Status : Public | |
2704 | |
2705 This is identical in behavior to features() except that it returns | |
2706 only those features that completely contain the segment. | |
2707 | |
2708 =cut | |
2709 | |
2710 sub contained_in { | |
2711 my $self = shift; | |
2712 $self->features_in_range(-range_type=>'contained_in',@_); | |
2713 } | |
2714 | |
2715 =head2 get_abscoords | |
2716 | |
2717 Title : get_abscoords | |
2718 Usage : $db->get_abscoords($name,$class,$refseq) | |
2719 Function: get the absolute coordinates of sequence with name & class | |
2720 Returns : ($absref,$absstart,$absstop,$absstrand) | |
2721 Args : name and class of the landmark | |
2722 Status : protected | |
2723 | |
2724 Given the name and class of a genomic landmark, this function returns | |
2725 a four-element array consisting of: | |
2726 | |
2727 $absref the ID of the reference sequence that contains this landmark | |
2728 $absstart the position at which the landmark starts | |
2729 $absstop the position at which the landmark stops | |
2730 $absstrand the strand of the landmark, relative to the reference sequence | |
2731 | |
2732 If $refseq is provided, the function searches only within the | |
2733 specified reference sequence. | |
2734 | |
2735 =cut | |
2736 | |
2737 sub get_abscoords { | |
2738 my $self = shift; | |
2739 my ($name,$class,$refseq) = @_; | |
2740 $self->throw("get_abscoords() must be implemented by an adaptor"); | |
2741 } | |
2742 | |
2743 =head2 get_types | |
2744 | |
2745 Title : get_types | |
2746 Usage : $db->get_types($absref,$class,$start,$stop,$count) | |
2747 Function: get list of all feature types on the indicated segment | |
2748 Returns : list or hash of Bio::DB::GFF::Typename objects | |
2749 Args : see below | |
2750 Status : protected | |
2751 | |
2752 Arguments are: | |
2753 | |
2754 $absref the ID of the reference sequence | |
2755 $class the class of the reference sequence | |
2756 $start the position to start counting | |
2757 $stop the position to end counting | |
2758 $count a boolean indicating whether to count the number | |
2759 of occurrences of each feature type | |
2760 | |
2761 If $count is true, then a hash is returned. The keys of the hash are | |
2762 feature type names in the format "method:source" and the values are | |
2763 the number of times a feature of this type overlaps the indicated | |
2764 segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename | |
2765 objects. If $start or $stop are undef, then all features on the | |
2766 indicated segment are enumerated. If $absref is undef, then the call | |
2767 returns all feature types in the database. | |
2768 | |
2769 =cut | |
2770 | |
2771 sub get_types { | |
2772 my $self = shift; | |
2773 my ($refseq,$class,$start,$stop,$count,$types) = @_; | |
2774 $self->throw("get_types() must be implemented by an adaptor"); | |
2775 } | |
2776 | |
2777 =head2 make_feature | |
2778 | |
2779 Title : make_feature | |
2780 Usage : $db->make_feature(@args) | |
2781 Function: Create a Bio::DB::GFF::Feature object from string data | |
2782 Returns : a Bio::DB::GFF::Feature object | |
2783 Args : see below | |
2784 Status : internal | |
2785 | |
2786 This takes 14 arguments (really!): | |
2787 | |
2788 $parent A Bio::DB::GFF::RelSegment object | |
2789 $group_hash A hashref containing unique list of GFF groups | |
2790 $refname The name of the reference sequence for this feature | |
2791 $refclass The class of the reference sequence for this feature | |
2792 $start Start of feature | |
2793 $stop Stop of feature | |
2794 $source Feature source field | |
2795 $method Feature method field | |
2796 $score Feature score field | |
2797 $strand Feature strand | |
2798 $phase Feature phase | |
2799 $group_class Class of feature group | |
2800 $group_name Name of feature group | |
2801 $tstart For homologies, start of hit on target | |
2802 $tstop Stop of hit on target | |
2803 | |
2804 The $parent argument, if present, is used to establish relative | |
2805 coordinates in the resulting Bio::DB::Feature object. This allows one | |
2806 feature to generate a list of other features that are relative to its | |
2807 coordinate system (for example, finding the coordinates of the second | |
2808 exon relative to the coordinates of the first). | |
2809 | |
2810 The $group_hash allows the group_class/group_name strings to be turned | |
2811 into rich database objects via the make_obect() method (see above). | |
2812 Because these objects may be expensive to create, $group_hash is used | |
2813 to uniquefy them. The index of this hash is the composite key | |
2814 {$group_class,$group_name,$tstart,$tstop}. Values are whatever object | |
2815 is returned by the make_object() method. | |
2816 | |
2817 The remainder of the fields are taken from the GFF line, with the | |
2818 exception that "Target" features, which contain information about the | |
2819 target of a homology search, are parsed into their components. | |
2820 | |
2821 =cut | |
2822 | |
2823 # This call is responsible for turning a line of GFF into a | |
2824 # feature object. | |
2825 # The $parent argument is a Bio::DB::GFF::Segment object and is used | |
2826 # to establish the coordinate system for the new feature. | |
2827 # The $group_hash argument is an hash ref that holds previously- | |
2828 # generated group objects. | |
2829 # Other arguments are taken right out of the GFF table. | |
2830 sub make_feature { | |
2831 my $self = shift; | |
2832 my ($parent,$group_hash, # these arguments provided by generic mechanisms | |
2833 $srcseq, # the rest is provided by adaptor | |
2834 $start,$stop, | |
2835 $source,$method, | |
2836 $score,$strand,$phase, | |
2837 $group_class,$group_name, | |
2838 $tstart,$tstop, | |
2839 $db_id,$group_id) = @_; | |
2840 | |
2841 return unless $srcseq; # return undef if called with no arguments. This behavior is used for | |
2842 # on-the-fly aggregation. | |
2843 | |
2844 my $group; # undefined | |
2845 if (defined $group_class && defined $group_name) { | |
2846 $tstart ||= ''; | |
2847 $tstop ||= ''; | |
2848 if ($group_hash) { | |
2849 $group = $group_hash->{$group_class,$group_name,$tstart,$tstop} | |
2850 ||= $self->make_object($group_class,$group_name,$tstart,$tstop); | |
2851 } else { | |
2852 $group = $self->make_object($group_class,$group_name,$tstart,$tstop); | |
2853 } | |
2854 } | |
2855 | |
2856 # fix for some broken GFF files | |
2857 # unfortunately - has undesired side effects | |
2858 # if (defined $tstart && defined $tstop && !defined $strand) { | |
2859 # $strand = $tstart <= $tstop ? '+' : '-'; | |
2860 # } | |
2861 | |
2862 if (ref $parent) { # note that the src sequence is ignored | |
2863 return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop, | |
2864 $method,$source, | |
2865 $score,$strand,$phase, | |
2866 $group,$db_id,$group_id, | |
2867 $tstart,$tstop); | |
2868 } else { | |
2869 return Bio::DB::GFF::Feature->new($self,$srcseq, | |
2870 $start,$stop, | |
2871 $method,$source, | |
2872 $score,$strand,$phase, | |
2873 $group,$db_id,$group_id, | |
2874 $tstart,$tstop); | |
2875 } | |
2876 } | |
2877 | |
2878 sub make_aggregated_feature { | |
2879 my $self = shift; | |
2880 my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3); | |
2881 my $feature = $self->make_feature($parent,undef,@_); | |
2882 return [$feature] if $feature && !$feature->group; | |
2883 | |
2884 # if we have accumulated features and either: | |
2885 # (1) make_feature() returned undef, indicated very end or | |
2886 # (2) the current group is different from the previous one | |
2887 | |
2888 local $^W = 0; # irritating uninitialized value warning in next statement | |
2889 if (@$accumulated_features && | |
2890 (!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) { | |
2891 foreach my $a (@$aggregators) { # last aggregator gets first shot | |
2892 $a->aggregate($accumulated_features,$self) or next; | |
2893 } | |
2894 my @result = @$accumulated_features; | |
2895 @$accumulated_features = $feature ? ($feature) : (); | |
2896 return unless @result; | |
2897 return \@result ; | |
2898 } | |
2899 push @$accumulated_features,$feature; | |
2900 return; | |
2901 } | |
2902 | |
2903 =head2 parse_types | |
2904 | |
2905 Title : parse_types | |
2906 Usage : $db->parse_types(@args) | |
2907 Function: parses list of types | |
2908 Returns : an array ref containing ['method','source'] pairs | |
2909 Args : a list of types in 'method:source' form | |
2910 Status : internal | |
2911 | |
2912 This method takes an array of type names in the format "method:source" | |
2913 and returns an array reference of ['method','source'] pairs. It will | |
2914 also accept a single argument consisting of an array reference with | |
2915 the list of type names. | |
2916 | |
2917 =cut | |
2918 | |
2919 # turn feature types in the format "method:source" into a list of [method,source] refs | |
2920 sub parse_types { | |
2921 my $self = shift; | |
2922 return [] if !@_ or !defined($_[0]); | |
2923 return $_[0] if ref $_[0] eq 'ARRAY' && ref $_[0][0]; | |
2924 my @types = ref($_[0]) ? @{$_[0]} : @_; | |
2925 my @type_list = map { [split(':',$_,2)] } @types; | |
2926 return \@type_list; | |
2927 } | |
2928 | |
2929 =head2 make_match_sub | |
2930 | |
2931 Title : make_match_sub | |
2932 Usage : $db->make_match_sub($types) | |
2933 Function: creates a subroutine used for filtering features | |
2934 Returns : a code reference | |
2935 Args : a list of parsed type names | |
2936 Status : protected | |
2937 | |
2938 This method is used internally to generate a code subroutine that will | |
2939 accept or reject a feature based on its method and source. It takes | |
2940 an array of parsed type names in the format returned by parse_types(), | |
2941 and generates an anonymous subroutine. The subroutine takes a single | |
2942 Bio::DB::GFF::Feature object and returns true if the feature matches | |
2943 one of the desired feature types, and false otherwise. | |
2944 | |
2945 =cut | |
2946 | |
2947 # a subroutine that matches features indicated by list of types | |
2948 sub make_match_sub { | |
2949 my $self = shift; | |
2950 my $types = shift; | |
2951 | |
2952 return sub { 1 } unless ref $types && @$types; | |
2953 | |
2954 my @expr; | |
2955 for my $type (@$types) { | |
2956 my ($method,$source) = @$type; | |
2957 $method ||= '.*'; | |
2958 $source = $source ? ":$source" : "(?::.+)?"; | |
2959 push @expr,"${method}${source}"; | |
2960 } | |
2961 my $expr = join '|',@expr; | |
2962 return $self->{match_subs}{$expr} if $self->{match_subs}{$expr}; | |
2963 | |
2964 my $sub =<<END; | |
2965 sub { | |
2966 my \$feature = shift or return; | |
2967 return \$feature->type =~ /^($expr)\$/i; | |
2968 } | |
2969 END | |
2970 warn "match sub: $sub\n" if $self->debug; | |
2971 my $compiled_sub = eval $sub; | |
2972 $self->throw($@) if $@; | |
2973 return $self->{match_subs}{$expr} = $compiled_sub; | |
2974 } | |
2975 | |
2976 =head2 make_object | |
2977 | |
2978 Title : make_object | |
2979 Usage : $db->make_object($class,$name,$start,$stop) | |
2980 Function: creates a feature object | |
2981 Returns : a feature object | |
2982 Args : see below | |
2983 Status : protected | |
2984 | |
2985 This method is called to make an object from the GFF "group" field. | |
2986 By default, all Target groups are turned into Bio::DB::GFF::Homol | |
2987 objects, and everything else becomes a Bio::DB::GFF::Featname. | |
2988 However, adaptors are free to override this method to generate more | |
2989 interesting objects, such as true BioPerl objects, or Acedb objects. | |
2990 | |
2991 Arguments are: | |
2992 | |
2993 $name database ID for object | |
2994 $class class of object | |
2995 $start for similarities, start of match inside object | |
2996 $stop for similarities, stop of match inside object | |
2997 | |
2998 =cut | |
2999 | |
3000 # abstract call to turn a feature into an object, given its class and name | |
3001 sub make_object { | |
3002 my $self = shift; | |
3003 my ($class,$name,$start,$stop) = @_; | |
3004 return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop) | |
3005 if defined $start and length $start; | |
3006 return Bio::DB::GFF::Featname->new($class,$name); | |
3007 } | |
3008 | |
3009 | |
3010 =head2 do_attributes | |
3011 | |
3012 Title : do_attributes | |
3013 Usage : $db->do_attributes($id [,$tag]); | |
3014 Function: internal method to retrieve attributes given an id and tag | |
3015 Returns : a list of Bio::DB::GFF::Feature objects | |
3016 Args : a feature id and a attribute tag (optional) | |
3017 Status : protected | |
3018 | |
3019 This method is overridden by subclasses in order to return a list of | |
3020 attributes. If called with a tag, returns the value of attributes of | |
3021 that tag type. If called without a tag, returns a flattened array of | |
3022 (tag=E<gt>value) pairs. A particular tag can be present multiple times. | |
3023 | |
3024 =cut | |
3025 | |
3026 sub do_attributes { | |
3027 my $self = shift; | |
3028 my ($id,$tag) = @_; | |
3029 return (); | |
3030 } | |
3031 | |
3032 | |
3033 | |
3034 =head1 Internal Methods | |
3035 | |
3036 The following methods are internal to Bio::DB::GFF and are not | |
3037 guaranteed to remain the same. | |
3038 | |
3039 =head2 _features | |
3040 | |
3041 Title : _features | |
3042 Usage : $db->_features($search,$options,$parent) | |
3043 Function: internal method | |
3044 Returns : a list of Bio::DB::GFF::Feature objects | |
3045 Args : see below | |
3046 Status : internal | |
3047 | |
3048 This is an internal method that is called by overlapping_features(), | |
3049 contained_features() and features() to create features based on a | |
3050 parent segment's coordinate system. It takes three arguments, a | |
3051 search options hashref, an options hashref, and a parent segment. | |
3052 | |
3053 The search hashref contains the following keys: | |
3054 | |
3055 rangetype One of "overlaps", "contains" or "contained_in". Indicates | |
3056 the type of range query requested. | |
3057 refseq reference sequence ID | |
3058 refclass reference sequence class | |
3059 start start of range | |
3060 stop stop of range | |
3061 types arrayref containing list of types in "method:source" form | |
3062 | |
3063 The options hashref contains zero or more of the following keys: | |
3064 | |
3065 sparse turn on optimizations for a rare feature | |
3066 automerge if true, invoke aggregators to merge features | |
3067 iterator if true, return an iterator | |
3068 | |
3069 The $parent argument is a scalar object containing a | |
3070 Bio::DB::GFF::RelSegment object or descendent. | |
3071 | |
3072 =cut | |
3073 | |
3074 #' | |
3075 | |
3076 sub _features { | |
3077 my $self = shift; | |
3078 my ($search,$options,$parent) = @_; | |
3079 (@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)}) | |
3080 if defined($search->{start}) && $search->{start} > $search->{stop}; | |
3081 | |
3082 my $types = $self->parse_types($search->{types}); # parse out list of types | |
3083 my @aggregated_types = @$types; # keep a copy | |
3084 | |
3085 # allow the aggregators to operate on the original | |
3086 my @aggregators; | |
3087 if ($options->{automerge}) { | |
3088 for my $a ($self->aggregators) { | |
3089 $a = $a->clone if $options->{iterator}; | |
3090 unshift @aggregators,$a | |
3091 if $a->disaggregate(\@aggregated_types,$self); | |
3092 } | |
3093 } | |
3094 | |
3095 if ($options->{iterator}) { | |
3096 my @accumulated_features; | |
3097 my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) } | |
3098 : sub { [$self->make_feature($parent,undef,@_)] }; | |
3099 return $self->get_features_iterator({ %$search, | |
3100 types => \@aggregated_types }, | |
3101 { %$options, | |
3102 sort_by_group => $options->{automerge} }, | |
3103 $callback | |
3104 ); | |
3105 } | |
3106 | |
3107 my %groups; # cache the groups we create to avoid consuming too much unecessary memory | |
3108 my $features = []; | |
3109 | |
3110 my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) }; | |
3111 $self->get_features({ %$search, | |
3112 types => \@aggregated_types }, | |
3113 $options, | |
3114 $callback); | |
3115 | |
3116 if ($options->{automerge}) { | |
3117 warn "aggregating...\n" if $self->debug; | |
3118 foreach my $a (@aggregators) { # last aggregator gets first shot | |
3119 warn "Aggregator $a:\n" if $self->debug; | |
3120 $a->aggregate($features,$self); | |
3121 } | |
3122 } | |
3123 | |
3124 @$features; | |
3125 } | |
3126 | |
3127 =head2 get_features_iterator | |
3128 | |
3129 Title : get_features_iterator | |
3130 Usage : $db->get_features_iterator($search,$options,$callback) | |
3131 Function: get an iterator on a features query | |
3132 Returns : a Bio::SeqIO object | |
3133 Args : as per get_features() | |
3134 Status : Public | |
3135 | |
3136 This method takes the same arguments as get_features(), but returns an | |
3137 iterator that can be used to fetch features sequentially, as per | |
3138 Bio::SeqIO. | |
3139 | |
3140 Internally, this method is simply a front end to range_query(). | |
3141 The latter method constructs and executes the query, returning a | |
3142 statement handle. This routine passes the statement handle to the | |
3143 constructor for the iterator, along with the callback. | |
3144 | |
3145 =cut | |
3146 | |
3147 sub get_features_iterator { | |
3148 my $self = shift; | |
3149 my ($search,$options,$callback) = @_; | |
3150 $self->throw('feature iteration is not implemented in this adaptor'); | |
3151 } | |
3152 | |
3153 =head2 split_group | |
3154 | |
3155 Title : split_group | |
3156 Usage : $db->split_group($group_field,$gff3_flag) | |
3157 Function: parse GFF group field | |
3158 Returns : ($gclass,$gname,$tstart,$tstop,$attributes) | |
3159 Args : the gff group column and a flag indicating gff3 compatibility | |
3160 Status : internal | |
3161 | |
3162 This is a method that is called by load_gff_line to parse out the | |
3163 contents of one or more group fields. It returns the class of the | |
3164 group, its name, the start and stop of the target, if any, and an | |
3165 array reference containing any attributes that were stuck into the | |
3166 group field, in [attribute_name,attribute_value] format. | |
3167 | |
3168 =cut | |
3169 | |
3170 sub split_group { | |
3171 my $self = shift; | |
3172 my ($group,$gff3) = @_; | |
3173 if ($gff3) { | |
3174 my @groups = split /[;&]/,$group; # so easy! | |
3175 return $self->_split_gff3_group(@groups); | |
3176 } else { | |
3177 # handle group parsing | |
3178 # protect embedded semicolons in the group; there must be faster/more elegant way | |
3179 # to do this. | |
3180 $group =~ s/\\;/$;/g; | |
3181 while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 } | |
3182 my @groups = split(/\s*;\s*/,$group); | |
3183 foreach (@groups) { s/$;/;/g } | |
3184 return $self->_split_gff2_group(@groups); | |
3185 } | |
3186 } | |
3187 | |
3188 =head2 _split_gff2_group | |
3189 | |
3190 This is an internal method called by split_group(). | |
3191 | |
3192 =cut | |
3193 | |
3194 sub _split_gff2_group { | |
3195 my $self = shift; | |
3196 my @groups = @_; | |
3197 | |
3198 my ($gclass,$gname,$tstart,$tstop,@attributes); | |
3199 | |
3200 for (@groups) { | |
3201 | |
3202 my ($tag,$value) = /^(\S+)(?:\s+(.+))?/; | |
3203 $value ||= ''; | |
3204 if ($value =~ /^\"(.+)\"$/) { #remove quotes | |
3205 $value = $1; | |
3206 } | |
3207 $value =~ s/\\t/\t/g; | |
3208 $value =~ s/\\r/\r/g; | |
3209 | |
3210 # Any additional groups become part of the attributes hash | |
3211 # For historical reasons, the tag "Note" is treated as an | |
3212 # attribute, even if it is the only group. | |
3213 $tag ||= ''; | |
3214 if ($tag eq 'Note' or ($gclass && $gname)) { | |
3215 push @attributes,[$tag => $value]; | |
3216 } | |
3217 | |
3218 # if the tag eq 'Target' then the class name is embedded in the ID | |
3219 # (the GFF format is obviously screwed up here) | |
3220 elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) { | |
3221 ($gclass,$gname) = ($1,$2); | |
3222 ($tstart,$tstop) = / (\d+) (\d+)/; | |
3223 } | |
3224 | |
3225 elsif (!$value) { | |
3226 push @attributes,[Note => $tag]; # e.g. "Confirmed_by_EST" | |
3227 } | |
3228 | |
3229 # otherwise, the tag and value correspond to the | |
3230 # group class and name | |
3231 else { | |
3232 ($gclass,$gname) = ($tag,$value); | |
3233 } | |
3234 } | |
3235 | |
3236 return ($gclass,$gname,$tstart,$tstop,\@attributes); | |
3237 } | |
3238 | |
3239 =head2 _split_gff3_group | |
3240 | |
3241 This is called internally from split_group(). | |
3242 | |
3243 =cut | |
3244 | |
3245 sub _split_gff3_group { | |
3246 my $self = shift; | |
3247 my @groups = @_; | |
3248 my ($gclass,$gname,$tstart,$tstop,@attributes); | |
3249 | |
3250 for my $group (@groups) { | |
3251 my ($tag,$value) = split /=/,$group; | |
3252 $tag = unescape($tag); | |
3253 my @values = map {unescape($_)} split /,/,$value; | |
3254 if ($tag eq 'Parent') { | |
3255 $gclass = 'Sequence'; | |
3256 $gname = shift @values; | |
3257 } | |
3258 elsif ($tag eq 'ID') { | |
3259 $gclass = 'Sequence'; | |
3260 $gname = shift @values; | |
3261 } | |
3262 elsif ($tag eq 'Target') { | |
3263 $gclass = 'Sequence'; | |
3264 ($gname,$tstart,$tstop) = split /\s+/,shift @values; | |
3265 } | |
3266 push @attributes,[$tag=>$_] foreach @values; | |
3267 } | |
3268 return ($gclass,$gname,$tstart,$tstop,\@attributes); | |
3269 } | |
3270 | |
3271 =head2 _delete_features(), _delete_groups(),_delete() | |
3272 | |
3273 Title : _delete_features(), _delete_groups(),_delete() | |
3274 Usage : $count = $db->_delete_features(@feature_ids) | |
3275 $count = $db->_delete_groups(@group_ids) | |
3276 $count = $db->_delete(\%delete_spec) | |
3277 Function: low-level feature/group deleter | |
3278 Returns : count of groups removed | |
3279 Args : list of feature or group ids removed | |
3280 Status : for implementation by subclasses | |
3281 | |
3282 These methods need to be implemented in adaptors. For | |
3283 _delete_features and _delete_groups, the arguments are a list of | |
3284 feature or group IDs to remove. For _delete(), the argument is a | |
3285 hashref with the three keys 'segments', 'types' and 'force'. The | |
3286 first contains an arrayref of Bio::DB::GFF::RelSegment objects to | |
3287 delete (all FEATURES within the segment are deleted). The second | |
3288 contains an arrayref of [method,source] feature types to delete. The | |
3289 two are ANDed together. If 'force' has a true value, this forces the | |
3290 operation to continue even if it would delete all features. | |
3291 | |
3292 =cut | |
3293 | |
3294 sub _delete_features { | |
3295 my $self = shift; | |
3296 my @feature_ids = @_; | |
3297 $self->throw('_delete_features is not implemented in this adaptor'); | |
3298 } | |
3299 | |
3300 sub _delete_groups { | |
3301 my $self = shift; | |
3302 my @group_ids = @_; | |
3303 $self->throw('_delete_groups is not implemented in this adaptor'); | |
3304 } | |
3305 | |
3306 sub _delete { | |
3307 my $self = shift; | |
3308 my $delete_options = shift; | |
3309 $self->throw('_delete is not implemented in this adaptor'); | |
3310 } | |
3311 | |
3312 sub unescape { | |
3313 my $v = shift; | |
3314 $v =~ tr/+/ /; | |
3315 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge; | |
3316 return $v; | |
3317 } | |
3318 | |
3319 | |
3320 package Bio::DB::GFF::ID_Iterator; | |
3321 use strict; | |
3322 | |
3323 use Bio::Root::Root; | |
3324 use vars '@ISA'; | |
3325 @ISA = 'Bio::Root::Root'; | |
3326 | |
3327 sub new { | |
3328 my $class = shift; | |
3329 my ($db,$ids,$type) = @_; | |
3330 return bless {ids=>$ids,db=>$db,type=>$type},$class; | |
3331 } | |
3332 | |
3333 sub next_seq { | |
3334 my $self = shift; | |
3335 my $next = shift @{$self->{ids}}; | |
3336 return unless $next; | |
3337 my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next; | |
3338 my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name) | |
3339 : $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name) | |
3340 : $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name) | |
3341 : $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier"); | |
3342 $self->throw("id does not exist") unless $segment; | |
3343 return $segment; | |
3344 } | |
3345 | |
3346 1; | |
3347 | |
3348 __END__ | |
3349 | |
3350 =head1 BUGS | |
3351 | |
3352 Features can only belong to a single group at a time. This must be | |
3353 addressed soon. | |
3354 | |
3355 Start coordinate can be greater than stop coordinate for relative | |
3356 addressing. This breaks strict BioPerl compatibility and must be | |
3357 fixed. | |
3358 | |
3359 =head1 SEE ALSO | |
3360 | |
3361 L<bioperl>, | |
3362 L<Bio::DB::GFF::RelSegment>, | |
3363 L<Bio::DB::GFF::Aggregator>, | |
3364 L<Bio::DB::GFF::Feature>, | |
3365 L<Bio::DB::GFF::Adaptor::dbi::mysqlopt>, | |
3366 L<Bio::DB::GFF::Adaptor::dbi::oracle>, | |
3367 L<Bio::DB::GFF::Adaptor::memory> | |
3368 | |
3369 =head1 AUTHOR | |
3370 | |
3371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>. | |
3372 | |
3373 Copyright (c) 2001 Cold Spring Harbor Laboratory. | |
3374 | |
3375 This library is free software; you can redistribute it and/or modify | |
3376 it under the same terms as Perl itself. | |
3377 | |
3378 =cut | |
3379 |