0
|
1 # $Id: GFF.pm,v 1.71.2.2 2003/09/12 13:29:32 lstein Exp $
|
|
2
|
|
3 =head1 NAME
|
|
4
|
|
5 Bio::DB::GFF -- Storage and retrieval of sequence annotation data
|
|
6
|
|
7 =head1 SYNOPSIS
|
|
8
|
|
9 use Bio::DB::GFF;
|
|
10
|
|
11 # Open the sequence database
|
|
12 my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt',
|
|
13 -dsn => 'dbi:mysql:elegans',
|
|
14 -fasta => '/usr/local/fasta_files'
|
|
15 );
|
|
16
|
|
17 # fetch a 1 megabase segment of sequence starting at landmark "ZK909"
|
|
18 my $segment = $db->segment('ZK909', 1 => 1000000);
|
|
19
|
|
20 # pull out all transcript features
|
|
21 my @transcripts = $segment->features('transcript');
|
|
22
|
|
23 # for each transcript, total the length of the introns
|
|
24 my %totals;
|
|
25 for my $t (@transcripts) {
|
|
26 my @introns = $t->Intron;
|
|
27 $totals{$t->name} += $_->length foreach @introns;
|
|
28 }
|
|
29
|
|
30 # Sort the exons of the first transcript by position
|
|
31 my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon;
|
|
32
|
|
33 # Get a region 1000 bp upstream of first exon
|
|
34 my $upstream = $exons[0]->segment(-1000,0);
|
|
35
|
|
36 # get its DNA
|
|
37 my $dna = $upstream->dna;
|
|
38
|
|
39 # and get all curated polymorphisms inside it
|
|
40 @polymorphisms = $upstream->contained_features('polymorphism:curated');
|
|
41
|
|
42 # get all feature types in the database
|
|
43 my @types = $db->types;
|
|
44
|
|
45 # count all feature types in the segment
|
|
46 my %type_counts = $segment->types(-enumerate=>1);
|
|
47
|
|
48 # get an iterator on all curated features of type 'exon' or 'intron'
|
|
49 my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']);
|
|
50
|
|
51 while (my $s = $iterator->next_seq) {
|
|
52 print $s,"\n";
|
|
53 }
|
|
54
|
|
55 # find all transcripts annotated as having function 'kinase'
|
|
56 my $iterator = $db->get_seq_stream(-type=>'transcript',
|
|
57 -attributes=>{Function=>'kinase'});
|
|
58 while (my $s = $iterator->next_seq) {
|
|
59 print $s,"\n";
|
|
60 }
|
|
61
|
|
62 =head1 DESCRIPTION
|
|
63
|
|
64 Bio::DB::GFF provides fast indexed access to a sequence annotation
|
|
65 database. It supports multiple database types (ACeDB, relational),
|
|
66 and multiple schemas through a system of adaptors and aggregators.
|
|
67
|
|
68 The following operations are supported by this module:
|
|
69
|
|
70 - retrieving a segment of sequence based on the ID of a landmark
|
|
71 - retrieving the DNA from that segment
|
|
72 - finding all annotations that overlap with the segment
|
|
73 - finding all annotations that are completely contained within the
|
|
74 segment
|
|
75 - retrieving all annotations of a particular type, either within a
|
|
76 segment, or globally
|
|
77 - conversion from absolute to relative coordinates and back again,
|
|
78 using any arbitrary landmark for the relative coordinates
|
|
79 - using a sequence segment to create new segments based on relative
|
|
80 offsets
|
|
81
|
|
82 The data model used by Bio::DB::GFF is compatible with the GFF flat
|
|
83 file format (http://www.sanger.ac.uk/software/GFF). The module can
|
|
84 load a set of GFF files into the database, and serves objects that
|
|
85 have methods corresponding to GFF fields.
|
|
86
|
|
87 The objects returned by Bio::DB::GFF are compatible with the
|
|
88 SeqFeatureI interface, allowing their use by the Bio::Graphics and
|
|
89 Bio::DAS modules.
|
|
90
|
|
91 =head2 Auxiliary Scripts
|
|
92
|
|
93 The bioperl distribution includes several scripts that make it easier
|
|
94 to work with Bio::DB::GFF databases. They are located in the scripts
|
|
95 directory under a subdirectory named Bio::DB::GFF:
|
|
96
|
|
97 =over 4
|
|
98
|
|
99 =item bp_load_gff.pl
|
|
100
|
|
101 This script will load a Bio::DB::GFF database from a flat GFF file of
|
|
102 sequence annotations. Only the relational database version of
|
|
103 Bio::DB::GFF is supported. It can be used to create the database from
|
|
104 scratch, as well as to incrementally load new data.
|
|
105
|
|
106 This script takes a --fasta argument to load raw DNA into the database
|
|
107 as well. However, GFF databases do not require access to the raw DNA
|
|
108 for most of their functionality.
|
|
109
|
|
110 load_gff.pl also has a --upgrade option, which will perform a
|
|
111 non-destructive upgrade of older schemas to newer ones.
|
|
112
|
|
113 =item bp_bulk_load_gff.pl
|
|
114
|
|
115 This script will populate a Bio::DB::GFF database from a flat GFF file
|
|
116 of sequence annotations. Only the MySQL database version of
|
|
117 Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in
|
|
118 order to accelerate loading considerably; however, it can only be used
|
|
119 for the initial load, and not for updates.
|
|
120
|
|
121 This script takes a --fasta argument to load raw DNA into the database
|
|
122 as well. However, GFF databases do not require access to the raw DNA
|
|
123 for most of their functionality.
|
|
124
|
|
125 =item bp_fast_load_gff.pl
|
|
126
|
|
127 This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe
|
|
128 tricks to allow for incremental updates. It only supports the MySQL
|
|
129 database version of Bio::DB::GFF and is guaranteed not to work on
|
|
130 non-Unix platforms.
|
|
131
|
|
132 Arguments are the same as bp_load_gff.pl
|
|
133
|
|
134 =item gadfly_to_gff.pl
|
|
135
|
|
136 This script will convert the GFF-like format used by the Berkeley
|
|
137 Drosophila Sequencing project into a format suitable for use with this
|
|
138 module.
|
|
139
|
|
140 =item sgd_to_gff.pl
|
|
141
|
|
142 This script will convert the tab-delimited feature files used by the
|
|
143 Saccharomyces Genome Database into a format suitable for use with this
|
|
144 module.
|
|
145
|
|
146 =back
|
|
147
|
|
148 =head2 GFF Fundamentals
|
|
149
|
|
150 The GFF format is a flat tab-delimited file, each line of which
|
|
151 corresponds to an annotation, or feature. Each line has nine columns
|
|
152 and looks like this:
|
|
153
|
|
154 Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7"
|
|
155
|
|
156 The 9 columns are as follows:
|
|
157
|
|
158 =over 4
|
|
159
|
|
160 =item 1. reference sequence
|
|
161
|
|
162 This is the ID of the sequence that is used to establish the
|
|
163 coordinate system of the annotation. In the example above, the
|
|
164 reference sequence is "Chr1".
|
|
165
|
|
166 =item 2. source
|
|
167
|
|
168 The source of the annotation. This field describes how the annotation
|
|
169 was derived. In the example above, the source is "curated" to
|
|
170 indicate that the feature is the result of human curation. The names
|
|
171 and versions of software programs are often used for the source field,
|
|
172 as in "tRNAScan-SE/1.2".
|
|
173
|
|
174 =item 3. method
|
|
175
|
|
176 The annotation method. This field describes the type of the
|
|
177 annotation, such as "CDS". Together the method and source describe
|
|
178 the annotation type.
|
|
179
|
|
180 =item 4. start position
|
|
181
|
|
182 The start of the annotation relative to the reference sequence.
|
|
183
|
|
184 =item 5. stop position
|
|
185
|
|
186 The stop of the annotation relative to the reference sequence. Start
|
|
187 is always less than or equal to stop.
|
|
188
|
|
189 =item 6. score
|
|
190
|
|
191 For annotations that are associated with a numeric score (for example,
|
|
192 a sequence similarity), this field describes the score. The score
|
|
193 units are completely unspecified, but for sequence similarities, it is
|
|
194 typically percent identity. Annotations that don't have a score can
|
|
195 use "."
|
|
196
|
|
197 =item 7. strand
|
|
198
|
|
199 For those annotations which are strand-specific, this field is the
|
|
200 strand on which the annotation resides. It is "+" for the forward
|
|
201 strand, "-" for the reverse strand, or "." for annotations that are
|
|
202 not stranded.
|
|
203
|
|
204 =item 8. phase
|
|
205
|
|
206 For annotations that are linked to proteins, this field describes the
|
|
207 phase of the annotation on the codons. It is a number from 0 to 2, or
|
|
208 "." for features that have no phase\.
|
|
209
|
|
210 =item 9. group
|
|
211
|
|
212 GFF provides a simple way of generating annotation hierarchies ("is
|
|
213 composed of" relationships) by providing a group field. The group
|
|
214 field contains the class and ID of an annotation which is the logical
|
|
215 parent of the current one. In the example given above, the group is
|
|
216 the Transcript named "R119.7".
|
|
217
|
|
218 The group field is also used to store information about the target of
|
|
219 sequence similarity hits, and miscellaneous notes. See the next
|
|
220 section for a description of how to describe similarity targets.
|
|
221
|
|
222 The format of the group fields is "Class ID" with a single space (not
|
|
223 a tab) separating the class from the ID. It is VERY IMPORTANT to
|
|
224 follow this format, or grouping will not work properly.
|
|
225
|
|
226 =back
|
|
227
|
|
228 The sequences used to establish the coordinate system for annotations
|
|
229 can correspond to sequenced clones, clone fragments, contigs or
|
|
230 super-contigs. Thus, this module can be used throughout the lifecycle
|
|
231 of a sequencing project.
|
|
232
|
|
233 In addition to a group ID, the GFF format allows annotations to have a
|
|
234 group class. For example, in the ACeDB representation, RNA
|
|
235 interference experiments have a class of "RNAi" and an ID that is
|
|
236 unique among the RNAi experiments. Since not all databases support
|
|
237 this notion, the class is optional in all calls to this module, and
|
|
238 defaults to "Sequence" when not provided.
|
|
239
|
|
240 Double-quotes are sometimes used in GFF files around components of the
|
|
241 group field. Strictly, this is only necessary if the group name or
|
|
242 class contains whitespace.
|
|
243
|
|
244 =head2 Making GFF files work with this module
|
|
245
|
|
246 Some annotations do not need to be individually named. For example,
|
|
247 it is probably not useful to assign a unique name to each ALU repeat
|
|
248 in a vertebrate genome. Others, such as predicted genes, correspond
|
|
249 to named biological objects; you probably want to be able to fetch the
|
|
250 positions of these objects by referring to them by name.
|
|
251
|
|
252 To accomodate named annotations, the GFF format places the object
|
|
253 class and name in the group field. The name identifies the object,
|
|
254 and the class prevents similarly-named objects, for example clones and
|
|
255 sequences, from collding.
|
|
256
|
|
257 A named object is shown in the following excerpt from a GFF file:
|
|
258
|
|
259 Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2
|
|
260
|
|
261 This object is a predicted transcript named Y95BA.2. In this case,
|
|
262 the group field is used to identify the class and name of the object,
|
|
263 even though no other annotation belongs to that group.
|
|
264
|
|
265 It now becomes possible to retrieve the region of the genome covered
|
|
266 by transcript Y95B8A.2 using the segment() method:
|
|
267
|
|
268 $segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2');
|
|
269
|
|
270 It is not necessary for the annotation's method to correspond to the
|
|
271 object class, although this is commonly the case.
|
|
272
|
|
273 As explained above, each annotation in a GFF file refers to a
|
|
274 reference sequence. It is important that each reference sequence also
|
|
275 be identified by a line in the GFF file. This allows the Bio::DB::GFF
|
|
276 module to determine the length and class of the reference sequence,
|
|
277 and makes it possible to do relative arithmetic.
|
|
278
|
|
279 For example, if "Chr1" is used as a reference sequence, then it should
|
|
280 have an entry in the GFF file similar to this one:
|
|
281
|
|
282 Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1
|
|
283
|
|
284 This indicates that the reference sequence named "Chr1" has length
|
|
285 14972282 bp, method "chromosome" and source "assembly". In addition,
|
|
286 as indicated by the group field, Chr1 has class "Sequence" and name
|
|
287 "Chr1".
|
|
288
|
|
289 The object class "Sequence" is used by default when the class is not
|
|
290 specified in the segment() call. This allows you to use a shortcut
|
|
291 form of the segment() method:
|
|
292
|
|
293 $segment = $db->segment('Chr1'); # whole chromosome
|
|
294 $segment = $db->segment('Chr1',1=>1000); # first 1000 bp
|
|
295
|
|
296 For your convenience, if, during loading a GFF file, Bio::DB::GFF
|
|
297 encounters a line like the following:
|
|
298
|
|
299 ##sequence-region Chr1 1 14972282
|
|
300
|
|
301 It will automatically generate the following entry:
|
|
302
|
|
303 Chr1 reference Component 1 14972282 . + . Sequence Chr1
|
|
304
|
|
305 This is sufficient to use Chr1 as a reference point.
|
|
306 The ##sequence-region line is frequently found in the GFF files
|
|
307 distributed by annotation groups.
|
|
308
|
|
309 =head2 Sequence alignments
|
|
310
|
|
311 There are two cases in which an annotation indicates the relationship
|
|
312 between two sequences. The first case is a similarity hit, where the
|
|
313 annotation indicates an alignment. The second case is a map assembly,
|
|
314 in which the annotation indicates that a portion of a larger sequence
|
|
315 is built up from one or more smaller ones.
|
|
316
|
|
317 Both cases are indicated by using the B<Target> tag in the group
|
|
318 field. For example, a typical similarity hit will look like this:
|
|
319
|
|
320 Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544
|
|
321
|
|
322 The group field contains the Target tag, followed by an identifier for
|
|
323 the biological object referred to. The GFF format uses the notation
|
|
324 I<Class>:I<Name> for the biological object, and even though this is
|
|
325 stylistically inconsistent, that's the way it's done. The object
|
|
326 identifier is followed by two integers indicating the start and stop
|
|
327 of the alignment on the target sequence.
|
|
328
|
|
329 Unlike the main start and stop columns, it is possible for the target
|
|
330 start to be greater than the target end. The previous example
|
|
331 indicates that the the section of Chr1 from 76,953 to 77,108 aligns to
|
|
332 the protein SW:ABL_DROME starting at position 493 and extending to
|
|
333 position 544.
|
|
334
|
|
335 A similar notation is used for sequence assembly information as shown
|
|
336 in this example:
|
|
337
|
|
338 Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826
|
|
339 LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742
|
|
340
|
|
341 This indicates that the region between bases 10922906 and 11177731 of
|
|
342 Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region
|
|
343 of LINK_H0601 between 32386 and 64122 is, in turn, composed of the
|
|
344 bases 5 to 31742 of cosmid F49B2.
|
|
345
|
|
346 =head2 Attributes
|
|
347
|
|
348 While not intended to serve as a general-purpose sequence database
|
|
349 (see bioperl-db for that), GFF allows you to tag features with
|
|
350 arbitrary attributes. Attributes appear in the Group field following
|
|
351 the initial class/name pair. For example:
|
|
352
|
|
353 Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3
|
|
354
|
|
355 This line tags the feature named Transcript Y95B8A.2 as being "Gene"
|
|
356 named sma-3 and having the Alias "sma3". Features having these
|
|
357 attributes can be looked up using the fetch_feature_by_attribute() method.
|
|
358
|
|
359 Two attributes have special meaning: "Note" is for backward
|
|
360 compatibility and is used for unstructured text remarks. "Alias" is
|
|
361 considered as a synonym for the feature name and will be consulted
|
|
362 when looking up a feature by its name.
|
|
363
|
|
364 =head2 Adaptors and Aggregators
|
|
365
|
|
366 This module uses a system of adaptors and aggregators in order to make
|
|
367 it adaptable to use with a variety of databases.
|
|
368
|
|
369 =over 4
|
|
370
|
|
371 =item Adaptors
|
|
372
|
|
373 The core of the module handles the user API, annotation coordinate
|
|
374 arithmetic, and other common issues. The details of fetching
|
|
375 information from databases is handled by an adaptor, which is
|
|
376 specified during Bio::DB::GFF construction. The adaptor encapsulates
|
|
377 database-specific information such as the schema, user authentication
|
|
378 and access methods.
|
|
379
|
|
380 Currently there are two adaptors: 'dbi::mysql' and 'dbi::mysqlopt'.
|
|
381 The former is an interface to a simple Mysql schema. The latter is an
|
|
382 optimized version of dbi::mysql which uses a binning scheme to
|
|
383 accelerate range queries and the Bio::DB::Fasta module for rapid
|
|
384 retrieval of sequences. Note the double-colon between the words.
|
|
385
|
|
386 =item Aggregators
|
|
387
|
|
388 The GFF format uses a "group" field to indicate aggregation properties
|
|
389 of individual features. For example, a set of exons and introns may
|
|
390 share a common transcript group, and multiple transcripts may share
|
|
391 the same gene group.
|
|
392
|
|
393 Aggregators are small modules that use the group information to
|
|
394 rebuild the hierarchy. When a Bio::DB::GFF object is created, you
|
|
395 indicate that it use a set of one or more aggregators. Each
|
|
396 aggregator provides a new composite annotation type. Before the
|
|
397 database query is generated each aggregator is called to
|
|
398 "disaggregate" its annotation type into list of component types
|
|
399 contained in the database. After the query is generated, each
|
|
400 aggregator is called again in order to build composite annotations
|
|
401 from the returned components.
|
|
402
|
|
403 For example, during disaggregation, the standard
|
|
404 "processed_transcript" aggregator generates a list of component
|
|
405 feature types including "UTR", "CDS", and "polyA_site". Later, it
|
|
406 aggregates these features into a set of annotations of type
|
|
407 "processed_transcript".
|
|
408
|
|
409 During aggregation, the list of aggregators is called in reverse
|
|
410 order. This allows aggregators to collaborate to create multi-level
|
|
411 structures: the transcript aggregator assembles transcripts from
|
|
412 introns and exons; the gene aggregator then assembles genes from sets
|
|
413 of transcripts.
|
|
414
|
|
415 Three default aggregators are provided:
|
|
416
|
|
417 transcript assembles transcripts from features of type
|
|
418 exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA
|
|
419 clone assembles clones from Clone_left_end, Clone_right_end
|
|
420 and Sequence features.
|
|
421 alignment assembles gapped alignments from features of type
|
|
422 "similarity".
|
|
423
|
|
424 In addition, this module provides the optional "wormbase_gene"
|
|
425 aggregator, which accomodates the WormBase representation of genes.
|
|
426 This aggregator aggregates features of method "exon", "CDS", "5'UTR",
|
|
427 "3'UTR", "polyA" and "TSS" into a single object. It also expects to
|
|
428 find a single feature of type "Sequence" that spans the entire gene.
|
|
429
|
|
430 The existing aggregators are easily customized.
|
|
431
|
|
432 Note that aggregation will not occur unless you specifically request
|
|
433 the aggregation type. For example, this call:
|
|
434
|
|
435 @features = $segment->features('alignment');
|
|
436
|
|
437 will generate an array of aggregated alignment features. However,
|
|
438 this call:
|
|
439
|
|
440 @features = $segment->features();
|
|
441
|
|
442 will return a list of unaggregated similarity segments.
|
|
443
|
|
444 For more informnation, see the manual pages for
|
|
445 Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone,
|
|
446 etc.
|
|
447
|
|
448 =back
|
|
449
|
|
450 =head1 API
|
|
451
|
|
452 The following is the API for Bio::DB::GFF.
|
|
453
|
|
454 =cut
|
|
455
|
|
456 package Bio::DB::GFF;
|
|
457
|
|
458 use strict;
|
|
459
|
|
460 use Bio::DB::GFF::Util::Rearrange;
|
|
461 use Bio::DB::GFF::RelSegment;
|
|
462 use Bio::DB::GFF::Feature;
|
|
463 use Bio::DB::GFF::Aggregator;
|
|
464 use Bio::DasI;
|
|
465 use Bio::Root::Root;
|
|
466
|
|
467 use vars qw(@ISA $VERSION);
|
|
468 @ISA = qw(Bio::Root::Root Bio::DasI);
|
|
469
|
|
470 $VERSION = '1.2003';
|
|
471 my %valid_range_types = (overlaps => 1,
|
|
472 contains => 1,
|
|
473 contained_in => 1);
|
|
474
|
|
475 =head1 Querying GFF Databases
|
|
476
|
|
477 =head2 new
|
|
478
|
|
479 Title : new
|
|
480 Usage : my $db = new Bio::DB::GFF(@args);
|
|
481 Function: create a new Bio::DB::GFF object
|
|
482 Returns : new Bio::DB::GFF object
|
|
483 Args : lists of adaptors and aggregators
|
|
484 Status : Public
|
|
485
|
|
486 These are the arguments:
|
|
487
|
|
488 -adaptor Name of the adaptor module to use. If none
|
|
489 provided, defaults to "dbi::mysqlopt".
|
|
490
|
|
491 -aggregator Array reference to a list of aggregators
|
|
492 to apply to the database. If none provided,
|
|
493 defaults to ['processed_transcript','alignment'].
|
|
494
|
|
495 <other> Any other named argument pairs are passed to
|
|
496 the adaptor for processing.
|
|
497
|
|
498 The adaptor argument must correspond to a module contained within the
|
|
499 Bio::DB::GFF::Adaptor namespace. For example, the
|
|
500 Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying
|
|
501 'dbi::mysql'. By Perl convention, the adaptors names are lower case
|
|
502 because they are loaded at run time.
|
|
503
|
|
504 The aggregator array may contain a list of aggregator names, a list of
|
|
505 initialized aggregator objects, or a string in the form
|
|
506 "aggregator_name{subpart1,subpart2,subpart3/main_method}" (the
|
|
507 /main_method part is optional). For example, if you wish to change
|
|
508 the components aggregated by the transcript aggregator, you could pass
|
|
509 it to the GFF constructor this way:
|
|
510
|
|
511 my $transcript =
|
|
512 Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr
|
|
513 polyA spliced_leader)]);
|
|
514
|
|
515 my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment],
|
|
516 -adaptor => 'dbi::mysql',
|
|
517 -dsn => 'dbi:mysql:elegans42');
|
|
518
|
|
519 Alternatively, you could create an entirely new transcript aggregator
|
|
520 this way:
|
|
521
|
|
522 my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}';
|
|
523 my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment],
|
|
524 -adaptor => 'dbi::mysql',
|
|
525 -dsn => 'dbi:mysql:elegans42');
|
|
526
|
|
527 See L<Bio::DB::GFF::Aggregator> for more details.
|
|
528
|
|
529 The commonly used 'dbi::mysql' adaptor recognizes the following
|
|
530 adaptor-specific arguments:
|
|
531
|
|
532 Argument Description
|
|
533 -------- -----------
|
|
534
|
|
535 -dsn the DBI data source, e.g. 'dbi:mysql:ens0040'
|
|
536 If a partial name is given, such as "ens0040", the
|
|
537 "dbi:mysql:" prefix will be added automatically.
|
|
538
|
|
539 -user username for authentication
|
|
540
|
|
541 -pass the password for authentication
|
|
542
|
|
543 -refclass landmark Class; defaults to "Sequence"
|
|
544
|
|
545 The commonly used 'dbi::mysqlopt' adaptor also recogizes the following
|
|
546 arguments.
|
|
547
|
|
548 Argument Description
|
|
549 -------- -----------
|
|
550
|
|
551 -fasta path to a directory containing FASTA files for the DNA
|
|
552 contained in this database (e.g. "/usr/local/share/fasta")
|
|
553
|
|
554 -acedb an acedb URL to use when converting features into ACEDB
|
|
555 objects (e.g. sace://localhost:2005)
|
|
556
|
|
557 =cut
|
|
558
|
|
559 #'
|
|
560
|
|
561 sub new {
|
|
562 my $package = shift;
|
|
563 my ($adaptor,$aggregators,$args,$refclass);
|
|
564
|
|
565 if (@_ == 1) { # special case, default to dbi::mysqlopt
|
|
566 $adaptor = 'dbi::mysqlopt';
|
|
567 $args = {DSN => shift};
|
|
568 } else {
|
|
569 ($adaptor,$aggregators,$refclass,$args) = rearrange([
|
|
570 [qw(ADAPTOR FACTORY)],
|
|
571 [qw(AGGREGATOR AGGREGATORS)],
|
|
572 'REFCLASS',
|
|
573 ],@_);
|
|
574 }
|
|
575
|
|
576 $adaptor ||= 'dbi::mysqlopt';
|
|
577 my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E";
|
|
578 eval "require $class" unless $class->can('new');
|
|
579 $package->throw("Unable to load $adaptor adaptor: $@") if $@;
|
|
580
|
|
581 my $self = $class->new($args);
|
|
582 $self->default_class($refclass) if defined $refclass;
|
|
583
|
|
584 # handle the aggregators.
|
|
585 # aggregators are responsible for creating complex multi-part features
|
|
586 # from the GFF "group" field. If none are provided, then we provide a
|
|
587 # list of the two used in WormBase.
|
|
588 # Each aggregator can be a scalar or a ref. In the former case
|
|
589 # it is treated as a class name to call new() on. In the latter
|
|
590 # the aggreator is treated as a ready made object.
|
|
591 $aggregators = $self->default_aggregators unless defined $aggregators;
|
|
592 my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators;
|
|
593 for my $a (@a) {
|
|
594 $self->add_aggregator($a);
|
|
595 }
|
|
596
|
|
597 # default settings go here.....
|
|
598 $self->automerge(1); # set automerge to true
|
|
599
|
|
600 $self;
|
|
601 }
|
|
602
|
|
603
|
|
604 =head2 types
|
|
605
|
|
606 Title : types
|
|
607 Usage : $db->types(@args)
|
|
608 Function: return list of feature types in range or database
|
|
609 Returns : a list of Bio::DB::GFF::Typename objects
|
|
610 Args : see below
|
|
611 Status : public
|
|
612
|
|
613 This routine returns a list of feature types known to the database.
|
|
614 The list can be database-wide or restricted to a region. It is also
|
|
615 possible to find out how many times each feature occurs.
|
|
616
|
|
617 For range queries, it is usually more convenient to create a
|
|
618 Bio::DB::GFF::Segment object, and then invoke it's types() method.
|
|
619
|
|
620 Arguments are as follows:
|
|
621
|
|
622 -ref ID of reference sequence
|
|
623 -class class of reference sequence
|
|
624 -start start of segment
|
|
625 -stop stop of segment
|
|
626 -enumerate if true, count the features
|
|
627
|
|
628 The returned value will be a list of Bio::DB::GFF::Typename objects,
|
|
629 which if evaluated in a string context will return the feature type in
|
|
630 "method:source" format. This object class also has method() and
|
|
631 source() methods for retrieving the like-named fields.
|
|
632
|
|
633 If -enumerate is true, then the function returns a hash (not a hash
|
|
634 reference) in which the keys are type names in "method:source" format
|
|
635 and the values are the number of times each feature appears in the
|
|
636 database or segment.
|
|
637
|
|
638 The argument -end is a synonum for -stop, and -count is a synonym for
|
|
639 -enumerate.
|
|
640
|
|
641 =cut
|
|
642
|
|
643 sub types {
|
|
644 my $self = shift;
|
|
645 my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([
|
|
646 [qw(REF REFSEQ)],
|
|
647 qw(START),
|
|
648 [qw(STOP END)],
|
|
649 [qw(ENUMERATE COUNT)],
|
|
650 [qw(CLASS SEQCLASS)],
|
|
651 [qw(TYPE TYPES)],
|
|
652 ],@_);
|
|
653 $types = $self->parse_types($types) if defined $types;
|
|
654 $self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types);
|
|
655 }
|
|
656
|
|
657 =head2 classes
|
|
658
|
|
659 Title : classes
|
|
660 Usage : $db->classes
|
|
661 Function: return list of landmark classes in database
|
|
662 Returns : a list of classes
|
|
663 Args : none
|
|
664 Status : public
|
|
665
|
|
666 This routine returns the list of reference classes known to the
|
|
667 database, or empty if classes are not used by the database. Classes
|
|
668 are distinct from types, being essentially qualifiers on the reference
|
|
669 namespaces.
|
|
670
|
|
671 =cut
|
|
672
|
|
673 sub classes {
|
|
674 my $self = shift;
|
|
675 return ();
|
|
676 }
|
|
677
|
|
678 =head2 segment
|
|
679
|
|
680 Title : segment
|
|
681 Usage : $db->segment(@args);
|
|
682 Function: create a segment object
|
|
683 Returns : segment object(s)
|
|
684 Args : numerous, see below
|
|
685 Status : public
|
|
686
|
|
687 This method generates a segment object, which is a Perl object
|
|
688 subclassed from Bio::DB::GFF::Segment. The segment can be used to
|
|
689 find overlapping features and the raw DNA.
|
|
690
|
|
691 When making the segment() call, you specify the ID of a sequence
|
|
692 landmark (e.g. an accession number, a clone or contig), and a
|
|
693 positional range relative to the landmark. If no range is specified,
|
|
694 then the entire extent of the landmark is used to generate the
|
|
695 segment.
|
|
696
|
|
697 You may also provide the ID of a "reference" sequence, which will set
|
|
698 the coordinate system and orientation used for all features contained
|
|
699 within the segment. The reference sequence can be changed later. If
|
|
700 no reference sequence is provided, then the coordinate system is based
|
|
701 on the landmark.
|
|
702
|
|
703 Arguments:
|
|
704
|
|
705 -name ID of the landmark sequence.
|
|
706
|
|
707 -class Database object class for the landmark sequence.
|
|
708 "Sequence" assumed if not specified. This is
|
|
709 irrelevant for databases which do not recognize
|
|
710 object classes.
|
|
711
|
|
712 -start Start of the segment relative to landmark. Positions
|
|
713 follow standard 1-based sequence rules. If not specified,
|
|
714 defaults to the beginning of the landmark.
|
|
715
|
|
716 -end Stop of the segment relative to the landmark. If not specified,
|
|
717 defaults to the end of the landmark.
|
|
718
|
|
719 -stop Same as -end.
|
|
720
|
|
721 -offset For those who prefer 0-based indexing, the offset specifies the
|
|
722 position of the new segment relative to the start of the landmark.
|
|
723
|
|
724 -length For those who prefer 0-based indexing, the length specifies the
|
|
725 length of the new segment.
|
|
726
|
|
727 -refseq Specifies the ID of the reference landmark used to establish the
|
|
728 coordinate system for the newly-created segment.
|
|
729
|
|
730 -refclass Specifies the class of the reference landmark, for those databases
|
|
731 that distinguish different object classes. Defaults to "Sequence".
|
|
732
|
|
733 -absolute
|
|
734 Return features in absolute coordinates rather than relative to the
|
|
735 parent segment.
|
|
736
|
|
737 -nocheck Don't check the database for the coordinates and length of this
|
|
738 feature. Construct a segment using the indicated name as the
|
|
739 reference, a start coordinate of 1, an undefined end coordinate,
|
|
740 and a strand of +1.
|
|
741
|
|
742 -force Same as -nocheck.
|
|
743
|
|
744 -seq,-sequence,-sourceseq Aliases for -name.
|
|
745
|
|
746 -begin,-end Aliases for -start and -stop
|
|
747
|
|
748 -off,-len Aliases for -offset and -length
|
|
749
|
|
750 -seqclass Alias for -class
|
|
751
|
|
752 Here's an example to explain how this works:
|
|
753
|
|
754 my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql');
|
|
755
|
|
756 If successful, $db will now hold the database accessor object. We now
|
|
757 try to fetch the fragment of sequence whose ID is A0000182 and class
|
|
758 is "Accession."
|
|
759
|
|
760 my $segment = $db->segment(-name=>'A0000182',-class=>'Accession');
|
|
761
|
|
762 If successful, $segment now holds the entire segment corresponding to
|
|
763 this accession number. By default, the sequence is used as its own
|
|
764 reference sequence, so its first base will be 1 and its last base will
|
|
765 be the length of the accession.
|
|
766
|
|
767 Assuming that this sequence belongs to a longer stretch of DNA, say a
|
|
768 contig, we can fetch this information like so:
|
|
769
|
|
770 my $sourceseq = $segment->sourceseq;
|
|
771
|
|
772 and find the start and stop on the source like this:
|
|
773
|
|
774 my $start = $segment->abs_start;
|
|
775 my $stop = $segment->abs_stop;
|
|
776
|
|
777 If we had another segment, say $s2, which is on the same contiguous
|
|
778 piece of DNA, we can pass that to the refseq() method in order to
|
|
779 establish it as the coordinate reference point:
|
|
780
|
|
781 $segment->refseq($s2);
|
|
782
|
|
783 Now calling start() will return the start of the segment relative to
|
|
784 the beginning of $s2, accounting for differences in strandedness:
|
|
785
|
|
786 my $rel_start = $segment->start;
|
|
787
|
|
788 IMPORTANT NOTE: This method can be used to return the segment spanned
|
|
789 by an arbitrary named annotation. However, if the annotation appears
|
|
790 at multiple locations on the genome, for example an EST that maps to
|
|
791 multiple locations, then, provided that all locations reside on the
|
|
792 same physical segment, the method will return a segment that spans the
|
|
793 minimum and maximum positions. If the reference sequence occupies
|
|
794 ranges on different physical segments, then it returns them all in an
|
|
795 array context, and raises a "multiple segment exception" exception in
|
|
796 a scalar context.
|
|
797
|
|
798 =cut
|
|
799
|
|
800 #'
|
|
801
|
|
802 sub segment {
|
|
803 my $self = shift;
|
|
804 my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self,
|
|
805 $self->setup_segment_args(@_));
|
|
806 foreach (@segments) {
|
|
807 $_->absolute(1) if $self->absolute;
|
|
808 }
|
|
809
|
|
810 $self->_multiple_return_args(@segments);
|
|
811 }
|
|
812
|
|
813 sub _multiple_return_args {
|
|
814 my $self = shift;
|
|
815 my @args = @_;
|
|
816 if (@args == 0) {
|
|
817 return;
|
|
818 } elsif (@args == 1) {
|
|
819 return $args[0];
|
|
820 } elsif (wantarray) { # more than one reference sequence
|
|
821 return @args;
|
|
822 } else {
|
|
823 $self->error($args[0]->name,
|
|
824 " has more than one reference sequence in database. Please call in a list context to retrieve them all.");
|
|
825 $self->throw('multiple segment exception');
|
|
826 return;
|
|
827 }
|
|
828
|
|
829 }
|
|
830
|
|
831 # backward compatibility -- don't use!
|
|
832 # (deliberately undocumented too)
|
|
833 sub abs_segment {
|
|
834 my $self = shift;
|
|
835 return $self->segment($self->setup_segment_args(@_),-absolute=>1);
|
|
836 }
|
|
837
|
|
838 sub setup_segment_args {
|
|
839 my $self = shift;
|
|
840 return @_ if defined $_[0] && $_[0] =~ /^-/;
|
|
841 return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3;
|
|
842 return (-class=>$_[0],-name=>$_[1]) if @_ == 2;
|
|
843 return (-name=>$_[0]) if @_ == 1;
|
|
844 }
|
|
845
|
|
846 =head2 features
|
|
847
|
|
848 Title : features
|
|
849 Usage : $db->features(@args)
|
|
850 Function: get all features, possibly filtered by type
|
|
851 Returns : a list of Bio::DB::GFF::Feature objects
|
|
852 Args : see below
|
|
853 Status : public
|
|
854
|
|
855 This routine will retrieve features in the database regardless of
|
|
856 position. It can be used to return all features, or a subset based on
|
|
857 their method and source.
|
|
858
|
|
859 Arguments are as follows:
|
|
860
|
|
861 -types List of feature types to return. Argument is an array
|
|
862 reference containing strings of the format "method:source"
|
|
863
|
|
864 -merge Whether to apply aggregators to the generated features.
|
|
865
|
|
866 -rare Turn on optimizations suitable for a relatively rare feature type,
|
|
867 where it makes more sense to filter by feature type first,
|
|
868 and then by position.
|
|
869
|
|
870 -attributes A hash reference containing attributes to match.
|
|
871
|
|
872 -iterator Whether to return an iterator across the features.
|
|
873
|
|
874 -binsize A true value will create a set of artificial features whose
|
|
875 start and stop positions indicate bins of the given size, and
|
|
876 whose scores are the number of features in the bin. The
|
|
877 class and method of the feature will be set to "bin",
|
|
878 its source to "method:source", and its group to "bin:method:source".
|
|
879 This is a handy way of generating histograms of feature density.
|
|
880
|
|
881 If -iterator is true, then the method returns a single scalar value
|
|
882 consisting of a Bio::SeqIO object. You can call next_seq() repeatedly
|
|
883 on this object to fetch each of the features in turn. If iterator is
|
|
884 false or absent, then all the features are returned as a list.
|
|
885
|
|
886 Currently aggregation is disabled when iterating over a series of
|
|
887 features.
|
|
888
|
|
889 Types are indicated using the nomenclature "method:source". Either of
|
|
890 these fields can be omitted, in which case a wildcard is used for the
|
|
891 missing field. Type names without the colon (e.g. "exon") are
|
|
892 interpreted as the method name and a source wild card. Regular
|
|
893 expressions are allowed in either field, as in: "similarity:BLAST.*".
|
|
894
|
|
895 The -attributes argument is a hashref containing one or more attributes
|
|
896 to match against:
|
|
897
|
|
898 -attributes => { Gene => 'abc-1',
|
|
899 Note => 'confirmed' }
|
|
900
|
|
901 Attribute matching is simple string matching, and multiple attributes
|
|
902 are ANDed together.
|
|
903
|
|
904 =cut
|
|
905
|
|
906 sub features {
|
|
907 my $self = shift;
|
|
908 my ($types,$automerge,$sparse,$iterator,$other);
|
|
909 if (defined $_[0] &&
|
|
910 $_[0] =~ /^-/) {
|
|
911 ($types,$automerge,$sparse,$iterator,$other) = rearrange([
|
|
912 [qw(TYPE TYPES)],
|
|
913 [qw(MERGE AUTOMERGE)],
|
|
914 [qw(RARE SPARSE)],
|
|
915 'ITERATOR'
|
|
916 ],@_);
|
|
917 } else {
|
|
918 $types = \@_;
|
|
919 }
|
|
920
|
|
921 # for whole database retrievals, we probably don't want to automerge!
|
|
922 $automerge = $self->automerge unless defined $automerge;
|
|
923 $other ||= {};
|
|
924 $self->_features({
|
|
925 rangetype => 'contains',
|
|
926 types => $types,
|
|
927 },
|
|
928 { sparse => $sparse,
|
|
929 automerge => $automerge,
|
|
930 iterator =>$iterator,
|
|
931 %$other,
|
|
932 }
|
|
933 );
|
|
934 }
|
|
935
|
|
936 =head2 get_seq_stream
|
|
937
|
|
938 Title : get_seq_stream
|
|
939 Usage : my $seqio = $self->get_seq_sream(@args)
|
|
940 Function: Performs a query and returns an iterator over it
|
|
941 Returns : a Bio::SeqIO stream capable of producing sequence
|
|
942 Args : As in features()
|
|
943 Status : public
|
|
944
|
|
945 This routine takes the same arguments as features(), but returns a
|
|
946 Bio::SeqIO::Stream-compliant object. Use it like this:
|
|
947
|
|
948 $stream = $db->get_seq_stream('exon');
|
|
949 while (my $exon = $stream->next_seq) {
|
|
950 print $exon,"\n";
|
|
951 }
|
|
952
|
|
953 NOTE: This is also called get_feature_stream(), since that's what it
|
|
954 really does.
|
|
955
|
|
956 =cut
|
|
957
|
|
958 sub get_seq_stream {
|
|
959 my $self = shift;
|
|
960 my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1)
|
|
961 : (-types=>\@_,-iterator=>1);
|
|
962 $self->features(@args);
|
|
963 }
|
|
964
|
|
965 *get_feature_stream = \&get_seq_stream;
|
|
966
|
|
967 =head2 get_feature_by_name
|
|
968
|
|
969 Title : get_feature_by_name
|
|
970 Usage : $db->get_feature_by_name($class => $name)
|
|
971 Function: fetch features by their name
|
|
972 Returns : a list of Bio::DB::GFF::Feature objects
|
|
973 Args : the class and name of the desired feature
|
|
974 Status : public
|
|
975
|
|
976 This method can be used to fetch a named feature from the database.
|
|
977 GFF annotations are named using the group class and name fields, so
|
|
978 for features that belong to a group of size one, this method can be
|
|
979 used to retrieve that group (and is equivalent to the segment()
|
|
980 method). Any Alias attributes are also searched for matching names.
|
|
981
|
|
982 An alternative syntax allows you to search for features by name within
|
|
983 a circumscribed region:
|
|
984
|
|
985 @f = $db->get_feature_by_name(-class => $class,-name=>$name,
|
|
986 -ref => $sequence_name,
|
|
987 -start => $start,
|
|
988 -end => $end);
|
|
989
|
|
990 This method may return zero, one, or several Bio::DB::GFF::Feature
|
|
991 objects.
|
|
992
|
|
993 Aggregation is performed on features as usual.
|
|
994
|
|
995 NOTE: At various times, this function was called fetch_group(),
|
|
996 fetch_feature(), fetch_feature_by_name() and segments(). These names
|
|
997 are preserved for backward compatibility.
|
|
998
|
|
999 =cut
|
|
1000
|
|
1001 sub get_feature_by_name {
|
|
1002 my $self = shift;
|
|
1003 my ($gclass,$gname,$automerge,$ref,$start,$end);
|
|
1004 if (@_ == 1) {
|
|
1005 $gclass = $self->default_class;
|
|
1006 $gname = shift;
|
|
1007 } else {
|
|
1008 ($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE',
|
|
1009 ['REF','REFSEQ'],
|
|
1010 'START',['STOP','END']
|
|
1011 ],@_);
|
|
1012 $gclass ||= $self->default_class;
|
|
1013 }
|
|
1014 $automerge = $self->automerge unless defined $automerge;
|
|
1015
|
|
1016 # we need to refactor this... It's repeated code (see below)...
|
|
1017 my @aggregators;
|
|
1018 if ($automerge) {
|
|
1019 for my $a ($self->aggregators) {
|
|
1020 push @aggregators,$a if $a->disaggregate([],$self);
|
|
1021 }
|
|
1022 }
|
|
1023
|
|
1024 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
|
|
1025 my $features = [];
|
|
1026 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
|
|
1027 my $location = [$ref,$start,$end] if defined $ref;
|
|
1028 $self->_feature_by_name($gclass,$gname,$location,$callback);
|
|
1029
|
|
1030 warn "aggregating...\n" if $self->debug;
|
|
1031 foreach my $a (@aggregators) { # last aggregator gets first shot
|
|
1032 $a->aggregate($features,$self) or next;
|
|
1033 }
|
|
1034
|
|
1035 @$features;
|
|
1036 }
|
|
1037
|
|
1038 # horrible indecision regarding proper names!
|
|
1039 *fetch_group = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name;
|
|
1040 *segments = \&segment;
|
|
1041
|
|
1042 =head2 get_feature_by_target
|
|
1043
|
|
1044 Title : get_feature_by_target
|
|
1045 Usage : $db->get_feature_by_target($class => $name)
|
|
1046 Function: fetch features by their similarity target
|
|
1047 Returns : a list of Bio::DB::GFF::Feature objects
|
|
1048 Args : the class and name of the desired feature
|
|
1049 Status : public
|
|
1050
|
|
1051 This method can be used to fetch a named feature from the database
|
|
1052 based on its similarity hit.
|
|
1053
|
|
1054 =cut
|
|
1055
|
|
1056 sub get_feature_by_target {
|
|
1057 shift->get_feature_by_name(@_);
|
|
1058 }
|
|
1059
|
|
1060 =head2 get_feature_by_attribute
|
|
1061
|
|
1062 Title : get_feature_by_attribute
|
|
1063 Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2)
|
|
1064 Function: fetch segments by combinations of attribute values
|
|
1065 Returns : a list of Bio::DB::GFF::Feature objects
|
|
1066 Args : the class and name of the desired feature
|
|
1067 Status : public
|
|
1068
|
|
1069 This method can be used to fetch a set of features from the database.
|
|
1070 Attributes are a list of name=E<gt>value pairs. They will be logically
|
|
1071 ANDED together.
|
|
1072
|
|
1073 =cut
|
|
1074
|
|
1075 sub get_feature_by_attribute {
|
|
1076 my $self = shift;
|
|
1077 my %attributes = ref($_[0]) ? %{$_[0]} : @_;
|
|
1078
|
|
1079 # we need to refactor this... It's repeated code (see above)...
|
|
1080 my @aggregators;
|
|
1081 if ($self->automerge) {
|
|
1082 for my $a ($self->aggregators) {
|
|
1083 unshift @aggregators,$a if $a->disaggregate([],$self);
|
|
1084 }
|
|
1085 }
|
|
1086
|
|
1087 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
|
|
1088 my $features = [];
|
|
1089 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
|
|
1090 $self->_feature_by_attribute(\%attributes,$callback);
|
|
1091
|
|
1092 warn "aggregating...\n" if $self->debug;
|
|
1093 foreach my $a (@aggregators) { # last aggregator gets first shot
|
|
1094 $a->aggregate($features,$self) or next;
|
|
1095 }
|
|
1096
|
|
1097 @$features;
|
|
1098 }
|
|
1099
|
|
1100 # more indecision...
|
|
1101 *fetch_feature_by_attribute = \&get_feature_by_attribute;
|
|
1102
|
|
1103 =head2 get_feature_by_id
|
|
1104
|
|
1105 Title : get_feature_by_id
|
|
1106 Usage : $db->get_feature_by_id($id)
|
|
1107 Function: fetch segments by feature ID
|
|
1108 Returns : a Bio::DB::GFF::Feature object
|
|
1109 Args : the feature ID
|
|
1110 Status : public
|
|
1111
|
|
1112 This method can be used to fetch a feature from the database using its
|
|
1113 ID. Not all GFF databases support IDs, so be careful with this.
|
|
1114
|
|
1115 =cut
|
|
1116
|
|
1117 sub get_feature_by_id {
|
|
1118 my $self = shift;
|
|
1119 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
|
|
1120 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
|
|
1121 my $features = [];
|
|
1122 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
|
|
1123 $self->_feature_by_id($id,'feature',$callback);
|
|
1124 return wantarray ? @$features : $features->[0];
|
|
1125 }
|
|
1126 *fetch_feature_by_id = \&get_feature_by_id;
|
|
1127
|
|
1128 =head2 get_feature_by_gid
|
|
1129
|
|
1130 Title : get_feature_by_gid
|
|
1131 Usage : $db->get_feature_by_gid($id)
|
|
1132 Function: fetch segments by feature ID
|
|
1133 Returns : a Bio::DB::GFF::Feature object
|
|
1134 Args : the feature ID
|
|
1135 Status : public
|
|
1136
|
|
1137 This method can be used to fetch a feature from the database using its
|
|
1138 group ID. Not all GFF databases support IDs, so be careful with this.
|
|
1139
|
|
1140 The group ID is often more interesting than the feature ID, since
|
|
1141 groups can be complex objects containing subobjects.
|
|
1142
|
|
1143 =cut
|
|
1144
|
|
1145 sub get_feature_by_gid {
|
|
1146 my $self = shift;
|
|
1147 my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_;
|
|
1148 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
|
|
1149 my $features = [];
|
|
1150 my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) };
|
|
1151 $self->_feature_by_id($id,'group',$callback);
|
|
1152 return wantarray ? @$features : $features->[0];
|
|
1153 }
|
|
1154 *fetch_feature_by_gid = \&get_feature_by_gid;
|
|
1155
|
|
1156 =head2 delete_features
|
|
1157
|
|
1158 Title : delete_features
|
|
1159 Usage : $db->delete_features(@ids_or_features)
|
|
1160 Function: delete one or more features
|
|
1161 Returns : count of features deleted
|
|
1162 Args : list of features or feature ids
|
|
1163 Status : public
|
|
1164
|
|
1165 Pass this method a list of numeric feature ids or a set of features.
|
|
1166 It will attempt to remove the features from the database and return a
|
|
1167 count of the features removed.
|
|
1168
|
|
1169 NOTE: This method is also called delete_feature(). Also see
|
|
1170 delete_groups().
|
|
1171
|
|
1172 =cut
|
|
1173
|
|
1174 *delete_feature = \&delete_features;
|
|
1175
|
|
1176 sub delete_features {
|
|
1177 my $self = shift;
|
|
1178 my @features_or_ids = @_;
|
|
1179 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids;
|
|
1180 return unless @ids;
|
|
1181 $self->_delete_features(@ids);
|
|
1182 }
|
|
1183
|
|
1184 =head2 delete_groups
|
|
1185
|
|
1186 Title : delete_groups
|
|
1187 Usage : $db->delete_groups(@ids_or_features)
|
|
1188 Function: delete one or more feature groups
|
|
1189 Returns : count of features deleted
|
|
1190 Args : list of features or feature group ids
|
|
1191 Status : public
|
|
1192
|
|
1193 Pass this method a list of numeric group ids or a set of features. It
|
|
1194 will attempt to recursively remove the features and ALL members of
|
|
1195 their group from the database. It returns a count of the number of
|
|
1196 features (not groups) returned.
|
|
1197
|
|
1198 NOTE: This method is also called delete_group(). Also see
|
|
1199 delete_features().
|
|
1200
|
|
1201 =cut
|
|
1202
|
|
1203 *delete_group = \&delete_groupss;
|
|
1204
|
|
1205 sub delete_groups {
|
|
1206 my $self = shift;
|
|
1207 my @features_or_ids = @_;
|
|
1208 my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids;
|
|
1209 return unless @ids;
|
|
1210 $self->_delete_groups(@ids);
|
|
1211 }
|
|
1212
|
|
1213 =head2 delete
|
|
1214
|
|
1215 Title : delete
|
|
1216 Usage : $db->delete(@args)
|
|
1217 Function: delete features
|
|
1218 Returns : count of features deleted -- if available
|
|
1219 Args : numerous, see below
|
|
1220 Status : public
|
|
1221
|
|
1222 This method deletes all features that overlap the specified region or
|
|
1223 are of a particular type. If no arguments are provided and the -force
|
|
1224 argument is true, then deletes ALL features.
|
|
1225
|
|
1226 Arguments:
|
|
1227
|
|
1228 -name ID of the landmark sequence.
|
|
1229
|
|
1230 -ref ID of the landmark sequence (synonym for -name).
|
|
1231
|
|
1232 -class Database object class for the landmark sequence.
|
|
1233 "Sequence" assumed if not specified. This is
|
|
1234 irrelevant for databases which do not recognize
|
|
1235 object classes.
|
|
1236
|
|
1237 -start Start of the segment relative to landmark. Positions
|
|
1238 follow standard 1-based sequence rules. If not specified,
|
|
1239 defaults to the beginning of the landmark.
|
|
1240
|
|
1241 -end Stop of the segment relative to the landmark. If not specified,
|
|
1242 defaults to the end of the landmark.
|
|
1243
|
|
1244 -offset Zero-based addressing
|
|
1245
|
|
1246 -length Length of region
|
|
1247
|
|
1248 -type,-types Either a single scalar type to be deleted, or an
|
|
1249 reference to an array of types.
|
|
1250
|
|
1251 -force Force operation to be performed even if it would delete
|
|
1252 entire feature table.
|
|
1253
|
|
1254 -range_type Control the range type of the deletion. One of "overlaps" (default)
|
|
1255 "contains" or "contained_in"
|
|
1256
|
|
1257 Examples:
|
|
1258
|
|
1259 $db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats
|
|
1260 $db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000
|
|
1261 $db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3
|
|
1262
|
|
1263 The short form of this call, as described in segment() is also allowed:
|
|
1264
|
|
1265 $db->delete("chr3",1=>1000);
|
|
1266 $db->delete("chr3");
|
|
1267
|
|
1268 IMPORTANT NOTE: This method only deletes features. It does *NOT*
|
|
1269 delete the names of groups that contain the deleted features. Group
|
|
1270 IDs will be reused if you later load a feature with the same group
|
|
1271 name as one that was previously deleted.
|
|
1272
|
|
1273 NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the
|
|
1274 result code from the SQL DELETE operation. Some dbd drivers return the
|
|
1275 count of rows deleted, while others return 0E0. Caveat emptor.
|
|
1276
|
|
1277 =cut
|
|
1278
|
|
1279 sub delete {
|
|
1280 my $self = shift;
|
|
1281 my @args = $self->setup_segment_args(@_);
|
|
1282 my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) =
|
|
1283 rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET',
|
|
1284 'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args);
|
|
1285 $offset = 0 unless defined $offset;
|
|
1286 $start = $offset+1 unless defined $start;
|
|
1287 $end = $start+$length-1 if !defined $end and $length;
|
|
1288 $class ||= $self->default_class;
|
|
1289
|
|
1290 my $types = $self->parse_types($type); # parse out list of types
|
|
1291
|
|
1292 $range_type ||= 'overlaps';
|
|
1293 $self->throw("range type must be one of {".
|
|
1294 join(',',keys %valid_range_types).
|
|
1295 "}\n")
|
|
1296 unless $valid_range_types{lc $range_type};
|
|
1297
|
|
1298
|
|
1299 my @segments;
|
|
1300 if (defined $name && $name ne '') {
|
|
1301 my @args = (-name=>$name,-class=>$class);
|
|
1302 push @args,(-start=>$start) if defined $start;
|
|
1303 push @args,(-end =>$end) if defined $end;
|
|
1304 @segments = $self->segment(@args);
|
|
1305 return unless @segments;
|
|
1306 }
|
|
1307 $self->_delete({segments => \@segments,
|
|
1308 types => $types,
|
|
1309 range_type => $range_type,
|
|
1310 force => $force}
|
|
1311 );
|
|
1312 }
|
|
1313
|
|
1314 =head2 absolute
|
|
1315
|
|
1316 Title : absolute
|
|
1317 Usage : $abs = $db->absolute([$abs]);
|
|
1318 Function: gets/sets absolute mode
|
|
1319 Returns : current setting of absolute mode boolean
|
|
1320 Args : new setting for absolute mode boolean
|
|
1321 Status : public
|
|
1322
|
|
1323 $db-E<gt>absolute(1) will turn on absolute mode for the entire database.
|
|
1324 All segments retrieved will use absolute coordinates by default,
|
|
1325 rather than relative coordinates. You can still set them to use
|
|
1326 relative coordinates by calling $segment-E<gt>absolute(0).
|
|
1327
|
|
1328 Note that this is not the same as calling abs_segment(); it continues
|
|
1329 to allow you to look up groups that are not used directly as reference
|
|
1330 sequences.
|
|
1331
|
|
1332 =cut
|
|
1333
|
|
1334 sub absolute {
|
|
1335 my $self = shift;
|
|
1336 my $d = $self->{absolute};
|
|
1337 $self->{absolute} = shift if @_;
|
|
1338 $d;
|
|
1339 }
|
|
1340
|
|
1341 =head2 strict_bounds_checking
|
|
1342
|
|
1343 Title : strict_bounds_checking
|
|
1344 Usage : $flag = $db->strict_bounds_checking([$flag])
|
|
1345 Function: gets/sets strict bounds checking
|
|
1346 Returns : current setting of bounds checking flag
|
|
1347 Args : new setting for bounds checking flag
|
|
1348 Status : public
|
|
1349
|
|
1350 This flag enables extra checks for segment requests that go beyond the
|
|
1351 ends of their reference sequences. If bounds checking is enabled,
|
|
1352 then retrieved segments will be truncated to their physical length,
|
|
1353 and their truncated() methods will return true.
|
|
1354
|
|
1355 If the flag is off (the default), then the module will return segments
|
|
1356 that appear to extend beyond their physical boundaries. Requests for
|
|
1357 features beyond the end of the segment will, however, return empty.
|
|
1358
|
|
1359 =cut
|
|
1360
|
|
1361 sub strict_bounds_checking {
|
|
1362 my $self = shift;
|
|
1363 my $d = $self->{strict};
|
|
1364 $self->{strict} = shift if @_;
|
|
1365 $d;
|
|
1366 }
|
|
1367
|
|
1368 =head2 get_Seq_by_id
|
|
1369
|
|
1370 Title : get_Seq_by_id
|
|
1371 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
|
|
1372 Function: Gets a Bio::Seq object by its name
|
|
1373 Returns : a Bio::Seq object
|
|
1374 Args : the id (as a string) of a sequence
|
|
1375 Throws : "id does not exist" exception
|
|
1376
|
|
1377 NOTE: Bio::DB::RandomAccessI compliant method
|
|
1378
|
|
1379 =cut
|
|
1380
|
|
1381 sub get_Seq_by_id {
|
|
1382 my $self = shift;
|
|
1383 my $id = shift;
|
|
1384 my $stream = $self->get_Stream_by_id($id);
|
|
1385 return $stream->next_seq;
|
|
1386 }
|
|
1387
|
|
1388
|
|
1389 =head2 get_Seq_by_accession
|
|
1390
|
|
1391 Title : get_Seq_by_accession
|
|
1392 Usage : $seq = $db->get_Seq_by_accession('AL12234')
|
|
1393 Function: Gets a Bio::Seq object by its accession
|
|
1394 Returns : a Bio::Seq object
|
|
1395 Args : the id (as a string) of a sequence
|
|
1396 Throws : "id does not exist" exception
|
|
1397
|
|
1398 NOTE: Bio::DB::RandomAccessI compliant method
|
|
1399
|
|
1400 =cut
|
|
1401
|
|
1402 sub get_Seq_by_accession {
|
|
1403 my $self = shift;
|
|
1404 my $id = shift;
|
|
1405 my $stream = $self->get_Stream_by_accession($id);
|
|
1406 return $stream->next_seq;
|
|
1407 }
|
|
1408
|
|
1409 =head2 get_Stream_by_acc ()
|
|
1410
|
|
1411 =cut
|
|
1412
|
|
1413 =head2 get_Seq_by_acc
|
|
1414
|
|
1415 Title : get_Seq_by_acc
|
|
1416 Usage : $seq = $db->get_Seq_by_acc('X77802');
|
|
1417 Function: Gets a Bio::Seq object by accession number
|
|
1418 Returns : A Bio::Seq object
|
|
1419 Args : accession number (as a string)
|
|
1420 Throws : "acc does not exist" exception
|
|
1421
|
|
1422 NOTE: Bio::DB::RandomAccessI compliant method
|
|
1423
|
|
1424 =cut
|
|
1425
|
|
1426 sub get_Stream_by_name {
|
|
1427 my $self = shift;
|
|
1428 my @ids = @_;
|
|
1429 my $id = ref($ids[0]) ? $ids[0] : \@ids;
|
|
1430 Bio::DB::GFF::ID_Iterator->new($self,$id,'name');
|
|
1431 }
|
|
1432
|
|
1433 =head2 get_Stream_by_id
|
|
1434
|
|
1435 Title : get_Stream_by_id
|
|
1436 Usage : $seq = $db->get_Stream_by_id(@ids);
|
|
1437 Function: Retrieves a stream of Seq objects given their ids
|
|
1438 Returns : a Bio::SeqIO stream object
|
|
1439 Args : an array of unique ids/accession numbers, or
|
|
1440 an array reference
|
|
1441
|
|
1442 NOTE: This is also called get_Stream_by_batch()
|
|
1443
|
|
1444 =cut
|
|
1445
|
|
1446 sub get_Stream_by_id {
|
|
1447 my $self = shift;
|
|
1448 my @ids = @_;
|
|
1449 my $id = ref($ids[0]) ? $ids[0] : \@ids;
|
|
1450 Bio::DB::GFF::ID_Iterator->new($self,$id,'feature');
|
|
1451 }
|
|
1452
|
|
1453 =head2 get_Stream_by_batch ()
|
|
1454
|
|
1455 Title : get_Stream_by_batch
|
|
1456 Usage : $seq = $db->get_Stream_by_batch(@ids);
|
|
1457 Function: Retrieves a stream of Seq objects given their ids
|
|
1458 Returns : a Bio::SeqIO stream object
|
|
1459 Args : an array of unique ids/accession numbers, or
|
|
1460 an array reference
|
|
1461
|
|
1462 NOTE: This is the same as get_Stream_by_id().
|
|
1463
|
|
1464 =cut
|
|
1465
|
|
1466 *get_Stream_by_batch = \&get_Stream_by_id;
|
|
1467
|
|
1468
|
|
1469 =head2 get_Stream_by_group ()
|
|
1470
|
|
1471 Bioperl compatibility.
|
|
1472
|
|
1473 =cut
|
|
1474
|
|
1475 sub get_Stream_by_group {
|
|
1476 my $self = shift;
|
|
1477 my @ids = @_;
|
|
1478 my $id = ref($ids[0]) ? $ids[0] : \@ids;
|
|
1479 Bio::DB::GFF::ID_Iterator->new($self,$id,'group');
|
|
1480 }
|
|
1481
|
|
1482 =head2 all_seqfeatures
|
|
1483
|
|
1484 Title : all_seqfeatures
|
|
1485 Usage : @features = $db->all_seqfeatures(@args)
|
|
1486 Function: fetch all the features in the database
|
|
1487 Returns : an array of features, or an iterator
|
|
1488 Args : See below
|
|
1489 Status : public
|
|
1490
|
|
1491 This is equivalent to calling $db-E<gt>features() without any types, and
|
|
1492 will return all the features in the database. The -merge and
|
|
1493 -iterator arguments are recognized, and behave the same as described
|
|
1494 for features().
|
|
1495
|
|
1496 =cut
|
|
1497
|
|
1498 sub all_seqfeatures {
|
|
1499 my $self = shift;
|
|
1500 my ($automerge,$iterator)= rearrange([
|
|
1501 [qw(MERGE AUTOMERGE)],
|
|
1502 'ITERATOR'
|
|
1503 ],@_);
|
|
1504 my @args;
|
|
1505 push @args,(-merge=>$automerge) if defined $automerge;
|
|
1506 push @args,(-iterator=>$iterator) if defined $iterator;
|
|
1507 $self->features(@args);
|
|
1508 }
|
|
1509
|
|
1510 =head1 Creating and Loading GFF Databases
|
|
1511
|
|
1512 =head2 initialize
|
|
1513
|
|
1514 Title : initialize
|
|
1515 Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2);
|
|
1516 Function: initialize a GFF database
|
|
1517 Returns : true if initialization successful
|
|
1518 Args : a set of named parameters
|
|
1519 Status : Public
|
|
1520
|
|
1521 This method can be used to initialize an empty database. It takes the following
|
|
1522 named arguments:
|
|
1523
|
|
1524 -erase A boolean value. If true the database will be wiped clean if it
|
|
1525 already contains data.
|
|
1526
|
|
1527 Other named arguments may be recognized by subclasses. They become database
|
|
1528 meta values that control various settable options.
|
|
1529
|
|
1530 As a shortcut (and for backward compatibility) a single true argument
|
|
1531 is the same as initialize(-erase=E<gt>1).
|
|
1532
|
|
1533 =cut
|
|
1534
|
|
1535 sub initialize {
|
|
1536 my $self = shift;
|
|
1537 #$self->do_initialize(1) if @_ == 1 && $_[0];
|
|
1538 #why was this line (^) here? I can't see that it actually does anything
|
|
1539 #one option would be to execute the line and return, but I don't know
|
|
1540 #why you would want to do that either.
|
|
1541
|
|
1542 my ($erase,$meta) = rearrange(['ERASE'],@_);
|
|
1543 $meta ||= {};
|
|
1544
|
|
1545 # initialize (possibly erasing)
|
|
1546 return unless $self->do_initialize($erase);
|
|
1547 my @default = $self->default_meta_values;
|
|
1548
|
|
1549 # this is an awkward way of uppercasing the
|
|
1550 # even-numbered values (necessary for case-insensitive SQL databases)
|
|
1551 for (my $i=0; $i<@default; $i++) {
|
|
1552 $default[$i] = uc $default[$i] if !($i % 2);
|
|
1553 }
|
|
1554
|
|
1555 my %values = (@default,%$meta);
|
|
1556 foreach (keys %values) {
|
|
1557 $self->meta($_ => $values{$_});
|
|
1558 }
|
|
1559 1;
|
|
1560 }
|
|
1561
|
|
1562
|
|
1563 =head2 load_gff
|
|
1564
|
|
1565 Title : load_gff
|
|
1566 Usage : $db->load_gff($file|$directory|$filehandle);
|
|
1567 Function: load GFF data into database
|
|
1568 Returns : count of records loaded
|
|
1569 Args : a directory, a file, a list of files,
|
|
1570 or a filehandle
|
|
1571 Status : Public
|
|
1572
|
|
1573 This method takes a single overloaded argument, which can be any of:
|
|
1574
|
|
1575 =over 4
|
|
1576
|
|
1577 =item 1. a scalar corresponding to a GFF file on the system
|
|
1578
|
|
1579 A pathname to a local GFF file. Any files ending with the .gz, .Z, or
|
|
1580 .bz2 suffixes will be transparently decompressed with the appropriate
|
|
1581 command-line utility.
|
|
1582
|
|
1583 =item 2. an array reference containing a list of GFF files on the system
|
|
1584
|
|
1585 For example ['/home/gff/gff1.gz','/home/gff/gff2.gz']
|
|
1586
|
|
1587 =item 3. directory path
|
|
1588
|
|
1589 The indicated directory will be searched for all files ending in the
|
|
1590 suffixes .gff, .gff.gz, .gff.Z or .gff.bz2.
|
|
1591
|
|
1592 =item 4. filehandle
|
|
1593
|
|
1594 An open filehandle from which to read the GFF data. Tied filehandles
|
|
1595 now work as well.
|
|
1596
|
|
1597 =item 5. a pipe expression
|
|
1598
|
|
1599 A pipe expression will also work. For example, a GFF file on a remote
|
|
1600 web server can be loaded with an expression like this:
|
|
1601
|
|
1602 $db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |");
|
|
1603
|
|
1604 =back
|
|
1605
|
|
1606 If successful, the method will return the number of GFF lines
|
|
1607 successfully loaded.
|
|
1608
|
|
1609 NOTE:this method used to be called load(), but has been changed. The
|
|
1610 old method name is also recognized.
|
|
1611
|
|
1612 =cut
|
|
1613
|
|
1614 sub load_gff {
|
|
1615 my $self = shift;
|
|
1616 my $file_or_directory = shift || '.';
|
|
1617 return $self->do_load_gff($file_or_directory) if ref($file_or_directory) &&
|
|
1618 tied *$file_or_directory;
|
|
1619
|
|
1620 my $tied_stdin = tied(*STDIN);
|
|
1621 open SAVEIN,"<&STDIN" unless $tied_stdin;
|
|
1622 local @ARGV = $self->setup_argv($file_or_directory,'gff') or return; # to play tricks with reader
|
|
1623 my $result = $self->do_load_gff('ARGV');
|
|
1624 open STDIN,"<&SAVEIN" unless $tied_stdin; # restore STDIN
|
|
1625 return $result;
|
|
1626 }
|
|
1627
|
|
1628 *load = \&load_gff;
|
|
1629
|
|
1630 =head2 load_fasta
|
|
1631
|
|
1632 Title : load_fasta
|
|
1633 Usage : $db->load_fasta($file|$directory|$filehandle);
|
|
1634 Function: load FASTA data into database
|
|
1635 Returns : count of records loaded
|
|
1636 Args : a directory, a file, a list of files,
|
|
1637 or a filehandle
|
|
1638 Status : Public
|
|
1639
|
|
1640 This method takes a single overloaded argument, which can be any of:
|
|
1641
|
|
1642 =over 4
|
|
1643
|
|
1644 =item 1. scalar corresponding to a FASTA file on the system
|
|
1645
|
|
1646 A pathname to a local FASTA file. Any files ending with the .gz, .Z, or
|
|
1647 .bz2 suffixes will be transparently decompressed with the appropriate
|
|
1648 command-line utility.
|
|
1649
|
|
1650 =item 2. array reference containing a list of FASTA files on the
|
|
1651 system
|
|
1652
|
|
1653 For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz']
|
|
1654
|
|
1655 =item 3. path to a directory
|
|
1656
|
|
1657 The indicated directory will be searched for all files ending in the
|
|
1658 suffixes .fa, .fa.gz, .fa.Z or .fa.bz2.
|
|
1659
|
|
1660 a=item 4. filehandle
|
|
1661
|
|
1662 An open filehandle from which to read the FASTA data.
|
|
1663
|
|
1664 =item 5. pipe expression
|
|
1665
|
|
1666 A pipe expression will also work. For example, a FASTA file on a remote
|
|
1667 web server can be loaded with an expression like this:
|
|
1668
|
|
1669 $db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |");
|
|
1670
|
|
1671 =back
|
|
1672
|
|
1673 =cut
|
|
1674
|
|
1675 sub load_fasta {
|
|
1676 my $self = shift;
|
|
1677 my $file_or_directory = shift || '.';
|
|
1678 return $self->load_sequence($file_or_directory) if ref($file_or_directory) &&
|
|
1679 tied *$file_or_directory;
|
|
1680
|
|
1681 my $tied = tied(*STDIN);
|
|
1682 open SAVEIN,"<&STDIN" unless $tied;
|
|
1683 local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader
|
|
1684 my $result = $self->load_sequence('ARGV');
|
|
1685 open STDIN,"<&SAVEIN" unless $tied; # restore STDIN
|
|
1686 return $result;
|
|
1687 }
|
|
1688
|
|
1689 =head2 load_sequence_string
|
|
1690
|
|
1691 Title : load_sequence_string
|
|
1692 Usage : $db->load_sequence_string($id,$dna)
|
|
1693 Function: load a single DNA entry
|
|
1694 Returns : true if successfully loaded
|
|
1695 Args : a raw sequence string (DNA, RNA, protein)
|
|
1696 Status : Public
|
|
1697
|
|
1698 =cut
|
|
1699
|
|
1700 sub load_sequence_string {
|
|
1701 my $self = shift;
|
|
1702 my ($acc,$seq) = @_;
|
|
1703 my $offset = 0;
|
|
1704 $self->insert_sequence_chunk($acc,\$offset,\$seq) or return;
|
|
1705 $self->insert_sequence($acc,$offset,$seq) or return;
|
|
1706 1;
|
|
1707 }
|
|
1708
|
|
1709 sub setup_argv {
|
|
1710 my $self = shift;
|
|
1711 my $file_or_directory = shift;
|
|
1712 my @suffixes = @_;
|
|
1713 no strict 'refs'; # so that we can call fileno() on the argument
|
|
1714
|
|
1715 my @argv;
|
|
1716
|
|
1717 if (-d $file_or_directory) {
|
|
1718 @argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes;
|
|
1719 }elsif (my $fd = fileno($file_or_directory)) {
|
|
1720 open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN");
|
|
1721 @argv = '-';
|
|
1722 } elsif (ref $file_or_directory) {
|
|
1723 @argv = @$file_or_directory;
|
|
1724 } else {
|
|
1725 @argv = $file_or_directory;
|
|
1726 }
|
|
1727
|
|
1728 foreach (@argv) {
|
|
1729 if (/\.gz$/) {
|
|
1730 $_ = "gunzip -c $_ |";
|
|
1731 } elsif (/\.Z$/) {
|
|
1732 $_ = "uncompress -c $_ |";
|
|
1733 } elsif (/\.bz2$/) {
|
|
1734 $_ = "bunzip2 -c $_ |";
|
|
1735 }
|
|
1736 }
|
|
1737 @argv;
|
|
1738 }
|
|
1739
|
|
1740 =head2 lock_on_load
|
|
1741
|
|
1742 Title : lock_on_load
|
|
1743 Usage : $lock = $db->lock_on_load([$lock])
|
|
1744 Function: set write locking during load
|
|
1745 Returns : current value of lock-on-load flag
|
|
1746 Args : new value of lock-on-load-flag
|
|
1747 Status : Public
|
|
1748
|
|
1749 This method is honored by some of the adaptors. If the value is true,
|
|
1750 the tables used by the GFF modules will be locked for writing during
|
|
1751 loads and inaccessible to other processes.
|
|
1752
|
|
1753 =cut
|
|
1754
|
|
1755 sub lock_on_load {
|
|
1756 my $self = shift;
|
|
1757 my $d = $self->{lock};
|
|
1758 $self->{lock} = shift if @_;
|
|
1759 $d;
|
|
1760 }
|
|
1761
|
|
1762 =head2 meta
|
|
1763
|
|
1764 Title : meta
|
|
1765 Usage : $value = $db->meta($name [,$newval])
|
|
1766 Function: get or set a meta variable
|
|
1767 Returns : a string
|
|
1768 Args : meta variable name and optionally value
|
|
1769 Status : abstract
|
|
1770
|
|
1771 Get or set a named metavalues for the database. Metavalues can be
|
|
1772 used for database-specific settings.
|
|
1773
|
|
1774 By default, this method does nothing!
|
|
1775
|
|
1776 =cut
|
|
1777
|
|
1778 sub meta {
|
|
1779 my $self = shift;
|
|
1780 my ($name,$value) = @_;
|
|
1781 return;
|
|
1782 }
|
|
1783
|
|
1784 =head2 default_meta_values
|
|
1785
|
|
1786 Title : default_meta_values
|
|
1787 Usage : %values = $db->default_meta_values
|
|
1788 Function: empty the database
|
|
1789 Returns : a list of tag=>value pairs
|
|
1790 Args : none
|
|
1791 Status : protected
|
|
1792
|
|
1793 This method returns a list of tag=E<gt>value pairs that contain default
|
|
1794 meta information about the database. It is invoked by initialize() to
|
|
1795 write out the default meta values. The base class version returns an
|
|
1796 empty list.
|
|
1797
|
|
1798 For things to work properly, meta value names must be UPPERCASE.
|
|
1799
|
|
1800 =cut
|
|
1801
|
|
1802 sub default_meta_values {
|
|
1803 my $self = shift;
|
|
1804 return ();
|
|
1805 }
|
|
1806
|
|
1807
|
|
1808 =head2 error
|
|
1809
|
|
1810 Title : error
|
|
1811 Usage : $db->error( [$new error] );
|
|
1812 Function: read or set error message
|
|
1813 Returns : error message
|
|
1814 Args : an optional argument to set the error message
|
|
1815 Status : Public
|
|
1816
|
|
1817 This method can be used to retrieve the last error message. Errors
|
|
1818 are not reset to empty by successful calls, so contents are only valid
|
|
1819 immediately after an error condition has been detected.
|
|
1820
|
|
1821 =cut
|
|
1822
|
|
1823 sub error {
|
|
1824 my $self = shift;
|
|
1825 my $g = $self->{error};
|
|
1826 $self->{error} = join '',@_ if @_;
|
|
1827 $g;
|
|
1828 }
|
|
1829
|
|
1830 =head2 debug
|
|
1831
|
|
1832 Title : debug
|
|
1833 Usage : $db->debug( [$flag] );
|
|
1834 Function: read or set debug flag
|
|
1835 Returns : current value of debug flag
|
|
1836 Args : new debug flag (optional)
|
|
1837 Status : Public
|
|
1838
|
|
1839 This method can be used to turn on debug messages. The exact nature
|
|
1840 of those messages depends on the adaptor in use.
|
|
1841
|
|
1842 =cut
|
|
1843
|
|
1844 sub debug {
|
|
1845 my $self = shift;
|
|
1846 my $g = $self->{debug};
|
|
1847 $self->{debug} = shift if @_;
|
|
1848 $g;
|
|
1849 }
|
|
1850
|
|
1851
|
|
1852 =head2 automerge
|
|
1853
|
|
1854 Title : automerge
|
|
1855 Usage : $db->automerge( [$new automerge] );
|
|
1856 Function: get or set automerge value
|
|
1857 Returns : current value (boolean)
|
|
1858 Args : an optional argument to set the automerge value
|
|
1859 Status : Public
|
|
1860
|
|
1861 By default, this module will use the aggregators to merge groups into
|
|
1862 single composite objects. This default can be changed to false by
|
|
1863 calling automerge(0).
|
|
1864
|
|
1865 =cut
|
|
1866
|
|
1867 sub automerge {
|
|
1868 my $self = shift;
|
|
1869 my $g = $self->{automerge};
|
|
1870 $self->{automerge} = shift if @_;
|
|
1871 $g;
|
|
1872 }
|
|
1873
|
|
1874 =head2 attributes
|
|
1875
|
|
1876 Title : attributes
|
|
1877 Usage : @attributes = $db->attributes($id,$name)
|
|
1878 Function: get the "attributres" on a particular feature
|
|
1879 Returns : an array of string
|
|
1880 Args : feature ID
|
|
1881 Status : public
|
|
1882
|
|
1883 Some GFF version 2 files use the groups column to store a series of
|
|
1884 attribute/value pairs. In this interpretation of GFF, the first such
|
|
1885 pair is treated as the primary group for the feature; subsequent pairs
|
|
1886 are treated as attributes. Two attributes have special meaning:
|
|
1887 "Note" is for backward compatibility and is used for unstructured text
|
|
1888 remarks. "Alias" is considered as a synonym for the feature name.
|
|
1889
|
|
1890 If no name is provided, then attributes() returns a flattened hash, of
|
|
1891 attribute=E<gt>value pairs. This lets you do:
|
|
1892
|
|
1893 %attributes = $db->attributes($id);
|
|
1894
|
|
1895 Normally, attributes() will be called by the feature:
|
|
1896
|
|
1897 @notes = $feature->attributes('Note');
|
|
1898
|
|
1899 In a scalar context, attributes() returns the first value of the
|
|
1900 attribute if a tag is present, otherwise a hash reference in which the
|
|
1901 keys are attribute names and the values are anonymous arrays
|
|
1902 containing the values.
|
|
1903
|
|
1904 =cut
|
|
1905
|
|
1906 sub attributes {
|
|
1907 my $self = shift;
|
|
1908 my ($id,$tag) = @_;
|
|
1909 my @result = $self->do_attributes($id,$tag) or return;
|
|
1910 return @result if wantarray;
|
|
1911
|
|
1912 # what to do in an array context
|
|
1913 return $result[0] if $tag;
|
|
1914 my %result;
|
|
1915 while (my($key,$value) = splice(@result,0,2)) {
|
|
1916 push @{$result{$key}},$value;
|
|
1917 }
|
|
1918 return \%result;
|
|
1919 }
|
|
1920
|
|
1921 =head2 fast_queries
|
|
1922
|
|
1923 Title : fast_queries
|
|
1924 Usage : $flag = $db->fast_queries([$flag])
|
|
1925 Function: turn on and off the "fast queries" option
|
|
1926 Returns : a boolean
|
|
1927 Args : a boolean flag (optional)
|
|
1928 Status : public
|
|
1929
|
|
1930 The mysql database driver (and possibly others) support a "fast" query
|
|
1931 mode that caches results on the server side. This makes queries come
|
|
1932 back faster, particularly when creating iterators. The downside is
|
|
1933 that while iterating, new queries will die with a "command synch"
|
|
1934 error. This method turns the feature on and off.
|
|
1935
|
|
1936 For databases that do not support a fast query, this method has no
|
|
1937 effect.
|
|
1938
|
|
1939 =cut
|
|
1940
|
|
1941 # override this method in order to set the mysql_use_result attribute, which is an obscure
|
|
1942 # but extremely powerful optimization for both performance and memory.
|
|
1943 sub fast_queries {
|
|
1944 my $self = shift;
|
|
1945 my $d = $self->{fast_queries};
|
|
1946 $self->{fast_queries} = shift if @_;
|
|
1947 $d;
|
|
1948 }
|
|
1949
|
|
1950 =head2 add_aggregator
|
|
1951
|
|
1952 Title : add_aggregator
|
|
1953 Usage : $db->add_aggregator($aggregator)
|
|
1954 Function: add an aggregator to the list
|
|
1955 Returns : nothing
|
|
1956 Args : an aggregator
|
|
1957 Status : public
|
|
1958
|
|
1959 This method will append an aggregator to the end of the list of
|
|
1960 registered aggregators. Three different argument types are accepted:
|
|
1961
|
|
1962 1) a Bio::DB::GFF::Aggregator object -- will be added
|
|
1963 2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}"
|
|
1964 -- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method
|
|
1965 part is optional).
|
|
1966 3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator
|
|
1967 subclass, where the token corresponds to the subclass name.
|
|
1968
|
|
1969 =cut
|
|
1970
|
|
1971 sub add_aggregator {
|
|
1972 my $self = shift;
|
|
1973 my $aggregator = shift;
|
|
1974 my $list = $self->{aggregators} ||= [];
|
|
1975 if (ref $aggregator) { # an object
|
|
1976 @$list = grep {$_->get_method ne $aggregator->get_method} @$list;
|
|
1977 push @$list,$aggregator;
|
|
1978 }
|
|
1979
|
|
1980 elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) {
|
|
1981 my($agg_name,$subparts,$mainpart) = ($1,$2,$3);
|
|
1982 my @subparts = split /,\s*/,$subparts;
|
|
1983 my @args = (-method => $agg_name,
|
|
1984 -sub_parts => \@subparts);
|
|
1985 push @args,(-main_method => $mainpart) if $mainpart;
|
|
1986 warn "making an aggregator with (@args), subparts = @subparts" if $self->debug;
|
|
1987 push @$list,Bio::DB::GFF::Aggregator->new(@args);
|
|
1988 }
|
|
1989
|
|
1990 else {
|
|
1991 my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E";
|
|
1992 eval "require $class";
|
|
1993 $self->throw("Unable to load $aggregator aggregator: $@") if $@;
|
|
1994 push @$list,$class->new();
|
|
1995 }
|
|
1996 }
|
|
1997
|
|
1998 =head2 aggregators
|
|
1999
|
|
2000 Title : aggregators
|
|
2001 Usage : $db->aggregators([@new_aggregators]);
|
|
2002 Function: retrieve list of aggregators
|
|
2003 Returns : list of aggregators
|
|
2004 Args : a list of aggregators to set (optional)
|
|
2005 Status : public
|
|
2006
|
|
2007 This method will get or set the list of aggregators assigned to
|
|
2008 the database. If 1 or more arguments are passed, the existing
|
|
2009 set will be cleared.
|
|
2010
|
|
2011 =cut
|
|
2012
|
|
2013 sub aggregators {
|
|
2014 my $self = shift;
|
|
2015 my $d = $self->{aggregators};
|
|
2016 if (@_) {
|
|
2017 $self->clear_aggregators;
|
|
2018 $self->add_aggregator($_) foreach @_;
|
|
2019 }
|
|
2020 return unless $d;
|
|
2021 return @$d;
|
|
2022 }
|
|
2023
|
|
2024 =head2 clear_aggregators
|
|
2025
|
|
2026 Title : clear_aggregators
|
|
2027 Usage : $db->clear_aggregators
|
|
2028 Function: clears list of aggregators
|
|
2029 Returns : nothing
|
|
2030 Args : none
|
|
2031 Status : public
|
|
2032
|
|
2033 This method will clear the aggregators stored in the database object.
|
|
2034 Use aggregators() or add_aggregator() to add some back.
|
|
2035
|
|
2036 =cut
|
|
2037
|
|
2038 sub clear_aggregators { shift->{aggregators} = [] }
|
|
2039
|
|
2040 =head1 Methods for use by Subclasses
|
|
2041
|
|
2042 The following methods are chiefly of interest to subclasses and are
|
|
2043 not intended for use by end programmers.
|
|
2044
|
|
2045 =head2 abscoords
|
|
2046
|
|
2047 Title : abscoords
|
|
2048 Usage : $db->abscoords($name,$class,$refseq)
|
|
2049 Function: finds position of a landmark in reference coordinates
|
|
2050 Returns : ($ref,$class,$start,$stop,$strand)
|
|
2051 Args : name and class of landmark
|
|
2052 Status : public
|
|
2053
|
|
2054 This method is called by Bio::DB::GFF::RelSegment to obtain the
|
|
2055 absolute coordinates of a sequence landmark. The arguments are the
|
|
2056 name and class of the landmark. If successful, abscoords() returns
|
|
2057 the ID of the reference sequence, its class, its start and stop
|
|
2058 positions, and the orientation of the reference sequence's coordinate
|
|
2059 system ("+" for forward strand, "-" for reverse strand).
|
|
2060
|
|
2061 If $refseq is present in the argument list, it forces the query to
|
|
2062 search for the landmark in a particular reference sequence.
|
|
2063
|
|
2064 =cut
|
|
2065
|
|
2066 sub abscoords {
|
|
2067 my $self = shift;
|
|
2068 my ($name,$class,$refseq) = @_;
|
|
2069 $class ||= $self->{default_class};
|
|
2070 $self->get_abscoords($name,$class,$refseq);
|
|
2071 }
|
|
2072
|
|
2073 =head1 Protected API
|
|
2074
|
|
2075 The following methods are not intended for public consumption, but are
|
|
2076 intended to be overridden/implemented by adaptors.
|
|
2077
|
|
2078 =head2 default_aggregators
|
|
2079
|
|
2080 Title : default_aggregators
|
|
2081 Usage : $db->default_aggregators;
|
|
2082 Function: retrieve list of aggregators
|
|
2083 Returns : array reference containing list of aggregator names
|
|
2084 Args : none
|
|
2085 Status : protected
|
|
2086
|
|
2087 This method (which is intended to be overridden by adaptors) returns a
|
|
2088 list of standard aggregators to be applied when no aggregators are
|
|
2089 specified in the constructor.
|
|
2090
|
|
2091 =cut
|
|
2092
|
|
2093 sub default_aggregators {
|
|
2094 my $self = shift;
|
|
2095 return ['processed_transcript','alignment'];
|
|
2096 }
|
|
2097
|
|
2098 =head2 do_load_gff
|
|
2099
|
|
2100 Title : do_load_gff
|
|
2101 Usage : $db->do_load_gff($handle)
|
|
2102 Function: load a GFF input stream
|
|
2103 Returns : number of features loaded
|
|
2104 Args : A filehandle.
|
|
2105 Status : protected
|
|
2106
|
|
2107 This method is called to load a GFF data stream. The method will read
|
|
2108 GFF features from E<lt>E<gt> and load them into the database. On exit the
|
|
2109 method must return the number of features loaded.
|
|
2110
|
|
2111 Note that the method is responsible for parsing the GFF lines. This
|
|
2112 is to allow for differences in the interpretation of the "group"
|
|
2113 field, which are legion.
|
|
2114
|
|
2115 You probably want to use load_gff() instead. It is more flexible
|
|
2116 about the arguments it accepts.
|
|
2117
|
|
2118 =cut
|
|
2119
|
|
2120 # load from <>
|
|
2121 sub do_load_gff {
|
|
2122 my $self = shift;
|
|
2123 my $io_handle = shift;
|
|
2124
|
|
2125 local $self->{gff3_flag} = 0;
|
|
2126 $self->setup_load();
|
|
2127
|
|
2128 my $fasta_sequence_id;
|
|
2129
|
|
2130 while (<$io_handle>) {
|
|
2131 chomp;
|
|
2132 $self->{gff3_flag}++ if /^\#\#gff-version\s+3/;
|
|
2133 if (/^>(\S+)/) { # uh oh, sequence coming
|
|
2134 $fasta_sequence_id = $1;
|
|
2135 last;
|
|
2136 }
|
|
2137 if (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
|
|
2138 $self->load_gff_line(
|
|
2139 {
|
|
2140 ref => $1,
|
|
2141 class => 'Sequence',
|
|
2142 source => 'reference',
|
|
2143 method => 'Component',
|
|
2144 start => $2,
|
|
2145 stop => $3,
|
|
2146 score => undef,
|
|
2147 strand => undef,
|
|
2148 phase => undef,
|
|
2149 gclass => 'Sequence',
|
|
2150 gname => $1,
|
|
2151 tstart => undef,
|
|
2152 tstop => undef,
|
|
2153 attributes => [],
|
|
2154 }
|
|
2155 );
|
|
2156 next;
|
|
2157 }
|
|
2158
|
|
2159 next if /^\#/;
|
|
2160 my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
|
|
2161 next unless defined($ref) && defined($method) && defined($start) && defined($stop);
|
|
2162 foreach (\$score,\$strand,\$phase) {
|
|
2163 undef $$_ if $$_ eq '.';
|
|
2164 }
|
|
2165
|
|
2166 my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{gff3_flag});
|
|
2167
|
|
2168 # no standard way in the GFF file to denote the class of the reference sequence -- drat!
|
|
2169 # so we invoke the factory to do it
|
|
2170 my $class = $self->refclass($ref);
|
|
2171
|
|
2172 # call subclass to do the dirty work
|
|
2173 if ($start > $stop) {
|
|
2174 ($start,$stop) = ($stop,$start);
|
|
2175 if ($strand eq '+') {
|
|
2176 $strand = '-';
|
|
2177 } elsif ($strand eq '-') {
|
|
2178 $strand = '+';
|
|
2179 }
|
|
2180 }
|
|
2181 $self->load_gff_line({ref => $ref,
|
|
2182 class => $class,
|
|
2183 source => $source,
|
|
2184 method => $method,
|
|
2185 start => $start,
|
|
2186 stop => $stop,
|
|
2187 score => $score,
|
|
2188 strand => $strand,
|
|
2189 phase => $phase,
|
|
2190 gclass => $gclass,
|
|
2191 gname => $gname,
|
|
2192 tstart => $tstart,
|
|
2193 tstop => $tstop,
|
|
2194 attributes => $attributes}
|
|
2195 );
|
|
2196 }
|
|
2197
|
|
2198 my $result = $self->finish_load();
|
|
2199 $result += $self->load_sequence($io_handle,$fasta_sequence_id)
|
|
2200 if defined $fasta_sequence_id;
|
|
2201 $result;
|
|
2202
|
|
2203 }
|
|
2204
|
|
2205 =head2 load_sequence
|
|
2206
|
|
2207 Title : load_sequence
|
|
2208 Usage : $db->load_sequence($handle [,$id])
|
|
2209 Function: load a FASTA data stream
|
|
2210 Returns : number of sequences
|
|
2211 Args : a filehandle and optionally the ID of
|
|
2212 the first sequence in the stream.
|
|
2213 Status : protected
|
|
2214
|
|
2215 You probably want to use load_fasta() instead. The $id argument is a
|
|
2216 hack used to switch from GFF loading to FASTA loading when load_gff()
|
|
2217 discovers FASTA data hiding at the bottom of the GFF file (as Artemis
|
|
2218 does).
|
|
2219
|
|
2220 =cut
|
|
2221
|
|
2222 sub load_sequence {
|
|
2223 my $self = shift;
|
|
2224 my $io_handle = shift;
|
|
2225 my $id = shift; # hack for GFF files that contain fasta data
|
|
2226
|
|
2227 # read fasta file(s) from ARGV
|
|
2228 my ($seq,$offset,$loaded) = (undef,0,0);
|
|
2229 while (<$io_handle>) {
|
|
2230 chomp;
|
|
2231 if (/^>(\S+)/) {
|
|
2232 $self->insert_sequence($id,$offset,$seq) if $id;
|
|
2233 $id = $1;
|
|
2234 $offset = 0;
|
|
2235 $seq = '';
|
|
2236 $loaded++;
|
|
2237 } else {
|
|
2238 $seq .= $_;
|
|
2239 $self->insert_sequence_chunk($id,\$offset,\$seq);
|
|
2240 }
|
|
2241 }
|
|
2242 $self->insert_sequence($id,$offset,$seq) if $id;
|
|
2243 $loaded+0;
|
|
2244 }
|
|
2245
|
|
2246 sub insert_sequence_chunk {
|
|
2247 my $self = shift;
|
|
2248 my ($id,$offsetp,$seqp) = @_;
|
|
2249 if (my $cs = $self->dna_chunk_size) {
|
|
2250 while (length($$seqp) >= $cs) {
|
|
2251 my $chunk = substr($$seqp,0,$cs);
|
|
2252 $self->insert_sequence($id,$$offsetp,$chunk);
|
|
2253 $$offsetp += length($chunk);
|
|
2254 substr($$seqp,0,$cs) = '';
|
|
2255 }
|
|
2256 }
|
|
2257 return 1; # the calling routine may expect success or failure
|
|
2258 }
|
|
2259
|
|
2260 # used to store big pieces of DNA in itty bitty pieces
|
|
2261 sub dna_chunk_size {
|
|
2262 return 0;
|
|
2263 }
|
|
2264
|
|
2265 sub insert_sequence {
|
|
2266 my $self = shift;
|
|
2267 my($id,$offset,$seq) = @_;
|
|
2268 $self->throw('insert_sequence(): must be defined in subclass');
|
|
2269 }
|
|
2270
|
|
2271 # This is the default class for reference points. Defaults to Sequence.
|
|
2272 sub default_class {
|
|
2273 my $self = shift;
|
|
2274 my $d = exists($self->{default_class}) ? $self->{default_class} : 'Sequence';
|
|
2275 $self->{default_class} = shift if @_;
|
|
2276 $d;
|
|
2277 }
|
|
2278
|
|
2279 # gets name of the reference sequence, and returns its class
|
|
2280 # currently just calls default_class
|
|
2281 sub refclass {
|
|
2282 my $self = shift;
|
|
2283 my $name = shift;
|
|
2284 return $self->default_class;
|
|
2285 }
|
|
2286
|
|
2287 =head2 setup_load
|
|
2288
|
|
2289 Title : setup_load
|
|
2290 Usage : $db->setup_load
|
|
2291 Function: called before load_gff_line()
|
|
2292 Returns : void
|
|
2293 Args : none
|
|
2294 Status : abstract
|
|
2295
|
|
2296 This abstract method gives subclasses a chance to do any
|
|
2297 schema-specific initialization prior to loading a set of GFF records.
|
|
2298 It must be implemented by a subclass.
|
|
2299
|
|
2300 =cut
|
|
2301
|
|
2302 sub setup_load {
|
|
2303 # default, do nothing
|
|
2304 }
|
|
2305
|
|
2306 =head2 finish_load
|
|
2307
|
|
2308 Title : finish_load
|
|
2309 Usage : $db->finish_load
|
|
2310 Function: called after load_gff_line()
|
|
2311 Returns : number of records loaded
|
|
2312 Args : none
|
|
2313 Status :abstract
|
|
2314
|
|
2315 This method gives subclasses a chance to do any schema-specific
|
|
2316 cleanup after loading a set of GFF records.
|
|
2317
|
|
2318 =cut
|
|
2319
|
|
2320 sub finish_load {
|
|
2321 # default, do nothing
|
|
2322 }
|
|
2323
|
|
2324 =head2 load_gff_line
|
|
2325
|
|
2326 Title : load_gff_line
|
|
2327 Usage : $db->load_gff_line(@args)
|
|
2328 Function: called to load one parsed line of GFF
|
|
2329 Returns : true if successfully inserted
|
|
2330 Args : see below
|
|
2331 Status : abstract
|
|
2332
|
|
2333 This abstract method is called once per line of the GFF and passed a
|
|
2334 hashref containing parsed GFF fields. The fields are:
|
|
2335
|
|
2336 {ref => $ref,
|
|
2337 class => $class,
|
|
2338 source => $source,
|
|
2339 method => $method,
|
|
2340 start => $start,
|
|
2341 stop => $stop,
|
|
2342 score => $score,
|
|
2343 strand => $strand,
|
|
2344 phase => $phase,
|
|
2345 gclass => $gclass,
|
|
2346 gname => $gname,
|
|
2347 tstart => $tstart,
|
|
2348 tstop => $tstop,
|
|
2349 attributes => $attributes}
|
|
2350
|
|
2351 =cut
|
|
2352
|
|
2353 sub load_gff_line {
|
|
2354 shift->throw("load_gff_line(): must be implemented by an adaptor");
|
|
2355 }
|
|
2356
|
|
2357
|
|
2358 =head2 do_initialize
|
|
2359
|
|
2360 Title : do_initialize
|
|
2361 Usage : $db->do_initialize([$erase])
|
|
2362 Function: initialize and possibly erase database
|
|
2363 Returns : true if successful
|
|
2364 Args : optional erase flag
|
|
2365 Status : protected
|
|
2366
|
|
2367 This method implements the initialize() method described above, and
|
|
2368 takes the same arguments.
|
|
2369
|
|
2370 =cut
|
|
2371
|
|
2372 sub do_initialize {
|
|
2373 shift->throw('do_initialize(): must be implemented by an adaptor');
|
|
2374 }
|
|
2375
|
|
2376 =head2 dna
|
|
2377
|
|
2378 Title : dna
|
|
2379 Usage : $db->dna($id,$start,$stop,$class)
|
|
2380 Function: return the raw DNA string for a segment
|
|
2381 Returns : a raw DNA string
|
|
2382 Args : id of the sequence, its class, start and stop positions
|
|
2383 Status : public
|
|
2384
|
|
2385 This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA
|
|
2386 sequence.
|
|
2387
|
|
2388 Arguments: -name sequence name
|
|
2389 -start start position
|
|
2390 -stop stop position
|
|
2391 -class sequence class
|
|
2392
|
|
2393 If start and stop are both undef, then the entire DNA is retrieved.
|
|
2394 So to fetch the whole dna, call like this:
|
|
2395
|
|
2396 $db->dna($name_of_sequence);
|
|
2397
|
|
2398 or like this:
|
|
2399
|
|
2400 $db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence);
|
|
2401
|
|
2402 NOTE: you will probably prefer to create a Segment and then invoke its
|
|
2403 dna() method.
|
|
2404
|
|
2405 =cut
|
|
2406
|
|
2407 # call to return the DNA string for the indicated region
|
|
2408 # real work is done by get_dna()
|
|
2409 sub dna {
|
|
2410 my $self = shift;
|
|
2411 my ($id,$start,$stop,$class) = rearrange([
|
|
2412 [qw(NAME ID REF REFSEQ)],
|
|
2413 qw(START),
|
|
2414 [qw(STOP END)],
|
|
2415 'CLASS',
|
|
2416 ],@_);
|
|
2417 # return unless defined $start && defined $stop;
|
|
2418 $self->get_dna($id,$start,$stop,$class);
|
|
2419 }
|
|
2420
|
|
2421 sub features_in_range {
|
|
2422 my $self = shift;
|
|
2423 my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) =
|
|
2424 rearrange([
|
|
2425 [qw(RANGE_TYPE)],
|
|
2426 [qw(REF REFSEQ)],
|
|
2427 qw(CLASS),
|
|
2428 qw(START),
|
|
2429 [qw(STOP END)],
|
|
2430 [qw(TYPE TYPES)],
|
|
2431 qw(PARENT),
|
|
2432 [qw(RARE SPARSE)],
|
|
2433 [qw(MERGE AUTOMERGE)],
|
|
2434 'ITERATOR'
|
|
2435 ],@_);
|
|
2436 $other ||= {};
|
|
2437 $automerge = $types && $self->automerge unless defined $automerge;
|
|
2438 $self->throw("range type must be one of {".
|
|
2439 join(',',keys %valid_range_types).
|
|
2440 "}\n")
|
|
2441 unless $valid_range_types{lc $range_type};
|
|
2442 $self->_features({
|
|
2443 rangetype => lc $range_type,
|
|
2444 refseq => $refseq,
|
|
2445 refclass => $class,
|
|
2446 start => $start,
|
|
2447 stop => $stop,
|
|
2448 types => $types },
|
|
2449 {
|
|
2450 sparse => $sparse,
|
|
2451 automerge => $automerge,
|
|
2452 iterator => $iterator,
|
|
2453 %$other,
|
|
2454 },
|
|
2455 $parent);
|
|
2456 }
|
|
2457
|
|
2458 =head2 get_dna
|
|
2459
|
|
2460 Title : get_dna
|
|
2461 Usage : $db->get_dna($id,$start,$stop,$class)
|
|
2462 Function: get DNA for indicated segment
|
|
2463 Returns : the dna string
|
|
2464 Args : sequence ID, start, stop and class
|
|
2465 Status : protected
|
|
2466
|
|
2467 If start E<gt> stop and the sequence is nucleotide, then this method
|
|
2468 should return the reverse complement. The sequence class may be
|
|
2469 ignored by those databases that do not recognize different object
|
|
2470 types.
|
|
2471
|
|
2472 =cut
|
|
2473
|
|
2474 sub get_dna {
|
|
2475 my $self = shift;
|
|
2476 my ($id,$start,$stop,$class,) = @_;
|
|
2477 $self->throw("get_dna() must be implemented by an adaptor");
|
|
2478 }
|
|
2479
|
|
2480 =head2 get_features
|
|
2481
|
|
2482 Title : get_features
|
|
2483 Usage : $db->get_features($search,$options,$callback)
|
|
2484 Function: get list of features for a region
|
|
2485 Returns : count of number of features retrieved
|
|
2486 Args : see below
|
|
2487 Status : protected
|
|
2488
|
|
2489 The first argument is a hash reference containing search criteria for
|
|
2490 retrieving features. It contains the following keys:
|
|
2491
|
|
2492 rangetype One of "overlaps", "contains" or "contained_in". Indicates
|
|
2493 the type of range query requested.
|
|
2494
|
|
2495 refseq ID of the landmark that establishes the absolute
|
|
2496 coordinate system.
|
|
2497
|
|
2498 refclass Class of this landmark. Can be ignored by implementations
|
|
2499 that don't recognize such distinctions.
|
|
2500
|
|
2501 start Start of the range, inclusive.
|
|
2502
|
|
2503 stop Stop of the range, inclusive.
|
|
2504
|
|
2505 types Array reference containing the list of annotation types
|
|
2506 to fetch from the database. Each annotation type is an
|
|
2507 array reference consisting of [source,method].
|
|
2508
|
|
2509 The second argument is a hash reference containing certain options
|
|
2510 that affect the way information is retrieved:
|
|
2511
|
|
2512 sort_by_group
|
|
2513 A flag. If true, means that the returned features should be
|
|
2514 sorted by the group that they're in.
|
|
2515
|
|
2516 sparse A flag. If true, means that the expected density of the
|
|
2517 features is such that it will be more efficient to search
|
|
2518 by type rather than by range. If it is taking a long
|
|
2519 time to fetch features, give this a try.
|
|
2520
|
|
2521 binsize A true value will create a set of artificial features whose
|
|
2522 start and stop positions indicate bins of the given size, and
|
|
2523 whose scores are the number of features in the bin. The
|
|
2524 class of the feature will be set to "bin", and its name to
|
|
2525 "method:source". This is a handy way of generating histograms
|
|
2526 of feature density.
|
|
2527
|
|
2528 The third argument, the $callback, is a code reference to which
|
|
2529 retrieved features are passed. It is described in more detail below.
|
|
2530
|
|
2531 This routine is responsible for getting arrays of GFF data out of the
|
|
2532 database and passing them to the callback subroutine. The callback
|
|
2533 does the work of constructing a Bio::DB::GFF::Feature object out of
|
|
2534 that data. The callback expects a list of 13 fields:
|
|
2535
|
|
2536 $refseq The reference sequence
|
|
2537 $start feature start
|
|
2538 $stop feature stop
|
|
2539 $source feature source
|
|
2540 $method feature method
|
|
2541 $score feature score
|
|
2542 $strand feature strand
|
|
2543 $phase feature phase
|
|
2544 $groupclass group class (may be undef)
|
|
2545 $groupname group ID (may be undef)
|
|
2546 $tstart target start for similarity hits (may be undef)
|
|
2547 $tstop target stop for similarity hits (may be undef)
|
|
2548 $feature_id A unique feature ID (may be undef)
|
|
2549
|
|
2550 These fields are in the same order as the raw GFF file, with the
|
|
2551 exception that the group column has been parsed into group class and
|
|
2552 group name fields.
|
|
2553
|
|
2554 The feature ID, if provided, is a unique identifier of the feature
|
|
2555 line. The module does not depend on this ID in any way, but it is
|
|
2556 available via Bio::DB::GFF-E<gt>id() if wanted. In the dbi::mysql and
|
|
2557 dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb
|
|
2558 adaptor it is not used.
|
|
2559
|
|
2560 =cut
|
|
2561
|
|
2562 sub get_features{
|
|
2563 my $self = shift;
|
|
2564 my ($search,$options,$callback) = @_;
|
|
2565 $self->throw("get_features() must be implemented by an adaptor");
|
|
2566 }
|
|
2567
|
|
2568
|
|
2569 =head2 _feature_by_name
|
|
2570
|
|
2571 Title : _feature_by_name
|
|
2572 Usage : $db->_feature_by_name($class,$name,$location,$callback)
|
|
2573 Function: get a list of features by name and class
|
|
2574 Returns : count of number of features retrieved
|
|
2575 Args : name of feature, class of feature, and a callback
|
|
2576 Status : abstract
|
|
2577
|
|
2578 This method is used internally. The callback arguments are the same
|
|
2579 as those used by make_feature(). This method must be overidden by
|
|
2580 subclasses.
|
|
2581
|
|
2582 =cut
|
|
2583
|
|
2584 sub _feature_by_name {
|
|
2585 my $self = shift;
|
|
2586 my ($class,$name,$location,$callback) = @_;
|
|
2587 $self->throw("_feature_by_name() must be implemented by an adaptor");
|
|
2588 }
|
|
2589
|
|
2590 sub _feature_by_attribute {
|
|
2591 my $self = shift;
|
|
2592 my ($attributes,$callback) = @_;
|
|
2593 $self->throw("_feature_by_name() must be implemented by an adaptor");
|
|
2594 }
|
|
2595
|
|
2596 =head2 _feature_by_id
|
|
2597
|
|
2598 Title : _feature_by_id
|
|
2599 Usage : $db->_feature_by_id($ids,$type,$callback)
|
|
2600 Function: get a feature based
|
|
2601 Returns : count of number of features retrieved
|
|
2602 Args : arrayref to feature IDs to fetch
|
|
2603 Status : abstract
|
|
2604
|
|
2605 This method is used internally to fetch features either by their ID or
|
|
2606 their group ID. $ids is a arrayref containing a list of IDs, $type is
|
|
2607 one of "feature" or "group", and $callback is a callback. The
|
|
2608 callback arguments are the same as those used by make_feature(). This
|
|
2609 method must be overidden by subclasses.
|
|
2610
|
|
2611 =cut
|
|
2612
|
|
2613 sub _feature_by_id {
|
|
2614 my $self = shift;
|
|
2615 my ($ids,$type,$callback) = @_;
|
|
2616 $self->throw("_feature_by_id() must be implemented by an adaptor");
|
|
2617 }
|
|
2618
|
|
2619 =head2 overlapping_features
|
|
2620
|
|
2621 Title : overlapping_features
|
|
2622 Usage : $db->overlapping_features(@args)
|
|
2623 Function: get features that overlap the indicated range
|
|
2624 Returns : a list of Bio::DB::GFF::Feature objects
|
|
2625 Args : see below
|
|
2626 Status : public
|
|
2627
|
|
2628 This method is invoked by Bio::DB::GFF::Segment-E<gt>features() to find
|
|
2629 the list of features that overlap a given range. It is generally
|
|
2630 preferable to create the Segment first, and then fetch the features.
|
|
2631
|
|
2632 This method takes set of named arguments:
|
|
2633
|
|
2634 -refseq ID of the reference sequence
|
|
2635 -class Class of the reference sequence
|
|
2636 -start Start of the desired range in refseq coordinates
|
|
2637 -stop Stop of the desired range in refseq coordinates
|
|
2638 -types List of feature types to return. Argument is an array
|
|
2639 reference containing strings of the format "method:source"
|
|
2640 -parent A parent Bio::DB::GFF::Segment object, used to create
|
|
2641 relative coordinates in the generated features.
|
|
2642 -rare Turn on an optimization suitable for a relatively rare feature type,
|
|
2643 where it will be faster to filter by feature type first
|
|
2644 and then by position, rather than vice versa.
|
|
2645 -merge Whether to apply aggregators to the generated features.
|
|
2646 -iterator Whether to return an iterator across the features.
|
|
2647
|
|
2648 If -iterator is true, then the method returns a single scalar value
|
|
2649 consisting of a Bio::SeqIO object. You can call next_seq() repeatedly
|
|
2650 on this object to fetch each of the features in turn. If iterator is
|
|
2651 false or absent, then all the features are returned as a list.
|
|
2652
|
|
2653 Currently aggregation is disabled when iterating over a series of
|
|
2654 features.
|
|
2655
|
|
2656 Types are indicated using the nomenclature "method:source". Either of
|
|
2657 these fields can be omitted, in which case a wildcard is used for the
|
|
2658 missing field. Type names without the colon (e.g. "exon") are
|
|
2659 interpreted as the method name and a source wild card. Regular
|
|
2660 expressions are allowed in either field, as in: "similarity:BLAST.*".
|
|
2661
|
|
2662 =cut
|
|
2663
|
|
2664 # call to return the features that overlap the named region
|
|
2665 # real work is done by get_features
|
|
2666 sub overlapping_features {
|
|
2667 my $self = shift;
|
|
2668 $self->features_in_range(-range_type=>'overlaps',@_);
|
|
2669 }
|
|
2670
|
|
2671 =head2 contained_features
|
|
2672
|
|
2673 Title : contained_features
|
|
2674 Usage : $db->contained_features(@args)
|
|
2675 Function: get features that are contained within the indicated range
|
|
2676 Returns : a list of Bio::DB::GFF::Feature objects
|
|
2677 Args : see overlapping_features()
|
|
2678 Status : public
|
|
2679
|
|
2680 This call is similar to overlapping_features(), except that it only
|
|
2681 retrieves features whose end points are completely contained within
|
|
2682 the specified range.
|
|
2683
|
|
2684 Generally you will want to fetch a Bio::DB::GFF::Segment object and
|
|
2685 call its contained_features() method rather than call this directly.
|
|
2686
|
|
2687 =cut
|
|
2688
|
|
2689 # The same, except that it only returns features that are completely contained within the
|
|
2690 # range (much faster usually)
|
|
2691 sub contained_features {
|
|
2692 my $self = shift;
|
|
2693 $self->features_in_range(-range_type=>'contains',@_);
|
|
2694 }
|
|
2695
|
|
2696 =head2 contained_in
|
|
2697
|
|
2698 Title : contained_in
|
|
2699 Usage : @features = $s->contained_in(@args)
|
|
2700 Function: get features that contain this segment
|
|
2701 Returns : a list of Bio::DB::GFF::Feature objects
|
|
2702 Args : see features()
|
|
2703 Status : Public
|
|
2704
|
|
2705 This is identical in behavior to features() except that it returns
|
|
2706 only those features that completely contain the segment.
|
|
2707
|
|
2708 =cut
|
|
2709
|
|
2710 sub contained_in {
|
|
2711 my $self = shift;
|
|
2712 $self->features_in_range(-range_type=>'contained_in',@_);
|
|
2713 }
|
|
2714
|
|
2715 =head2 get_abscoords
|
|
2716
|
|
2717 Title : get_abscoords
|
|
2718 Usage : $db->get_abscoords($name,$class,$refseq)
|
|
2719 Function: get the absolute coordinates of sequence with name & class
|
|
2720 Returns : ($absref,$absstart,$absstop,$absstrand)
|
|
2721 Args : name and class of the landmark
|
|
2722 Status : protected
|
|
2723
|
|
2724 Given the name and class of a genomic landmark, this function returns
|
|
2725 a four-element array consisting of:
|
|
2726
|
|
2727 $absref the ID of the reference sequence that contains this landmark
|
|
2728 $absstart the position at which the landmark starts
|
|
2729 $absstop the position at which the landmark stops
|
|
2730 $absstrand the strand of the landmark, relative to the reference sequence
|
|
2731
|
|
2732 If $refseq is provided, the function searches only within the
|
|
2733 specified reference sequence.
|
|
2734
|
|
2735 =cut
|
|
2736
|
|
2737 sub get_abscoords {
|
|
2738 my $self = shift;
|
|
2739 my ($name,$class,$refseq) = @_;
|
|
2740 $self->throw("get_abscoords() must be implemented by an adaptor");
|
|
2741 }
|
|
2742
|
|
2743 =head2 get_types
|
|
2744
|
|
2745 Title : get_types
|
|
2746 Usage : $db->get_types($absref,$class,$start,$stop,$count)
|
|
2747 Function: get list of all feature types on the indicated segment
|
|
2748 Returns : list or hash of Bio::DB::GFF::Typename objects
|
|
2749 Args : see below
|
|
2750 Status : protected
|
|
2751
|
|
2752 Arguments are:
|
|
2753
|
|
2754 $absref the ID of the reference sequence
|
|
2755 $class the class of the reference sequence
|
|
2756 $start the position to start counting
|
|
2757 $stop the position to end counting
|
|
2758 $count a boolean indicating whether to count the number
|
|
2759 of occurrences of each feature type
|
|
2760
|
|
2761 If $count is true, then a hash is returned. The keys of the hash are
|
|
2762 feature type names in the format "method:source" and the values are
|
|
2763 the number of times a feature of this type overlaps the indicated
|
|
2764 segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename
|
|
2765 objects. If $start or $stop are undef, then all features on the
|
|
2766 indicated segment are enumerated. If $absref is undef, then the call
|
|
2767 returns all feature types in the database.
|
|
2768
|
|
2769 =cut
|
|
2770
|
|
2771 sub get_types {
|
|
2772 my $self = shift;
|
|
2773 my ($refseq,$class,$start,$stop,$count,$types) = @_;
|
|
2774 $self->throw("get_types() must be implemented by an adaptor");
|
|
2775 }
|
|
2776
|
|
2777 =head2 make_feature
|
|
2778
|
|
2779 Title : make_feature
|
|
2780 Usage : $db->make_feature(@args)
|
|
2781 Function: Create a Bio::DB::GFF::Feature object from string data
|
|
2782 Returns : a Bio::DB::GFF::Feature object
|
|
2783 Args : see below
|
|
2784 Status : internal
|
|
2785
|
|
2786 This takes 14 arguments (really!):
|
|
2787
|
|
2788 $parent A Bio::DB::GFF::RelSegment object
|
|
2789 $group_hash A hashref containing unique list of GFF groups
|
|
2790 $refname The name of the reference sequence for this feature
|
|
2791 $refclass The class of the reference sequence for this feature
|
|
2792 $start Start of feature
|
|
2793 $stop Stop of feature
|
|
2794 $source Feature source field
|
|
2795 $method Feature method field
|
|
2796 $score Feature score field
|
|
2797 $strand Feature strand
|
|
2798 $phase Feature phase
|
|
2799 $group_class Class of feature group
|
|
2800 $group_name Name of feature group
|
|
2801 $tstart For homologies, start of hit on target
|
|
2802 $tstop Stop of hit on target
|
|
2803
|
|
2804 The $parent argument, if present, is used to establish relative
|
|
2805 coordinates in the resulting Bio::DB::Feature object. This allows one
|
|
2806 feature to generate a list of other features that are relative to its
|
|
2807 coordinate system (for example, finding the coordinates of the second
|
|
2808 exon relative to the coordinates of the first).
|
|
2809
|
|
2810 The $group_hash allows the group_class/group_name strings to be turned
|
|
2811 into rich database objects via the make_obect() method (see above).
|
|
2812 Because these objects may be expensive to create, $group_hash is used
|
|
2813 to uniquefy them. The index of this hash is the composite key
|
|
2814 {$group_class,$group_name,$tstart,$tstop}. Values are whatever object
|
|
2815 is returned by the make_object() method.
|
|
2816
|
|
2817 The remainder of the fields are taken from the GFF line, with the
|
|
2818 exception that "Target" features, which contain information about the
|
|
2819 target of a homology search, are parsed into their components.
|
|
2820
|
|
2821 =cut
|
|
2822
|
|
2823 # This call is responsible for turning a line of GFF into a
|
|
2824 # feature object.
|
|
2825 # The $parent argument is a Bio::DB::GFF::Segment object and is used
|
|
2826 # to establish the coordinate system for the new feature.
|
|
2827 # The $group_hash argument is an hash ref that holds previously-
|
|
2828 # generated group objects.
|
|
2829 # Other arguments are taken right out of the GFF table.
|
|
2830 sub make_feature {
|
|
2831 my $self = shift;
|
|
2832 my ($parent,$group_hash, # these arguments provided by generic mechanisms
|
|
2833 $srcseq, # the rest is provided by adaptor
|
|
2834 $start,$stop,
|
|
2835 $source,$method,
|
|
2836 $score,$strand,$phase,
|
|
2837 $group_class,$group_name,
|
|
2838 $tstart,$tstop,
|
|
2839 $db_id,$group_id) = @_;
|
|
2840
|
|
2841 return unless $srcseq; # return undef if called with no arguments. This behavior is used for
|
|
2842 # on-the-fly aggregation.
|
|
2843
|
|
2844 my $group; # undefined
|
|
2845 if (defined $group_class && defined $group_name) {
|
|
2846 $tstart ||= '';
|
|
2847 $tstop ||= '';
|
|
2848 if ($group_hash) {
|
|
2849 $group = $group_hash->{$group_class,$group_name,$tstart,$tstop}
|
|
2850 ||= $self->make_object($group_class,$group_name,$tstart,$tstop);
|
|
2851 } else {
|
|
2852 $group = $self->make_object($group_class,$group_name,$tstart,$tstop);
|
|
2853 }
|
|
2854 }
|
|
2855
|
|
2856 # fix for some broken GFF files
|
|
2857 # unfortunately - has undesired side effects
|
|
2858 # if (defined $tstart && defined $tstop && !defined $strand) {
|
|
2859 # $strand = $tstart <= $tstop ? '+' : '-';
|
|
2860 # }
|
|
2861
|
|
2862 if (ref $parent) { # note that the src sequence is ignored
|
|
2863 return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop,
|
|
2864 $method,$source,
|
|
2865 $score,$strand,$phase,
|
|
2866 $group,$db_id,$group_id,
|
|
2867 $tstart,$tstop);
|
|
2868 } else {
|
|
2869 return Bio::DB::GFF::Feature->new($self,$srcseq,
|
|
2870 $start,$stop,
|
|
2871 $method,$source,
|
|
2872 $score,$strand,$phase,
|
|
2873 $group,$db_id,$group_id,
|
|
2874 $tstart,$tstop);
|
|
2875 }
|
|
2876 }
|
|
2877
|
|
2878 sub make_aggregated_feature {
|
|
2879 my $self = shift;
|
|
2880 my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3);
|
|
2881 my $feature = $self->make_feature($parent,undef,@_);
|
|
2882 return [$feature] if $feature && !$feature->group;
|
|
2883
|
|
2884 # if we have accumulated features and either:
|
|
2885 # (1) make_feature() returned undef, indicated very end or
|
|
2886 # (2) the current group is different from the previous one
|
|
2887
|
|
2888 local $^W = 0; # irritating uninitialized value warning in next statement
|
|
2889 if (@$accumulated_features &&
|
|
2890 (!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) {
|
|
2891 foreach my $a (@$aggregators) { # last aggregator gets first shot
|
|
2892 $a->aggregate($accumulated_features,$self) or next;
|
|
2893 }
|
|
2894 my @result = @$accumulated_features;
|
|
2895 @$accumulated_features = $feature ? ($feature) : ();
|
|
2896 return unless @result;
|
|
2897 return \@result ;
|
|
2898 }
|
|
2899 push @$accumulated_features,$feature;
|
|
2900 return;
|
|
2901 }
|
|
2902
|
|
2903 =head2 parse_types
|
|
2904
|
|
2905 Title : parse_types
|
|
2906 Usage : $db->parse_types(@args)
|
|
2907 Function: parses list of types
|
|
2908 Returns : an array ref containing ['method','source'] pairs
|
|
2909 Args : a list of types in 'method:source' form
|
|
2910 Status : internal
|
|
2911
|
|
2912 This method takes an array of type names in the format "method:source"
|
|
2913 and returns an array reference of ['method','source'] pairs. It will
|
|
2914 also accept a single argument consisting of an array reference with
|
|
2915 the list of type names.
|
|
2916
|
|
2917 =cut
|
|
2918
|
|
2919 # turn feature types in the format "method:source" into a list of [method,source] refs
|
|
2920 sub parse_types {
|
|
2921 my $self = shift;
|
|
2922 return [] if !@_ or !defined($_[0]);
|
|
2923 return $_[0] if ref $_[0] eq 'ARRAY' && ref $_[0][0];
|
|
2924 my @types = ref($_[0]) ? @{$_[0]} : @_;
|
|
2925 my @type_list = map { [split(':',$_,2)] } @types;
|
|
2926 return \@type_list;
|
|
2927 }
|
|
2928
|
|
2929 =head2 make_match_sub
|
|
2930
|
|
2931 Title : make_match_sub
|
|
2932 Usage : $db->make_match_sub($types)
|
|
2933 Function: creates a subroutine used for filtering features
|
|
2934 Returns : a code reference
|
|
2935 Args : a list of parsed type names
|
|
2936 Status : protected
|
|
2937
|
|
2938 This method is used internally to generate a code subroutine that will
|
|
2939 accept or reject a feature based on its method and source. It takes
|
|
2940 an array of parsed type names in the format returned by parse_types(),
|
|
2941 and generates an anonymous subroutine. The subroutine takes a single
|
|
2942 Bio::DB::GFF::Feature object and returns true if the feature matches
|
|
2943 one of the desired feature types, and false otherwise.
|
|
2944
|
|
2945 =cut
|
|
2946
|
|
2947 # a subroutine that matches features indicated by list of types
|
|
2948 sub make_match_sub {
|
|
2949 my $self = shift;
|
|
2950 my $types = shift;
|
|
2951
|
|
2952 return sub { 1 } unless ref $types && @$types;
|
|
2953
|
|
2954 my @expr;
|
|
2955 for my $type (@$types) {
|
|
2956 my ($method,$source) = @$type;
|
|
2957 $method ||= '.*';
|
|
2958 $source = $source ? ":$source" : "(?::.+)?";
|
|
2959 push @expr,"${method}${source}";
|
|
2960 }
|
|
2961 my $expr = join '|',@expr;
|
|
2962 return $self->{match_subs}{$expr} if $self->{match_subs}{$expr};
|
|
2963
|
|
2964 my $sub =<<END;
|
|
2965 sub {
|
|
2966 my \$feature = shift or return;
|
|
2967 return \$feature->type =~ /^($expr)\$/i;
|
|
2968 }
|
|
2969 END
|
|
2970 warn "match sub: $sub\n" if $self->debug;
|
|
2971 my $compiled_sub = eval $sub;
|
|
2972 $self->throw($@) if $@;
|
|
2973 return $self->{match_subs}{$expr} = $compiled_sub;
|
|
2974 }
|
|
2975
|
|
2976 =head2 make_object
|
|
2977
|
|
2978 Title : make_object
|
|
2979 Usage : $db->make_object($class,$name,$start,$stop)
|
|
2980 Function: creates a feature object
|
|
2981 Returns : a feature object
|
|
2982 Args : see below
|
|
2983 Status : protected
|
|
2984
|
|
2985 This method is called to make an object from the GFF "group" field.
|
|
2986 By default, all Target groups are turned into Bio::DB::GFF::Homol
|
|
2987 objects, and everything else becomes a Bio::DB::GFF::Featname.
|
|
2988 However, adaptors are free to override this method to generate more
|
|
2989 interesting objects, such as true BioPerl objects, or Acedb objects.
|
|
2990
|
|
2991 Arguments are:
|
|
2992
|
|
2993 $name database ID for object
|
|
2994 $class class of object
|
|
2995 $start for similarities, start of match inside object
|
|
2996 $stop for similarities, stop of match inside object
|
|
2997
|
|
2998 =cut
|
|
2999
|
|
3000 # abstract call to turn a feature into an object, given its class and name
|
|
3001 sub make_object {
|
|
3002 my $self = shift;
|
|
3003 my ($class,$name,$start,$stop) = @_;
|
|
3004 return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop)
|
|
3005 if defined $start and length $start;
|
|
3006 return Bio::DB::GFF::Featname->new($class,$name);
|
|
3007 }
|
|
3008
|
|
3009
|
|
3010 =head2 do_attributes
|
|
3011
|
|
3012 Title : do_attributes
|
|
3013 Usage : $db->do_attributes($id [,$tag]);
|
|
3014 Function: internal method to retrieve attributes given an id and tag
|
|
3015 Returns : a list of Bio::DB::GFF::Feature objects
|
|
3016 Args : a feature id and a attribute tag (optional)
|
|
3017 Status : protected
|
|
3018
|
|
3019 This method is overridden by subclasses in order to return a list of
|
|
3020 attributes. If called with a tag, returns the value of attributes of
|
|
3021 that tag type. If called without a tag, returns a flattened array of
|
|
3022 (tag=E<gt>value) pairs. A particular tag can be present multiple times.
|
|
3023
|
|
3024 =cut
|
|
3025
|
|
3026 sub do_attributes {
|
|
3027 my $self = shift;
|
|
3028 my ($id,$tag) = @_;
|
|
3029 return ();
|
|
3030 }
|
|
3031
|
|
3032
|
|
3033
|
|
3034 =head1 Internal Methods
|
|
3035
|
|
3036 The following methods are internal to Bio::DB::GFF and are not
|
|
3037 guaranteed to remain the same.
|
|
3038
|
|
3039 =head2 _features
|
|
3040
|
|
3041 Title : _features
|
|
3042 Usage : $db->_features($search,$options,$parent)
|
|
3043 Function: internal method
|
|
3044 Returns : a list of Bio::DB::GFF::Feature objects
|
|
3045 Args : see below
|
|
3046 Status : internal
|
|
3047
|
|
3048 This is an internal method that is called by overlapping_features(),
|
|
3049 contained_features() and features() to create features based on a
|
|
3050 parent segment's coordinate system. It takes three arguments, a
|
|
3051 search options hashref, an options hashref, and a parent segment.
|
|
3052
|
|
3053 The search hashref contains the following keys:
|
|
3054
|
|
3055 rangetype One of "overlaps", "contains" or "contained_in". Indicates
|
|
3056 the type of range query requested.
|
|
3057 refseq reference sequence ID
|
|
3058 refclass reference sequence class
|
|
3059 start start of range
|
|
3060 stop stop of range
|
|
3061 types arrayref containing list of types in "method:source" form
|
|
3062
|
|
3063 The options hashref contains zero or more of the following keys:
|
|
3064
|
|
3065 sparse turn on optimizations for a rare feature
|
|
3066 automerge if true, invoke aggregators to merge features
|
|
3067 iterator if true, return an iterator
|
|
3068
|
|
3069 The $parent argument is a scalar object containing a
|
|
3070 Bio::DB::GFF::RelSegment object or descendent.
|
|
3071
|
|
3072 =cut
|
|
3073
|
|
3074 #'
|
|
3075
|
|
3076 sub _features {
|
|
3077 my $self = shift;
|
|
3078 my ($search,$options,$parent) = @_;
|
|
3079 (@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)})
|
|
3080 if defined($search->{start}) && $search->{start} > $search->{stop};
|
|
3081
|
|
3082 my $types = $self->parse_types($search->{types}); # parse out list of types
|
|
3083 my @aggregated_types = @$types; # keep a copy
|
|
3084
|
|
3085 # allow the aggregators to operate on the original
|
|
3086 my @aggregators;
|
|
3087 if ($options->{automerge}) {
|
|
3088 for my $a ($self->aggregators) {
|
|
3089 $a = $a->clone if $options->{iterator};
|
|
3090 unshift @aggregators,$a
|
|
3091 if $a->disaggregate(\@aggregated_types,$self);
|
|
3092 }
|
|
3093 }
|
|
3094
|
|
3095 if ($options->{iterator}) {
|
|
3096 my @accumulated_features;
|
|
3097 my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) }
|
|
3098 : sub { [$self->make_feature($parent,undef,@_)] };
|
|
3099 return $self->get_features_iterator({ %$search,
|
|
3100 types => \@aggregated_types },
|
|
3101 { %$options,
|
|
3102 sort_by_group => $options->{automerge} },
|
|
3103 $callback
|
|
3104 );
|
|
3105 }
|
|
3106
|
|
3107 my %groups; # cache the groups we create to avoid consuming too much unecessary memory
|
|
3108 my $features = [];
|
|
3109
|
|
3110 my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) };
|
|
3111 $self->get_features({ %$search,
|
|
3112 types => \@aggregated_types },
|
|
3113 $options,
|
|
3114 $callback);
|
|
3115
|
|
3116 if ($options->{automerge}) {
|
|
3117 warn "aggregating...\n" if $self->debug;
|
|
3118 foreach my $a (@aggregators) { # last aggregator gets first shot
|
|
3119 warn "Aggregator $a:\n" if $self->debug;
|
|
3120 $a->aggregate($features,$self);
|
|
3121 }
|
|
3122 }
|
|
3123
|
|
3124 @$features;
|
|
3125 }
|
|
3126
|
|
3127 =head2 get_features_iterator
|
|
3128
|
|
3129 Title : get_features_iterator
|
|
3130 Usage : $db->get_features_iterator($search,$options,$callback)
|
|
3131 Function: get an iterator on a features query
|
|
3132 Returns : a Bio::SeqIO object
|
|
3133 Args : as per get_features()
|
|
3134 Status : Public
|
|
3135
|
|
3136 This method takes the same arguments as get_features(), but returns an
|
|
3137 iterator that can be used to fetch features sequentially, as per
|
|
3138 Bio::SeqIO.
|
|
3139
|
|
3140 Internally, this method is simply a front end to range_query().
|
|
3141 The latter method constructs and executes the query, returning a
|
|
3142 statement handle. This routine passes the statement handle to the
|
|
3143 constructor for the iterator, along with the callback.
|
|
3144
|
|
3145 =cut
|
|
3146
|
|
3147 sub get_features_iterator {
|
|
3148 my $self = shift;
|
|
3149 my ($search,$options,$callback) = @_;
|
|
3150 $self->throw('feature iteration is not implemented in this adaptor');
|
|
3151 }
|
|
3152
|
|
3153 =head2 split_group
|
|
3154
|
|
3155 Title : split_group
|
|
3156 Usage : $db->split_group($group_field,$gff3_flag)
|
|
3157 Function: parse GFF group field
|
|
3158 Returns : ($gclass,$gname,$tstart,$tstop,$attributes)
|
|
3159 Args : the gff group column and a flag indicating gff3 compatibility
|
|
3160 Status : internal
|
|
3161
|
|
3162 This is a method that is called by load_gff_line to parse out the
|
|
3163 contents of one or more group fields. It returns the class of the
|
|
3164 group, its name, the start and stop of the target, if any, and an
|
|
3165 array reference containing any attributes that were stuck into the
|
|
3166 group field, in [attribute_name,attribute_value] format.
|
|
3167
|
|
3168 =cut
|
|
3169
|
|
3170 sub split_group {
|
|
3171 my $self = shift;
|
|
3172 my ($group,$gff3) = @_;
|
|
3173 if ($gff3) {
|
|
3174 my @groups = split /[;&]/,$group; # so easy!
|
|
3175 return $self->_split_gff3_group(@groups);
|
|
3176 } else {
|
|
3177 # handle group parsing
|
|
3178 # protect embedded semicolons in the group; there must be faster/more elegant way
|
|
3179 # to do this.
|
|
3180 $group =~ s/\\;/$;/g;
|
|
3181 while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 }
|
|
3182 my @groups = split(/\s*;\s*/,$group);
|
|
3183 foreach (@groups) { s/$;/;/g }
|
|
3184 return $self->_split_gff2_group(@groups);
|
|
3185 }
|
|
3186 }
|
|
3187
|
|
3188 =head2 _split_gff2_group
|
|
3189
|
|
3190 This is an internal method called by split_group().
|
|
3191
|
|
3192 =cut
|
|
3193
|
|
3194 sub _split_gff2_group {
|
|
3195 my $self = shift;
|
|
3196 my @groups = @_;
|
|
3197
|
|
3198 my ($gclass,$gname,$tstart,$tstop,@attributes);
|
|
3199
|
|
3200 for (@groups) {
|
|
3201
|
|
3202 my ($tag,$value) = /^(\S+)(?:\s+(.+))?/;
|
|
3203 $value ||= '';
|
|
3204 if ($value =~ /^\"(.+)\"$/) { #remove quotes
|
|
3205 $value = $1;
|
|
3206 }
|
|
3207 $value =~ s/\\t/\t/g;
|
|
3208 $value =~ s/\\r/\r/g;
|
|
3209
|
|
3210 # Any additional groups become part of the attributes hash
|
|
3211 # For historical reasons, the tag "Note" is treated as an
|
|
3212 # attribute, even if it is the only group.
|
|
3213 $tag ||= '';
|
|
3214 if ($tag eq 'Note' or ($gclass && $gname)) {
|
|
3215 push @attributes,[$tag => $value];
|
|
3216 }
|
|
3217
|
|
3218 # if the tag eq 'Target' then the class name is embedded in the ID
|
|
3219 # (the GFF format is obviously screwed up here)
|
|
3220 elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) {
|
|
3221 ($gclass,$gname) = ($1,$2);
|
|
3222 ($tstart,$tstop) = / (\d+) (\d+)/;
|
|
3223 }
|
|
3224
|
|
3225 elsif (!$value) {
|
|
3226 push @attributes,[Note => $tag]; # e.g. "Confirmed_by_EST"
|
|
3227 }
|
|
3228
|
|
3229 # otherwise, the tag and value correspond to the
|
|
3230 # group class and name
|
|
3231 else {
|
|
3232 ($gclass,$gname) = ($tag,$value);
|
|
3233 }
|
|
3234 }
|
|
3235
|
|
3236 return ($gclass,$gname,$tstart,$tstop,\@attributes);
|
|
3237 }
|
|
3238
|
|
3239 =head2 _split_gff3_group
|
|
3240
|
|
3241 This is called internally from split_group().
|
|
3242
|
|
3243 =cut
|
|
3244
|
|
3245 sub _split_gff3_group {
|
|
3246 my $self = shift;
|
|
3247 my @groups = @_;
|
|
3248 my ($gclass,$gname,$tstart,$tstop,@attributes);
|
|
3249
|
|
3250 for my $group (@groups) {
|
|
3251 my ($tag,$value) = split /=/,$group;
|
|
3252 $tag = unescape($tag);
|
|
3253 my @values = map {unescape($_)} split /,/,$value;
|
|
3254 if ($tag eq 'Parent') {
|
|
3255 $gclass = 'Sequence';
|
|
3256 $gname = shift @values;
|
|
3257 }
|
|
3258 elsif ($tag eq 'ID') {
|
|
3259 $gclass = 'Sequence';
|
|
3260 $gname = shift @values;
|
|
3261 }
|
|
3262 elsif ($tag eq 'Target') {
|
|
3263 $gclass = 'Sequence';
|
|
3264 ($gname,$tstart,$tstop) = split /\s+/,shift @values;
|
|
3265 }
|
|
3266 push @attributes,[$tag=>$_] foreach @values;
|
|
3267 }
|
|
3268 return ($gclass,$gname,$tstart,$tstop,\@attributes);
|
|
3269 }
|
|
3270
|
|
3271 =head2 _delete_features(), _delete_groups(),_delete()
|
|
3272
|
|
3273 Title : _delete_features(), _delete_groups(),_delete()
|
|
3274 Usage : $count = $db->_delete_features(@feature_ids)
|
|
3275 $count = $db->_delete_groups(@group_ids)
|
|
3276 $count = $db->_delete(\%delete_spec)
|
|
3277 Function: low-level feature/group deleter
|
|
3278 Returns : count of groups removed
|
|
3279 Args : list of feature or group ids removed
|
|
3280 Status : for implementation by subclasses
|
|
3281
|
|
3282 These methods need to be implemented in adaptors. For
|
|
3283 _delete_features and _delete_groups, the arguments are a list of
|
|
3284 feature or group IDs to remove. For _delete(), the argument is a
|
|
3285 hashref with the three keys 'segments', 'types' and 'force'. The
|
|
3286 first contains an arrayref of Bio::DB::GFF::RelSegment objects to
|
|
3287 delete (all FEATURES within the segment are deleted). The second
|
|
3288 contains an arrayref of [method,source] feature types to delete. The
|
|
3289 two are ANDed together. If 'force' has a true value, this forces the
|
|
3290 operation to continue even if it would delete all features.
|
|
3291
|
|
3292 =cut
|
|
3293
|
|
3294 sub _delete_features {
|
|
3295 my $self = shift;
|
|
3296 my @feature_ids = @_;
|
|
3297 $self->throw('_delete_features is not implemented in this adaptor');
|
|
3298 }
|
|
3299
|
|
3300 sub _delete_groups {
|
|
3301 my $self = shift;
|
|
3302 my @group_ids = @_;
|
|
3303 $self->throw('_delete_groups is not implemented in this adaptor');
|
|
3304 }
|
|
3305
|
|
3306 sub _delete {
|
|
3307 my $self = shift;
|
|
3308 my $delete_options = shift;
|
|
3309 $self->throw('_delete is not implemented in this adaptor');
|
|
3310 }
|
|
3311
|
|
3312 sub unescape {
|
|
3313 my $v = shift;
|
|
3314 $v =~ tr/+/ /;
|
|
3315 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
|
|
3316 return $v;
|
|
3317 }
|
|
3318
|
|
3319
|
|
3320 package Bio::DB::GFF::ID_Iterator;
|
|
3321 use strict;
|
|
3322
|
|
3323 use Bio::Root::Root;
|
|
3324 use vars '@ISA';
|
|
3325 @ISA = 'Bio::Root::Root';
|
|
3326
|
|
3327 sub new {
|
|
3328 my $class = shift;
|
|
3329 my ($db,$ids,$type) = @_;
|
|
3330 return bless {ids=>$ids,db=>$db,type=>$type},$class;
|
|
3331 }
|
|
3332
|
|
3333 sub next_seq {
|
|
3334 my $self = shift;
|
|
3335 my $next = shift @{$self->{ids}};
|
|
3336 return unless $next;
|
|
3337 my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next;
|
|
3338 my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name)
|
|
3339 : $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name)
|
|
3340 : $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name)
|
|
3341 : $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier");
|
|
3342 $self->throw("id does not exist") unless $segment;
|
|
3343 return $segment;
|
|
3344 }
|
|
3345
|
|
3346 1;
|
|
3347
|
|
3348 __END__
|
|
3349
|
|
3350 =head1 BUGS
|
|
3351
|
|
3352 Features can only belong to a single group at a time. This must be
|
|
3353 addressed soon.
|
|
3354
|
|
3355 Start coordinate can be greater than stop coordinate for relative
|
|
3356 addressing. This breaks strict BioPerl compatibility and must be
|
|
3357 fixed.
|
|
3358
|
|
3359 =head1 SEE ALSO
|
|
3360
|
|
3361 L<bioperl>,
|
|
3362 L<Bio::DB::GFF::RelSegment>,
|
|
3363 L<Bio::DB::GFF::Aggregator>,
|
|
3364 L<Bio::DB::GFF::Feature>,
|
|
3365 L<Bio::DB::GFF::Adaptor::dbi::mysqlopt>,
|
|
3366 L<Bio::DB::GFF::Adaptor::dbi::oracle>,
|
|
3367 L<Bio::DB::GFF::Adaptor::memory>
|
|
3368
|
|
3369 =head1 AUTHOR
|
|
3370
|
|
3371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
|
|
3372
|
|
3373 Copyright (c) 2001 Cold Spring Harbor Laboratory.
|
|
3374
|
|
3375 This library is free software; you can redistribute it and/or modify
|
|
3376 it under the same terms as Perl itself.
|
|
3377
|
|
3378 =cut
|
|
3379
|