0
|
1 # $Id: DasI.pm,v 1.15 2002/11/11 18:16:29 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::DasI
|
|
4 #
|
|
5 # Cared for by Lincoln Stein <lstein@cshl.org>
|
|
6 #
|
|
7 # Copyright Lincoln Stein
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::DasI - DAS-style access to a feature database
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 # Open up a feature database somehow...
|
|
20 $db = Bio::DasI->new(@args);
|
|
21
|
|
22 @segments = $db->segment(-name => 'NT_29921.4',
|
|
23 -start => 1,
|
|
24 -end => 1000000);
|
|
25
|
|
26 # segments are Bio::Das::SegmentI - compliant objects
|
|
27
|
|
28 # fetch a list of features
|
|
29 @features = $db->features(-type=>['type1','type2','type3']);
|
|
30
|
|
31 # invoke a callback over features
|
|
32 $db->features(-type=>['type1','type2','type3'],
|
|
33 -callback => sub { ... }
|
|
34 );
|
|
35
|
|
36 $stream = $db->get_seq_stream(-type=>['type1','type2','type3']);
|
|
37 while (my $feature = $stream->next_seq) {
|
|
38 # each feature is a Bio::SeqFeatureI-compliant object
|
|
39 }
|
|
40
|
|
41 # get all feature types
|
|
42 @types = $db->types;
|
|
43
|
|
44 # count types
|
|
45 %types = $db->types(-enumerate=>1);
|
|
46
|
|
47 @feature = $db->get_feature_by_name($class=>$name);
|
|
48 @feature = $db->get_feature_by_target($target_name);
|
|
49 @feature = $db->get_feature_by_attribute($att1=>$value1,$att2=>$value2);
|
|
50 $feature = $db->get_feature_by_id($id);
|
|
51
|
|
52 $error = $db->error;
|
|
53
|
|
54 =head1 DESCRIPTION
|
|
55
|
|
56 Bio::DasI is a simplified alternative interface to sequence annotation
|
|
57 databases used by the distributed annotation system (see
|
|
58 L<Bio::Das>). In this scheme, the genome is represented as a series of
|
|
59 features, a subset of which are named. Named features can be used as
|
|
60 reference points for retrieving "segments" (see L<Bio::Das::SegmentI>),
|
|
61 and these can, in turn, be used as the basis for exploring the genome
|
|
62 further.
|
|
63
|
|
64 In addition to a name, each feature has a "class", which is
|
|
65 essentially a namespace qualifier and a "type", which describes what
|
|
66 type of feature it is. Das uses the GO consortium's ontology of
|
|
67 feature types, and so the type is actually an object of class
|
|
68 Bio::Das::FeatureTypeI (see L<Bio::Das::FeatureTypeI>). Bio::DasI
|
|
69 provides methods for interrogating the database for the types it
|
|
70 contains and the counts of each type.
|
|
71
|
|
72 =head1 FEEDBACK
|
|
73
|
|
74 =head2 Mailing Lists
|
|
75
|
|
76 User feedback is an integral part of the evolution of this and other
|
|
77 Bioperl modules. Send your comments and suggestions preferably to one
|
|
78 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
79
|
|
80 bioperl-l@bio.perl.org
|
|
81
|
|
82 =head2 Reporting Bugs
|
|
83
|
|
84 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
85 the bugs and their resolution. Bug reports can be submitted via email
|
|
86 or the web:
|
|
87
|
|
88 bioperl-bugs@bio.perl.org
|
|
89 http://bugzilla.bioperl.org/
|
|
90
|
|
91 =head1 AUTHOR - Lincoln Stein
|
|
92
|
|
93 Email lstein@cshl.org
|
|
94
|
|
95 =head1 APPENDIX
|
|
96
|
|
97 The rest of the documentation details each of the object
|
|
98 methods. Internal methods are usually preceded with a _
|
|
99
|
|
100 =cut
|
|
101
|
|
102 #'
|
|
103 # Let the code begin...
|
|
104
|
|
105 package Bio::DasI;
|
|
106 use strict;
|
|
107
|
|
108 use vars qw(@ISA);
|
|
109 use Bio::Root::RootI;
|
|
110 use Bio::Das::SegmentI;
|
|
111 use Bio::SeqFeature::CollectionI;
|
|
112 # Object preamble - inherits from Bio::Root::Root;
|
|
113 @ISA = qw(Bio::Root::RootI Bio::SeqFeature::CollectionI);
|
|
114
|
|
115 =head2 new
|
|
116
|
|
117 Title : new
|
|
118 Usage : Bio::DasI->new(@args)
|
|
119 Function: Create new Bio::DasI object
|
|
120 Returns : a Bio::DasI object
|
|
121 Args : see below
|
|
122
|
|
123 The new() method creates a new object. The argument list is either a
|
|
124 single argument consisting of a connection string, or the following
|
|
125 list of -name=E<gt>value arguments:
|
|
126
|
|
127 Argument Description
|
|
128 -------- -----------
|
|
129
|
|
130 -dsn Connection string for database
|
|
131 -adaptor Name of an adaptor class to use when connecting
|
|
132 -aggregator Array ref containing list of aggregators
|
|
133 "semantic mappers" to apply to database
|
|
134 -user Authentication username
|
|
135 -pass Authentication password
|
|
136
|
|
137 Implementors of DasI may add other arguments.
|
|
138
|
|
139 =cut
|
|
140
|
|
141 sub new {shift->throw_not_implemented}
|
|
142
|
|
143 =head2 types
|
|
144
|
|
145 Title : types
|
|
146 Usage : $db->types(@args)
|
|
147 Function: return list of feature types in database
|
|
148 Returns : a list of Bio::Das::FeatureTypeI objects
|
|
149 Args : see below
|
|
150
|
|
151 This routine returns a list of feature types known to the database. It
|
|
152 is also possible to find out how many times each feature occurs.
|
|
153
|
|
154 Arguments are -option=E<gt>value pairs as follows:
|
|
155
|
|
156 -enumerate if true, count the features
|
|
157
|
|
158 The returned value will be a list of Bio::Das::FeatureTypeI objects
|
|
159 (see L<Bio::Das::FeatureTypeI>.
|
|
160
|
|
161 If -enumerate is true, then the function returns a hash (not a hash
|
|
162 reference) in which the keys are the stringified versions of
|
|
163 Bio::Das::FeatureTypeI and the values are the number of times each
|
|
164 feature appears in the database.
|
|
165
|
|
166 =cut
|
|
167
|
|
168 sub types { shift->throw_not_implemented; }
|
|
169
|
|
170 =head2 segment
|
|
171
|
|
172 Title : segment
|
|
173 Usage : $db->segment(@args);
|
|
174 Function: create a segment object
|
|
175 Returns : segment object(s)
|
|
176 Args : see below
|
|
177
|
|
178 This method generates a Bio::Das::SegmentI object (see
|
|
179 L<Bio::Das::SegmentI>). The segment can be used to find overlapping
|
|
180 features and the raw sequence.
|
|
181
|
|
182 When making the segment() call, you specify the ID of a sequence
|
|
183 landmark (e.g. an accession number, a clone or contig), and a
|
|
184 positional range relative to the landmark. If no range is specified,
|
|
185 then the entire region spanned by the landmark is used to generate the
|
|
186 segment.
|
|
187
|
|
188 Arguments are -option=E<gt>value pairs as follows:
|
|
189
|
|
190 -name ID of the landmark sequence.
|
|
191
|
|
192 -class A namespace qualifier. It is not necessary for the
|
|
193 database to honor namespace qualifiers, but if it
|
|
194 does, this is where the qualifier is indicated.
|
|
195
|
|
196 -version Version number of the landmark. It is not necessary for
|
|
197 the database to honor versions, but if it does, this is
|
|
198 where the version is indicated.
|
|
199
|
|
200 -start Start of the segment relative to landmark. Positions
|
|
201 follow standard 1-based sequence rules. If not specified,
|
|
202 defaults to the beginning of the landmark.
|
|
203
|
|
204 -end End of the segment relative to the landmark. If not specified,
|
|
205 defaults to the end of the landmark.
|
|
206
|
|
207 The return value is a list of Bio::Das::SegmentI objects. If the method
|
|
208 is called in a scalar context and there are no more than one segments
|
|
209 that satisfy the request, then it is allowed to return the segment.
|
|
210 Otherwise, the method must throw a "multiple segment exception".
|
|
211
|
|
212 =cut
|
|
213
|
|
214 #'
|
|
215
|
|
216 sub segment { shift->throw_not_implemented }
|
|
217
|
|
218 =head2 features
|
|
219
|
|
220 Title : features
|
|
221 Usage : $db->features(@args)
|
|
222 Function: get all features, possibly filtered by type
|
|
223 Returns : a list of Bio::SeqFeatureI objects
|
|
224 Args : see below
|
|
225 Status : public
|
|
226
|
|
227 This routine will retrieve features in the database regardless of
|
|
228 position. It can be used to return all features, or a subset based on
|
|
229 their type
|
|
230
|
|
231 Arguments are -option=E<gt>value pairs as follows:
|
|
232
|
|
233 -types List of feature types to return. Argument is an array
|
|
234 of Bio::Das::FeatureTypeI objects or a set of strings
|
|
235 that can be converted into FeatureTypeI objects.
|
|
236
|
|
237 -callback A callback to invoke on each feature. The subroutine
|
|
238 will be passed each Bio::SeqFeatureI object in turn.
|
|
239
|
|
240 -attributes A hash reference containing attributes to match.
|
|
241
|
|
242 The -attributes argument is a hashref containing one or more attributes
|
|
243 to match against:
|
|
244
|
|
245 -attributes => { Gene => 'abc-1',
|
|
246 Note => 'confirmed' }
|
|
247
|
|
248 Attribute matching is simple exact string matching, and multiple
|
|
249 attributes are ANDed together. See L<Bio::DB::ConstraintsI> for a
|
|
250 more sophisticated take on this.
|
|
251
|
|
252 If one provides a callback, it will be invoked on each feature in
|
|
253 turn. If the callback returns a false value, iteration will be
|
|
254 interrupted. When a callback is provided, the method returns undef.
|
|
255
|
|
256 =cut
|
|
257
|
|
258 sub features { shift->throw_not_implemented }
|
|
259
|
|
260 =head2 get_feature_by_name
|
|
261
|
|
262 Title : get_feature_by_name
|
|
263 Usage : $db->get_feature_by_name(-class=>$class,-name=>$name)
|
|
264 Function: fetch features by their name
|
|
265 Returns : a list of Bio::SeqFeatureI objects
|
|
266 Args : the class and name of the desired feature
|
|
267 Status : public
|
|
268
|
|
269 This method can be used to fetch named feature(s) from the database.
|
|
270 The -class and -name arguments have the same meaning as in segment(),
|
|
271 and the method also accepts the following short-cut forms:
|
|
272
|
|
273 1) one argument: the argument is treated as the feature name
|
|
274 2) two arguments: the arguments are treated as the class and name
|
|
275 (note: this uses _rearrange() so the first argument must not
|
|
276 begin with a hyphen or it will be interpreted as a named
|
|
277 argument).
|
|
278
|
|
279 This method may return zero, one, or several Bio::SeqFeatureI objects.
|
|
280 The implementor may allow the name to contain wildcards, in which case
|
|
281 standard C-shell glob semantics are expected.
|
|
282
|
|
283 =cut
|
|
284
|
|
285 sub get_feature_by_name {
|
|
286 shift->throw_not_implemented();
|
|
287 }
|
|
288
|
|
289 =head2 get_feature_by_target
|
|
290
|
|
291 Title : get_feature_by_target
|
|
292 Usage : $db->get_feature_by_target($class => $name)
|
|
293 Function: fetch features by their similarity target
|
|
294 Returns : a list of Bio::SeqFeatureI objects
|
|
295 Args : the class and name of the desired feature
|
|
296 Status : public
|
|
297
|
|
298 This method can be used to fetch a named feature from the database
|
|
299 based on its similarity hit. The arguments are the same as
|
|
300 get_feature_by_name(). If this is not implemented, the interface
|
|
301 defaults to using get_feature_by_name().
|
|
302
|
|
303 =cut
|
|
304
|
|
305 sub get_feature_by_target {
|
|
306 shift->get_feature_by_name(@_);
|
|
307 }
|
|
308
|
|
309 =head2 get_feature_by_id
|
|
310
|
|
311 Title : get_feature_by_id
|
|
312 Usage : $db->get_feature_by_target($id)
|
|
313 Function: fetch a feature by its ID
|
|
314 Returns : a Bio::SeqFeatureI objects
|
|
315 Args : the ID of the feature
|
|
316 Status : public
|
|
317
|
|
318 If the database provides unique feature IDs, this can be used to
|
|
319 retrieve a single feature from the database. If not overridden, this
|
|
320 interface calls get_feature_by_name() and returns the first element.
|
|
321
|
|
322 =cut
|
|
323
|
|
324 sub get_feature_by_id {
|
|
325 (shift->get_feature_by_name(@_))[0];
|
|
326 }
|
|
327
|
|
328 =head2 get_feature_by_attribute
|
|
329
|
|
330 Title : get_feature_by_attribute
|
|
331 Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2)
|
|
332 Function: fetch features by combinations of attribute values
|
|
333 Returns : a list of Bio::SeqFeatureI objects
|
|
334 Args : the class and name of the desired feature
|
|
335 Status : public
|
|
336
|
|
337 This method can be used to fetch a set of features from the database.
|
|
338 Attributes are a list of name=E<gt>value pairs. They will be
|
|
339 logically ANDed together. If an attribute value is an array
|
|
340 reference, the list of values in the array is treated as an
|
|
341 alternative set of values to be ORed together.
|
|
342
|
|
343 =cut
|
|
344
|
|
345 sub get_feature_by_attribute {
|
|
346 shift->throw_not_implemented();
|
|
347 }
|
|
348
|
|
349
|
|
350 =head2 search_notes
|
|
351
|
|
352 Title : search_notes
|
|
353 Usage : $db->search_notes($search_term,$max_results)
|
|
354 Function: full-text search on features, ENSEMBL-style
|
|
355 Returns : an array of [$name,$description,$score]
|
|
356 Args : see below
|
|
357 Status : public
|
|
358
|
|
359 This routine performs a full-text search on feature attributes (which
|
|
360 attributes depend on implementation) and returns a list of
|
|
361 [$name,$description,$score], where $name is the feature ID,
|
|
362 $description is a human-readable description such as a locus line, and
|
|
363 $score is the match strength.
|
|
364
|
|
365 Since this is a decidedly non-standard thing to do (but the generic
|
|
366 genome browser uses it), the default method returns an empty list.
|
|
367 You do not have to implement it.
|
|
368
|
|
369 =cut
|
|
370
|
|
371 sub search_notes { return }
|
|
372
|
|
373 =head2 get_seq_stream
|
|
374
|
|
375 Title : get_seq_stream
|
|
376 Usage : $seqio = $db->get_seq_stream(@args)
|
|
377 Function: Performs a query and returns an iterator over it
|
|
378 Returns : a Bio::SeqIO stream capable of returning Bio::SeqFeatureI objects
|
|
379 Args : As in features()
|
|
380 Status : public
|
|
381
|
|
382 This routine takes the same arguments as features(), but returns a
|
|
383 Bio::SeqIO::Stream-compliant object. Use it like this:
|
|
384
|
|
385 $stream = $db->get_seq_stream('exon');
|
|
386 while (my $exon = $stream->next_seq) {
|
|
387 print $exon,"\n";
|
|
388 }
|
|
389
|
|
390 NOTE: In the interface this method is aliased to get_feature_stream(),
|
|
391 as the name is more descriptive.
|
|
392
|
|
393 =cut
|
|
394
|
|
395 sub get_seq_stream { shift->throw_not_implemented }
|
|
396 sub get_feature_stream {shift->get_seq_stream(@_) }
|
|
397
|
|
398 =head2 refclass
|
|
399
|
|
400 Title : refclass
|
|
401 Usage : $class = $db->refclass
|
|
402 Function: returns the default class to use for segment() calls
|
|
403 Returns : a string
|
|
404 Args : none
|
|
405 Status : public
|
|
406
|
|
407 For data sources which use namespaces to distinguish reference
|
|
408 sequence accessions, this returns the default namespace (or "class")
|
|
409 to use. This interface defines a default of "Accession".
|
|
410
|
|
411 =cut
|
|
412
|
|
413 sub refclass { "Accession" }
|
|
414
|
|
415 1;
|