0
|
1 # $Id: AbstractSeq.pm,v 1.16 2002/10/22 07:38:33 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::DB::AbstractSeq
|
|
4 #
|
|
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
|
|
6 #
|
|
7 # Copyright Ewan Birney
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Index::AbstractSeq - Base class for AbstractSeq s
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 # Make a new sequence file indexing package
|
|
20
|
|
21 package MyShinyNewIndexer;
|
|
22 use Bio::Index::AbstractSeq;
|
|
23
|
|
24 @ISA = ('Bio::Index::AbstractSeq');
|
|
25
|
|
26 # Now provide the necessary methods...
|
|
27
|
|
28 =head1 DESCRIPTION
|
|
29
|
|
30 Provides a common base class for multiple
|
|
31 sequence files built using the
|
|
32 Bio::Index::Abstract system, and provides a
|
|
33 Bio::DB::SeqI interface.
|
|
34
|
|
35 =head1 FEEDBACK
|
|
36
|
|
37 =head2 Mailing Lists
|
|
38
|
|
39 User feedback is an integral part of the evolution of this
|
|
40 and other Bioperl modules. Send your comments and suggestions preferably
|
|
41 to one of the Bioperl mailing lists.
|
|
42 Your participation is much appreciated.
|
|
43
|
|
44 bioperl-l@bioperl.org - General discussion
|
|
45 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
46
|
|
47 =head2 Reporting Bugs
|
|
48
|
|
49 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
50 the bugs and their resolution.
|
|
51 Bug reports can be submitted via email or the web:
|
|
52
|
|
53 bioperl-bugs@bio.perl.org
|
|
54 http://bugzilla.bioperl.org/
|
|
55
|
|
56 =head1 AUTHOR - Ewan Birney
|
|
57
|
|
58 Email birney@ebi.ac.uk
|
|
59
|
|
60 Describe contact details here
|
|
61
|
|
62 =head1 APPENDIX
|
|
63
|
|
64 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
|
|
65
|
|
66 =head1 SEE ALSO
|
|
67
|
|
68 Bio::Index::Abstract - Module which
|
|
69 Bio::Index::AbstractSeq inherits off, which
|
|
70 provides dbm indexing for flat files (which are
|
|
71 not necessarily sequence files).
|
|
72
|
|
73 =cut
|
|
74
|
|
75 # Let's begin the code ...
|
|
76
|
|
77
|
|
78 package Bio::Index::AbstractSeq;
|
|
79 use vars qw(@ISA);
|
|
80 use strict;
|
|
81
|
|
82 use Bio::SeqIO::MultiFile;
|
|
83 use Bio::Index::Abstract;
|
|
84 use Bio::DB::SeqI;
|
|
85
|
|
86
|
|
87 @ISA = qw(Bio::Index::Abstract Bio::DB::SeqI);
|
|
88
|
|
89 sub new {
|
|
90 my ($class, @args) = @_;
|
|
91 my $self = $class->SUPER::new(@args);
|
|
92
|
|
93 $self->{'_seqio_cache'} = [];
|
|
94 return $self;
|
|
95 }
|
|
96
|
|
97 =head2 _file_format
|
|
98
|
|
99 Title : _file_format
|
|
100 Usage : $self->_file_format
|
|
101 Function: Derived classes should override this
|
|
102 method (it throws an exception here)
|
|
103 to give the file format of the files used
|
|
104 Example :
|
|
105 Returns :
|
|
106 Args :
|
|
107
|
|
108
|
|
109 =cut
|
|
110
|
|
111 sub _file_format {
|
|
112 my ($self,@args) = @_;
|
|
113
|
|
114 my $pkg = ref($self);
|
|
115 $self->throw("Class '$pkg' must provide a file format method correctly");
|
|
116 }
|
|
117
|
|
118 =head2 fetch
|
|
119
|
|
120 Title : fetch
|
|
121 Usage : $index->fetch( $id )
|
|
122 Function: Returns a Bio::Seq object from the index
|
|
123 Example : $seq = $index->fetch( 'dJ67B12' )
|
|
124 Returns : Bio::Seq object
|
|
125 Args : ID
|
|
126
|
|
127 =cut
|
|
128
|
|
129 sub fetch {
|
|
130 my( $self, $id ) = @_;
|
|
131 my $db = $self->db();
|
|
132 my $seq;
|
|
133
|
|
134 if (my $rec = $db->{ $id }) {
|
|
135 my ($file, $begin) = $self->unpack_record( $rec );
|
|
136
|
|
137 # Get the (possibly cached) SeqIO object
|
|
138 my $seqio = $self->_get_SeqIO_object( $file );
|
|
139 my $fh = $seqio->_fh();
|
|
140
|
|
141 # move to start of record
|
|
142 $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug
|
|
143 seek($fh, $begin, 0);
|
|
144
|
|
145 $seq = $seqio->next_seq();
|
|
146 }
|
|
147
|
|
148 # we essentially assumme that the primary_id for the database
|
|
149 # is the display_id
|
|
150 $seq->primary_id($seq->display_id()) if( defined $seq && ref($seq) &&
|
|
151 $seq->isa('Bio::PrimarySeqI') );
|
|
152
|
|
153 return $seq;
|
|
154 }
|
|
155
|
|
156 =head2 _get_SeqIO_object
|
|
157
|
|
158 Title : _get_SeqIO_object
|
|
159 Usage : $index->_get_SeqIO_object( $file )
|
|
160 Function: Returns a Bio::SeqIO object for the file
|
|
161 Example : $seq = $index->_get_SeqIO_object( 0 )
|
|
162 Returns : Bio::SeqIO object
|
|
163 Args : File number (an integer)
|
|
164
|
|
165 =cut
|
|
166
|
|
167 sub _get_SeqIO_object {
|
|
168 my( $self, $i ) = @_;
|
|
169
|
|
170 unless ($self->{'_seqio_cache'}[$i]) {
|
|
171 my $fh = $self->_file_handle($i);
|
|
172 # make a new SeqIO object
|
|
173 my $seqio = Bio::SeqIO->new( -Format => $self->_file_format,
|
|
174 -fh => $fh);
|
|
175 $self->{'_seqio_cache'}[$i] = $seqio;
|
|
176 }
|
|
177 return $self->{'_seqio_cache'}[$i];
|
|
178 }
|
|
179
|
|
180 =head2 get_Seq_by_id
|
|
181
|
|
182 Title : get_Seq_by_id
|
|
183 Usage : $seq = $db->get_Seq_by_id()
|
|
184 Function: retrieves a sequence object, identically to
|
|
185 ->fetch, but here behaving as a Bio::DB::BioSeqI
|
|
186 Returns : new Bio::Seq object
|
|
187 Args : string represents the id
|
|
188
|
|
189
|
|
190 =cut
|
|
191
|
|
192 sub get_Seq_by_id {
|
|
193 my ($self,$id) = @_;
|
|
194
|
|
195 return $self->fetch($id);
|
|
196 }
|
|
197
|
|
198 =head2 get_Seq_by_acc
|
|
199
|
|
200 Title : get_Seq_by_acc
|
|
201 Usage : $seq = $db->get_Seq_by_acc()
|
|
202 Function: retrieves a sequence object, identically to
|
|
203 ->fetch, but here behaving as a Bio::DB::BioSeqI
|
|
204 Returns : new Bio::Seq object
|
|
205 Args : string represents the accession number
|
|
206
|
|
207
|
|
208 =cut
|
|
209
|
|
210 sub get_Seq_by_acc {
|
|
211 my ($self,$id) = @_;
|
|
212
|
|
213 return $self->fetch($id);
|
|
214 }
|
|
215
|
|
216 =head2 get_PrimarySeq_stream
|
|
217
|
|
218 Title : get_PrimarySeq_stream
|
|
219 Usage : $stream = get_PrimarySeq_stream
|
|
220 Function: Makes a Bio::DB::SeqStreamI compliant object
|
|
221 which provides a single method, next_primary_seq
|
|
222 Returns : Bio::DB::SeqStreamI
|
|
223 Args : none
|
|
224
|
|
225
|
|
226 =cut
|
|
227
|
|
228 sub get_PrimarySeq_stream {
|
|
229 my $self = shift;
|
|
230 my $num = $self->_file_count() || 0;
|
|
231 my @file;
|
|
232
|
|
233 for (my $i = 0; $i < $num; $i++) {
|
|
234 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
|
|
235 push(@file,$file);
|
|
236 }
|
|
237
|
|
238 my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file);
|
|
239 return $out;
|
|
240 }
|
|
241
|
|
242 =head2 get_all_primary_ids
|
|
243
|
|
244 Title : get_all_primary_ids
|
|
245 Usage : @ids = $seqdb->get_all_primary_ids()
|
|
246 Function: gives an array of all the primary_ids of the
|
|
247 sequence objects in the database. These
|
|
248 maybe ids (display style) or accession numbers
|
|
249 or something else completely different - they
|
|
250 *are not* meaningful outside of this database
|
|
251 implementation.
|
|
252 Example :
|
|
253 Returns : an array of strings
|
|
254 Args : none
|
|
255
|
|
256
|
|
257 =cut
|
|
258
|
|
259 sub get_all_primary_ids {
|
|
260 my ($self,@args) = @_;
|
|
261 my $db = $self->db;
|
|
262
|
|
263 # the problem is here that we have indexed things both on
|
|
264 # accession number and name.
|
|
265
|
|
266 # We could take two options
|
|
267 # here - loop over the database, returning only one copy of each
|
|
268 # id that points to the same byte position, or we rely on semantics
|
|
269 # of accession numbers.
|
|
270
|
|
271 # someone is going to index a database with no accession numbers.
|
|
272 # doh!. We have to uniquify the index...
|
|
273
|
|
274 my( %bytepos );
|
|
275 while (my($id, $rec) = each %$db) {
|
|
276 if( $id =~ /^__/ ) {
|
|
277 # internal info
|
|
278 next;
|
|
279 }
|
|
280 my ($file, $begin) = $self->unpack_record( $rec );
|
|
281
|
|
282 $bytepos{"$file:$begin"} = $id;
|
|
283 }
|
|
284
|
|
285 return values %bytepos;
|
|
286 }
|
|
287
|
|
288
|
|
289 =head2 get_Seq_by_primary_id
|
|
290
|
|
291 Title : get_Seq_by_primary_id
|
|
292 Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string);
|
|
293 Function: Gets a Bio::Seq object by the primary id. The primary
|
|
294 id in these cases has to come from $db->get_all_primary_ids.
|
|
295 There is no other way to get (or guess) the primary_ids
|
|
296 in a database.
|
|
297
|
|
298 The other possibility is to get Bio::PrimarySeqI objects
|
|
299 via the get_PrimarySeq_stream and the primary_id field
|
|
300 on these objects are specified as the ids to use here.
|
|
301 Returns : A Bio::Seq object
|
|
302 Args : primary id (as a string)
|
|
303 Throws : "acc does not exist" exception
|
|
304
|
|
305
|
|
306 =cut
|
|
307
|
|
308 sub get_Seq_by_primary_id {
|
|
309 my ($self,$id) = @_;
|
|
310 return $self->fetch($id);
|
|
311 }
|
|
312
|
|
313 1;
|