comparison variant_effect_predictor/Bio/Index/AbstractSeq.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: AbstractSeq.pm,v 1.16 2002/10/22 07:38:33 lapp Exp $
2 #
3 # BioPerl module for Bio::DB::AbstractSeq
4 #
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
6 #
7 # Copyright Ewan Birney
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Index::AbstractSeq - Base class for AbstractSeq s
16
17 =head1 SYNOPSIS
18
19 # Make a new sequence file indexing package
20
21 package MyShinyNewIndexer;
22 use Bio::Index::AbstractSeq;
23
24 @ISA = ('Bio::Index::AbstractSeq');
25
26 # Now provide the necessary methods...
27
28 =head1 DESCRIPTION
29
30 Provides a common base class for multiple
31 sequence files built using the
32 Bio::Index::Abstract system, and provides a
33 Bio::DB::SeqI interface.
34
35 =head1 FEEDBACK
36
37 =head2 Mailing Lists
38
39 User feedback is an integral part of the evolution of this
40 and other Bioperl modules. Send your comments and suggestions preferably
41 to one of the Bioperl mailing lists.
42 Your participation is much appreciated.
43
44 bioperl-l@bioperl.org - General discussion
45 http://bioperl.org/MailList.shtml - About the mailing lists
46
47 =head2 Reporting Bugs
48
49 Report bugs to the Bioperl bug tracking system to help us keep track
50 the bugs and their resolution.
51 Bug reports can be submitted via email or the web:
52
53 bioperl-bugs@bio.perl.org
54 http://bugzilla.bioperl.org/
55
56 =head1 AUTHOR - Ewan Birney
57
58 Email birney@ebi.ac.uk
59
60 Describe contact details here
61
62 =head1 APPENDIX
63
64 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
65
66 =head1 SEE ALSO
67
68 Bio::Index::Abstract - Module which
69 Bio::Index::AbstractSeq inherits off, which
70 provides dbm indexing for flat files (which are
71 not necessarily sequence files).
72
73 =cut
74
75 # Let's begin the code ...
76
77
78 package Bio::Index::AbstractSeq;
79 use vars qw(@ISA);
80 use strict;
81
82 use Bio::SeqIO::MultiFile;
83 use Bio::Index::Abstract;
84 use Bio::DB::SeqI;
85
86
87 @ISA = qw(Bio::Index::Abstract Bio::DB::SeqI);
88
89 sub new {
90 my ($class, @args) = @_;
91 my $self = $class->SUPER::new(@args);
92
93 $self->{'_seqio_cache'} = [];
94 return $self;
95 }
96
97 =head2 _file_format
98
99 Title : _file_format
100 Usage : $self->_file_format
101 Function: Derived classes should override this
102 method (it throws an exception here)
103 to give the file format of the files used
104 Example :
105 Returns :
106 Args :
107
108
109 =cut
110
111 sub _file_format {
112 my ($self,@args) = @_;
113
114 my $pkg = ref($self);
115 $self->throw("Class '$pkg' must provide a file format method correctly");
116 }
117
118 =head2 fetch
119
120 Title : fetch
121 Usage : $index->fetch( $id )
122 Function: Returns a Bio::Seq object from the index
123 Example : $seq = $index->fetch( 'dJ67B12' )
124 Returns : Bio::Seq object
125 Args : ID
126
127 =cut
128
129 sub fetch {
130 my( $self, $id ) = @_;
131 my $db = $self->db();
132 my $seq;
133
134 if (my $rec = $db->{ $id }) {
135 my ($file, $begin) = $self->unpack_record( $rec );
136
137 # Get the (possibly cached) SeqIO object
138 my $seqio = $self->_get_SeqIO_object( $file );
139 my $fh = $seqio->_fh();
140
141 # move to start of record
142 $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug
143 seek($fh, $begin, 0);
144
145 $seq = $seqio->next_seq();
146 }
147
148 # we essentially assumme that the primary_id for the database
149 # is the display_id
150 $seq->primary_id($seq->display_id()) if( defined $seq && ref($seq) &&
151 $seq->isa('Bio::PrimarySeqI') );
152
153 return $seq;
154 }
155
156 =head2 _get_SeqIO_object
157
158 Title : _get_SeqIO_object
159 Usage : $index->_get_SeqIO_object( $file )
160 Function: Returns a Bio::SeqIO object for the file
161 Example : $seq = $index->_get_SeqIO_object( 0 )
162 Returns : Bio::SeqIO object
163 Args : File number (an integer)
164
165 =cut
166
167 sub _get_SeqIO_object {
168 my( $self, $i ) = @_;
169
170 unless ($self->{'_seqio_cache'}[$i]) {
171 my $fh = $self->_file_handle($i);
172 # make a new SeqIO object
173 my $seqio = Bio::SeqIO->new( -Format => $self->_file_format,
174 -fh => $fh);
175 $self->{'_seqio_cache'}[$i] = $seqio;
176 }
177 return $self->{'_seqio_cache'}[$i];
178 }
179
180 =head2 get_Seq_by_id
181
182 Title : get_Seq_by_id
183 Usage : $seq = $db->get_Seq_by_id()
184 Function: retrieves a sequence object, identically to
185 ->fetch, but here behaving as a Bio::DB::BioSeqI
186 Returns : new Bio::Seq object
187 Args : string represents the id
188
189
190 =cut
191
192 sub get_Seq_by_id {
193 my ($self,$id) = @_;
194
195 return $self->fetch($id);
196 }
197
198 =head2 get_Seq_by_acc
199
200 Title : get_Seq_by_acc
201 Usage : $seq = $db->get_Seq_by_acc()
202 Function: retrieves a sequence object, identically to
203 ->fetch, but here behaving as a Bio::DB::BioSeqI
204 Returns : new Bio::Seq object
205 Args : string represents the accession number
206
207
208 =cut
209
210 sub get_Seq_by_acc {
211 my ($self,$id) = @_;
212
213 return $self->fetch($id);
214 }
215
216 =head2 get_PrimarySeq_stream
217
218 Title : get_PrimarySeq_stream
219 Usage : $stream = get_PrimarySeq_stream
220 Function: Makes a Bio::DB::SeqStreamI compliant object
221 which provides a single method, next_primary_seq
222 Returns : Bio::DB::SeqStreamI
223 Args : none
224
225
226 =cut
227
228 sub get_PrimarySeq_stream {
229 my $self = shift;
230 my $num = $self->_file_count() || 0;
231 my @file;
232
233 for (my $i = 0; $i < $num; $i++) {
234 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} );
235 push(@file,$file);
236 }
237
238 my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file);
239 return $out;
240 }
241
242 =head2 get_all_primary_ids
243
244 Title : get_all_primary_ids
245 Usage : @ids = $seqdb->get_all_primary_ids()
246 Function: gives an array of all the primary_ids of the
247 sequence objects in the database. These
248 maybe ids (display style) or accession numbers
249 or something else completely different - they
250 *are not* meaningful outside of this database
251 implementation.
252 Example :
253 Returns : an array of strings
254 Args : none
255
256
257 =cut
258
259 sub get_all_primary_ids {
260 my ($self,@args) = @_;
261 my $db = $self->db;
262
263 # the problem is here that we have indexed things both on
264 # accession number and name.
265
266 # We could take two options
267 # here - loop over the database, returning only one copy of each
268 # id that points to the same byte position, or we rely on semantics
269 # of accession numbers.
270
271 # someone is going to index a database with no accession numbers.
272 # doh!. We have to uniquify the index...
273
274 my( %bytepos );
275 while (my($id, $rec) = each %$db) {
276 if( $id =~ /^__/ ) {
277 # internal info
278 next;
279 }
280 my ($file, $begin) = $self->unpack_record( $rec );
281
282 $bytepos{"$file:$begin"} = $id;
283 }
284
285 return values %bytepos;
286 }
287
288
289 =head2 get_Seq_by_primary_id
290
291 Title : get_Seq_by_primary_id
292 Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string);
293 Function: Gets a Bio::Seq object by the primary id. The primary
294 id in these cases has to come from $db->get_all_primary_ids.
295 There is no other way to get (or guess) the primary_ids
296 in a database.
297
298 The other possibility is to get Bio::PrimarySeqI objects
299 via the get_PrimarySeq_stream and the primary_id field
300 on these objects are specified as the ids to use here.
301 Returns : A Bio::Seq object
302 Args : primary id (as a string)
303 Throws : "acc does not exist" exception
304
305
306 =cut
307
308 sub get_Seq_by_primary_id {
309 my ($self,$id) = @_;
310 return $self->fetch($id);
311 }
312
313 1;