Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Index/AbstractSeq.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: AbstractSeq.pm,v 1.16 2002/10/22 07:38:33 lapp Exp $ | |
2 # | |
3 # BioPerl module for Bio::DB::AbstractSeq | |
4 # | |
5 # Cared for by Ewan Birney <birney@ebi.ac.uk> | |
6 # | |
7 # Copyright Ewan Birney | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Index::AbstractSeq - Base class for AbstractSeq s | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 # Make a new sequence file indexing package | |
20 | |
21 package MyShinyNewIndexer; | |
22 use Bio::Index::AbstractSeq; | |
23 | |
24 @ISA = ('Bio::Index::AbstractSeq'); | |
25 | |
26 # Now provide the necessary methods... | |
27 | |
28 =head1 DESCRIPTION | |
29 | |
30 Provides a common base class for multiple | |
31 sequence files built using the | |
32 Bio::Index::Abstract system, and provides a | |
33 Bio::DB::SeqI interface. | |
34 | |
35 =head1 FEEDBACK | |
36 | |
37 =head2 Mailing Lists | |
38 | |
39 User feedback is an integral part of the evolution of this | |
40 and other Bioperl modules. Send your comments and suggestions preferably | |
41 to one of the Bioperl mailing lists. | |
42 Your participation is much appreciated. | |
43 | |
44 bioperl-l@bioperl.org - General discussion | |
45 http://bioperl.org/MailList.shtml - About the mailing lists | |
46 | |
47 =head2 Reporting Bugs | |
48 | |
49 Report bugs to the Bioperl bug tracking system to help us keep track | |
50 the bugs and their resolution. | |
51 Bug reports can be submitted via email or the web: | |
52 | |
53 bioperl-bugs@bio.perl.org | |
54 http://bugzilla.bioperl.org/ | |
55 | |
56 =head1 AUTHOR - Ewan Birney | |
57 | |
58 Email birney@ebi.ac.uk | |
59 | |
60 Describe contact details here | |
61 | |
62 =head1 APPENDIX | |
63 | |
64 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
65 | |
66 =head1 SEE ALSO | |
67 | |
68 Bio::Index::Abstract - Module which | |
69 Bio::Index::AbstractSeq inherits off, which | |
70 provides dbm indexing for flat files (which are | |
71 not necessarily sequence files). | |
72 | |
73 =cut | |
74 | |
75 # Let's begin the code ... | |
76 | |
77 | |
78 package Bio::Index::AbstractSeq; | |
79 use vars qw(@ISA); | |
80 use strict; | |
81 | |
82 use Bio::SeqIO::MultiFile; | |
83 use Bio::Index::Abstract; | |
84 use Bio::DB::SeqI; | |
85 | |
86 | |
87 @ISA = qw(Bio::Index::Abstract Bio::DB::SeqI); | |
88 | |
89 sub new { | |
90 my ($class, @args) = @_; | |
91 my $self = $class->SUPER::new(@args); | |
92 | |
93 $self->{'_seqio_cache'} = []; | |
94 return $self; | |
95 } | |
96 | |
97 =head2 _file_format | |
98 | |
99 Title : _file_format | |
100 Usage : $self->_file_format | |
101 Function: Derived classes should override this | |
102 method (it throws an exception here) | |
103 to give the file format of the files used | |
104 Example : | |
105 Returns : | |
106 Args : | |
107 | |
108 | |
109 =cut | |
110 | |
111 sub _file_format { | |
112 my ($self,@args) = @_; | |
113 | |
114 my $pkg = ref($self); | |
115 $self->throw("Class '$pkg' must provide a file format method correctly"); | |
116 } | |
117 | |
118 =head2 fetch | |
119 | |
120 Title : fetch | |
121 Usage : $index->fetch( $id ) | |
122 Function: Returns a Bio::Seq object from the index | |
123 Example : $seq = $index->fetch( 'dJ67B12' ) | |
124 Returns : Bio::Seq object | |
125 Args : ID | |
126 | |
127 =cut | |
128 | |
129 sub fetch { | |
130 my( $self, $id ) = @_; | |
131 my $db = $self->db(); | |
132 my $seq; | |
133 | |
134 if (my $rec = $db->{ $id }) { | |
135 my ($file, $begin) = $self->unpack_record( $rec ); | |
136 | |
137 # Get the (possibly cached) SeqIO object | |
138 my $seqio = $self->_get_SeqIO_object( $file ); | |
139 my $fh = $seqio->_fh(); | |
140 | |
141 # move to start of record | |
142 $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug | |
143 seek($fh, $begin, 0); | |
144 | |
145 $seq = $seqio->next_seq(); | |
146 } | |
147 | |
148 # we essentially assumme that the primary_id for the database | |
149 # is the display_id | |
150 $seq->primary_id($seq->display_id()) if( defined $seq && ref($seq) && | |
151 $seq->isa('Bio::PrimarySeqI') ); | |
152 | |
153 return $seq; | |
154 } | |
155 | |
156 =head2 _get_SeqIO_object | |
157 | |
158 Title : _get_SeqIO_object | |
159 Usage : $index->_get_SeqIO_object( $file ) | |
160 Function: Returns a Bio::SeqIO object for the file | |
161 Example : $seq = $index->_get_SeqIO_object( 0 ) | |
162 Returns : Bio::SeqIO object | |
163 Args : File number (an integer) | |
164 | |
165 =cut | |
166 | |
167 sub _get_SeqIO_object { | |
168 my( $self, $i ) = @_; | |
169 | |
170 unless ($self->{'_seqio_cache'}[$i]) { | |
171 my $fh = $self->_file_handle($i); | |
172 # make a new SeqIO object | |
173 my $seqio = Bio::SeqIO->new( -Format => $self->_file_format, | |
174 -fh => $fh); | |
175 $self->{'_seqio_cache'}[$i] = $seqio; | |
176 } | |
177 return $self->{'_seqio_cache'}[$i]; | |
178 } | |
179 | |
180 =head2 get_Seq_by_id | |
181 | |
182 Title : get_Seq_by_id | |
183 Usage : $seq = $db->get_Seq_by_id() | |
184 Function: retrieves a sequence object, identically to | |
185 ->fetch, but here behaving as a Bio::DB::BioSeqI | |
186 Returns : new Bio::Seq object | |
187 Args : string represents the id | |
188 | |
189 | |
190 =cut | |
191 | |
192 sub get_Seq_by_id { | |
193 my ($self,$id) = @_; | |
194 | |
195 return $self->fetch($id); | |
196 } | |
197 | |
198 =head2 get_Seq_by_acc | |
199 | |
200 Title : get_Seq_by_acc | |
201 Usage : $seq = $db->get_Seq_by_acc() | |
202 Function: retrieves a sequence object, identically to | |
203 ->fetch, but here behaving as a Bio::DB::BioSeqI | |
204 Returns : new Bio::Seq object | |
205 Args : string represents the accession number | |
206 | |
207 | |
208 =cut | |
209 | |
210 sub get_Seq_by_acc { | |
211 my ($self,$id) = @_; | |
212 | |
213 return $self->fetch($id); | |
214 } | |
215 | |
216 =head2 get_PrimarySeq_stream | |
217 | |
218 Title : get_PrimarySeq_stream | |
219 Usage : $stream = get_PrimarySeq_stream | |
220 Function: Makes a Bio::DB::SeqStreamI compliant object | |
221 which provides a single method, next_primary_seq | |
222 Returns : Bio::DB::SeqStreamI | |
223 Args : none | |
224 | |
225 | |
226 =cut | |
227 | |
228 sub get_PrimarySeq_stream { | |
229 my $self = shift; | |
230 my $num = $self->_file_count() || 0; | |
231 my @file; | |
232 | |
233 for (my $i = 0; $i < $num; $i++) { | |
234 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} ); | |
235 push(@file,$file); | |
236 } | |
237 | |
238 my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file); | |
239 return $out; | |
240 } | |
241 | |
242 =head2 get_all_primary_ids | |
243 | |
244 Title : get_all_primary_ids | |
245 Usage : @ids = $seqdb->get_all_primary_ids() | |
246 Function: gives an array of all the primary_ids of the | |
247 sequence objects in the database. These | |
248 maybe ids (display style) or accession numbers | |
249 or something else completely different - they | |
250 *are not* meaningful outside of this database | |
251 implementation. | |
252 Example : | |
253 Returns : an array of strings | |
254 Args : none | |
255 | |
256 | |
257 =cut | |
258 | |
259 sub get_all_primary_ids { | |
260 my ($self,@args) = @_; | |
261 my $db = $self->db; | |
262 | |
263 # the problem is here that we have indexed things both on | |
264 # accession number and name. | |
265 | |
266 # We could take two options | |
267 # here - loop over the database, returning only one copy of each | |
268 # id that points to the same byte position, or we rely on semantics | |
269 # of accession numbers. | |
270 | |
271 # someone is going to index a database with no accession numbers. | |
272 # doh!. We have to uniquify the index... | |
273 | |
274 my( %bytepos ); | |
275 while (my($id, $rec) = each %$db) { | |
276 if( $id =~ /^__/ ) { | |
277 # internal info | |
278 next; | |
279 } | |
280 my ($file, $begin) = $self->unpack_record( $rec ); | |
281 | |
282 $bytepos{"$file:$begin"} = $id; | |
283 } | |
284 | |
285 return values %bytepos; | |
286 } | |
287 | |
288 | |
289 =head2 get_Seq_by_primary_id | |
290 | |
291 Title : get_Seq_by_primary_id | |
292 Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string); | |
293 Function: Gets a Bio::Seq object by the primary id. The primary | |
294 id in these cases has to come from $db->get_all_primary_ids. | |
295 There is no other way to get (or guess) the primary_ids | |
296 in a database. | |
297 | |
298 The other possibility is to get Bio::PrimarySeqI objects | |
299 via the get_PrimarySeq_stream and the primary_id field | |
300 on these objects are specified as the ids to use here. | |
301 Returns : A Bio::Seq object | |
302 Args : primary id (as a string) | |
303 Throws : "acc does not exist" exception | |
304 | |
305 | |
306 =cut | |
307 | |
308 sub get_Seq_by_primary_id { | |
309 my ($self,$id) = @_; | |
310 return $self->fetch($id); | |
311 } | |
312 | |
313 1; |