Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Index/AbstractSeq.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: AbstractSeq.pm,v 1.16 2002/10/22 07:38:33 lapp Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::DB::AbstractSeq | |
| 4 # | |
| 5 # Cared for by Ewan Birney <birney@ebi.ac.uk> | |
| 6 # | |
| 7 # Copyright Ewan Birney | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::Index::AbstractSeq - Base class for AbstractSeq s | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 # Make a new sequence file indexing package | |
| 20 | |
| 21 package MyShinyNewIndexer; | |
| 22 use Bio::Index::AbstractSeq; | |
| 23 | |
| 24 @ISA = ('Bio::Index::AbstractSeq'); | |
| 25 | |
| 26 # Now provide the necessary methods... | |
| 27 | |
| 28 =head1 DESCRIPTION | |
| 29 | |
| 30 Provides a common base class for multiple | |
| 31 sequence files built using the | |
| 32 Bio::Index::Abstract system, and provides a | |
| 33 Bio::DB::SeqI interface. | |
| 34 | |
| 35 =head1 FEEDBACK | |
| 36 | |
| 37 =head2 Mailing Lists | |
| 38 | |
| 39 User feedback is an integral part of the evolution of this | |
| 40 and other Bioperl modules. Send your comments and suggestions preferably | |
| 41 to one of the Bioperl mailing lists. | |
| 42 Your participation is much appreciated. | |
| 43 | |
| 44 bioperl-l@bioperl.org - General discussion | |
| 45 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 46 | |
| 47 =head2 Reporting Bugs | |
| 48 | |
| 49 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 50 the bugs and their resolution. | |
| 51 Bug reports can be submitted via email or the web: | |
| 52 | |
| 53 bioperl-bugs@bio.perl.org | |
| 54 http://bugzilla.bioperl.org/ | |
| 55 | |
| 56 =head1 AUTHOR - Ewan Birney | |
| 57 | |
| 58 Email birney@ebi.ac.uk | |
| 59 | |
| 60 Describe contact details here | |
| 61 | |
| 62 =head1 APPENDIX | |
| 63 | |
| 64 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
| 65 | |
| 66 =head1 SEE ALSO | |
| 67 | |
| 68 Bio::Index::Abstract - Module which | |
| 69 Bio::Index::AbstractSeq inherits off, which | |
| 70 provides dbm indexing for flat files (which are | |
| 71 not necessarily sequence files). | |
| 72 | |
| 73 =cut | |
| 74 | |
| 75 # Let's begin the code ... | |
| 76 | |
| 77 | |
| 78 package Bio::Index::AbstractSeq; | |
| 79 use vars qw(@ISA); | |
| 80 use strict; | |
| 81 | |
| 82 use Bio::SeqIO::MultiFile; | |
| 83 use Bio::Index::Abstract; | |
| 84 use Bio::DB::SeqI; | |
| 85 | |
| 86 | |
| 87 @ISA = qw(Bio::Index::Abstract Bio::DB::SeqI); | |
| 88 | |
| 89 sub new { | |
| 90 my ($class, @args) = @_; | |
| 91 my $self = $class->SUPER::new(@args); | |
| 92 | |
| 93 $self->{'_seqio_cache'} = []; | |
| 94 return $self; | |
| 95 } | |
| 96 | |
| 97 =head2 _file_format | |
| 98 | |
| 99 Title : _file_format | |
| 100 Usage : $self->_file_format | |
| 101 Function: Derived classes should override this | |
| 102 method (it throws an exception here) | |
| 103 to give the file format of the files used | |
| 104 Example : | |
| 105 Returns : | |
| 106 Args : | |
| 107 | |
| 108 | |
| 109 =cut | |
| 110 | |
| 111 sub _file_format { | |
| 112 my ($self,@args) = @_; | |
| 113 | |
| 114 my $pkg = ref($self); | |
| 115 $self->throw("Class '$pkg' must provide a file format method correctly"); | |
| 116 } | |
| 117 | |
| 118 =head2 fetch | |
| 119 | |
| 120 Title : fetch | |
| 121 Usage : $index->fetch( $id ) | |
| 122 Function: Returns a Bio::Seq object from the index | |
| 123 Example : $seq = $index->fetch( 'dJ67B12' ) | |
| 124 Returns : Bio::Seq object | |
| 125 Args : ID | |
| 126 | |
| 127 =cut | |
| 128 | |
| 129 sub fetch { | |
| 130 my( $self, $id ) = @_; | |
| 131 my $db = $self->db(); | |
| 132 my $seq; | |
| 133 | |
| 134 if (my $rec = $db->{ $id }) { | |
| 135 my ($file, $begin) = $self->unpack_record( $rec ); | |
| 136 | |
| 137 # Get the (possibly cached) SeqIO object | |
| 138 my $seqio = $self->_get_SeqIO_object( $file ); | |
| 139 my $fh = $seqio->_fh(); | |
| 140 | |
| 141 # move to start of record | |
| 142 $begin-- if( $^O =~ /mswin/i); # workaround for Win DB_File bug | |
| 143 seek($fh, $begin, 0); | |
| 144 | |
| 145 $seq = $seqio->next_seq(); | |
| 146 } | |
| 147 | |
| 148 # we essentially assumme that the primary_id for the database | |
| 149 # is the display_id | |
| 150 $seq->primary_id($seq->display_id()) if( defined $seq && ref($seq) && | |
| 151 $seq->isa('Bio::PrimarySeqI') ); | |
| 152 | |
| 153 return $seq; | |
| 154 } | |
| 155 | |
| 156 =head2 _get_SeqIO_object | |
| 157 | |
| 158 Title : _get_SeqIO_object | |
| 159 Usage : $index->_get_SeqIO_object( $file ) | |
| 160 Function: Returns a Bio::SeqIO object for the file | |
| 161 Example : $seq = $index->_get_SeqIO_object( 0 ) | |
| 162 Returns : Bio::SeqIO object | |
| 163 Args : File number (an integer) | |
| 164 | |
| 165 =cut | |
| 166 | |
| 167 sub _get_SeqIO_object { | |
| 168 my( $self, $i ) = @_; | |
| 169 | |
| 170 unless ($self->{'_seqio_cache'}[$i]) { | |
| 171 my $fh = $self->_file_handle($i); | |
| 172 # make a new SeqIO object | |
| 173 my $seqio = Bio::SeqIO->new( -Format => $self->_file_format, | |
| 174 -fh => $fh); | |
| 175 $self->{'_seqio_cache'}[$i] = $seqio; | |
| 176 } | |
| 177 return $self->{'_seqio_cache'}[$i]; | |
| 178 } | |
| 179 | |
| 180 =head2 get_Seq_by_id | |
| 181 | |
| 182 Title : get_Seq_by_id | |
| 183 Usage : $seq = $db->get_Seq_by_id() | |
| 184 Function: retrieves a sequence object, identically to | |
| 185 ->fetch, but here behaving as a Bio::DB::BioSeqI | |
| 186 Returns : new Bio::Seq object | |
| 187 Args : string represents the id | |
| 188 | |
| 189 | |
| 190 =cut | |
| 191 | |
| 192 sub get_Seq_by_id { | |
| 193 my ($self,$id) = @_; | |
| 194 | |
| 195 return $self->fetch($id); | |
| 196 } | |
| 197 | |
| 198 =head2 get_Seq_by_acc | |
| 199 | |
| 200 Title : get_Seq_by_acc | |
| 201 Usage : $seq = $db->get_Seq_by_acc() | |
| 202 Function: retrieves a sequence object, identically to | |
| 203 ->fetch, but here behaving as a Bio::DB::BioSeqI | |
| 204 Returns : new Bio::Seq object | |
| 205 Args : string represents the accession number | |
| 206 | |
| 207 | |
| 208 =cut | |
| 209 | |
| 210 sub get_Seq_by_acc { | |
| 211 my ($self,$id) = @_; | |
| 212 | |
| 213 return $self->fetch($id); | |
| 214 } | |
| 215 | |
| 216 =head2 get_PrimarySeq_stream | |
| 217 | |
| 218 Title : get_PrimarySeq_stream | |
| 219 Usage : $stream = get_PrimarySeq_stream | |
| 220 Function: Makes a Bio::DB::SeqStreamI compliant object | |
| 221 which provides a single method, next_primary_seq | |
| 222 Returns : Bio::DB::SeqStreamI | |
| 223 Args : none | |
| 224 | |
| 225 | |
| 226 =cut | |
| 227 | |
| 228 sub get_PrimarySeq_stream { | |
| 229 my $self = shift; | |
| 230 my $num = $self->_file_count() || 0; | |
| 231 my @file; | |
| 232 | |
| 233 for (my $i = 0; $i < $num; $i++) { | |
| 234 my( $file, $stored_size ) = $self->unpack_record( $self->db->{"__FILE_$i"} ); | |
| 235 push(@file,$file); | |
| 236 } | |
| 237 | |
| 238 my $out = Bio::SeqIO::MultiFile->new( '-format' => $self->_file_format , -files => \@file); | |
| 239 return $out; | |
| 240 } | |
| 241 | |
| 242 =head2 get_all_primary_ids | |
| 243 | |
| 244 Title : get_all_primary_ids | |
| 245 Usage : @ids = $seqdb->get_all_primary_ids() | |
| 246 Function: gives an array of all the primary_ids of the | |
| 247 sequence objects in the database. These | |
| 248 maybe ids (display style) or accession numbers | |
| 249 or something else completely different - they | |
| 250 *are not* meaningful outside of this database | |
| 251 implementation. | |
| 252 Example : | |
| 253 Returns : an array of strings | |
| 254 Args : none | |
| 255 | |
| 256 | |
| 257 =cut | |
| 258 | |
| 259 sub get_all_primary_ids { | |
| 260 my ($self,@args) = @_; | |
| 261 my $db = $self->db; | |
| 262 | |
| 263 # the problem is here that we have indexed things both on | |
| 264 # accession number and name. | |
| 265 | |
| 266 # We could take two options | |
| 267 # here - loop over the database, returning only one copy of each | |
| 268 # id that points to the same byte position, or we rely on semantics | |
| 269 # of accession numbers. | |
| 270 | |
| 271 # someone is going to index a database with no accession numbers. | |
| 272 # doh!. We have to uniquify the index... | |
| 273 | |
| 274 my( %bytepos ); | |
| 275 while (my($id, $rec) = each %$db) { | |
| 276 if( $id =~ /^__/ ) { | |
| 277 # internal info | |
| 278 next; | |
| 279 } | |
| 280 my ($file, $begin) = $self->unpack_record( $rec ); | |
| 281 | |
| 282 $bytepos{"$file:$begin"} = $id; | |
| 283 } | |
| 284 | |
| 285 return values %bytepos; | |
| 286 } | |
| 287 | |
| 288 | |
| 289 =head2 get_Seq_by_primary_id | |
| 290 | |
| 291 Title : get_Seq_by_primary_id | |
| 292 Usage : $seq = $db->get_Seq_by_primary_id($primary_id_string); | |
| 293 Function: Gets a Bio::Seq object by the primary id. The primary | |
| 294 id in these cases has to come from $db->get_all_primary_ids. | |
| 295 There is no other way to get (or guess) the primary_ids | |
| 296 in a database. | |
| 297 | |
| 298 The other possibility is to get Bio::PrimarySeqI objects | |
| 299 via the get_PrimarySeq_stream and the primary_id field | |
| 300 on these objects are specified as the ids to use here. | |
| 301 Returns : A Bio::Seq object | |
| 302 Args : primary id (as a string) | |
| 303 Throws : "acc does not exist" exception | |
| 304 | |
| 305 | |
| 306 =cut | |
| 307 | |
| 308 sub get_Seq_by_primary_id { | |
| 309 my ($self,$id) = @_; | |
| 310 return $self->fetch($id); | |
| 311 } | |
| 312 | |
| 313 1; |
