comparison variant_effect_predictor/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::DBFile::CollectionAdaptor
24
25 =head1 SYNOPSIS
26
27 For use with a Bio::EnsEMBL::Collector e.g.
28
29 package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor;
30
31 @ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor
32 Bio::EnsEMBL::Funcgen::Collector::ResultFeature
33 Bio::EnsEMBL::DBFile::CollectionAdaptor);
34 #DBSQL and DBFile inheritance here due to dynamic nature of ResultFeatureAdaptor
35
36
37 Fetch wrapper methods access file based data via read_collection_blob:
38
39 sub _fetch_from_file_by_Slice_ResultSet{
40
41 #define filepath/config
42
43 my $packed_scores = $self->read_collection_blob(
44 $filepath,
45 $efg_sr_id,
46 $conf->{$window_size}{'byte_offset'},
47 $conf->{$window_size}{'byte_length'},
48 );
49
50 #Do unpacking and object creation here
51
52 }
53
54 =head1 DESCRIPTION
55
56 Adaptor for direct collection(.col) file access, which are binary compressed fixed
57 width format files providing window based values across the genome. Collection files
58 integrate an index block which contains seq_region byte off set values.
59
60 NOTE: By default all collection files are generated and packed using little endian encoding.
61 Due to the lack of standards of float encoding(wrt to endianess) perl packs using the
62 implicit endianess of the underlying architecture. This means that accessing float
63 collection files located on a big endian architecture will produce unexpected results.
64
65 # endian issues will disappear with knetfile xsubs
66
67 =head1 SEE ALSO
68
69 Bio::EnsEMBL::DBFile::FileAdaptor
70
71 =cut
72
73
74
75 package Bio::EnsEMBL::DBFile::CollectionAdaptor;
76
77 use strict;
78 use warnings;
79
80 use Bio::EnsEMBL::DBFile::FileAdaptor;
81 use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
82 use vars qw(@ISA);
83 @ISA = qw(Bio::EnsEMBL::DBFile::FileAdaptor);
84
85
86 =head2 initialise_filehandle
87
88 Arg[1] : string - filepath
89 Example : $self->initialise_filehandle($filepath);
90 Description: Initialises the filehandle for use, in this case reads
91 the index (seq_region offsets)
92 Returntype : None
93 Exceptions : warns if read fails
94 Caller : Bio::EnsEMBL::DBFile::FileAdaptor::get_filehandle
95 Status : at risk
96
97 =cut
98
99 sub initialise_filehandle{
100 my ($self, $filepath) = @_;
101 my $fh = $self->{file_cache}{$filepath}{filehandle};
102
103 #offsets include the length of the complete index block
104 my ($index_size, $read_bytes, $index, $num_keys, %offset_index);
105
106 ### INDEX FORMAT ###
107 #First block of the index the index size in bytes(not inc size block).
108 #
109 #Rest of index is a hash of sr_id(v 2 bytes) key offset(V 4 bytes) value pairs
110 #V (long) is 4 bytes(via sys/read), which is actually an Config{intsize} i.e. i?
111 #long is 8 bytes according to Config{longsize}!
112
113 #read uses logical characters not necessarily in bytes
114 #altho this does seem to read bytes, maybe due to binmode?
115 #seek is in bytes
116 #Changed to sysread/read which both use bytes explicitly
117 #Can't mix sysread/seek due to I/O buffering differences
118
119
120 #Read index_size first encoded as v(2 bytes)
121 $read_bytes = sysread($fh, $index_size, 2);
122
123 if(! ((defined $read_bytes) && ($read_bytes == 2))){
124 #! defined is error 0 is end of file
125 warn "Failed to read index size from $filepath\n$!";
126
127 #Delete fh as it is useless/unsafe to retry
128 undef $self->{file_cache}{$filepath}{filehandle};
129 }
130 else{ #Read index
131 ($index_size) = unpack('v', $index_size);
132 $read_bytes = sysread($fh, $index, $index_size); #Now read index proper
133
134 if(! ((defined $read_bytes) && ($read_bytes == $index_size))){
135 #! defined is error 0 is end of file
136 warn "Failed to read index from $filepath\n$!";
137
138 #Delete fh as it is useless/unsafe to retry
139 undef $self->{file_cache}{$filepath}{filehandle};
140 }
141 else{
142 #Number of key-value pairs => $index_size /(size of key(v 2bytes) + size of offset(V 4bytes))
143 $num_keys = $index_size/6;
144 my $unpack_template = '(vV)'.$num_keys,;
145
146 %offset_index = unpack($unpack_template, $index);
147 $self->{file_cache}{$filepath}{off_sets} = \%offset_index;
148 }
149 }
150
151 return $self->{file_cache}{$filepath}{off_sets};
152 }
153
154
155 =head2 read_collection_blob
156
157 Arg[1] : string - filepath
158 Arg[2] : int - seq_region_id
159 Arg[3] : int - seq_region offset. The byte offset required to
160 locate the required start position
161 Arg[4] : int - byte length to read
162 Example : my $blob_substr = $self->read_collection_blob($filepath,
163 $sr_key,
164 $sr_offset,
165 $byte_length);
166 Description: Reads bytes from file given a seq_region_key, byte offset and byte length.
167 Sets filehandle to undef if read fails.
168 Returntype : string - packed binary data
169 Exceptions : warns if seek or read errors
170 Caller : general e.g. fetch_from_file_by_Slice_ResultSet
171 Status : at risk
172
173 =cut
174
175 # We could change this to take a Slice, hence we could check
176 # whether an EOF error is because the slice is out of range
177 # and undef only if it is in range i.e. the index/file is corrupt
178 # overkill?
179 # This is something the Slice API should warn about
180 # but will still cause undef'd filehandle here
181 # Index should also contain ends, so we can validate whether the slice is out of range???
182
183
184 sub read_collection_blob{
185 my($self, $filepath, $sr_key, $sr_offset, $byte_length) = @_;
186
187 my $blob_substr;
188 my $fh = $self->get_filehandle($filepath, {-binmode => 1});
189
190 if(defined $fh){
191 #Return from query cache here?
192 #cache key = "$filepath:$key:$sr_offset:$byte_length"
193
194 #define total offset
195
196 #if(! exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
197 # #warn "sr_key($sr_key) is not part of index for $filepath\n";
198 #}
199 #else{
200
201 if(exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
202
203 my $total_offset = $self->{file_cache}{$filepath}{off_sets}{$sr_key} + $sr_offset;
204 my $seeked = sysseek($fh, $total_offset, 0);#0(whence) is SEEK_SET.
205
206 if(! $seeked){
207 warn("Failed to seek to byte $total_offset in $filepath");
208 #Don't undef fh here as this valid Slice maybe out of range
209 #and we don't want to kill a valid fh
210 #i.e. Slice start/end is past end of seq_region
211 }
212 else{
213 my $read_bytes = sysread($fh, $blob_substr, $byte_length);
214
215 if(! ((defined $read_bytes) && ($read_bytes == $byte_length))){
216 #! defined is error 0 is end of file
217 warn "Failed to read from $filepath\n$!";
218
219 if($read_bytes == 0){
220 #This maybe because the slice is out of range!
221 #The API gives no warning about this
222
223 warn "End Of File encountered\n";
224 warn "Total offset:\t".$self->{file_cache}{$filepath}{off_sets}{$sr_key}.
225 " key($sr_key) + $sr_offset = $total_offset\n";
226
227 #add some checks against the theoretical/true length of the file?
228 }
229 else{ #Delete fh as it is useless/unsafe to retry
230 undef $self->{file_cache}{$filepath}{filehandle};
231 #$blob_substr is now set to empty string by read
232 undef $blob_substr;
233 }
234 }
235 }
236 }
237 }
238
239 return $blob_substr;
240 }
241
242
243 1;