Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/DBFile/CollectionAdaptor.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::DBFile::CollectionAdaptor | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 For use with a Bio::EnsEMBL::Collector e.g. | |
28 | |
29 package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor; | |
30 | |
31 @ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor | |
32 Bio::EnsEMBL::Funcgen::Collector::ResultFeature | |
33 Bio::EnsEMBL::DBFile::CollectionAdaptor); | |
34 #DBSQL and DBFile inheritance here due to dynamic nature of ResultFeatureAdaptor | |
35 | |
36 | |
37 Fetch wrapper methods access file based data via read_collection_blob: | |
38 | |
39 sub _fetch_from_file_by_Slice_ResultSet{ | |
40 | |
41 #define filepath/config | |
42 | |
43 my $packed_scores = $self->read_collection_blob( | |
44 $filepath, | |
45 $efg_sr_id, | |
46 $conf->{$window_size}{'byte_offset'}, | |
47 $conf->{$window_size}{'byte_length'}, | |
48 ); | |
49 | |
50 #Do unpacking and object creation here | |
51 | |
52 } | |
53 | |
54 =head1 DESCRIPTION | |
55 | |
56 Adaptor for direct collection(.col) file access, which are binary compressed fixed | |
57 width format files providing window based values across the genome. Collection files | |
58 integrate an index block which contains seq_region byte off set values. | |
59 | |
60 NOTE: By default all collection files are generated and packed using little endian encoding. | |
61 Due to the lack of standards of float encoding(wrt to endianess) perl packs using the | |
62 implicit endianess of the underlying architecture. This means that accessing float | |
63 collection files located on a big endian architecture will produce unexpected results. | |
64 | |
65 # endian issues will disappear with knetfile xsubs | |
66 | |
67 =head1 SEE ALSO | |
68 | |
69 Bio::EnsEMBL::DBFile::FileAdaptor | |
70 | |
71 =cut | |
72 | |
73 | |
74 | |
75 package Bio::EnsEMBL::DBFile::CollectionAdaptor; | |
76 | |
77 use strict; | |
78 use warnings; | |
79 | |
80 use Bio::EnsEMBL::DBFile::FileAdaptor; | |
81 use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate); | |
82 use vars qw(@ISA); | |
83 @ISA = qw(Bio::EnsEMBL::DBFile::FileAdaptor); | |
84 | |
85 | |
86 =head2 initialise_filehandle | |
87 | |
88 Arg[1] : string - filepath | |
89 Example : $self->initialise_filehandle($filepath); | |
90 Description: Initialises the filehandle for use, in this case reads | |
91 the index (seq_region offsets) | |
92 Returntype : None | |
93 Exceptions : warns if read fails | |
94 Caller : Bio::EnsEMBL::DBFile::FileAdaptor::get_filehandle | |
95 Status : at risk | |
96 | |
97 =cut | |
98 | |
99 sub initialise_filehandle{ | |
100 my ($self, $filepath) = @_; | |
101 my $fh = $self->{file_cache}{$filepath}{filehandle}; | |
102 | |
103 #offsets include the length of the complete index block | |
104 my ($index_size, $read_bytes, $index, $num_keys, %offset_index); | |
105 | |
106 ### INDEX FORMAT ### | |
107 #First block of the index the index size in bytes(not inc size block). | |
108 # | |
109 #Rest of index is a hash of sr_id(v 2 bytes) key offset(V 4 bytes) value pairs | |
110 #V (long) is 4 bytes(via sys/read), which is actually an Config{intsize} i.e. i? | |
111 #long is 8 bytes according to Config{longsize}! | |
112 | |
113 #read uses logical characters not necessarily in bytes | |
114 #altho this does seem to read bytes, maybe due to binmode? | |
115 #seek is in bytes | |
116 #Changed to sysread/read which both use bytes explicitly | |
117 #Can't mix sysread/seek due to I/O buffering differences | |
118 | |
119 | |
120 #Read index_size first encoded as v(2 bytes) | |
121 $read_bytes = sysread($fh, $index_size, 2); | |
122 | |
123 if(! ((defined $read_bytes) && ($read_bytes == 2))){ | |
124 #! defined is error 0 is end of file | |
125 warn "Failed to read index size from $filepath\n$!"; | |
126 | |
127 #Delete fh as it is useless/unsafe to retry | |
128 undef $self->{file_cache}{$filepath}{filehandle}; | |
129 } | |
130 else{ #Read index | |
131 ($index_size) = unpack('v', $index_size); | |
132 $read_bytes = sysread($fh, $index, $index_size); #Now read index proper | |
133 | |
134 if(! ((defined $read_bytes) && ($read_bytes == $index_size))){ | |
135 #! defined is error 0 is end of file | |
136 warn "Failed to read index from $filepath\n$!"; | |
137 | |
138 #Delete fh as it is useless/unsafe to retry | |
139 undef $self->{file_cache}{$filepath}{filehandle}; | |
140 } | |
141 else{ | |
142 #Number of key-value pairs => $index_size /(size of key(v 2bytes) + size of offset(V 4bytes)) | |
143 $num_keys = $index_size/6; | |
144 my $unpack_template = '(vV)'.$num_keys,; | |
145 | |
146 %offset_index = unpack($unpack_template, $index); | |
147 $self->{file_cache}{$filepath}{off_sets} = \%offset_index; | |
148 } | |
149 } | |
150 | |
151 return $self->{file_cache}{$filepath}{off_sets}; | |
152 } | |
153 | |
154 | |
155 =head2 read_collection_blob | |
156 | |
157 Arg[1] : string - filepath | |
158 Arg[2] : int - seq_region_id | |
159 Arg[3] : int - seq_region offset. The byte offset required to | |
160 locate the required start position | |
161 Arg[4] : int - byte length to read | |
162 Example : my $blob_substr = $self->read_collection_blob($filepath, | |
163 $sr_key, | |
164 $sr_offset, | |
165 $byte_length); | |
166 Description: Reads bytes from file given a seq_region_key, byte offset and byte length. | |
167 Sets filehandle to undef if read fails. | |
168 Returntype : string - packed binary data | |
169 Exceptions : warns if seek or read errors | |
170 Caller : general e.g. fetch_from_file_by_Slice_ResultSet | |
171 Status : at risk | |
172 | |
173 =cut | |
174 | |
175 # We could change this to take a Slice, hence we could check | |
176 # whether an EOF error is because the slice is out of range | |
177 # and undef only if it is in range i.e. the index/file is corrupt | |
178 # overkill? | |
179 # This is something the Slice API should warn about | |
180 # but will still cause undef'd filehandle here | |
181 # Index should also contain ends, so we can validate whether the slice is out of range??? | |
182 | |
183 | |
184 sub read_collection_blob{ | |
185 my($self, $filepath, $sr_key, $sr_offset, $byte_length) = @_; | |
186 | |
187 my $blob_substr; | |
188 my $fh = $self->get_filehandle($filepath, {-binmode => 1}); | |
189 | |
190 if(defined $fh){ | |
191 #Return from query cache here? | |
192 #cache key = "$filepath:$key:$sr_offset:$byte_length" | |
193 | |
194 #define total offset | |
195 | |
196 #if(! exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){ | |
197 # #warn "sr_key($sr_key) is not part of index for $filepath\n"; | |
198 #} | |
199 #else{ | |
200 | |
201 if(exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){ | |
202 | |
203 my $total_offset = $self->{file_cache}{$filepath}{off_sets}{$sr_key} + $sr_offset; | |
204 my $seeked = sysseek($fh, $total_offset, 0);#0(whence) is SEEK_SET. | |
205 | |
206 if(! $seeked){ | |
207 warn("Failed to seek to byte $total_offset in $filepath"); | |
208 #Don't undef fh here as this valid Slice maybe out of range | |
209 #and we don't want to kill a valid fh | |
210 #i.e. Slice start/end is past end of seq_region | |
211 } | |
212 else{ | |
213 my $read_bytes = sysread($fh, $blob_substr, $byte_length); | |
214 | |
215 if(! ((defined $read_bytes) && ($read_bytes == $byte_length))){ | |
216 #! defined is error 0 is end of file | |
217 warn "Failed to read from $filepath\n$!"; | |
218 | |
219 if($read_bytes == 0){ | |
220 #This maybe because the slice is out of range! | |
221 #The API gives no warning about this | |
222 | |
223 warn "End Of File encountered\n"; | |
224 warn "Total offset:\t".$self->{file_cache}{$filepath}{off_sets}{$sr_key}. | |
225 " key($sr_key) + $sr_offset = $total_offset\n"; | |
226 | |
227 #add some checks against the theoretical/true length of the file? | |
228 } | |
229 else{ #Delete fh as it is useless/unsafe to retry | |
230 undef $self->{file_cache}{$filepath}{filehandle}; | |
231 #$blob_substr is now set to empty string by read | |
232 undef $blob_substr; | |
233 } | |
234 } | |
235 } | |
236 } | |
237 } | |
238 | |
239 return $blob_substr; | |
240 } | |
241 | |
242 | |
243 1; |