0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::DBFile::CollectionAdaptor
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 For use with a Bio::EnsEMBL::Collector e.g.
|
|
28
|
|
29 package Bio::EnsEMBL::Funcgen::DBSQL::ResultFeatureAdaptor;
|
|
30
|
|
31 @ISA = qw(Bio::EnsEMBL::Funcgen::DBSQL::BaseFeatureAdaptor
|
|
32 Bio::EnsEMBL::Funcgen::Collector::ResultFeature
|
|
33 Bio::EnsEMBL::DBFile::CollectionAdaptor);
|
|
34 #DBSQL and DBFile inheritance here due to dynamic nature of ResultFeatureAdaptor
|
|
35
|
|
36
|
|
37 Fetch wrapper methods access file based data via read_collection_blob:
|
|
38
|
|
39 sub _fetch_from_file_by_Slice_ResultSet{
|
|
40
|
|
41 #define filepath/config
|
|
42
|
|
43 my $packed_scores = $self->read_collection_blob(
|
|
44 $filepath,
|
|
45 $efg_sr_id,
|
|
46 $conf->{$window_size}{'byte_offset'},
|
|
47 $conf->{$window_size}{'byte_length'},
|
|
48 );
|
|
49
|
|
50 #Do unpacking and object creation here
|
|
51
|
|
52 }
|
|
53
|
|
54 =head1 DESCRIPTION
|
|
55
|
|
56 Adaptor for direct collection(.col) file access, which are binary compressed fixed
|
|
57 width format files providing window based values across the genome. Collection files
|
|
58 integrate an index block which contains seq_region byte off set values.
|
|
59
|
|
60 NOTE: By default all collection files are generated and packed using little endian encoding.
|
|
61 Due to the lack of standards of float encoding(wrt to endianess) perl packs using the
|
|
62 implicit endianess of the underlying architecture. This means that accessing float
|
|
63 collection files located on a big endian architecture will produce unexpected results.
|
|
64
|
|
65 # endian issues will disappear with knetfile xsubs
|
|
66
|
|
67 =head1 SEE ALSO
|
|
68
|
|
69 Bio::EnsEMBL::DBFile::FileAdaptor
|
|
70
|
|
71 =cut
|
|
72
|
|
73
|
|
74
|
|
75 package Bio::EnsEMBL::DBFile::CollectionAdaptor;
|
|
76
|
|
77 use strict;
|
|
78 use warnings;
|
|
79
|
|
80 use Bio::EnsEMBL::DBFile::FileAdaptor;
|
|
81 use Bio::EnsEMBL::Utils::Exception qw(throw warning deprecate);
|
|
82 use vars qw(@ISA);
|
|
83 @ISA = qw(Bio::EnsEMBL::DBFile::FileAdaptor);
|
|
84
|
|
85
|
|
86 =head2 initialise_filehandle
|
|
87
|
|
88 Arg[1] : string - filepath
|
|
89 Example : $self->initialise_filehandle($filepath);
|
|
90 Description: Initialises the filehandle for use, in this case reads
|
|
91 the index (seq_region offsets)
|
|
92 Returntype : None
|
|
93 Exceptions : warns if read fails
|
|
94 Caller : Bio::EnsEMBL::DBFile::FileAdaptor::get_filehandle
|
|
95 Status : at risk
|
|
96
|
|
97 =cut
|
|
98
|
|
99 sub initialise_filehandle{
|
|
100 my ($self, $filepath) = @_;
|
|
101 my $fh = $self->{file_cache}{$filepath}{filehandle};
|
|
102
|
|
103 #offsets include the length of the complete index block
|
|
104 my ($index_size, $read_bytes, $index, $num_keys, %offset_index);
|
|
105
|
|
106 ### INDEX FORMAT ###
|
|
107 #First block of the index the index size in bytes(not inc size block).
|
|
108 #
|
|
109 #Rest of index is a hash of sr_id(v 2 bytes) key offset(V 4 bytes) value pairs
|
|
110 #V (long) is 4 bytes(via sys/read), which is actually an Config{intsize} i.e. i?
|
|
111 #long is 8 bytes according to Config{longsize}!
|
|
112
|
|
113 #read uses logical characters not necessarily in bytes
|
|
114 #altho this does seem to read bytes, maybe due to binmode?
|
|
115 #seek is in bytes
|
|
116 #Changed to sysread/read which both use bytes explicitly
|
|
117 #Can't mix sysread/seek due to I/O buffering differences
|
|
118
|
|
119
|
|
120 #Read index_size first encoded as v(2 bytes)
|
|
121 $read_bytes = sysread($fh, $index_size, 2);
|
|
122
|
|
123 if(! ((defined $read_bytes) && ($read_bytes == 2))){
|
|
124 #! defined is error 0 is end of file
|
|
125 warn "Failed to read index size from $filepath\n$!";
|
|
126
|
|
127 #Delete fh as it is useless/unsafe to retry
|
|
128 undef $self->{file_cache}{$filepath}{filehandle};
|
|
129 }
|
|
130 else{ #Read index
|
|
131 ($index_size) = unpack('v', $index_size);
|
|
132 $read_bytes = sysread($fh, $index, $index_size); #Now read index proper
|
|
133
|
|
134 if(! ((defined $read_bytes) && ($read_bytes == $index_size))){
|
|
135 #! defined is error 0 is end of file
|
|
136 warn "Failed to read index from $filepath\n$!";
|
|
137
|
|
138 #Delete fh as it is useless/unsafe to retry
|
|
139 undef $self->{file_cache}{$filepath}{filehandle};
|
|
140 }
|
|
141 else{
|
|
142 #Number of key-value pairs => $index_size /(size of key(v 2bytes) + size of offset(V 4bytes))
|
|
143 $num_keys = $index_size/6;
|
|
144 my $unpack_template = '(vV)'.$num_keys,;
|
|
145
|
|
146 %offset_index = unpack($unpack_template, $index);
|
|
147 $self->{file_cache}{$filepath}{off_sets} = \%offset_index;
|
|
148 }
|
|
149 }
|
|
150
|
|
151 return $self->{file_cache}{$filepath}{off_sets};
|
|
152 }
|
|
153
|
|
154
|
|
155 =head2 read_collection_blob
|
|
156
|
|
157 Arg[1] : string - filepath
|
|
158 Arg[2] : int - seq_region_id
|
|
159 Arg[3] : int - seq_region offset. The byte offset required to
|
|
160 locate the required start position
|
|
161 Arg[4] : int - byte length to read
|
|
162 Example : my $blob_substr = $self->read_collection_blob($filepath,
|
|
163 $sr_key,
|
|
164 $sr_offset,
|
|
165 $byte_length);
|
|
166 Description: Reads bytes from file given a seq_region_key, byte offset and byte length.
|
|
167 Sets filehandle to undef if read fails.
|
|
168 Returntype : string - packed binary data
|
|
169 Exceptions : warns if seek or read errors
|
|
170 Caller : general e.g. fetch_from_file_by_Slice_ResultSet
|
|
171 Status : at risk
|
|
172
|
|
173 =cut
|
|
174
|
|
175 # We could change this to take a Slice, hence we could check
|
|
176 # whether an EOF error is because the slice is out of range
|
|
177 # and undef only if it is in range i.e. the index/file is corrupt
|
|
178 # overkill?
|
|
179 # This is something the Slice API should warn about
|
|
180 # but will still cause undef'd filehandle here
|
|
181 # Index should also contain ends, so we can validate whether the slice is out of range???
|
|
182
|
|
183
|
|
184 sub read_collection_blob{
|
|
185 my($self, $filepath, $sr_key, $sr_offset, $byte_length) = @_;
|
|
186
|
|
187 my $blob_substr;
|
|
188 my $fh = $self->get_filehandle($filepath, {-binmode => 1});
|
|
189
|
|
190 if(defined $fh){
|
|
191 #Return from query cache here?
|
|
192 #cache key = "$filepath:$key:$sr_offset:$byte_length"
|
|
193
|
|
194 #define total offset
|
|
195
|
|
196 #if(! exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
|
|
197 # #warn "sr_key($sr_key) is not part of index for $filepath\n";
|
|
198 #}
|
|
199 #else{
|
|
200
|
|
201 if(exists $self->{file_cache}{$filepath}{off_sets}{$sr_key}){
|
|
202
|
|
203 my $total_offset = $self->{file_cache}{$filepath}{off_sets}{$sr_key} + $sr_offset;
|
|
204 my $seeked = sysseek($fh, $total_offset, 0);#0(whence) is SEEK_SET.
|
|
205
|
|
206 if(! $seeked){
|
|
207 warn("Failed to seek to byte $total_offset in $filepath");
|
|
208 #Don't undef fh here as this valid Slice maybe out of range
|
|
209 #and we don't want to kill a valid fh
|
|
210 #i.e. Slice start/end is past end of seq_region
|
|
211 }
|
|
212 else{
|
|
213 my $read_bytes = sysread($fh, $blob_substr, $byte_length);
|
|
214
|
|
215 if(! ((defined $read_bytes) && ($read_bytes == $byte_length))){
|
|
216 #! defined is error 0 is end of file
|
|
217 warn "Failed to read from $filepath\n$!";
|
|
218
|
|
219 if($read_bytes == 0){
|
|
220 #This maybe because the slice is out of range!
|
|
221 #The API gives no warning about this
|
|
222
|
|
223 warn "End Of File encountered\n";
|
|
224 warn "Total offset:\t".$self->{file_cache}{$filepath}{off_sets}{$sr_key}.
|
|
225 " key($sr_key) + $sr_offset = $total_offset\n";
|
|
226
|
|
227 #add some checks against the theoretical/true length of the file?
|
|
228 }
|
|
229 else{ #Delete fh as it is useless/unsafe to retry
|
|
230 undef $self->{file_cache}{$filepath}{filehandle};
|
|
231 #$blob_substr is now set to empty string by read
|
|
232 undef $blob_substr;
|
|
233 }
|
|
234 }
|
|
235 }
|
|
236 }
|
|
237 }
|
|
238
|
|
239 return $blob_substr;
|
|
240 }
|
|
241
|
|
242
|
|
243 1;
|