0
|
1 # $Id: LargePrimarySeq.pm,v 1.27 2002/12/01 00:05:21 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Seq::LargePrimarySeq
|
|
4 #
|
|
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
|
|
6 #
|
|
7 # Copyright Ewan Birney
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # updated to utilize File::Temp - jason 2000-12-12
|
|
12 # POD documentation - main docs before the code
|
|
13
|
|
14 =head1 NAME
|
|
15
|
|
16 Bio::Seq::LargePrimarySeq - PrimarySeq object that stores sequence as
|
|
17 files in the tempdir (as found by File::Temp) or the default method in
|
|
18 Bio::Root::Root
|
|
19
|
|
20 =head1 SYNOPSIS
|
|
21
|
|
22 # normal primary seq usage
|
|
23
|
|
24 =head1 DESCRIPTION
|
|
25
|
|
26 This object stores a sequence as a series of files in a temporary
|
|
27 directory. The aim is to allow someone the ability to store very large
|
|
28 sequences (eg, E<gt> 100MBases) in a file system without running out of memory
|
|
29 (eg, on a 64 MB real memory machine!).
|
|
30
|
|
31 Of course, to actually make use of this functionality, the programs
|
|
32 which use this object B<must> not call $primary_seq-E<gt>seq otherwise the
|
|
33 entire sequence will come out into memory and probably paste your
|
|
34 machine. However, calls $primary_seq-E<gt>subseq(10,100) will cause only
|
|
35 90 characters to be brought into real memory.
|
|
36
|
|
37 =head1 FEEDBACK
|
|
38
|
|
39 =head2 Mailing Lists
|
|
40
|
|
41 User feedback is an integral part of the evolution of this
|
|
42 and other Bioperl modules. Send your comments and suggestions preferably
|
|
43 to one of the Bioperl mailing lists.
|
|
44 Your participation is much appreciated.
|
|
45
|
|
46 bioperl-l@bioperl.org - General discussion
|
|
47 http://bio.perl.org/MailList.html - About the mailing lists
|
|
48
|
|
49 =head2 Reporting Bugs
|
|
50
|
|
51 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
52 the bugs and their resolution. Bug reports can be submitted via email
|
|
53 or the web:
|
|
54
|
|
55 bioperl-bugs@bio.perl.org
|
|
56 http://bugzilla.bioperl.org/
|
|
57
|
|
58 =head1 AUTHOR - Ewan Birney, Jason Stajich
|
|
59
|
|
60 Email birney@ebi.ac.uk
|
|
61 Email jason@bioperl.org
|
|
62
|
|
63 =head1 APPENDIX
|
|
64
|
|
65 The rest of the documentation details each of the object
|
|
66 methods. Internal methods are usually preceded with a _
|
|
67
|
|
68 =cut
|
|
69
|
|
70
|
|
71 # Let the code begin...
|
|
72
|
|
73
|
|
74 package Bio::Seq::LargePrimarySeq;
|
|
75 use vars qw($AUTOLOAD @ISA);
|
|
76 use strict;
|
|
77
|
|
78 use Bio::PrimarySeq;
|
|
79 use Bio::Root::IO;
|
|
80
|
|
81 @ISA = qw(Bio::PrimarySeq Bio::Root::IO);
|
|
82
|
|
83 sub new {
|
|
84 my ($class, %params) = @_;
|
|
85
|
|
86 # don't let PrimarySeq set seq until we have
|
|
87 # opened filehandle
|
|
88
|
|
89 my $seq = $params{'-seq'} || $params{'-SEQ'};
|
|
90 if($seq ) {
|
|
91 delete $params{'-seq'};
|
|
92 delete $params{'-SEQ'};
|
|
93 }
|
|
94 my $self = $class->SUPER::new(%params);
|
|
95 $self->_initialize_io(%params);
|
|
96 my $tempdir = $self->tempdir( CLEANUP => 1);
|
|
97 my ($tfh,$file) = $self->tempfile( DIR => $tempdir );
|
|
98
|
|
99 $tfh && $self->_fh($tfh);
|
|
100 $file && $self->_filename($file);
|
|
101 $self->length(0);
|
|
102 $seq && $self->seq($seq);
|
|
103
|
|
104 return $self;
|
|
105 }
|
|
106
|
|
107
|
|
108 sub length {
|
|
109 my ($obj,$value) = @_;
|
|
110 if( defined $value) {
|
|
111 $obj->{'length'} = $value;
|
|
112 }
|
|
113
|
|
114 return (defined $obj->{'length'}) ? $obj->{'length'} : 0;
|
|
115 }
|
|
116
|
|
117 =head2 seq
|
|
118
|
|
119 Title : seq
|
|
120 Usage :
|
|
121 Function:
|
|
122 Example :
|
|
123 Returns :
|
|
124 Args :
|
|
125
|
|
126
|
|
127 =cut
|
|
128
|
|
129 sub seq {
|
|
130 my ($self, $data) = @_;
|
|
131 if( defined $data ) {
|
|
132 if( $self->length() == 0) {
|
|
133 $self->add_sequence_as_string($data);
|
|
134 } else {
|
|
135 $self->warn("Trying to reset the seq string, cannot do this with a LargePrimarySeq - must allocate a new object");
|
|
136 }
|
|
137 }
|
|
138 return $self->subseq(1,$self->length);
|
|
139 }
|
|
140
|
|
141 =head2 subseq
|
|
142
|
|
143 Title : subseq
|
|
144 Usage :
|
|
145 Function:
|
|
146 Example :
|
|
147 Returns :
|
|
148 Args :
|
|
149
|
|
150
|
|
151 =cut
|
|
152
|
|
153 sub subseq{
|
|
154 my ($self,$start,$end) = @_;
|
|
155 my $string;
|
|
156 my $fh = $self->_fh();
|
|
157
|
|
158 if( ref($start) && $start->isa('Bio::LocationI') ) {
|
|
159 my $loc = $start;
|
|
160 if( $loc->length == 0 ) {
|
|
161 $self->warn("Expect location lengths to be > 0");
|
|
162 return '';
|
|
163 } elsif( $loc->end < $loc->start ) {
|
|
164 # what about circular seqs
|
|
165 $self->warn("Expect location start to come before location end");
|
|
166 }
|
|
167 my $seq = '';
|
|
168 if( $loc->isa('Bio::Location::SplitLocationI') ) {
|
|
169 foreach my $subloc ( $loc->sub_Location ) {
|
|
170 if(! seek($fh,$subloc->start() - 1,0)) {
|
|
171 $self->throw("Unable to seek on file $start:$end $!");
|
|
172 }
|
|
173 my $ret = read($fh, $string, $subloc->length());
|
|
174 if( !defined $ret ) {
|
|
175 $self->throw("Unable to read $start:$end $!");
|
|
176 }
|
|
177 if( $subloc->strand < 0 ) {
|
|
178 $string = Bio::PrimarySeq->new(-seq => $string)->revcom()->seq();
|
|
179 }
|
|
180 $seq .= $string;
|
|
181 }
|
|
182 } else {
|
|
183 if(! seek($fh,$loc->start()-1,0)) {
|
|
184 $self->throw("Unable to seek on file ".$loc->start.":".
|
|
185 $loc->end ." $!");
|
|
186 }
|
|
187 my $ret = read($fh, $string, $loc->length());
|
|
188 if( !defined $ret ) {
|
|
189 $self->throw("Unable to read ".$loc->start.":".
|
|
190 $loc->end ." $!");
|
|
191 }
|
|
192 $seq = $string;
|
|
193 }
|
|
194 if( defined $loc->strand &&
|
|
195 $loc->strand < 0 ) {
|
|
196 $seq = Bio::PrimarySeq->new(-seq => $seq)->revcom()->seq();
|
|
197 }
|
|
198 return $seq;
|
|
199 }
|
|
200 if( $start <= 0 || $end > $self->length ) {
|
|
201 $self->throw("Attempting to get a subseq out of range $start:$end vs ".
|
|
202 $self->length);
|
|
203 }
|
|
204 if( $end < $start ) {
|
|
205 $self->throw("Attempting to subseq with end ($end) less than start ($start). To revcom use the revcom function with trunc");
|
|
206 }
|
|
207
|
|
208 if(! seek($fh,$start-1,0)) {
|
|
209 $self->throw("Unable to seek on file $start:$end $!");
|
|
210 }
|
|
211 my $ret = read($fh, $string, $end-$start+1);
|
|
212 if( !defined $ret ) {
|
|
213 $self->throw("Unable to read $start:$end $!");
|
|
214 }
|
|
215 return $string;
|
|
216 }
|
|
217
|
|
218 =head2 add_sequence_as_string
|
|
219
|
|
220 Title : add_sequence_as_string
|
|
221 Usage : $seq->add_sequence_as_string("CATGAT");
|
|
222 Function: Appends additional residues to an existing LargePrimarySeq object.
|
|
223 This allows one to build up a large sequence without storing
|
|
224 entire object in memory.
|
|
225 Returns : Current length of sequence
|
|
226 Args : string to append
|
|
227
|
|
228 =cut
|
|
229
|
|
230 sub add_sequence_as_string{
|
|
231 my ($self,$str) = @_;
|
|
232 my $len = $self->length + CORE::length($str);
|
|
233 my $fh = $self->_fh();
|
|
234 if(! seek($fh,0,2)) {
|
|
235 $self->throw("Unable to seek end of file: $!");
|
|
236 }
|
|
237 $self->_print($str);
|
|
238 $self->length($len);
|
|
239 }
|
|
240
|
|
241
|
|
242 =head2 _filename
|
|
243
|
|
244 Title : _filename
|
|
245 Usage : $obj->_filename($newval)
|
|
246 Function:
|
|
247 Example :
|
|
248 Returns : value of _filename
|
|
249 Args : newvalue (optional)
|
|
250
|
|
251
|
|
252 =cut
|
|
253
|
|
254 sub _filename{
|
|
255 my ($obj,$value) = @_;
|
|
256 if( defined $value) {
|
|
257 $obj->{'_filename'} = $value;
|
|
258 }
|
|
259 return $obj->{'_filename'};
|
|
260
|
|
261 }
|
|
262 =head2 alphabet
|
|
263
|
|
264 Title : alphabet
|
|
265 Usage : $obj->alphabet($newval)
|
|
266 Function:
|
|
267 Example :
|
|
268 Returns : value of alphabet
|
|
269 Args : newvalue (optional)
|
|
270
|
|
271
|
|
272 =cut
|
|
273
|
|
274 sub alphabet{
|
|
275 my ($self,$value) = @_;
|
|
276 if( defined $value) {
|
|
277 $self->SUPER::alphabet($value);
|
|
278 }
|
|
279 return $self->SUPER::alphabet() || 'dna';
|
|
280
|
|
281 }
|
|
282
|
|
283 sub DESTROY {
|
|
284 my $self = shift;
|
|
285 my $fh = $self->_fh();
|
|
286 close($fh) if( defined $fh );
|
|
287 # this should be handled by Tempfile removal, but we'll unlink anyways.
|
|
288 unlink $self->_filename() if defined $self->_filename() && -e $self->_filename;
|
|
289 $self->SUPER::DESTROY();
|
|
290 }
|
|
291
|
|
292 1;
|