0
|
1 # $Id: scf.pm,v 1.23 2002/11/01 11:16:25 heikki Exp $
|
|
2 #
|
|
3 # Copyright (c) 1997-2001 bioperl, Chad Matsalla. All Rights Reserved.
|
|
4 # This module is free software; you can redistribute it and/or
|
|
5 # modify it under the same terms as Perl itself.
|
|
6 #
|
|
7 # Copyright Chad Matsalla
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SeqIO::scf - .scf file input/output stream
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 Do not use this module directly. Use it via the Bio::SeqIO class, see
|
|
20 L<Bio::SeqIO> for more information.
|
|
21
|
|
22 =head1 DESCRIPTION
|
|
23
|
|
24 This object can transform .scf files to and from
|
|
25 Bio::Seq::SeqWithQuality objects. Mechanisms are present to retrieve
|
|
26 trace data from scf files.
|
|
27
|
|
28 =head1 FEEDBACK
|
|
29
|
|
30 =head2 Mailing Lists
|
|
31
|
|
32
|
|
33 User feedback is an integral part of the evolution of this and other
|
|
34 Bioperl modules. Send your comments and suggestions preferably to one
|
|
35 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
36
|
|
37 bioperl-l@bioperl.org - General discussion
|
|
38 http://www.bioperl.org/MailList.shtml - About the mailing lists
|
|
39
|
|
40 =head2 Reporting Bugs
|
|
41
|
|
42 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
43 the bugs and their resolution. Bug reports can be submitted via email
|
|
44 or the web:
|
|
45
|
|
46 bioperl-bugs@bio.perl.org
|
|
47 http://bugzilla.bioperl.org/
|
|
48
|
|
49 =head1 AUTHOR Chad Matsalla
|
|
50
|
|
51 Chad Matsalla
|
|
52 bioinformatics@dieselwurks.com
|
|
53
|
|
54 =head1 CONTRIBUTORS
|
|
55
|
|
56 Jason Stajich, jason@bioperl.org
|
|
57 Tony Cox, avc@sanger.ac.uk
|
|
58 Heikki Lehvaslaiho, heikki@ebi.ac.uk
|
|
59
|
|
60 =head1 APPENDIX
|
|
61
|
|
62 The rest of the documentation details each of the object
|
|
63 methods. Internal methods are usually preceded with a _
|
|
64
|
|
65 =cut
|
|
66
|
|
67 # Let the code begin...
|
|
68
|
|
69 package Bio::SeqIO::scf;
|
|
70 use vars qw(@ISA $DEFAULT_QUALITY);
|
|
71 use strict;
|
|
72 use Bio::SeqIO;
|
|
73 use Bio::Seq::SeqFactory;
|
|
74 require 'dumpvar.pl';
|
|
75
|
|
76 BEGIN {
|
|
77 $DEFAULT_QUALITY= 10;
|
|
78 }
|
|
79
|
|
80 @ISA = qw(Bio::SeqIO);
|
|
81
|
|
82 sub _initialize {
|
|
83 my($self,@args) = @_;
|
|
84 $self->SUPER::_initialize(@args);
|
|
85 if( ! defined $self->sequence_factory ) {
|
|
86 $self->sequence_factory(new Bio::Seq::SeqFactory
|
|
87 (-verbose => $self->verbose(),
|
|
88 -type => 'Bio::Seq::SeqWithQuality'));
|
|
89 }
|
|
90 }
|
|
91
|
|
92 =head2 next_seq()
|
|
93
|
|
94 Title : next_seq()
|
|
95 Usage : $scf = $stream->next_seq()
|
|
96 Function: returns the next scf sequence in the stream
|
|
97 Returns : Bio::Seq::SeqWithQuality object
|
|
98 Args : NONE
|
|
99 Notes : Fills the interface specification for SeqIO.
|
|
100 The SCF specification does not provide for having more then
|
|
101 one sequence in a given scf. So once the filehandle has been open
|
|
102 and passed to SeqIO don't expect to run this function more then
|
|
103 once on a given scf unless you embraced and extended the SCF
|
|
104 standard. (But that's just C R A Z Y talk, isn't it.)
|
|
105
|
|
106 =cut
|
|
107
|
|
108 #'
|
|
109 sub next_seq {
|
|
110 my ($self) = @_;
|
|
111 my ($seq, $seqc, $fh, $buffer, $offset, $length, $read_bytes, @read,
|
|
112 %names);
|
|
113 # set up a filehandle to read in the scf
|
|
114 $fh = $self->_filehandle();
|
|
115 unless ($fh) { # simulate the <> function
|
|
116 if ( !fileno(ARGV) or eof(ARGV) ) {
|
|
117 return unless my $ARGV = shift;
|
|
118 open(ARGV,$ARGV) or
|
|
119 $self->throw("Could not open $ARGV for SCF stream reading $!");
|
|
120 }
|
|
121 $fh = \*ARGV;
|
|
122 }
|
|
123 binmode $fh; # for the Win32/Mac crowds
|
|
124 return unless read $fh, $buffer, 128; # no exception; probably end of file
|
|
125 # the first thing to do is parse the header. This is common
|
|
126 # among all versions of scf.
|
|
127 $self->_set_header($buffer);
|
|
128 # the rest of the the information is different between the
|
|
129 # the different versions of scf.
|
|
130 my $byte = "n";
|
|
131 if ($self->{'version'} lt "3.00") {
|
|
132 # first gather the trace information
|
|
133 $length = $self->{'samples'}*$self->{sample_size}*4;
|
|
134 $buffer = $self->read_from_buffer($fh,$buffer,$length);
|
|
135 if ($self->{sample_size} == 1) {
|
|
136 $byte = "c";
|
|
137 }
|
|
138 @read = unpack "${byte}${length}",$buffer;
|
|
139 # these traces need to be split
|
|
140 $self->_set_v2_traces(\@read);
|
|
141 # now go and get the base information
|
|
142 $offset = $self->{bases_offset};
|
|
143 $length = ($self->{bases} * 12);
|
|
144 seek $fh,$offset,0;
|
|
145 $buffer = $self->read_from_buffer($fh,$buffer,$length);
|
|
146 # now distill the information into its fractions.
|
|
147 $self->_set_v2_bases($buffer);
|
|
148 } else {
|
|
149 my $transformed_read;
|
|
150 foreach (qw(A C G T)) {
|
|
151 $length = $self->{'samples'}*$self->{sample_size};
|
|
152 $buffer = $self->read_from_buffer($fh,$buffer,$length);
|
|
153 if ($self->{sample_size} == 1) {
|
|
154 $byte = "c";
|
|
155 }
|
|
156 @read = unpack "${byte}${length}",$buffer;
|
|
157 # this little spurt of nonsense is because
|
|
158 # the trace values are given in the binary
|
|
159 # file as unsigned shorts but they really
|
|
160 # are signed. 30000 is an arbitrary number
|
|
161 # (will there be any traces with a given
|
|
162 # point greater then 30000? I hope not.
|
|
163 # once the read is read, it must be changed
|
|
164 # from relative
|
|
165 for (my $element=0; $element < scalar(@read); $element++) {
|
|
166 if ($read[$element] > 30000) {
|
|
167 $read[$element] = $read[$element] - 65536;
|
|
168 }
|
|
169 }
|
|
170 $transformed_read = $self->_delta(\@read,"backward");
|
|
171 $self->{'traces'}->{$_} = join(' ',@{$transformed_read});
|
|
172 }
|
|
173 # now go and get the peak index information
|
|
174 $offset = $self->{bases_offset};
|
|
175 $length = ($self->{bases} * 4);
|
|
176 seek $fh,$offset,0;
|
|
177 $buffer = $self->read_from_buffer($fh,$buffer,$length);
|
|
178 $self->_set_v3_peak_indices($buffer);
|
|
179 # now go and get the accuracy information
|
|
180 $buffer = $self->read_from_buffer($fh,$buffer,$length);
|
|
181 $self->_set_v3_base_accuracies($buffer);
|
|
182 # OK, now go and get the base information.
|
|
183 $length = $self->{bases};
|
|
184 $buffer = $self->read_from_buffer($fh,$buffer,$length);
|
|
185 $self->{'parsed'}->{'sequence'} = unpack("a$length",$buffer);
|
|
186 # now, finally, extract the calls from the accuracy information.
|
|
187 $self->_set_v3_quality($self);
|
|
188 }
|
|
189 # now go and get the comment information
|
|
190 $offset = $self->{comments_offset};
|
|
191 seek $fh,$offset,0;
|
|
192 $length = $self->{comment_size};
|
|
193 $buffer = $self->read_from_buffer($fh,$buffer,$length);
|
|
194 $self->_set_comments($buffer);
|
|
195 return $self->sequence_factory->create
|
|
196 (-seq => $self->{'parsed'}->{'sequence'},
|
|
197 -qual => $self->{'parsed'}->{'qualities'},
|
|
198 -id => $self->{'comments'}->{'NAME'}
|
|
199 );
|
|
200 }
|
|
201
|
|
202
|
|
203 =head2 _set_v3_quality()
|
|
204
|
|
205 Title : _set_v3_quality()
|
|
206 Usage : $self->_set_v3_quality()
|
|
207 Function: Set the base qualities from version3 scf's
|
|
208 Returns : Nothing. Alters $self.
|
|
209 Args : None.
|
|
210 Notes :
|
|
211
|
|
212 =cut
|
|
213
|
|
214 #'
|
|
215 sub _set_v3_quality {
|
|
216 my $self = shift;
|
|
217 my @bases = split//,$self->{'parsed'}->{'sequence'};
|
|
218 my (@qualities,$currbase,$currqual,$counter);
|
|
219 for ($counter=0; $counter <= $#bases ; $counter++) {
|
|
220 $currbase = uc($bases[$counter]);
|
|
221 if ($currbase eq "A") { $currqual = $self->{'parsed'}->{'base_accuracies'}->{'A'}->[$counter]; }
|
|
222 elsif ($currbase eq "C") { $currqual = $self->{'parsed'}->{'base_accuracies'}->{'C'}->[$counter]; }
|
|
223 elsif ($currbase eq "G") { $currqual = $self->{'parsed'}->{'base_accuracies'}->{'G'}->[$counter]; }
|
|
224 elsif ($currbase eq "T") { $currqual = $self->{'parsed'}->{'base_accuracies'}->{'T'}->[$counter]; }
|
|
225 else { $currqual = "unknown"; }
|
|
226 push @qualities,$currqual;
|
|
227 }
|
|
228 $self->{'parsed'}->{'qualities'} = \@qualities;
|
|
229 }
|
|
230
|
|
231 =head2 _set_v3_peak_indices($buffer)
|
|
232
|
|
233 Title : _set_v3_peak_indices($buffer)
|
|
234 Usage : $self->_set_v3_peak_indices($buffer);
|
|
235 Function: Unpacks the base accuracies for version3 scf
|
|
236 Returns : Nothing. Alters $self
|
|
237 Args : A scalar containing binary data.
|
|
238 Notes :
|
|
239
|
|
240 =cut
|
|
241
|
|
242 sub _set_v3_peak_indices {
|
|
243 my ($self,$buffer) = @_;
|
|
244 my $length = length($buffer);
|
|
245 my ($offset,@read,@positions);
|
|
246 @read = unpack "N$length",$buffer;
|
|
247 $self->{'parsed'}->{'peak_indices'} = join(' ',@read);
|
|
248 }
|
|
249
|
|
250 =head2 _set_v3_base_accuracies($buffer)
|
|
251
|
|
252 Title : _set_v3_base_accuracies($buffer)
|
|
253 Usage : $self->_set_v3_base_accuracies($buffer)
|
|
254 Function: Set the base accuracies for version 3 scf's
|
|
255 Returns : Nothing. Alters $self.
|
|
256 Args : A scalar containing binary data.
|
|
257 Notes :
|
|
258
|
|
259 =cut
|
|
260
|
|
261 #'
|
|
262 sub _set_v3_base_accuracies {
|
|
263 my ($self,$buffer) = @_;
|
|
264 my $length = length($buffer);
|
|
265 my $qlength = $length/4;
|
|
266 my $offset = 0;
|
|
267 my (@qualities,@sorter,$counter,$round,$last_base);
|
|
268 foreach (qw(A C G T)) {
|
|
269 my @read;
|
|
270 $last_base = $offset + $qlength;
|
|
271 for (;$offset < $last_base; $offset += $qlength) {
|
|
272 @read = unpack "c$qlength", substr($buffer,$offset,$qlength);
|
|
273 $self->{'parsed'}->{'base_accuracies'}->{"$_"} = \@read;
|
|
274 }
|
|
275 }
|
|
276 }
|
|
277
|
|
278
|
|
279 =head2 _set_comments($buffer)
|
|
280
|
|
281 Title : _set_comments($buffer)
|
|
282 Usage : $self->_set_comments($buffer);
|
|
283 Function: Gather the comments section from the scf and parse it into its
|
|
284 components.
|
|
285 Returns : Nothing. Modifies $self.
|
|
286 Args : The buffer. It is expected that the buffer contains a binary
|
|
287 string for the comments section of an scf file according to
|
|
288 the scf file specifications.
|
|
289 Notes : None. Works like Jello.
|
|
290
|
|
291 =cut
|
|
292
|
|
293 sub _set_comments {
|
|
294 my ($self,$buffer) = @_;
|
|
295 my $size = length($buffer);
|
|
296 my $comments_retrieved = unpack "a$size",$buffer;
|
|
297 $comments_retrieved =~ s/\0//;
|
|
298 my @comments_split = split/\n/,$comments_retrieved;
|
|
299 if (@comments_split) {
|
|
300 foreach (@comments_split) {
|
|
301 /(\w+)=(.*)/;
|
|
302 if ($1 && $2) {
|
|
303 $self->{'comments'}->{$1} = $2;
|
|
304 }
|
|
305 }
|
|
306 }
|
|
307 return;
|
|
308 }
|
|
309
|
|
310 =head2 _set_header()
|
|
311
|
|
312 Title : _set_header($buffer)
|
|
313 Usage : $self->_set_header($buffer);
|
|
314 Function: Gather the header section from the scf and parse it into its
|
|
315 components.
|
|
316 Returns : Nothing. Modifies $self.
|
|
317 Args : The buffer. It is expected that the buffer contains a binary
|
|
318 string for the header section of an scf file according to the
|
|
319 scf file specifications.
|
|
320 Notes : None.
|
|
321
|
|
322 =cut
|
|
323
|
|
324 sub _set_header {
|
|
325 my ($self,$buffer) = @_;
|
|
326 ($self->{'scf'},
|
|
327 $self->{'samples'},
|
|
328 $self->{'sample_offset'},
|
|
329 $self->{'bases'},
|
|
330 $self->{'bases_left_clip'},
|
|
331 $self->{'bases_right_clip'},
|
|
332 $self->{'bases_offset'},
|
|
333 $self->{'comment_size'},
|
|
334 $self->{'comments_offset'},
|
|
335 $self->{'version'},
|
|
336 $self->{'sample_size'},
|
|
337 $self->{'code_set'},
|
|
338 @{$self->{'header_spare'}} ) = unpack "a4 NNNNNNNN a4 NN N20", $buffer;
|
|
339 return;
|
|
340
|
|
341 }
|
|
342
|
|
343 =head2 _set_v2_bases($buffer)
|
|
344
|
|
345 Title : _set_v2_bases($buffer)
|
|
346 Usage : $self->_set_v2_bases($buffer);
|
|
347 Function: Gather the bases section from the scf and parse it into its
|
|
348 components.
|
|
349 Returns : Nothing. Modifies $self.
|
|
350 Args : The buffer. It is expected that the buffer contains a binary
|
|
351 string for the bases section of an scf file according to the
|
|
352 scf file specifications.
|
|
353 Notes : None.
|
|
354
|
|
355 =cut
|
|
356
|
|
357 sub _set_v2_bases {
|
|
358 my ($self,$buffer) = @_;
|
|
359 my $length = length($buffer);
|
|
360 my ($offset2,$currbuff,$currbase,$currqual,$sequence,@qualities,@indices);
|
|
361 my @read;
|
|
362 for ($offset2=0;$offset2<$length;$offset2+=12) {
|
|
363 @read = unpack "N C C C C a C3", substr($buffer,$offset2,$length);
|
|
364 push @indices,$read[0];
|
|
365 $currbase = uc($read[5]);
|
|
366 if ($currbase eq "A") { $currqual = $read[1]; }
|
|
367 elsif ($currbase eq "C") { $currqual = $read[2]; }
|
|
368 elsif ($currbase eq "G") { $currqual = $read[3]; }
|
|
369 elsif ($currbase eq "T") { $currqual = $read[4]; }
|
|
370 else { $currqual = "UNKNOWN"; }
|
|
371 $sequence .= $currbase;
|
|
372 push @qualities,$currqual;
|
|
373 }
|
|
374 unless (!@indices) {
|
|
375 $self->{'parsed'}->{'peak_indices'} = join(' ',@indices);
|
|
376 }
|
|
377 $self->{'parsed'}->{'sequence'} = $sequence;
|
|
378 unless (!@qualities) {
|
|
379 $self->{'parsed'}->{'qualities'} = join(' ',@qualities);
|
|
380 }
|
|
381 }
|
|
382
|
|
383 =head2 _set_v2_traces(\@traces_array)
|
|
384
|
|
385 Title : _set_v2_traces(\@traces_array)
|
|
386 Usage : $self->_set_v2_traces(\@traces_array);
|
|
387 Function: Parses an scf Version2 trace array into its base components.
|
|
388 Returns : Nothing. Modifies $self.
|
|
389 Args : A reference to an array of the unpacked traces section of an
|
|
390 scf version2 file.
|
|
391
|
|
392 =cut
|
|
393
|
|
394 sub _set_v2_traces {
|
|
395 my ($self,$rread) = @_;
|
|
396 my @read = @$rread;
|
|
397 my $array = 0;
|
|
398 for (my $offset2 = 0; $offset2< scalar(@read); $offset2+=4) {
|
|
399 if ($array) {
|
|
400 push @{$self->{'traces'}->{'A'}},$read[$offset2];
|
|
401 push @{$self->{'traces'}->{'C'}},$read[$offset2+1];
|
|
402 push @{$self->{'traces'}->{'G'}},$read[$offset2+3];
|
|
403 push @{$self->{'traces'}->{'T'}},$read[$offset2+2];
|
|
404 } else {
|
|
405 $self->{'traces'}->{'A'} .= " ".$read[$offset2];
|
|
406 $self->{'traces'}->{'C'} .= " ".$read[$offset2+1];
|
|
407 $self->{'traces'}->{'G'} .= " ".$read[$offset2+2];
|
|
408 $self->{'traces'}->{'T'} .= " ".$read[$offset2+3];
|
|
409 }
|
|
410 }
|
|
411 return;
|
|
412 }
|
|
413
|
|
414 =head2 get_trace($base_channel)
|
|
415
|
|
416 Title : get_trace($base_channel)
|
|
417 Usage : @a_trace = @{$obj->get_trace("A")};
|
|
418 Function: Return the trace data for the given base.
|
|
419 Returns : A reference to an array containing the trace data for the
|
|
420 given base.
|
|
421 Args : A,C,G, or T. Any other input throws.
|
|
422 Notes :
|
|
423
|
|
424 =cut
|
|
425
|
|
426 sub get_trace {
|
|
427 my ($self,$base_channel) = @_;
|
|
428 $base_channel =~ tr/a-z/A-Z/;
|
|
429 if ($base_channel !~ /A|T|G|C/) {
|
|
430 $self->throw("You tried to ask for a base channel that wasn't A,T,G, or C. Ask for one of those next time.");
|
|
431 } elsif ($base_channel) {
|
|
432 my @temp = split(' ',$self->{'traces'}->{$base_channel});
|
|
433 return \@temp;
|
|
434 }
|
|
435 }
|
|
436
|
|
437 =head2 get_peak_indices()
|
|
438
|
|
439 Title : get_peak_indices()
|
|
440 Usage : @a_trace = @{$obj->get_peak_indices()};
|
|
441 Function: Return the peak indices for this scf.
|
|
442 Returns : A reference to an array containing the peak indices for this scf.
|
|
443 Args : None.
|
|
444 Notes :
|
|
445
|
|
446 =cut
|
|
447
|
|
448 sub get_peak_indices {
|
|
449 my ($self) = shift;
|
|
450 my @temp = split(' ',$self->{'parsed'}->{'peak_indices'});
|
|
451 return \@temp;
|
|
452 }
|
|
453
|
|
454
|
|
455 =head2 get_header()
|
|
456
|
|
457 Title : get_header()
|
|
458 Usage : %header = %{$obj->get_header()};
|
|
459 Function: Return the header for this scf.
|
|
460 Returns : A reference to a hash containing the header for this scf.
|
|
461 Args : None.
|
|
462 Notes :
|
|
463
|
|
464 =cut
|
|
465
|
|
466 sub get_header {
|
|
467 my ($self) = shift;
|
|
468 my %header;
|
|
469 foreach (qw(scf samples sample_offset bases bases_left_clip
|
|
470 bases_right_clip bases_offset comment_size comments_offset
|
|
471 version sample_size code_set peak_indices)) {
|
|
472 $header{"$_"} = $self->{"$_"};
|
|
473 }
|
|
474 return \%header;
|
|
475 }
|
|
476
|
|
477 =head2 _dump_traces_incoming($transformed)
|
|
478
|
|
479 Title : _dump_traces_incoming("transformed")
|
|
480 Usage : &_dump_traces($ra,$rc,$rg,$rt);
|
|
481 Function: Used in debugging. Prints all traces one beside each other.
|
|
482 Returns : Nothing.
|
|
483 Args : References to the arrays containing the traces for A,C,G,T.
|
|
484 Notes : Beats using dumpValue, I'll tell ya. Much better then using
|
|
485 join' ' too.
|
|
486 - if a scalar is included as an argument (any scalar), this
|
|
487 procedure will dump the _delta'd trace. If you don't know what
|
|
488 that means you should not be using this.
|
|
489
|
|
490 =cut
|
|
491
|
|
492 #'
|
|
493 sub _dump_traces_incoming {
|
|
494 my ($self) = @_;
|
|
495 my (@sA,@sT,@sG,@sC);
|
|
496 # @sA = @{$self->{'traces'}->{'A'}};
|
|
497 # @sC = @{$self->{'traces'}->{'C'}};
|
|
498 # @sG = @{$self->{'traces'}->{'G'}};
|
|
499 # @sT = @{$self->{'traces'}->{'T'}};
|
|
500 @sA = @{$self->get_trace('A')};
|
|
501 @sC = @{$self->get_trace('C')};
|
|
502 @sG = @{$self->get_trace('G')};
|
|
503 @sT = @{$self->get_trace('t')};
|
|
504 print ("Count\ta\tc\tg\tt\n");
|
|
505 for (my $curr=0; $curr < scalar(@sG); $curr++) {
|
|
506 print("$curr\t$sA[$curr]\t$sC[$curr]\t$sG[$curr]\t$sT[$curr]\n");
|
|
507 }
|
|
508 return;
|
|
509 }
|
|
510
|
|
511 =head2 _dump_traces_outgoing($transformed)
|
|
512
|
|
513 Title : _dump_traces_outgoing("transformed")
|
|
514 Usage : &_dump_traces_outgoing(($ra,$rc,$rg,$rt);
|
|
515 Function: Used in debugging. Prints all traces one beside each other.
|
|
516 Returns : Nothing.
|
|
517 Args : References to the arrays containing the traces for A,C,G,T.
|
|
518 Notes : Beats using dumpValue, I\'ll tell ya. Much better then using
|
|
519 join' ' too.
|
|
520 - if a scalar is included as an argument (any scalar), this
|
|
521 procedur will dump the _delta'd trace. If you don't know what
|
|
522 that means you should not be using this.
|
|
523
|
|
524 =cut
|
|
525
|
|
526 sub _dump_traces_outgoing {
|
|
527 my ($self,$transformed) = @_;
|
|
528 my (@sA,@sT,@sG,@sC);
|
|
529 if ($transformed) {
|
|
530 @sA = @{$self->{'text'}->{'t_samples_a'}};
|
|
531 @sC = @{$self->{'text'}->{'t_samples_c'}};
|
|
532 @sG = @{$self->{'text'}->{'t_samples_g'}};
|
|
533 @sT = @{$self->{'text'}->{'t_samples_t'}};
|
|
534 }
|
|
535 else {
|
|
536 @sA = @{$self->{'text'}->{'samples_a'}};
|
|
537 @sC = @{$self->{'text'}->{'samples_c'}};
|
|
538 @sG = @{$self->{'text'}->{'samples_g'}};
|
|
539 @sT = @{$self->{'text'}->{'samples_t'}};
|
|
540 }
|
|
541 print ("Count\ta\tc\tg\tt\n");
|
|
542 for (my $curr=0; $curr < scalar(@sG); $curr++) {
|
|
543 print("$curr\t$sA[$curr]\t$sC[$curr]\t$sG[$curr]\t$sT[$curr]\n");
|
|
544 }
|
|
545 return;
|
|
546 }
|
|
547
|
|
548 =head2 write_seq
|
|
549
|
|
550 Title : write_seq(-SeqWithQuality => $swq, <comments>)
|
|
551 Usage : $obj->write_seq( -SeqWithQuality => $swq,
|
|
552 -version => 2,
|
|
553 -CONV => "Bioperl-Chads Mighty SCF writer.");
|
|
554 Function: Write out an scf.
|
|
555 Returns : Nothing.
|
|
556 Args : Requires: a reference to a SeqWithQuality object to form the
|
|
557 basis for the scf.
|
|
558 if -version is provided, it should be "2" or "3". A SCF of that
|
|
559 version will be written.
|
|
560 Any other arguments are assumed to be comments and are put into
|
|
561 the comments section of the scf. Read the specifications for scf
|
|
562 to decide what might be good to put in here.
|
|
563
|
|
564 Notes :
|
|
565 Someday: (All of this stuff is easy easy easy I just don't have
|
|
566 the requirement or the time.)
|
|
567 - Change the peak scaling factor?
|
|
568 - Change the width of the peak?
|
|
569 - Change the overlap between peaks?
|
|
570
|
|
571 =cut
|
|
572
|
|
573 #'
|
|
574 sub write_seq {
|
|
575 my ($self,%args) = @_;
|
|
576 my %comments;
|
|
577 my ($label,$arg);
|
|
578
|
|
579 my ($swq) = $self->_rearrange([qw(SEQWITHQUALITY)], %args);
|
|
580 unless (ref($swq) eq "Bio::Seq::SeqWithQuality") {
|
|
581 $self->throw("You must pass a Bio::Seq::SeqWithQuality object to write_seq as a parameter named \"SeqWithQuality\"");
|
|
582 }
|
|
583 # verify that there is some sequence or some qualities
|
|
584 # If the $swq with quality has no qualities, set all qualities to 0.
|
|
585 # If the $swq has no sequence, set the sequence to N\'s.
|
|
586 $self->_fill_missing_data($swq);
|
|
587
|
|
588 # all of the rest of the arguments are comments for the scf
|
|
589 foreach $arg (sort keys %args) {
|
|
590 next if ($arg =~ /SeqWithQuality/i);
|
|
591 ($label = $arg) =~ s/^\-//;
|
|
592 $comments{$label} = $args{$arg};
|
|
593 }
|
|
594 if (!$comments{'NAME'}) { $comments{'NAME'} = $swq->id(); }
|
|
595 # HA! Bwahahahaha.
|
|
596 $comments{'CONV'} = "Bioperl-Chads Mighty SCF writer." unless defined $comments{'CONV'};
|
|
597 # now deal with the version of scf they want to write
|
|
598 if ($comments{version}) {
|
|
599 if ($comments{version} != 2 && $comments{version} != 3) {
|
|
600 $self->warn("This module can only write version 2.0 or 3.0 scf's. Writing a version 2.0 scf by default.");
|
|
601 $comments{version} = "2.00";
|
|
602 }
|
|
603 if ($comments{'version'} > 2) {
|
|
604 $comments{'version'} = "3.00";
|
|
605 }
|
|
606 }
|
|
607 else {
|
|
608 $comments{'version'} = "2.00";
|
|
609 }
|
|
610
|
|
611
|
|
612
|
|
613 # set a few things in the header
|
|
614 $self->{'header'}->{'magic'} = ".scf";
|
|
615 $self->{'header'}->{'sample_size'} = "2";
|
|
616 $self->{'header'}->{'bases'} = length($swq->seq());
|
|
617 $self->{'header'}->{'bases_left_clip'} = "0";
|
|
618 $self->{'header'}->{'bases_right_clip'} = "0";
|
|
619 $self->{'header'}->{'version'} = $comments{'version'};
|
|
620 $self->{'header'}->{'sample_size'} = "2";
|
|
621 $self->{'header'}->{'code_set'} = "9";
|
|
622 @{$self->{'header'}->{'spare'}} = qw(0 0 0 0 0 0 0 0 0 0
|
|
623 0 0 0 0 0 0 0 0 0 0);
|
|
624
|
|
625 # create the binary for the comments and file it in $self->{'binaries'}->{'comments'}
|
|
626 $self->_set_binary_comments(\%comments);
|
|
627 # create the binary and the strings for the traces, bases, offsets (if necessary), and accuracies (if necessary)
|
|
628 $self->_set_binary_tracesbases($comments{'version'},$swq->seq(),$swq->qual());
|
|
629
|
|
630 # now set more things in the header
|
|
631 $self->{'header'}->{'samples_offset'} = "128";
|
|
632
|
|
633 my ($b_base_offsets,$b_base_accuracies,$samples_size,$bases_size);
|
|
634 #
|
|
635 # version 2
|
|
636 #
|
|
637 if ($self->{'header'}->{'version'} == 2) {
|
|
638 $samples_size = $self->{'header'}->{'samples'} * 4 *
|
|
639 $self->{'header'}->{'sample_size'};
|
|
640 $bases_size = length($swq->seq()) * 12;
|
|
641 $self->{'header'}->{'bases_offset'} = 128 + length($self->{'binaries'}->{'samples_all'});
|
|
642 $self->{'header'}->{'comments_offset'} = 128 + length($self->{'binaries'}->{'samples_all'}) + length($self->{'binaries'}->{'v2_bases'});
|
|
643 $self->{'header'}->{'comments_size'} = length($self->{'binaries'}->{'comments'});
|
|
644 $self->{'header'}->{'private_size'} = "0";
|
|
645 $self->{'header'}->{'private_offset'} = 128 + $samples_size +
|
|
646 $bases_size + $self->{'header'}->{'comments_size'};
|
|
647 }
|
|
648 else {
|
|
649 $self->{'header'}->{'bases_offset'} = 128 + length($self->{'binaries'}->{'samples_all'});
|
|
650 $self->{'header'}->{'comments_size'} = length($self->{'binaries'}->{'comments'});
|
|
651 # this is:
|
|
652 # bases_offset + base_offsets + accuracies + called_bases + reserved
|
|
653 $self->{'header'}->{'comments_offset'} = $self->{'header'}->{'bases_offset'} + 4*$self->{header}->{'bases'} + 4*$self->{header}->{'bases'} + $self->{header}->{'bases'} + 3*$self->{header}->{'bases'};
|
|
654 $self->{'header'}->{'private_size'} = "0";
|
|
655 $self->{'header'}->{'private_offset'} = $self->{'header'}->{'comments_offset'} + $self->{'header'}->{'comments_size'};
|
|
656 }
|
|
657
|
|
658 $self->_set_binary_header();
|
|
659
|
|
660 # should something better be done rather then returning after
|
|
661 # writing? I don't do any exception trapping here
|
|
662 if ($comments{'version'} == 2) {
|
|
663 # print ("Lengths:\n");
|
|
664 # print("Header : ".length($self->{'binaries'}->{'header'})."\n");
|
|
665 # print("Traces : ".length($self->{'binaries'}->{'samples_all'})."\n");
|
|
666 # print("Bases : ".length($self->{'binaries'}->{'v2_bases'})."\n");
|
|
667 # print("Comments: ".length($self->{'binaries'}->{'comments'})."\n");
|
|
668 $self->_print ($self->{'binaries'}->{'header'}) or return;
|
|
669 $self->_print ($self->{'binaries'}->{'samples_all'}) or return;
|
|
670 $self->_print ($self->{'binaries'}->{'v2_bases'}) or return;
|
|
671 $self->_print ($self->{'binaries'}->{'comments'}) or return;
|
|
672 }
|
|
673 elsif ($comments{'version'} ==3) {
|
|
674 # print ("Lengths:\n");
|
|
675 # print("Header : ".length($self->{'binaries'}->{'header'})."\n");
|
|
676 # print("Traces : ".length($self->{'binaries'}->{'samples_all'})."\n");
|
|
677 # print("Offsets : ".length($self->{'binaries'}->{'v3_peak_offsets'})."\n");
|
|
678 # print("Accuracy: ".length($self->{'binaries'}->{'v3_accuracies_all'})."\n");
|
|
679 # print("Bases : ".length($self->{'binaries'}->{'v3_called_bases'})."\n");
|
|
680 # print("Reserved: ".length($self->{'binaries'}->{'v3_reserved'})."\n");
|
|
681 # print("Comments: ".length($self->{'binaries'}->{'comments'})."\n");
|
|
682 $self->{'header'}->{'comments_offset'} =
|
|
683 128+length($self->{'binaries'}->{'samples_all'})+
|
|
684 length($self->{'binaries'}->{'v3_peak_offsets'})+
|
|
685 length($self->{'binaries'}->{'v3_accuracies_all'})+
|
|
686 length($self->{'binaries'}->{'v3_called_bases'})+
|
|
687 length($self->{'binaries'}->{'v3_reserved'});
|
|
688 $self->{'header'}->{'spare'}->[1] =
|
|
689 $self->{'header'}->{'comments_offset'} +
|
|
690 length($self->{'binaries'}->{'comments'});
|
|
691 $self->_set_binary_header();
|
|
692 $self->_print ($self->{'binaries'}->{'header'}) or print("Couldn't write header\n");
|
|
693 $self->_print ($self->{'binaries'}->{'samples_all'}) or print("Couldn't write samples\n");
|
|
694 $self->_print ($self->{'binaries'}->{'v3_peak_offsets'}) or print("Couldn't write peak offsets\n");
|
|
695 $self->_print ($self->{'binaries'}->{'v3_accuracies_all'}) or print("Couldn't write accuracies\n");
|
|
696 $self->_print ($self->{'binaries'}->{'v3_called_bases'}) or print("Couldn't write called_bases\n");
|
|
697 $self->_print ($self->{'binaries'}->{'v3_reserved'}) or print("Couldn't write reserved\n");
|
|
698 $self->_print ($self->{'binaries'}->{'comments'}) or print ("Couldn't write comments\n");
|
|
699 }
|
|
700
|
|
701 # kinda unnecessary, given the close() below, but maybe that'll go
|
|
702 # away someday.
|
|
703 $self->flush if $self->_flush_on_write && defined $self->_fh;
|
|
704
|
|
705 $self->close();
|
|
706 }
|
|
707
|
|
708 =head2 _set_binary_header()
|
|
709
|
|
710 Title : _set_binary_header();
|
|
711 Usage : $self->_set_binary_header();
|
|
712 Function: Provide the binary string that will be used as the header for
|
|
713 a scfv2 document.
|
|
714 Returns : A binary string.
|
|
715 Args : None. Uses the entries in the $self->{'header'} hash. These
|
|
716 are set on construction of the object (hopefully correctly!).
|
|
717 Notes :
|
|
718
|
|
719 =cut
|
|
720
|
|
721 sub _set_binary_header {
|
|
722 my ($self) = shift;
|
|
723 my $binary = pack "a4 NNNNNNNN a4 NN N20",
|
|
724 (
|
|
725 $self->{'header'}->{'magic'},
|
|
726 $self->{'header'}->{'samples'},
|
|
727 $self->{'header'}->{'samples_offset'},
|
|
728 $self->{'header'}->{'bases'},
|
|
729 $self->{'header'}->{'bases_left_clip'},
|
|
730 $self->{'header'}->{'bases_right_clip'},
|
|
731 $self->{'header'}->{'bases_offset'},
|
|
732 $self->{'header'}->{'comments_size'},
|
|
733 $self->{'header'}->{'comments_offset'},
|
|
734 $self->{'header'}->{'version'},
|
|
735 $self->{'header'}->{'sample_size'},
|
|
736 $self->{'header'}->{'code_set'},
|
|
737 @{$self->{'header'}->{'spare'}});
|
|
738 $self->{'binaries'}->{'header'} = $binary;
|
|
739 }
|
|
740
|
|
741 =head2 _set_binary_tracesbases($version,$sequence,$ref_quality)
|
|
742
|
|
743 Title : _set_binary_tracesbases($version,$sequence,$ref_quality)
|
|
744 Usage : $self->_set_binary_tracesbases($version,$sequence,
|
|
745 $ref_quality);
|
|
746 Function: Constructs the trace and base strings for all scfs
|
|
747 Returns : Nothing. Alters self.
|
|
748 Args : $version - "2" or "3"
|
|
749 $sequence - a scalar containing arbitrary sequence data
|
|
750 $ref_quality - a reference to an array containing quality
|
|
751 values
|
|
752 Notes : This is a really complicated thing.
|
|
753
|
|
754 =cut
|
|
755
|
|
756 sub _set_binary_tracesbases {
|
|
757 my ($self,$version,$sequence,$rqual) = @_;
|
|
758 $sequence =~ tr/a-z/A-Z/;
|
|
759 $self->{'info'}->{'sequence'} = $sequence;
|
|
760 $self->{'info'}->{'sequence_length'} = length($sequence);
|
|
761 my @quals = @$rqual;
|
|
762 # build the ramp for the first base.
|
|
763 # a ramp looks like this "1 4 13 29 51 71 80 71 51 29 13 4 1" times the quality score.
|
|
764 # REMEMBER: A C G T
|
|
765 # note to self-> smooth this thing out a bit later
|
|
766 @{$self->{'text'}->{'ramp'}} = qw( 1 4 13 29 51 75 80 75 51 29 13 4 1 );
|
|
767 # the width of the ramp
|
|
768 $self->{'text'}->{'ramp_width'} = scalar(@{$self->{'text'}->{'ramp'}});
|
|
769 # how far should the peaks overlap?
|
|
770 $self->{'text'}->{'ramp_overlap'} = 1;
|
|
771 # where should the peaks be located?
|
|
772 $self->{'text'}->{'peak_at'} = 7;
|
|
773 $self->{'text'}->{'ramp_total_length'} =
|
|
774 $self->{'info'}->{'sequence_length'} * $self->{'text'}->{'ramp_width'}
|
|
775 - $self->{'info'}->{'sequence_length'} * $self->{'text'}->{'ramp_overlap'};
|
|
776 # create some empty arrays
|
|
777 # my (@sam_a,@sam_c,@sam_g,@sam_t,$pos);
|
|
778 my $pos;
|
|
779 my $total_length = $self->{'text'}->{ramp_total_length};
|
|
780 for ($pos=0;$pos<=$total_length;$pos++) {
|
|
781 $self->{'text'}->{'samples_a'}[$pos] = $self->{'text'}->{'samples_c'}[$pos]
|
|
782 = $self->{'text'}->{'samples_g'}[$pos] = $self->{'text'}->{'samples_t'}[$pos] = "0";
|
|
783 }
|
|
784 # $self->_dump_traces();
|
|
785 # now populate them
|
|
786 my ($current_base,$place_base_at,$peak_quality,$ramp_counter,$current_ramp,$ramp_position);
|
|
787 my $sequence_length = $self->{'info'}->{'sequence_length'};
|
|
788 my $half_ramp = int($self->{'text'}->{'ramp_width'}/2);
|
|
789 for ($pos = 0; $pos<$sequence_length;$pos++) {
|
|
790 $current_base = substr($self->{'info'}->{'sequence'},$pos,1);
|
|
791 # where should the peak for this base be placed? Modeled after a mktrace scf
|
|
792 $place_base_at = ($pos * $self->{'text'}->{'ramp_width'}) -
|
|
793 ($pos * $self->{'text'}->{'ramp_overlap'}) -
|
|
794 $half_ramp + $self->{'text'}->{'ramp_width'} - 1;
|
|
795 push @{$self->{'text'}->{'v3_peak_offsets'}},$place_base_at;
|
|
796 $peak_quality = $quals[$pos];
|
|
797 if ($current_base eq "A") {
|
|
798 $ramp_position = $place_base_at - $half_ramp;
|
|
799 for ($current_ramp = 0; $current_ramp < $self->{'text'}->{'ramp_width'}; $current_ramp++) {
|
|
800 $self->{'text'}->{'samples_a'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp];
|
|
801 }
|
|
802 push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,$peak_quality,0,0,0,$current_base,0,0,0);
|
|
803 push @{$self->{'text'}->{'v3_base_accuracy_a'}},$peak_quality;
|
|
804 foreach (qw(g c t)) {
|
|
805 push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0;
|
|
806 }
|
|
807 }
|
|
808 elsif ($current_base eq "C") {
|
|
809 $ramp_position = $place_base_at - $half_ramp;
|
|
810 for ($current_ramp = 0; $current_ramp < $self->{'text'}->{'ramp_width'}; $current_ramp++) {
|
|
811 $self->{'text'}->{'samples_c'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp];
|
|
812 }
|
|
813 push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,0,$peak_quality,0,0,$current_base,0,0,0);
|
|
814 push @{$self->{'text'}->{'v3_base_accuracy_c'}},$peak_quality;
|
|
815 foreach (qw(g a t)) {
|
|
816 push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0;
|
|
817 }
|
|
818 } elsif ($current_base eq "G") {
|
|
819 $ramp_position = $place_base_at - $half_ramp;
|
|
820 for ($current_ramp = 0;
|
|
821 $current_ramp < $self->{'text'}->{'ramp_width'};
|
|
822 $current_ramp++) {
|
|
823 $self->{'text'}->{'samples_g'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp];
|
|
824 }
|
|
825 push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,0,0,$peak_quality,0,$current_base,0,0,0);
|
|
826 push @{$self->{'text'}->{"v3_base_accuracy_g"}},$peak_quality;
|
|
827 foreach (qw(a c t)) {
|
|
828 push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0;
|
|
829 }
|
|
830 }
|
|
831 elsif( $current_base eq "T" ) {
|
|
832 $ramp_position = $place_base_at - $half_ramp;
|
|
833 for ($current_ramp = 0; $current_ramp < $self->{'text'}->{'ramp_width'}; $current_ramp++) {
|
|
834 $self->{'text'}->{'samples_t'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp];
|
|
835 }
|
|
836 push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,0,0,0,$peak_quality,$current_base,0,0,0);
|
|
837 push @{$self->{'text'}->{'v3_base_accuracy_t'}},$peak_quality;
|
|
838 foreach (qw(g c a)) {
|
|
839 push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0;
|
|
840 }
|
|
841 } elsif ($current_base eq "N") {
|
|
842 $ramp_position = $place_base_at - $half_ramp;
|
|
843 for ($current_ramp = 0;
|
|
844 $current_ramp < $self->{'text'}->{'ramp_width'};
|
|
845 $current_ramp++) {
|
|
846 $self->{'text'}->{'samples_a'}[$ramp_position+$current_ramp] = $peak_quality * $self->{'text'}->{'ramp'}[$current_ramp];
|
|
847 }
|
|
848 push @{$self->{'text'}->{'v2_bases'}},($place_base_at+1,$peak_quality,
|
|
849 $peak_quality,$peak_quality,$peak_quality,
|
|
850 $current_base,0,0,0);
|
|
851 foreach (qw(a c g t)) {
|
|
852 push @{$self->{'text'}->{"v3_base_accuracy_$_"}},0;
|
|
853 }
|
|
854 }
|
|
855 else {
|
|
856 # don't print this.
|
|
857 # print ("The current base ($current_base) is not a base. Hmmm.\n");
|
|
858 }
|
|
859 }
|
|
860 foreach (qw(a c g t)) {
|
|
861 pop @{$self->{'text'}->{"samples_$_"}};
|
|
862 }
|
|
863
|
|
864 # set the samples in the header
|
|
865 $self->{'header'}->{'samples'} = scalar(@{$self->{'text'}->{'samples_a'}});
|
|
866
|
|
867 # create the final trace string (this is version dependent)
|
|
868 $self->_make_trace_string($version);
|
|
869 # create the binary for v2 bases
|
|
870 if ($self->{'header'}->{'version'} == 2) {
|
|
871 my ($packstring,@pack_array,$pos2,$tester,@unpacked);
|
|
872 for ($pos = 0; $pos<$sequence_length;$pos++) {
|
|
873 my @pack_array = @{$self->{'text'}->{'v2_bases'}}[$pos*9..$pos*9+8];
|
|
874 $self->{'binaries'}->{'v2_bases'} .= pack "N C C C C a C3",@pack_array;
|
|
875 }
|
|
876 # now create the binary for the traces
|
|
877 my $trace_pack_length = scalar(@{$self->{'text'}->{'samples_all'}});
|
|
878 $self->{'binaries'}->{'samples_all'} .= pack "n$trace_pack_length",@{$self->{'text'}->{'samples_all'}};
|
|
879 }
|
|
880 else {
|
|
881 # now for the version 3 stuff!
|
|
882 # delta the trace data
|
|
883 my @temp;
|
|
884 foreach (qw(a c g t)) {
|
|
885 $self->{'text'}->{"t_samples_$_"} = $self->_delta($self->{'text'}->{"samples_$_"},"forward");
|
|
886 if ($_ eq 'a') {
|
|
887 @temp = @{$self->{'text'}->{"t_samples_a"}};
|
|
888 @{$self->{'text'}->{'samples_all'}} = @{$self->{'text'}->{"t_samples_a"}};
|
|
889 }
|
|
890 else {
|
|
891 push @{$self->{'text'}->{'samples_all'}},@{$self->{'text'}->{"t_samples_$_"}};
|
|
892 }
|
|
893 }
|
|
894 # now create the binary for the traces
|
|
895 my $trace_pack_length = scalar(@{$self->{'text'}->{'samples_all'}});
|
|
896
|
|
897 $self->{'binaries'}->{'samples_all'} .= pack "n$trace_pack_length",@{$self->{'text'}->{'samples_all'}};
|
|
898
|
|
899 # peak offsets
|
|
900 my $length = scalar(@{$self->{'text'}->{'v3_peak_offsets'}});
|
|
901 $self->{'binaries'}->{'v3_peak_offsets'} = pack "N$length",@{$self->{'text'}->{'v3_peak_offsets'}};
|
|
902 # base accuracies
|
|
903 @{$self->{'text'}->{'v3_accuracies_all'}} = @{$self->{'text'}->{"v3_base_accuracy_a"}};
|
|
904 foreach (qw(c g t)) {
|
|
905 @{$self->{'text'}->{'v3_accuracies_all'}} = (@{$self->{'text'}->{'v3_accuracies_all'}},@{$self->{'text'}->{"v3_base_accuracy_$_"}});
|
|
906 }
|
|
907 $length = scalar(@{$self->{'text'}->{'v3_accuracies_all'}});
|
|
908
|
|
909 $self->{'binaries'}->{'v3_accuracies_all'} = pack "c$length",@{$self->{'text'}->{'v3_accuracies_all'}};
|
|
910 # called bases
|
|
911 $length = length($self->{'info'}->{'sequence'});
|
|
912 my @seq = split(//,$self->{'info'}->{'sequence'});
|
|
913 # pack the string
|
|
914 $self->{'binaries'}->{'v3_called_bases'} = $self->{'info'}->{'sequence'};
|
|
915 # finally, reserved for future use
|
|
916 $length = $self->{'info'}->{'sequence_length'};
|
|
917 for (my $counter=0; $counter < $length; $counter++) {
|
|
918 push @temp,0;
|
|
919 }
|
|
920 $self->{'binaries'}->{'v3_reserved'} = pack "N$length",@temp;
|
|
921 }
|
|
922 }
|
|
923
|
|
924 =head2 _make_trace_string($version)
|
|
925
|
|
926 Title : _make_trace_string($version)
|
|
927 Usage : $self->_make_trace_string($version)
|
|
928 Function: Merges trace data for the four bases to produce an scf
|
|
929 trace string. _requires_ $version
|
|
930 Returns : Nothing. Alters $self.
|
|
931 Args : $version - a version number. "2" or "3"
|
|
932 Notes :
|
|
933
|
|
934 =cut
|
|
935
|
|
936 sub _make_trace_string {
|
|
937 my ($self,$version) = @_;
|
|
938 my @traces;
|
|
939 my @traces_view;
|
|
940 my @as = @{$self->{'text'}->{'samples_a'}};
|
|
941 my @cs = @{$self->{'text'}->{'samples_c'}};
|
|
942 my @gs = @{$self->{'text'}->{'samples_g'}};
|
|
943 my @ts = @{$self->{'text'}->{'samples_t'}};
|
|
944 if ($version == 2) {
|
|
945 for (my $curr=0; $curr < scalar(@as); $curr++) {
|
|
946 $as[$curr] = $DEFAULT_QUALITY unless defined $as[$curr];
|
|
947 $cs[$curr] = $DEFAULT_QUALITY unless defined $cs[$curr];
|
|
948 $gs[$curr] = $DEFAULT_QUALITY unless defined $gs[$curr];
|
|
949 $ts[$curr] = $DEFAULT_QUALITY unless defined $ts[$curr];
|
|
950 push @traces,($as[$curr],$cs[$curr],$gs[$curr],$ts[$curr]);
|
|
951 }
|
|
952 }
|
|
953 elsif ($version == 3) {
|
|
954 @traces = (@as,@cs,@gs,@ts);
|
|
955 }
|
|
956 else {
|
|
957 $self->throw("No idea what version required to make traces here. You gave #$version# Bailing.");
|
|
958 }
|
|
959 my $length = scalar(@traces);
|
|
960 $self->{'text'}->{'samples_all'} = \@traces;
|
|
961
|
|
962 }
|
|
963
|
|
964 =head2 _set_binary_comments(\@comments)
|
|
965
|
|
966 Title : _set_binary_comments(\@comments)
|
|
967 Usage : $self->_set_binary_comments(\@comments);
|
|
968 Function: Provide a binary string that will be the comments section of
|
|
969 the scf file. See the scf specifications for detailed
|
|
970 specifications for the comments section of an scf file. Hint:
|
|
971 CODE=something\nBODE=something\n\0
|
|
972 Returns : Nothing. Alters self.
|
|
973 Args : A reference to an array containing comments.
|
|
974 Notes : None.
|
|
975
|
|
976 =cut
|
|
977
|
|
978 sub _set_binary_comments {
|
|
979 my ($self,$rcomments) = @_;
|
|
980 my $comments_string = '';
|
|
981 my %comments = %$rcomments;
|
|
982 foreach my $key (sort keys %comments) {
|
|
983 $comments{$key} ||= '';
|
|
984 $comments_string .= "$key=$comments{$key}\n";
|
|
985 }
|
|
986 $comments_string .= "\n\0";
|
|
987 $self->{'header'}->{'comments'} = $comments_string;
|
|
988 my $length = length($comments_string);
|
|
989 $self->{'binaries'}->{'comments'} = pack "A$length",$comments_string;
|
|
990 $self->{'header'}->{'comments'} = $comments_string;
|
|
991 }
|
|
992
|
|
993 =head2 _fill_missing_data($swq)
|
|
994
|
|
995 Title : _fill_missing_data($swq)
|
|
996 Usage : $self->_fill_missing_data($swq);
|
|
997 Function: If the $swq with quality has no qualities, set all qualities
|
|
998 to 0.
|
|
999 If the $swq has no sequence, set the sequence to N's.
|
|
1000 Returns : Nothing. Modifies the SeqWithQuality that was passed as an
|
|
1001 argument.
|
|
1002 Args : A reference to a Bio::Seq::SeqWithQuality
|
|
1003 Notes : None.
|
|
1004
|
|
1005 =cut
|
|
1006
|
|
1007 #'
|
|
1008 sub _fill_missing_data {
|
|
1009 my ($self,$swq) = @_;
|
|
1010 my $qual_obj = $swq->qual_obj();
|
|
1011 my $seq_obj = $swq->seq_obj();
|
|
1012 if ($qual_obj->length() == 0 && $seq_obj->length() != 0) {
|
|
1013 my $fake_qualities = ("$DEFAULT_QUALITY ")x$seq_obj->length();
|
|
1014 $swq->qual($fake_qualities);
|
|
1015 }
|
|
1016 if ($seq_obj->length() == 0 && $qual_obj->length != 0) {
|
|
1017 my $sequence = ("N")x$qual_obj->length();
|
|
1018 $swq->seq($sequence);
|
|
1019 }
|
|
1020 }
|
|
1021
|
|
1022 =head2 _delta(\@trace_data,$direction)
|
|
1023
|
|
1024 Title : _delta(\@trace_data,$direction)
|
|
1025 Usage : $self->_delta(\@trace_data,$direction);
|
|
1026 Function:
|
|
1027 Returns : A reference to an array containing modified trace values.
|
|
1028 Args : A reference to an array containing trace data and a string
|
|
1029 indicating the direction of conversion. ("forward" or
|
|
1030 "backward").
|
|
1031 Notes : This code is taken from the specification for SCF3.2.
|
|
1032 http://www.mrc-lmb.cam.ac.uk/pubseq/manual/formats_unix_4.html
|
|
1033
|
|
1034 =cut
|
|
1035
|
|
1036
|
|
1037 sub _delta {
|
|
1038 my ($self,$rsamples,$direction) = @_;
|
|
1039 my @samples = @$rsamples;
|
|
1040 # /* If job == DELTA_IT:
|
|
1041 # * change a series of sample points to a series of delta delta values:
|
|
1042 # * ie change them in two steps:
|
|
1043 # * first: delta = current_value - previous_value
|
|
1044 # * then: delta_delta = delta - previous_delta
|
|
1045 # * else
|
|
1046 # * do the reverse
|
|
1047 # */
|
|
1048 # int i;
|
|
1049 # uint_2 p_delta, p_sample;
|
|
1050
|
|
1051 my ($i,$num_samples,$p_delta,$p_sample,@samples_converted);
|
|
1052
|
|
1053 # c-programmers are funny people with their single-letter variables
|
|
1054
|
|
1055 if ( $direction eq "forward" ) {
|
|
1056 $p_delta = 0;
|
|
1057 for ($i=0; $i < scalar(@samples); $i++) {
|
|
1058 $p_sample = $samples[$i];
|
|
1059 $samples[$i] = $samples[$i] - $p_delta;
|
|
1060 $p_delta = $p_sample;
|
|
1061 }
|
|
1062 $p_delta = 0;
|
|
1063 for ($i=0; $i < scalar(@samples); $i++) {
|
|
1064 $p_sample = $samples[$i];
|
|
1065 $samples[$i] = $samples[$i] - $p_delta;
|
|
1066 $p_delta = $p_sample;
|
|
1067 }
|
|
1068 }
|
|
1069 elsif ($direction eq "backward") {
|
|
1070 $p_sample = 0;
|
|
1071 for ($i=0; $i < scalar(@samples); $i++) {
|
|
1072 $samples[$i] = $samples[$i] + $p_sample;
|
|
1073 $p_sample = $samples[$i];
|
|
1074 }
|
|
1075 $p_sample = 0;
|
|
1076 for ($i=0; $i < scalar(@samples); $i++) {
|
|
1077 $samples[$i] = $samples[$i] + $p_sample;
|
|
1078 $p_sample = $samples[$i];
|
|
1079 }
|
|
1080 }
|
|
1081 else {
|
|
1082 $self->warn("Bad direction. Use \"forward\" or \"backward\".");
|
|
1083 }
|
|
1084 return \@samples;
|
|
1085 }
|
|
1086
|
|
1087 =head2 _unpack_magik($buffer)
|
|
1088
|
|
1089 Title : _unpack_magik($buffer)
|
|
1090 Usage : $self->_unpack_magik($buffer)
|
|
1091 Function: What unpack specification should be used? Try them all.
|
|
1092 Returns : Nothing.
|
|
1093 Args : A buffer containing arbitrary binary data.
|
|
1094 Notes : Eliminate the ambiguity and the guesswork. Used in the
|
|
1095 adaptation of _delta(), mostly.
|
|
1096
|
|
1097 =cut
|
|
1098
|
|
1099 sub _unpack_magik {
|
|
1100 my ($self,$buffer) = @_;
|
|
1101 my $length = length($buffer);
|
|
1102 my (@read,$counter);
|
|
1103 foreach (qw(c C s S i I l L n N v V)) {
|
|
1104 @read = unpack "$_$length", $buffer;
|
|
1105 print ("----- Unpacked with $_\n");
|
|
1106 for ($counter=0; $counter < 20; $counter++) {
|
|
1107 print("$read[$counter]\n");
|
|
1108 }
|
|
1109 }
|
|
1110 }
|
|
1111
|
|
1112 =head2 read_from_buffer($filehandle,$buffer,$length)
|
|
1113
|
|
1114 Title : read_from_buffer($filehandle,$buffer,$length)
|
|
1115 Usage : $self->read_from_buffer($filehandle,$buffer,$length);
|
|
1116 Function: Read from the buffer.
|
|
1117 Returns : $buffer, containing a read of $length
|
|
1118 Args : a filehandle, a buffer, and a read length
|
|
1119 Notes : I just got tired of typing
|
|
1120 "unless (length($buffer) == $length)" so I put it here.
|
|
1121
|
|
1122 =cut
|
|
1123
|
|
1124 sub read_from_buffer {
|
|
1125 my ($self,$fh,$buffer,$length) = @_;
|
|
1126 read $fh, $buffer, $length;
|
|
1127 unless (length($buffer) == $length) {
|
|
1128 $self->warn("The read was incomplete! Trying harder.");
|
|
1129 my $missing_length = $length - length($buffer);
|
|
1130 my $buffer2;
|
|
1131 read $fh,$buffer2,$missing_length;
|
|
1132 $buffer .= $buffer2;
|
|
1133 if (length($buffer) != $length) {
|
|
1134 $self->throw("Unexpected end of file while reading from SCF file. I should have read $length but instead got ".length($buffer)."! Current file position is ".tell($fh).".");
|
|
1135 }
|
|
1136 }
|
|
1137
|
|
1138 return $buffer;
|
|
1139 }
|
|
1140
|
|
1141 =head2 _dump_keys()
|
|
1142
|
|
1143 Title : _dump_keys()
|
|
1144 Usage : &_dump_keys($a_reference_to_some_hash)
|
|
1145 Function: Dump out the keys in a hash.
|
|
1146 Returns : Nothing.
|
|
1147 Args : A reference to a hash.
|
|
1148 Notes : A debugging method.
|
|
1149
|
|
1150 =cut
|
|
1151
|
|
1152 sub _dump_keys {
|
|
1153 my $rhash = shift;
|
|
1154 if ($rhash !~ /HASH/) {
|
|
1155 print("_dump_keys: that was not a hash.\nIt was #$rhash# which was this reference:".ref($rhash)."\n");
|
|
1156 return;
|
|
1157 }
|
|
1158 print("_dump_keys: The keys for $rhash are:\n");
|
|
1159 foreach (sort keys %$rhash) {
|
|
1160 print("$_\n");
|
|
1161 }
|
|
1162 }
|
|
1163
|
|
1164 =head2 _dump_base_accuracies()
|
|
1165
|
|
1166 Title : _dump_base_accuracies()
|
|
1167 Usage : $self->_dump_base_accuracies();
|
|
1168 Function: Dump out the v3 base accuracies in an easy to read format.
|
|
1169 Returns : Nothing.
|
|
1170 Args : None.
|
|
1171 Notes : A debugging method.
|
|
1172
|
|
1173 =cut
|
|
1174
|
|
1175 sub _dump_base_accuracies {
|
|
1176 my $self = shift;
|
|
1177 print("Dumping base accuracies! for v3\n");
|
|
1178 print("There are this many elements in a,c,g,t:\n");
|
|
1179 print(scalar(@{$self->{'text'}->{'v3_base_accuracy_a'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_c'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_g'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_t'}})."\n");
|
|
1180 my $number_traces = scalar(@{$self->{'text'}->{'v3_base_accuracy_a'}});
|
|
1181 for (my $counter=0; $counter < $number_traces; $counter++ ) {
|
|
1182 print("$counter\t");
|
|
1183 print $self->{'text'}->{'v3_base_accuracy_a'}->[$counter]."\t";
|
|
1184 print $self->{'text'}->{'v3_base_accuracy_c'}->[$counter]."\t";
|
|
1185 print $self->{'text'}->{'v3_base_accuracy_g'}->[$counter]."\t";
|
|
1186 print $self->{'text'}->{'v3_base_accuracy_t'}->[$counter]."\t";
|
|
1187 print("\n");
|
|
1188 }
|
|
1189 }
|
|
1190
|
|
1191 =head2 _dump_peak_indices_incoming()
|
|
1192
|
|
1193 Title : _dump_peak_indices_incoming()
|
|
1194 Usage : $self->_dump_peak_indices_incoming();
|
|
1195 Function: Dump out the v3 peak indices in an easy to read format.
|
|
1196 Returns : Nothing.
|
|
1197 Args : None.
|
|
1198 Notes : A debugging method.
|
|
1199
|
|
1200 =cut
|
|
1201
|
|
1202 sub _dump_peak_indices_incoming {
|
|
1203 my $self = shift;
|
|
1204 print("Dump peak indices incoming!\n");
|
|
1205 my $length = $self->{'bases'};
|
|
1206 print("The length is $length\n");
|
|
1207 for (my $count=0; $count < $length; $count++) {
|
|
1208 print("$count\t$self->{parsed}->{peak_indices}->[$count]\n");
|
|
1209 }
|
|
1210 }
|
|
1211
|
|
1212 =head2 _dump_base_accuracies_incoming()
|
|
1213
|
|
1214 Title : _dump_base_accuracies_incoming()
|
|
1215 Usage : $self->_dump_base_accuracies_incoming();
|
|
1216 Function: Dump out the v3 base accuracies in an easy to read format.
|
|
1217 Returns : Nothing.
|
|
1218 Args : None.
|
|
1219 Notes : A debugging method.
|
|
1220
|
|
1221 =cut
|
|
1222
|
|
1223 sub _dump_base_accuracies_incoming {
|
|
1224 my $self = shift;
|
|
1225 print("Dumping base accuracies! for v3\n");
|
|
1226 # print("There are this many elements in a,c,g,t:\n");
|
|
1227 # print(scalar(@{$self->{'parsed'}->{'v3_base_accuracy_a'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_c'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_g'}}).",".scalar(@{$self->{'text'}->{'v3_base_accuracy_t'}})."\n");
|
|
1228 my $number_traces = $self->{'bases'};
|
|
1229 for (my $counter=0; $counter < $number_traces; $counter++ ) {
|
|
1230 print("$counter\t");
|
|
1231 foreach (qw(A T G C)) {
|
|
1232 print $self->{'parsed'}->{'base_accuracies'}->{$_}->[$counter]."\t";
|
|
1233 }
|
|
1234 print("\n");
|
|
1235 }
|
|
1236 }
|
|
1237
|
|
1238
|
|
1239 =head2 _dump_comments()
|
|
1240
|
|
1241 Title : _dump_comments()
|
|
1242 Usage : $self->_dump_comments();
|
|
1243 Function: Debug dump the comments section from the scf.
|
|
1244 Returns : Nothing.
|
|
1245 Args : Nothing.
|
|
1246 Notes : None.
|
|
1247
|
|
1248 =cut
|
|
1249
|
|
1250 sub _dump_comments {
|
|
1251 my ($self) = @_;
|
|
1252 warn ("SCF comments:\n");
|
|
1253 foreach my $k (keys %{$self->{'comments'}}) {
|
|
1254 warn ("\t {$k} ==> ", $self->{'comments'}->{$k}, "\n");
|
|
1255 }
|
|
1256 }
|
|
1257
|
|
1258
|
|
1259 1;
|
|
1260 __END__
|
|
1261
|
|
1262
|