0
|
1 # BioPerl module for Bio::SeqIO::fastq
|
|
2 #
|
|
3 # Cared for by Tony Cox <avc@sanger.ac.uk>
|
|
4 #
|
|
5 # Copyright Tony Cox
|
|
6 #
|
|
7 # You may distribute this module under the same terms as perl itself
|
|
8 # _history
|
|
9 # October 29, 2001 incept data
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SeqIO::fastq - fastq sequence input/output stream
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 Do not use this module directly. Use it via the Bio::SeqIO class.
|
|
20
|
|
21 =head1 DESCRIPTION
|
|
22
|
|
23 This object can transform Bio::Seq and Bio::Seq::SeqWithQuality
|
|
24 objects to and from fastq flat file databases.
|
|
25
|
|
26 Fastq is a file format used frequently at the Sanger Centre to bundle
|
|
27 a fasta sequence and its quality data. A typical fastaq entry takes
|
|
28 the from:
|
|
29
|
|
30 @HCDPQ1D0501
|
|
31 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT.....
|
|
32 +HCDPQ1D0501
|
|
33 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65.....
|
|
34
|
|
35 Fastq files have sequence and quality data on a single line and the
|
|
36 quality values are single-byte encoded. To retrieve the decimal values
|
|
37 for qualities you need to subtract 33 (or Octal 41) from each byte and
|
|
38 then convert to a '2 digit + 1 space' integer. You can check if 33 is
|
|
39 the right number because the first byte which is always '!'
|
|
40 corresponds to a quality value of 0.
|
|
41
|
|
42 =head1 FEEDBACK
|
|
43
|
|
44 =head2 Mailing Lists
|
|
45
|
|
46 User feedback is an integral part of the evolution of this and other
|
|
47 Bioperl modules. Send your comments and suggestions preferably to one
|
|
48 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
49
|
|
50 bioperl-l@bioperl.org - General discussion
|
|
51 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
52
|
|
53 =head2 Reporting Bugs
|
|
54
|
|
55 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
56 the bugs and their resolution.
|
|
57 Bug reports can be submitted via email or the web:
|
|
58
|
|
59 bioperl-bugs@bio.perl.org
|
|
60 http://bugzilla.bioperl.org/
|
|
61
|
|
62 =head1 AUTHORS - Tony Cox
|
|
63
|
|
64 Email: avc@sanger.ac.uk
|
|
65
|
|
66
|
|
67 =head1 APPENDIX
|
|
68
|
|
69 The rest of the documentation details each of the object
|
|
70 methods. Internal methods are usually preceded with a _
|
|
71
|
|
72 =cut
|
|
73
|
|
74 # Let the code begin...
|
|
75
|
|
76 package Bio::SeqIO::fastq;
|
|
77 use vars qw(@ISA);
|
|
78 use strict;
|
|
79 # Object preamble - inherits from Bio::Root::Object
|
|
80
|
|
81 use Bio::SeqIO;
|
|
82 use Bio::Seq::SeqFactory;
|
|
83
|
|
84 @ISA = qw(Bio::SeqIO);
|
|
85
|
|
86 sub _initialize {
|
|
87 my($self,@args) = @_;
|
|
88 $self->SUPER::_initialize(@args);
|
|
89 if( ! defined $self->sequence_factory ) {
|
|
90 $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::SeqWithQuality'));
|
|
91 }
|
|
92 }
|
|
93
|
|
94
|
|
95 =head2 next_seq
|
|
96
|
|
97 Title : next_seq
|
|
98 Usage : $seq = $stream->next_seq()
|
|
99 Function: returns the next sequence in the stream
|
|
100 Returns : Bio::Seq::SeqWithQuality object
|
|
101 Args : NONE
|
|
102
|
|
103 =cut
|
|
104
|
|
105 sub next_seq {
|
|
106
|
|
107 my( $self ) = @_;
|
|
108 my $seq;
|
|
109 my $alphabet;
|
|
110 local $/ = "\n\@";
|
|
111
|
|
112 return unless my $entry = $self->_readline;
|
|
113
|
|
114 if ($entry eq '@') { # very first one
|
|
115 return unless $entry = $self->_readline;
|
|
116 }
|
|
117 my ($top,$sequence,$top2,$qualsequence) = $entry =~ /^
|
|
118 \@?(.+?)\n
|
|
119 ([^\@]*?)\n
|
|
120 \+?(.+?)\n
|
|
121 (.*)\n
|
|
122 /xs
|
|
123 or $self->throw("Can't parse fastq entry");
|
|
124 my ($id,$fulldesc) = $top =~ /^\s*(\S+)\s*(.*)/
|
|
125 or $self->throw("Can't parse fastq header");
|
|
126 if ($id eq '') {$id=$fulldesc;} # FIX incase no space between \@ and name
|
|
127 $sequence =~ s/\s//g; # Remove whitespace
|
|
128 $qualsequence =~ s/\s//g;
|
|
129
|
|
130 if(length($sequence) != length($qualsequence)){
|
|
131 $self->warn("Fastq sequence/quality data length mismatch error\n");
|
|
132 $self->warn("Sequence: $top, seq length: ",length($sequence), " Qual length: ", length($qualsequence), " \n");
|
|
133 $self->warn("$sequence\n");
|
|
134 $self->warn("$qualsequence\n");
|
|
135 $self->warn("FROM ENTRY: \n\n$entry\n");
|
|
136 }
|
|
137
|
|
138 my @qual = split('', $qualsequence);
|
|
139
|
|
140 my $qual;
|
|
141 foreach (@qual) {$qual .= (unpack("C",$_) - 33) ." "};
|
|
142
|
|
143
|
|
144 # for empty sequences we need to know the mol.type
|
|
145 $alphabet = $self->alphabet();
|
|
146 if(length($sequence) == 0) {
|
|
147 if(! defined($alphabet)) {
|
|
148 # let's default to dna
|
|
149 $alphabet = "dna";
|
|
150 }
|
|
151 } else {
|
|
152 # we don't need it really, so disable
|
|
153 $alphabet = undef;
|
|
154 }
|
|
155
|
|
156 # create the SeqWithQuality object
|
|
157 $seq = $self->sequence_factory->create(
|
|
158 -qual => $qual,
|
|
159 -seq => $sequence,
|
|
160 -id => $id,
|
|
161 -primary_id => $id,
|
|
162 -desc => $fulldesc,
|
|
163 -alphabet => $alphabet
|
|
164 );
|
|
165
|
|
166 # if there wasn't one before, set the guessed type
|
|
167 $self->alphabet($seq->alphabet());
|
|
168
|
|
169 return $seq;
|
|
170 }
|
|
171
|
|
172 =head2 write_seq
|
|
173
|
|
174 Title : write_seq
|
|
175 Usage : $stream->write_seq(@seq)
|
|
176 Function: writes the $seq object into the stream
|
|
177 Returns : 1 for success and 0 for error
|
|
178 Args : Bio::Seq::SeqWithQuality or Bio::seq object
|
|
179
|
|
180
|
|
181 =cut
|
|
182
|
|
183 sub write_seq {
|
|
184 my ($self,@seq) = @_;
|
|
185 foreach my $seq (@seq) {
|
|
186 my $str = $seq->seq;
|
|
187 my $top = $seq->display_id();
|
|
188 if ($seq->can('desc') and my $desc = $seq->desc()) {
|
|
189 $desc =~ s/\n//g;
|
|
190 $top .= " $desc";
|
|
191 }
|
|
192 if(length($str) > 0) {
|
|
193 $str =~ s/(.{1,60})/$1\n/g;
|
|
194 } else {
|
|
195 $str = "\n";
|
|
196 }
|
|
197
|
|
198 $self->_print (">",$top,"\n",$str) or return;
|
|
199 }
|
|
200
|
|
201 $self->flush if $self->_flush_on_write && defined $self->_fh;
|
|
202 return 1;
|
|
203 }
|
|
204
|
|
205 =head2 write_qual
|
|
206
|
|
207 Title : write_qual
|
|
208 Usage : $stream->write_qual(@seq)
|
|
209 Function: writes the $seq object into the stream
|
|
210 Returns : 1 for success and 0 for error
|
|
211 Args : Bio::Seq::SeqWithQuality object
|
|
212
|
|
213
|
|
214 =cut
|
|
215
|
|
216 sub write_qual {
|
|
217 my ($self,@seq) = @_;
|
|
218 foreach my $seq (@seq) {
|
|
219 unless ($seq->isa("Bio::Seq::SeqWithQuality")){
|
|
220 warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n");
|
|
221 next;
|
|
222 }
|
|
223 my @qual = @{$seq->qual};
|
|
224 my $top = $seq->display_id();
|
|
225 if ($seq->can('desc') and my $desc = $seq->desc()) {
|
|
226 $desc =~ s/\n//g;
|
|
227 $top .= " $desc";
|
|
228 }
|
|
229 my $qual = "" ;
|
|
230 if(scalar(@qual) > 0) {
|
|
231 my $max = 60;
|
|
232 for (my $q = 0;$q<scalar(@qual);$q++){
|
|
233 $qual .= $qual[$q] . " ";
|
|
234 if(length($qual) > $max){
|
|
235 $qual .= "\n";
|
|
236 $max += 60;
|
|
237 }
|
|
238 }
|
|
239 } else {
|
|
240 $qual = "\n";
|
|
241 }
|
|
242
|
|
243 $self->_print (">",$top,"\n",$qual,"\n") or return;
|
|
244 }
|
|
245 return 1;
|
|
246 }
|
|
247
|
|
248 =head2 write_fastq
|
|
249
|
|
250 Title : write_fastq
|
|
251 Usage : $stream->write_fastq(@seq)
|
|
252 Function: writes the $seq object into the stream
|
|
253 Returns : 1 for success and 0 for error
|
|
254 Args : Bio::Seq::SeqWithQuality object
|
|
255
|
|
256
|
|
257 =cut
|
|
258
|
|
259 sub write_fastq {
|
|
260 my ($self,@seq) = @_;
|
|
261 foreach my $seq (@seq) {
|
|
262 unless ($seq->isa("Bio::Seq::SeqWithQuality")){
|
|
263 warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n");
|
|
264 next;
|
|
265 }
|
|
266 my $str = $seq->seq;
|
|
267 my @qual = @{$seq->qual};
|
|
268 my $top = $seq->display_id();
|
|
269 if ($seq->can('desc') and my $desc = $seq->desc()) {
|
|
270 $desc =~ s/\n//g;
|
|
271 $top .= " $desc";
|
|
272 }
|
|
273 if(length($str) == 0) {
|
|
274 $str = "\n";
|
|
275 }
|
|
276 my $qual = "" ;
|
|
277 if(scalar(@qual) > 0) {
|
|
278 for (my $q = 0;$q<scalar(@qual);$q++){
|
|
279 $qual .= chr($qual[$q] + 33);
|
|
280 }
|
|
281 } else {
|
|
282 $qual = "\n";
|
|
283 }
|
|
284
|
|
285 $self->_print ("\@",$top,"\n",$str,"\n") or return;
|
|
286 $self->_print ("+",$top,"\n",$qual,"\n") or return;
|
|
287 }
|
|
288 return 1;
|
|
289 }
|
|
290 1;
|