comparison variant_effect_predictor/Bio/SeqIO/fastq.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # BioPerl module for Bio::SeqIO::fastq
2 #
3 # Cared for by Tony Cox <avc@sanger.ac.uk>
4 #
5 # Copyright Tony Cox
6 #
7 # You may distribute this module under the same terms as perl itself
8 # _history
9 # October 29, 2001 incept data
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::SeqIO::fastq - fastq sequence input/output stream
16
17 =head1 SYNOPSIS
18
19 Do not use this module directly. Use it via the Bio::SeqIO class.
20
21 =head1 DESCRIPTION
22
23 This object can transform Bio::Seq and Bio::Seq::SeqWithQuality
24 objects to and from fastq flat file databases.
25
26 Fastq is a file format used frequently at the Sanger Centre to bundle
27 a fasta sequence and its quality data. A typical fastaq entry takes
28 the from:
29
30 @HCDPQ1D0501
31 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT.....
32 +HCDPQ1D0501
33 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65.....
34
35 Fastq files have sequence and quality data on a single line and the
36 quality values are single-byte encoded. To retrieve the decimal values
37 for qualities you need to subtract 33 (or Octal 41) from each byte and
38 then convert to a '2 digit + 1 space' integer. You can check if 33 is
39 the right number because the first byte which is always '!'
40 corresponds to a quality value of 0.
41
42 =head1 FEEDBACK
43
44 =head2 Mailing Lists
45
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to one
48 of the Bioperl mailing lists. Your participation is much appreciated.
49
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/MailList.shtml - About the mailing lists
52
53 =head2 Reporting Bugs
54
55 Report bugs to the Bioperl bug tracking system to help us keep track
56 the bugs and their resolution.
57 Bug reports can be submitted via email or the web:
58
59 bioperl-bugs@bio.perl.org
60 http://bugzilla.bioperl.org/
61
62 =head1 AUTHORS - Tony Cox
63
64 Email: avc@sanger.ac.uk
65
66
67 =head1 APPENDIX
68
69 The rest of the documentation details each of the object
70 methods. Internal methods are usually preceded with a _
71
72 =cut
73
74 # Let the code begin...
75
76 package Bio::SeqIO::fastq;
77 use vars qw(@ISA);
78 use strict;
79 # Object preamble - inherits from Bio::Root::Object
80
81 use Bio::SeqIO;
82 use Bio::Seq::SeqFactory;
83
84 @ISA = qw(Bio::SeqIO);
85
86 sub _initialize {
87 my($self,@args) = @_;
88 $self->SUPER::_initialize(@args);
89 if( ! defined $self->sequence_factory ) {
90 $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::SeqWithQuality'));
91 }
92 }
93
94
95 =head2 next_seq
96
97 Title : next_seq
98 Usage : $seq = $stream->next_seq()
99 Function: returns the next sequence in the stream
100 Returns : Bio::Seq::SeqWithQuality object
101 Args : NONE
102
103 =cut
104
105 sub next_seq {
106
107 my( $self ) = @_;
108 my $seq;
109 my $alphabet;
110 local $/ = "\n\@";
111
112 return unless my $entry = $self->_readline;
113
114 if ($entry eq '@') { # very first one
115 return unless $entry = $self->_readline;
116 }
117 my ($top,$sequence,$top2,$qualsequence) = $entry =~ /^
118 \@?(.+?)\n
119 ([^\@]*?)\n
120 \+?(.+?)\n
121 (.*)\n
122 /xs
123 or $self->throw("Can't parse fastq entry");
124 my ($id,$fulldesc) = $top =~ /^\s*(\S+)\s*(.*)/
125 or $self->throw("Can't parse fastq header");
126 if ($id eq '') {$id=$fulldesc;} # FIX incase no space between \@ and name
127 $sequence =~ s/\s//g; # Remove whitespace
128 $qualsequence =~ s/\s//g;
129
130 if(length($sequence) != length($qualsequence)){
131 $self->warn("Fastq sequence/quality data length mismatch error\n");
132 $self->warn("Sequence: $top, seq length: ",length($sequence), " Qual length: ", length($qualsequence), " \n");
133 $self->warn("$sequence\n");
134 $self->warn("$qualsequence\n");
135 $self->warn("FROM ENTRY: \n\n$entry\n");
136 }
137
138 my @qual = split('', $qualsequence);
139
140 my $qual;
141 foreach (@qual) {$qual .= (unpack("C",$_) - 33) ." "};
142
143
144 # for empty sequences we need to know the mol.type
145 $alphabet = $self->alphabet();
146 if(length($sequence) == 0) {
147 if(! defined($alphabet)) {
148 # let's default to dna
149 $alphabet = "dna";
150 }
151 } else {
152 # we don't need it really, so disable
153 $alphabet = undef;
154 }
155
156 # create the SeqWithQuality object
157 $seq = $self->sequence_factory->create(
158 -qual => $qual,
159 -seq => $sequence,
160 -id => $id,
161 -primary_id => $id,
162 -desc => $fulldesc,
163 -alphabet => $alphabet
164 );
165
166 # if there wasn't one before, set the guessed type
167 $self->alphabet($seq->alphabet());
168
169 return $seq;
170 }
171
172 =head2 write_seq
173
174 Title : write_seq
175 Usage : $stream->write_seq(@seq)
176 Function: writes the $seq object into the stream
177 Returns : 1 for success and 0 for error
178 Args : Bio::Seq::SeqWithQuality or Bio::seq object
179
180
181 =cut
182
183 sub write_seq {
184 my ($self,@seq) = @_;
185 foreach my $seq (@seq) {
186 my $str = $seq->seq;
187 my $top = $seq->display_id();
188 if ($seq->can('desc') and my $desc = $seq->desc()) {
189 $desc =~ s/\n//g;
190 $top .= " $desc";
191 }
192 if(length($str) > 0) {
193 $str =~ s/(.{1,60})/$1\n/g;
194 } else {
195 $str = "\n";
196 }
197
198 $self->_print (">",$top,"\n",$str) or return;
199 }
200
201 $self->flush if $self->_flush_on_write && defined $self->_fh;
202 return 1;
203 }
204
205 =head2 write_qual
206
207 Title : write_qual
208 Usage : $stream->write_qual(@seq)
209 Function: writes the $seq object into the stream
210 Returns : 1 for success and 0 for error
211 Args : Bio::Seq::SeqWithQuality object
212
213
214 =cut
215
216 sub write_qual {
217 my ($self,@seq) = @_;
218 foreach my $seq (@seq) {
219 unless ($seq->isa("Bio::Seq::SeqWithQuality")){
220 warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n");
221 next;
222 }
223 my @qual = @{$seq->qual};
224 my $top = $seq->display_id();
225 if ($seq->can('desc') and my $desc = $seq->desc()) {
226 $desc =~ s/\n//g;
227 $top .= " $desc";
228 }
229 my $qual = "" ;
230 if(scalar(@qual) > 0) {
231 my $max = 60;
232 for (my $q = 0;$q<scalar(@qual);$q++){
233 $qual .= $qual[$q] . " ";
234 if(length($qual) > $max){
235 $qual .= "\n";
236 $max += 60;
237 }
238 }
239 } else {
240 $qual = "\n";
241 }
242
243 $self->_print (">",$top,"\n",$qual,"\n") or return;
244 }
245 return 1;
246 }
247
248 =head2 write_fastq
249
250 Title : write_fastq
251 Usage : $stream->write_fastq(@seq)
252 Function: writes the $seq object into the stream
253 Returns : 1 for success and 0 for error
254 Args : Bio::Seq::SeqWithQuality object
255
256
257 =cut
258
259 sub write_fastq {
260 my ($self,@seq) = @_;
261 foreach my $seq (@seq) {
262 unless ($seq->isa("Bio::Seq::SeqWithQuality")){
263 warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n");
264 next;
265 }
266 my $str = $seq->seq;
267 my @qual = @{$seq->qual};
268 my $top = $seq->display_id();
269 if ($seq->can('desc') and my $desc = $seq->desc()) {
270 $desc =~ s/\n//g;
271 $top .= " $desc";
272 }
273 if(length($str) == 0) {
274 $str = "\n";
275 }
276 my $qual = "" ;
277 if(scalar(@qual) > 0) {
278 for (my $q = 0;$q<scalar(@qual);$q++){
279 $qual .= chr($qual[$q] + 33);
280 }
281 } else {
282 $qual = "\n";
283 }
284
285 $self->_print ("\@",$top,"\n",$str,"\n") or return;
286 $self->_print ("+",$top,"\n",$qual,"\n") or return;
287 }
288 return 1;
289 }
290 1;