Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqIO/fastq.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # BioPerl module for Bio::SeqIO::fastq | |
2 # | |
3 # Cared for by Tony Cox <avc@sanger.ac.uk> | |
4 # | |
5 # Copyright Tony Cox | |
6 # | |
7 # You may distribute this module under the same terms as perl itself | |
8 # _history | |
9 # October 29, 2001 incept data | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SeqIO::fastq - fastq sequence input/output stream | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 Do not use this module directly. Use it via the Bio::SeqIO class. | |
20 | |
21 =head1 DESCRIPTION | |
22 | |
23 This object can transform Bio::Seq and Bio::Seq::SeqWithQuality | |
24 objects to and from fastq flat file databases. | |
25 | |
26 Fastq is a file format used frequently at the Sanger Centre to bundle | |
27 a fasta sequence and its quality data. A typical fastaq entry takes | |
28 the from: | |
29 | |
30 @HCDPQ1D0501 | |
31 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT..... | |
32 +HCDPQ1D0501 | |
33 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65..... | |
34 | |
35 Fastq files have sequence and quality data on a single line and the | |
36 quality values are single-byte encoded. To retrieve the decimal values | |
37 for qualities you need to subtract 33 (or Octal 41) from each byte and | |
38 then convert to a '2 digit + 1 space' integer. You can check if 33 is | |
39 the right number because the first byte which is always '!' | |
40 corresponds to a quality value of 0. | |
41 | |
42 =head1 FEEDBACK | |
43 | |
44 =head2 Mailing Lists | |
45 | |
46 User feedback is an integral part of the evolution of this and other | |
47 Bioperl modules. Send your comments and suggestions preferably to one | |
48 of the Bioperl mailing lists. Your participation is much appreciated. | |
49 | |
50 bioperl-l@bioperl.org - General discussion | |
51 http://bioperl.org/MailList.shtml - About the mailing lists | |
52 | |
53 =head2 Reporting Bugs | |
54 | |
55 Report bugs to the Bioperl bug tracking system to help us keep track | |
56 the bugs and their resolution. | |
57 Bug reports can be submitted via email or the web: | |
58 | |
59 bioperl-bugs@bio.perl.org | |
60 http://bugzilla.bioperl.org/ | |
61 | |
62 =head1 AUTHORS - Tony Cox | |
63 | |
64 Email: avc@sanger.ac.uk | |
65 | |
66 | |
67 =head1 APPENDIX | |
68 | |
69 The rest of the documentation details each of the object | |
70 methods. Internal methods are usually preceded with a _ | |
71 | |
72 =cut | |
73 | |
74 # Let the code begin... | |
75 | |
76 package Bio::SeqIO::fastq; | |
77 use vars qw(@ISA); | |
78 use strict; | |
79 # Object preamble - inherits from Bio::Root::Object | |
80 | |
81 use Bio::SeqIO; | |
82 use Bio::Seq::SeqFactory; | |
83 | |
84 @ISA = qw(Bio::SeqIO); | |
85 | |
86 sub _initialize { | |
87 my($self,@args) = @_; | |
88 $self->SUPER::_initialize(@args); | |
89 if( ! defined $self->sequence_factory ) { | |
90 $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::SeqWithQuality')); | |
91 } | |
92 } | |
93 | |
94 | |
95 =head2 next_seq | |
96 | |
97 Title : next_seq | |
98 Usage : $seq = $stream->next_seq() | |
99 Function: returns the next sequence in the stream | |
100 Returns : Bio::Seq::SeqWithQuality object | |
101 Args : NONE | |
102 | |
103 =cut | |
104 | |
105 sub next_seq { | |
106 | |
107 my( $self ) = @_; | |
108 my $seq; | |
109 my $alphabet; | |
110 local $/ = "\n\@"; | |
111 | |
112 return unless my $entry = $self->_readline; | |
113 | |
114 if ($entry eq '@') { # very first one | |
115 return unless $entry = $self->_readline; | |
116 } | |
117 my ($top,$sequence,$top2,$qualsequence) = $entry =~ /^ | |
118 \@?(.+?)\n | |
119 ([^\@]*?)\n | |
120 \+?(.+?)\n | |
121 (.*)\n | |
122 /xs | |
123 or $self->throw("Can't parse fastq entry"); | |
124 my ($id,$fulldesc) = $top =~ /^\s*(\S+)\s*(.*)/ | |
125 or $self->throw("Can't parse fastq header"); | |
126 if ($id eq '') {$id=$fulldesc;} # FIX incase no space between \@ and name | |
127 $sequence =~ s/\s//g; # Remove whitespace | |
128 $qualsequence =~ s/\s//g; | |
129 | |
130 if(length($sequence) != length($qualsequence)){ | |
131 $self->warn("Fastq sequence/quality data length mismatch error\n"); | |
132 $self->warn("Sequence: $top, seq length: ",length($sequence), " Qual length: ", length($qualsequence), " \n"); | |
133 $self->warn("$sequence\n"); | |
134 $self->warn("$qualsequence\n"); | |
135 $self->warn("FROM ENTRY: \n\n$entry\n"); | |
136 } | |
137 | |
138 my @qual = split('', $qualsequence); | |
139 | |
140 my $qual; | |
141 foreach (@qual) {$qual .= (unpack("C",$_) - 33) ." "}; | |
142 | |
143 | |
144 # for empty sequences we need to know the mol.type | |
145 $alphabet = $self->alphabet(); | |
146 if(length($sequence) == 0) { | |
147 if(! defined($alphabet)) { | |
148 # let's default to dna | |
149 $alphabet = "dna"; | |
150 } | |
151 } else { | |
152 # we don't need it really, so disable | |
153 $alphabet = undef; | |
154 } | |
155 | |
156 # create the SeqWithQuality object | |
157 $seq = $self->sequence_factory->create( | |
158 -qual => $qual, | |
159 -seq => $sequence, | |
160 -id => $id, | |
161 -primary_id => $id, | |
162 -desc => $fulldesc, | |
163 -alphabet => $alphabet | |
164 ); | |
165 | |
166 # if there wasn't one before, set the guessed type | |
167 $self->alphabet($seq->alphabet()); | |
168 | |
169 return $seq; | |
170 } | |
171 | |
172 =head2 write_seq | |
173 | |
174 Title : write_seq | |
175 Usage : $stream->write_seq(@seq) | |
176 Function: writes the $seq object into the stream | |
177 Returns : 1 for success and 0 for error | |
178 Args : Bio::Seq::SeqWithQuality or Bio::seq object | |
179 | |
180 | |
181 =cut | |
182 | |
183 sub write_seq { | |
184 my ($self,@seq) = @_; | |
185 foreach my $seq (@seq) { | |
186 my $str = $seq->seq; | |
187 my $top = $seq->display_id(); | |
188 if ($seq->can('desc') and my $desc = $seq->desc()) { | |
189 $desc =~ s/\n//g; | |
190 $top .= " $desc"; | |
191 } | |
192 if(length($str) > 0) { | |
193 $str =~ s/(.{1,60})/$1\n/g; | |
194 } else { | |
195 $str = "\n"; | |
196 } | |
197 | |
198 $self->_print (">",$top,"\n",$str) or return; | |
199 } | |
200 | |
201 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
202 return 1; | |
203 } | |
204 | |
205 =head2 write_qual | |
206 | |
207 Title : write_qual | |
208 Usage : $stream->write_qual(@seq) | |
209 Function: writes the $seq object into the stream | |
210 Returns : 1 for success and 0 for error | |
211 Args : Bio::Seq::SeqWithQuality object | |
212 | |
213 | |
214 =cut | |
215 | |
216 sub write_qual { | |
217 my ($self,@seq) = @_; | |
218 foreach my $seq (@seq) { | |
219 unless ($seq->isa("Bio::Seq::SeqWithQuality")){ | |
220 warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n"); | |
221 next; | |
222 } | |
223 my @qual = @{$seq->qual}; | |
224 my $top = $seq->display_id(); | |
225 if ($seq->can('desc') and my $desc = $seq->desc()) { | |
226 $desc =~ s/\n//g; | |
227 $top .= " $desc"; | |
228 } | |
229 my $qual = "" ; | |
230 if(scalar(@qual) > 0) { | |
231 my $max = 60; | |
232 for (my $q = 0;$q<scalar(@qual);$q++){ | |
233 $qual .= $qual[$q] . " "; | |
234 if(length($qual) > $max){ | |
235 $qual .= "\n"; | |
236 $max += 60; | |
237 } | |
238 } | |
239 } else { | |
240 $qual = "\n"; | |
241 } | |
242 | |
243 $self->_print (">",$top,"\n",$qual,"\n") or return; | |
244 } | |
245 return 1; | |
246 } | |
247 | |
248 =head2 write_fastq | |
249 | |
250 Title : write_fastq | |
251 Usage : $stream->write_fastq(@seq) | |
252 Function: writes the $seq object into the stream | |
253 Returns : 1 for success and 0 for error | |
254 Args : Bio::Seq::SeqWithQuality object | |
255 | |
256 | |
257 =cut | |
258 | |
259 sub write_fastq { | |
260 my ($self,@seq) = @_; | |
261 foreach my $seq (@seq) { | |
262 unless ($seq->isa("Bio::Seq::SeqWithQuality")){ | |
263 warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n"); | |
264 next; | |
265 } | |
266 my $str = $seq->seq; | |
267 my @qual = @{$seq->qual}; | |
268 my $top = $seq->display_id(); | |
269 if ($seq->can('desc') and my $desc = $seq->desc()) { | |
270 $desc =~ s/\n//g; | |
271 $top .= " $desc"; | |
272 } | |
273 if(length($str) == 0) { | |
274 $str = "\n"; | |
275 } | |
276 my $qual = "" ; | |
277 if(scalar(@qual) > 0) { | |
278 for (my $q = 0;$q<scalar(@qual);$q++){ | |
279 $qual .= chr($qual[$q] + 33); | |
280 } | |
281 } else { | |
282 $qual = "\n"; | |
283 } | |
284 | |
285 $self->_print ("\@",$top,"\n",$str,"\n") or return; | |
286 $self->_print ("+",$top,"\n",$qual,"\n") or return; | |
287 } | |
288 return 1; | |
289 } | |
290 1; |