Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqIO/fastq.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # BioPerl module for Bio::SeqIO::fastq | |
| 2 # | |
| 3 # Cared for by Tony Cox <avc@sanger.ac.uk> | |
| 4 # | |
| 5 # Copyright Tony Cox | |
| 6 # | |
| 7 # You may distribute this module under the same terms as perl itself | |
| 8 # _history | |
| 9 # October 29, 2001 incept data | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::SeqIO::fastq - fastq sequence input/output stream | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 Do not use this module directly. Use it via the Bio::SeqIO class. | |
| 20 | |
| 21 =head1 DESCRIPTION | |
| 22 | |
| 23 This object can transform Bio::Seq and Bio::Seq::SeqWithQuality | |
| 24 objects to and from fastq flat file databases. | |
| 25 | |
| 26 Fastq is a file format used frequently at the Sanger Centre to bundle | |
| 27 a fasta sequence and its quality data. A typical fastaq entry takes | |
| 28 the from: | |
| 29 | |
| 30 @HCDPQ1D0501 | |
| 31 GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT..... | |
| 32 +HCDPQ1D0501 | |
| 33 !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65..... | |
| 34 | |
| 35 Fastq files have sequence and quality data on a single line and the | |
| 36 quality values are single-byte encoded. To retrieve the decimal values | |
| 37 for qualities you need to subtract 33 (or Octal 41) from each byte and | |
| 38 then convert to a '2 digit + 1 space' integer. You can check if 33 is | |
| 39 the right number because the first byte which is always '!' | |
| 40 corresponds to a quality value of 0. | |
| 41 | |
| 42 =head1 FEEDBACK | |
| 43 | |
| 44 =head2 Mailing Lists | |
| 45 | |
| 46 User feedback is an integral part of the evolution of this and other | |
| 47 Bioperl modules. Send your comments and suggestions preferably to one | |
| 48 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 49 | |
| 50 bioperl-l@bioperl.org - General discussion | |
| 51 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 52 | |
| 53 =head2 Reporting Bugs | |
| 54 | |
| 55 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 56 the bugs and their resolution. | |
| 57 Bug reports can be submitted via email or the web: | |
| 58 | |
| 59 bioperl-bugs@bio.perl.org | |
| 60 http://bugzilla.bioperl.org/ | |
| 61 | |
| 62 =head1 AUTHORS - Tony Cox | |
| 63 | |
| 64 Email: avc@sanger.ac.uk | |
| 65 | |
| 66 | |
| 67 =head1 APPENDIX | |
| 68 | |
| 69 The rest of the documentation details each of the object | |
| 70 methods. Internal methods are usually preceded with a _ | |
| 71 | |
| 72 =cut | |
| 73 | |
| 74 # Let the code begin... | |
| 75 | |
| 76 package Bio::SeqIO::fastq; | |
| 77 use vars qw(@ISA); | |
| 78 use strict; | |
| 79 # Object preamble - inherits from Bio::Root::Object | |
| 80 | |
| 81 use Bio::SeqIO; | |
| 82 use Bio::Seq::SeqFactory; | |
| 83 | |
| 84 @ISA = qw(Bio::SeqIO); | |
| 85 | |
| 86 sub _initialize { | |
| 87 my($self,@args) = @_; | |
| 88 $self->SUPER::_initialize(@args); | |
| 89 if( ! defined $self->sequence_factory ) { | |
| 90 $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::SeqWithQuality')); | |
| 91 } | |
| 92 } | |
| 93 | |
| 94 | |
| 95 =head2 next_seq | |
| 96 | |
| 97 Title : next_seq | |
| 98 Usage : $seq = $stream->next_seq() | |
| 99 Function: returns the next sequence in the stream | |
| 100 Returns : Bio::Seq::SeqWithQuality object | |
| 101 Args : NONE | |
| 102 | |
| 103 =cut | |
| 104 | |
| 105 sub next_seq { | |
| 106 | |
| 107 my( $self ) = @_; | |
| 108 my $seq; | |
| 109 my $alphabet; | |
| 110 local $/ = "\n\@"; | |
| 111 | |
| 112 return unless my $entry = $self->_readline; | |
| 113 | |
| 114 if ($entry eq '@') { # very first one | |
| 115 return unless $entry = $self->_readline; | |
| 116 } | |
| 117 my ($top,$sequence,$top2,$qualsequence) = $entry =~ /^ | |
| 118 \@?(.+?)\n | |
| 119 ([^\@]*?)\n | |
| 120 \+?(.+?)\n | |
| 121 (.*)\n | |
| 122 /xs | |
| 123 or $self->throw("Can't parse fastq entry"); | |
| 124 my ($id,$fulldesc) = $top =~ /^\s*(\S+)\s*(.*)/ | |
| 125 or $self->throw("Can't parse fastq header"); | |
| 126 if ($id eq '') {$id=$fulldesc;} # FIX incase no space between \@ and name | |
| 127 $sequence =~ s/\s//g; # Remove whitespace | |
| 128 $qualsequence =~ s/\s//g; | |
| 129 | |
| 130 if(length($sequence) != length($qualsequence)){ | |
| 131 $self->warn("Fastq sequence/quality data length mismatch error\n"); | |
| 132 $self->warn("Sequence: $top, seq length: ",length($sequence), " Qual length: ", length($qualsequence), " \n"); | |
| 133 $self->warn("$sequence\n"); | |
| 134 $self->warn("$qualsequence\n"); | |
| 135 $self->warn("FROM ENTRY: \n\n$entry\n"); | |
| 136 } | |
| 137 | |
| 138 my @qual = split('', $qualsequence); | |
| 139 | |
| 140 my $qual; | |
| 141 foreach (@qual) {$qual .= (unpack("C",$_) - 33) ." "}; | |
| 142 | |
| 143 | |
| 144 # for empty sequences we need to know the mol.type | |
| 145 $alphabet = $self->alphabet(); | |
| 146 if(length($sequence) == 0) { | |
| 147 if(! defined($alphabet)) { | |
| 148 # let's default to dna | |
| 149 $alphabet = "dna"; | |
| 150 } | |
| 151 } else { | |
| 152 # we don't need it really, so disable | |
| 153 $alphabet = undef; | |
| 154 } | |
| 155 | |
| 156 # create the SeqWithQuality object | |
| 157 $seq = $self->sequence_factory->create( | |
| 158 -qual => $qual, | |
| 159 -seq => $sequence, | |
| 160 -id => $id, | |
| 161 -primary_id => $id, | |
| 162 -desc => $fulldesc, | |
| 163 -alphabet => $alphabet | |
| 164 ); | |
| 165 | |
| 166 # if there wasn't one before, set the guessed type | |
| 167 $self->alphabet($seq->alphabet()); | |
| 168 | |
| 169 return $seq; | |
| 170 } | |
| 171 | |
| 172 =head2 write_seq | |
| 173 | |
| 174 Title : write_seq | |
| 175 Usage : $stream->write_seq(@seq) | |
| 176 Function: writes the $seq object into the stream | |
| 177 Returns : 1 for success and 0 for error | |
| 178 Args : Bio::Seq::SeqWithQuality or Bio::seq object | |
| 179 | |
| 180 | |
| 181 =cut | |
| 182 | |
| 183 sub write_seq { | |
| 184 my ($self,@seq) = @_; | |
| 185 foreach my $seq (@seq) { | |
| 186 my $str = $seq->seq; | |
| 187 my $top = $seq->display_id(); | |
| 188 if ($seq->can('desc') and my $desc = $seq->desc()) { | |
| 189 $desc =~ s/\n//g; | |
| 190 $top .= " $desc"; | |
| 191 } | |
| 192 if(length($str) > 0) { | |
| 193 $str =~ s/(.{1,60})/$1\n/g; | |
| 194 } else { | |
| 195 $str = "\n"; | |
| 196 } | |
| 197 | |
| 198 $self->_print (">",$top,"\n",$str) or return; | |
| 199 } | |
| 200 | |
| 201 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
| 202 return 1; | |
| 203 } | |
| 204 | |
| 205 =head2 write_qual | |
| 206 | |
| 207 Title : write_qual | |
| 208 Usage : $stream->write_qual(@seq) | |
| 209 Function: writes the $seq object into the stream | |
| 210 Returns : 1 for success and 0 for error | |
| 211 Args : Bio::Seq::SeqWithQuality object | |
| 212 | |
| 213 | |
| 214 =cut | |
| 215 | |
| 216 sub write_qual { | |
| 217 my ($self,@seq) = @_; | |
| 218 foreach my $seq (@seq) { | |
| 219 unless ($seq->isa("Bio::Seq::SeqWithQuality")){ | |
| 220 warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n"); | |
| 221 next; | |
| 222 } | |
| 223 my @qual = @{$seq->qual}; | |
| 224 my $top = $seq->display_id(); | |
| 225 if ($seq->can('desc') and my $desc = $seq->desc()) { | |
| 226 $desc =~ s/\n//g; | |
| 227 $top .= " $desc"; | |
| 228 } | |
| 229 my $qual = "" ; | |
| 230 if(scalar(@qual) > 0) { | |
| 231 my $max = 60; | |
| 232 for (my $q = 0;$q<scalar(@qual);$q++){ | |
| 233 $qual .= $qual[$q] . " "; | |
| 234 if(length($qual) > $max){ | |
| 235 $qual .= "\n"; | |
| 236 $max += 60; | |
| 237 } | |
| 238 } | |
| 239 } else { | |
| 240 $qual = "\n"; | |
| 241 } | |
| 242 | |
| 243 $self->_print (">",$top,"\n",$qual,"\n") or return; | |
| 244 } | |
| 245 return 1; | |
| 246 } | |
| 247 | |
| 248 =head2 write_fastq | |
| 249 | |
| 250 Title : write_fastq | |
| 251 Usage : $stream->write_fastq(@seq) | |
| 252 Function: writes the $seq object into the stream | |
| 253 Returns : 1 for success and 0 for error | |
| 254 Args : Bio::Seq::SeqWithQuality object | |
| 255 | |
| 256 | |
| 257 =cut | |
| 258 | |
| 259 sub write_fastq { | |
| 260 my ($self,@seq) = @_; | |
| 261 foreach my $seq (@seq) { | |
| 262 unless ($seq->isa("Bio::Seq::SeqWithQuality")){ | |
| 263 warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n"); | |
| 264 next; | |
| 265 } | |
| 266 my $str = $seq->seq; | |
| 267 my @qual = @{$seq->qual}; | |
| 268 my $top = $seq->display_id(); | |
| 269 if ($seq->can('desc') and my $desc = $seq->desc()) { | |
| 270 $desc =~ s/\n//g; | |
| 271 $top .= " $desc"; | |
| 272 } | |
| 273 if(length($str) == 0) { | |
| 274 $str = "\n"; | |
| 275 } | |
| 276 my $qual = "" ; | |
| 277 if(scalar(@qual) > 0) { | |
| 278 for (my $q = 0;$q<scalar(@qual);$q++){ | |
| 279 $qual .= chr($qual[$q] + 33); | |
| 280 } | |
| 281 } else { | |
| 282 $qual = "\n"; | |
| 283 } | |
| 284 | |
| 285 $self->_print ("\@",$top,"\n",$str,"\n") or return; | |
| 286 $self->_print ("+",$top,"\n",$qual,"\n") or return; | |
| 287 } | |
| 288 return 1; | |
| 289 } | |
| 290 1; |
