Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/SeqIO/fastq.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/fastq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,290 @@ +# BioPerl module for Bio::SeqIO::fastq +# +# Cared for by Tony Cox <avc@sanger.ac.uk> +# +# Copyright Tony Cox +# +# You may distribute this module under the same terms as perl itself +# _history +# October 29, 2001 incept data + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::fastq - fastq sequence input/output stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq and Bio::Seq::SeqWithQuality +objects to and from fastq flat file databases. + +Fastq is a file format used frequently at the Sanger Centre to bundle +a fasta sequence and its quality data. A typical fastaq entry takes +the from: + + @HCDPQ1D0501 + GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT..... + +HCDPQ1D0501 + !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65..... + +Fastq files have sequence and quality data on a single line and the +quality values are single-byte encoded. To retrieve the decimal values +for qualities you need to subtract 33 (or Octal 41) from each byte and +then convert to a '2 digit + 1 space' integer. You can check if 33 is +the right number because the first byte which is always '!' +corresponds to a quality value of 0. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Tony Cox + +Email: avc@sanger.ac.uk + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::fastq; +use vars qw(@ISA); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::SeqWithQuality')); + } +} + + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq::SeqWithQuality object + Args : NONE + +=cut + +sub next_seq { + + my( $self ) = @_; + my $seq; + my $alphabet; + local $/ = "\n\@"; + + return unless my $entry = $self->_readline; + + if ($entry eq '@') { # very first one + return unless $entry = $self->_readline; + } + my ($top,$sequence,$top2,$qualsequence) = $entry =~ /^ + \@?(.+?)\n + ([^\@]*?)\n + \+?(.+?)\n + (.*)\n + /xs + or $self->throw("Can't parse fastq entry"); + my ($id,$fulldesc) = $top =~ /^\s*(\S+)\s*(.*)/ + or $self->throw("Can't parse fastq header"); + if ($id eq '') {$id=$fulldesc;} # FIX incase no space between \@ and name + $sequence =~ s/\s//g; # Remove whitespace + $qualsequence =~ s/\s//g; + + if(length($sequence) != length($qualsequence)){ + $self->warn("Fastq sequence/quality data length mismatch error\n"); + $self->warn("Sequence: $top, seq length: ",length($sequence), " Qual length: ", length($qualsequence), " \n"); + $self->warn("$sequence\n"); + $self->warn("$qualsequence\n"); + $self->warn("FROM ENTRY: \n\n$entry\n"); + } + + my @qual = split('', $qualsequence); + + my $qual; + foreach (@qual) {$qual .= (unpack("C",$_) - 33) ." "}; + + + # for empty sequences we need to know the mol.type + $alphabet = $self->alphabet(); + if(length($sequence) == 0) { + if(! defined($alphabet)) { + # let's default to dna + $alphabet = "dna"; + } + } else { + # we don't need it really, so disable + $alphabet = undef; + } + + # create the SeqWithQuality object + $seq = $self->sequence_factory->create( + -qual => $qual, + -seq => $sequence, + -id => $id, + -primary_id => $id, + -desc => $fulldesc, + -alphabet => $alphabet + ); + + # if there wasn't one before, set the guessed type + $self->alphabet($seq->alphabet()); + + return $seq; +} + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq::SeqWithQuality or Bio::seq object + + +=cut + +sub write_seq { + my ($self,@seq) = @_; + foreach my $seq (@seq) { + my $str = $seq->seq; + my $top = $seq->display_id(); + if ($seq->can('desc') and my $desc = $seq->desc()) { + $desc =~ s/\n//g; + $top .= " $desc"; + } + if(length($str) > 0) { + $str =~ s/(.{1,60})/$1\n/g; + } else { + $str = "\n"; + } + + $self->_print (">",$top,"\n",$str) or return; + } + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + +=head2 write_qual + + Title : write_qual + Usage : $stream->write_qual(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq::SeqWithQuality object + + +=cut + +sub write_qual { + my ($self,@seq) = @_; + foreach my $seq (@seq) { + unless ($seq->isa("Bio::Seq::SeqWithQuality")){ + warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n"); + next; + } + my @qual = @{$seq->qual}; + my $top = $seq->display_id(); + if ($seq->can('desc') and my $desc = $seq->desc()) { + $desc =~ s/\n//g; + $top .= " $desc"; + } + my $qual = "" ; + if(scalar(@qual) > 0) { + my $max = 60; + for (my $q = 0;$q<scalar(@qual);$q++){ + $qual .= $qual[$q] . " "; + if(length($qual) > $max){ + $qual .= "\n"; + $max += 60; + } + } + } else { + $qual = "\n"; + } + + $self->_print (">",$top,"\n",$qual,"\n") or return; + } + return 1; +} + +=head2 write_fastq + + Title : write_fastq + Usage : $stream->write_fastq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq::SeqWithQuality object + + +=cut + +sub write_fastq { + my ($self,@seq) = @_; + foreach my $seq (@seq) { + unless ($seq->isa("Bio::Seq::SeqWithQuality")){ + warn("You can write FASTQ without supplying a Bio::Seq::SeqWithQuality object! ", ref($seq), "\n"); + next; + } + my $str = $seq->seq; + my @qual = @{$seq->qual}; + my $top = $seq->display_id(); + if ($seq->can('desc') and my $desc = $seq->desc()) { + $desc =~ s/\n//g; + $top .= " $desc"; + } + if(length($str) == 0) { + $str = "\n"; + } + my $qual = "" ; + if(scalar(@qual) > 0) { + for (my $q = 0;$q<scalar(@qual);$q++){ + $qual .= chr($qual[$q] + 33); + } + } else { + $qual = "\n"; + } + + $self->_print ("\@",$top,"\n",$str,"\n") or return; + $self->_print ("+",$top,"\n",$qual,"\n") or return; + } + return 1; +} +1;