Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Assembly/IO/phrap.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/IO/phrap.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,313 @@ +# $Id: phrap.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $ +# +# BioPerl driver for phrap.out file +# +# Copyright by Robson F. de Souza +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::IO::phrap - driver to load phrap.out files. + +=head1 SYNOPSYS + + # Building an input stream + use Bio::Assembly::IO; + + # Assembly loading methods + $io = new Bio::Assembly::IO(-file=>"SGC0-424.phrap.out", + -format=>"phrap"); + + $assembly = $io->next_assembly; + +=head1 DESCRIPTION + +This package was developed to load the phrap.out files from the +(phred/phrap/consed) package by Phill Green. This files contain just +the messages printed to standard out by phrap when building an +assembly. This output is redirected by phredPhrap perl-script to a +file in the project's directory and hold some bit of information +regarding assembly quality, connections between contigs and clone's +position inside contigs. It should be noted that such files have no +data about the sequence. neither for contig consensus nor for any +aligned sequence. Anyway, such information may be loaded from Fasta +files in the projects directory and added to the assembly object +later. + +Note that, because no sequence is loaded for the contig consensus and +locations for aligned sequences are only given in "ungapped consensus" +coordinates in a phrap.out file, you can't make coordinate changes in +assemblies loaded by pharp.pm, unless you add an aligned +coordinates for each sequence to each contig's features collection +yourself. See L<Bio::Assembly::Contig::Coordinate_Systems> and +L<Bio::Assembly::Contig::Feature_collection>.. + +This driver also loads singlets into the assembly contigs as Bio::Seq +objects, altough without their sequence strings. It also adds a +feature for the entire sequence, thus storing the singlet length in +its end position, and adds a tag '_nof_trimmed_nonX' to the feature, +which stores the number of non-vector bases in the singlet. + +=head2 Implementation + +Assemblies are loaded into Bio::Assembly::Scaffold objects composed by +Bio::Assembly::Contig objects. No features are added to Bio::Assembly::Contig +"_aligned_coord:$seqID" feature class, therefore you can't make +coordinate changes in contigs loaded by this module. Contig objects +created by this module will have the following special feature +classes, identified by their primary tags, in their features +collection: + +"_main_contig_feature:$ID" : main feature for contig $ID. This + feature is used to store information + about the entire consensus + sequence. This feature always start at + base 1 and its end position is the + consensus sequence length. A tag, + 'trimmed_length' holds the length of the + trimmed good quality region inside the + consensus sequence. + +"_covered_region:$index" : coordinates for valid clones inside the + contig. $index is the covered region + number, starting at 1 for the covered + region closest to the consensus sequence + first base. + +"_unalign_coord:$seqID" : location of a sequence in "ungapped + consensus" coordinates (consensus + sequence without gaps). Primary and + secondary scores, indel and + substitutions statistics are stored as + feature tags. + +"_internal_clones:$cloneID" : clones inside contigs $cloneID should be + used as the unique id for each + clone. These features have six tags: + '_1st_name', which is the id of the + upstream (5') aligned sequence + delimiting the clone; '_1st_strand', the + upstream sequence strand in the + alignment; '_2nd_name', downstream (3') + sequence id; '_2nd_strand', the + downstream sequence strand in the + alignment; '_length', unaligned clone + length; '_rejected', a boolean flag, + which is false if the clone is valid and + true if it was rejected. + +All coordinates for the features above are expressed as "ungapped +consensus" coordinates (See L<Bio::Assembly::Contig::Coordinate_Systems>.. + +=head2 Feature collection + +# + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + + +=head1 AUTHOR - Robson Francisco de Souza + +Email rfsouza@citri.iq.usp.br + +head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Assembly::IO::phrap; + +use strict; +use vars qw(@ISA); + +use Bio::Assembly::IO; +use Bio::Assembly::Scaffold; +use Bio::Assembly::Contig; +use Bio::LocatableSeq; +use Bio::Seq; +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::Assembly::IO); + +=head2 next_assembly + + Title : next_assembly + Usage : $unigene = $stream->next_assembly() + Function: returns the next assembly in the stream + Returns : Bio::Assembly::Scaffold object + Args : NONE + +=cut + +sub next_assembly { + my $self = shift; # Package reference + + # Resetting assembly data structure + my $Assembly = Bio::Assembly::Scaffold->new(-source=>'phrap'); + + # Looping over all phrap out file lines + my ($contigOBJ); + while ($_ = $self->_readline) { + chomp; + + # Loading exact dupicated reads list +# /Exact duplicate reads:/ && do { +# my @exact_dupl; +# while (<FILE>) { +# last if (/^\s*$/); +# /(\S+)\s+(\S+)/ && do { +# push(@exact_dupl,[$1,$2]); +# }; +# $self->{'assembly'}{'exact_dupl_reads'} = +# new Data::Table(\@exact_dupl,['included','excluded'],0); +# } +# }; + + # Loading singlets reads data + /^(\d+) isolated singletons/ && do { + while ($_ = $self->_readline) { + chomp; + last if (/^$/); + if (/^\s+(\S+)\s+(\d+)\s+\((\d+)\)/) { + my $seqID = $1; my $length = $2; + my $nof_trimmed_nonX = $3; + my $seq = new Bio::Seq(-strand=>1, + -primary_id=>$seqID); + my $f = Bio::SeqFeature::Generic->new + (-start=>1, -end=>$seq->length(), + -primary=>$seq->primary_id(), + -tag=>{ '_nof_trimmed_nonX' => $nof_trimmed_nonX } + ); + $seq->add_SeqFeature($f); + $Assembly->add_singlet($seq); + } + } + }; + + # Loading contig information + /^Contig (\d+)\.\s+(\d+) reads?; (\d+) bp \(untrimmed\), (\d+) \(trimmed\)\./ && do { + my $nof_reads = $2; my $length = $3; my $trimmed_length = $4; + $contigOBJ = Bio::Assembly::Contig->new(-id=>$1, -source=>'phrap'); + my $feat = Bio::SeqFeature::Generic->new(-start=>1, + -end=>$length, + -primary=>"_main_contig_feature:".$contigOBJ->id(), + -tag=>{ '_trimmed_length' => $trimmed_length } + ); + $contigOBJ->add_features([ $feat ],1); + $Assembly->add_contig($contigOBJ); + }; + + # Loading read information + /^(C?)\s+(-?\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+\(\s*(\d+)\)\s+(\d+\.\d*)\s+(\d+\.\d*)\s+(\d+\.\d*)/ && do { + my $strand = ($1 eq 'C' ? -1 : 1); + my $readID = $4; my $start = $2; my $end = $3; + my $primary_score = $5; my $secondary_score = $6; + my $substitutions = $7; my $deletions = $8; my $insertions = $9; + my $seq = Bio::LocatableSeq->new(-start=>$start, + -end=>$end, + -strand=>$strand, + -id=>$readID, + -primary_id=>$readID, + -alphabet=>'dna'); + my $unalign_coord = Bio::SeqFeature::Generic->new(-start=>$start, + -end=>$end, + -primary=>"_unalign_coord:$readID", + -tag=>{'_primary_score'=>$primary_score, + '_secondary_score'=>$secondary_score, + '_substitutions'=>$substitutions, + '_insertions'=>,$insertions, + '_deletions'=>$deletions } + ); + $unalign_coord->attach_seq($seq); + $contigOBJ->add_seq($seq); $contigOBJ->add_features([ $unalign_coord ]); + }; + + # Loading INTERNAL clones description + /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ && do { + my $contigID = $1; + my $contig = $Assembly->get_contig_by_id($contigID); + while ($_ = $self->_readline) { + my (@data,$rejected,$c1_strand,$c2_strand); + + (@data = /\s+(\*?)\s+(C?)\s+(\S+)\s+(C?)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)/) && do { + if ($data[0] eq '*') { $rejected = 1 } else { $rejected = 0 } + $c1_strand = ($data[1] eq 'C' ? -1 : 1); + $c2_strand = ($data[3] eq 'C' ? -1 : 1); + (my $clone_name = $data[2]) =~ s/^(\S+)\.\w.*/$1/; + my $clone = Bio::SeqFeature::Generic->new(-start=>$data[6], + -end=>$data[7], + -strand=>0, + -primary=>"_internal_clone:$clone_name", + -tag=>{'_1st_strand'=>,$c1_strand, + '_2nd_strand'=>,$c2_strand, + '_1st_name'=>$data[2], + '_2nd_name'=>$data[4], + '_length'=>$data[5], + '_rejected'=>$rejected + } + ); + $contig->add_features([ $clone ]); + }; + + /Covered regions:/ && do { + my %coord = /(\d+)/g; my $i = 0; + foreach my $start (sort { $a <=> $b } keys %coord) { + my $cov = Bio::SeqFeature::Generic->new(-start=>$start, + -end=>$coord{$start}, + -primary=>'_covered_region:'.++$i + ); + # 1: attach feature to contig consensus, if any + $contig->add_features([ $cov ],1); + } + last; # exit while loop + }; # /Covered regions:/ + + } # while ($_ = $self->_readline) + }; # /INTERNAL\s+Contig\s+(\d+)\s+opp\s+sense/ + + } # while ($_ = $self->_readline) + return $Assembly; +} + +=head2 write_assembly + + Title : write_assembly + Usage : $ass_io->write_assembly($assembly) + Function: Write the assembly object in Phrap compatible ACE format + Returns : 1 on success, 0 for error + Args : A Bio::Assembly::Scaffold object + +=cut + +sub write_assemebly { + my $self = shift; + + $self->throw("Writing phrap.out files is not implemented yet! Sorry..."); +} + +1; + +__END__