Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/Assembly/IO/ace.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Assembly/IO/ace.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,383 @@
+# $Id: ace.pm,v 1.1 2002/11/04 14:38:14 heikki Exp $
+#
+## BioPerl module for Bio::Assembly::IO::ace
+#
+# Copyright by Robson F. de Souza
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Assembly::IO::ace -  module to load phrap ACE files.
+
+=head1 SYNOPSYS
+
+    # Building an input stream
+    use Bio::Assembly::IO;
+
+    # Assembly loading methods
+    $io = new Bio::Assembly::IO(-file=>"SGC0-424.fasta.screen.ace.1",
+                         -format=>"ace");
+
+    $assembly = $io->next_assembly;
+
+=head1 DESCRIPTION
+
+This package loads the ACE files from the (phred/phrap/consed) package
+by Phill Green.  It was written to be used as a driver module for
+Bio::Assembly::IO input/output.
+
+=head2 Implemention
+
+Assemblies are loaded into Bio::Assembly::Scaffold objects composed by
+Bio::Assembly::Contig objects. In addition to default
+"_aligned_coord:$seqID" feature class from Bio::Assembly::Contig, contig
+objects loaded by this module will have the following special feature
+classes in their feature collection:
+
+"_align_clipping:$seqID" : location of subsequence in sequence $seqID
+                           which is aligned to the contig
+
+"_quality_clipping:$seqID" : location of good quality subsequence for
+                             sequence $seqID
+
+"consensus tags", as they are called in Consed's documentation, is
+equivalent to a bioperl sequence feature and, therefore, are added to
+the feature collection using their type field (see Consed's README.txt
+file) as primary tag.
+
+"read tags" are also sequence features and are stored as
+sub_SeqFeatures of the sequence's coordinate feature (the
+corresponding "_aligned_coord:$seqID" feature, easily accessed through
+get_seq_coord() method).
+
+"whole assembly tags" have no start and end, as they are not
+associated to any particular sequence in the assembly, and are added
+to the assembly's annotation collection using phrap as tag.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to the
+Bioperl mailing lists  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org                 - General discussion
+  http://bio.perl.org/MailList.html     - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Robson Francisco de Souza
+
+Email rfsouza@citri.iq.usp.br
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+package Bio::Assembly::IO::ace;
+
+use strict;
+use vars qw(@ISA);
+
+use Bio::Assembly::IO;
+use Bio::Assembly::Scaffold;
+use Bio::Assembly::Contig;
+use Bio::LocatableSeq;
+use Bio::Annotation::SimpleValue;
+use Bio::Seq::PrimaryQual;
+use Bio::SeqFeature::Generic;
+
+@ISA = qw(Bio::Assembly::IO);
+
+=head1 Parser methods
+
+=head2 next_assembly
+
+ Title   : next_assembly
+ Usage   : $unigene = $stream->next_assembly()
+ Function: returns the next assembly in the stream
+ Returns : Bio::Assembly::Scaffold object
+ Args    : NONE
+
+=cut
+
+sub next_assembly {
+    my $self = shift; # Object reference
+    local $/="\n";
+
+    # Resetting assembly data structure
+    my $assembly = Bio::Assembly::Scaffold->new(-source=>'phrap');
+
+    # Looping over all ACE file lines
+    my ($contigOBJ,$read_name);
+    my $read_data = {}; # Temporary holder for read data
+    while ($_ = $self->_readline) { # By now, ACE files hold a single assembly
+	chomp;
+
+	# Loading assembly information (ASsembly field)
+#	(/^AS (\d+) (\d+)/) && do {
+#	    $assembly->_set_nof_contigs($1);
+#	    $assembly->_set_nof_sequences_in_contigs($2);
+#	};
+
+	# Loading contig sequence (COntig sequence field)
+	(/^CO Contig(\d+) (\d+) (\d+) (\d+) (\w+)/) && do { # New contig found!
+	    my $contigID = $1;
+	    $contigOBJ = Bio::Assembly::Contig->new(-source=>'phrap', -id=>$contigID);
+#	    $contigOBJ->set_nof_bases($2); # Contig length in base pairs
+#	    $contigOBJ->set_nof_reads($3); # Number of reads in this contig
+#	    $contigOBJ->set_nof_segments($4); # Number of read segments selected for consensus assembly
+	    my $ori = ($5 eq 'U' ? 1 : -1); # 'C' if contig was complemented (using consed) or U if not (default)
+	    $contigOBJ->strand($ori);
+	    my $consensus_sequence = undef;
+	    while ($_ = $self->_readline) { # Looping over contig lines
+		chomp;                   # Drop <ENTER> (\n) on current line
+		last if (/^$/);          # Stop if empty line (contig end) is found
+		s/\*/-/g; # Forcing '-' as gap symbol
+		$consensus_sequence .= $_;
+	    }
+
+	    my $consensus_length = length($consensus_sequence);
+	    $consensus_sequence = Bio::LocatableSeq->new(-seq=>$consensus_sequence,
+							      -start=>1,
+							      -end=>$consensus_length,
+							      -id=>$contigID);
+	    $contigOBJ->set_consensus_sequence($consensus_sequence);
+	    $assembly->add_contig($contigOBJ);
+	};
+
+	# Loading contig qualities... (Base Quality field)
+	/^BQ/ && do {
+	    my $consensus = $contigOBJ->get_consensus_sequence()->seq();
+	    my ($i,$j,@tmp);
+	    my @quality = ();
+	    $j = 0;
+	    while ($_ = $self->_readline) {
+		chomp;
+		last if (/^$/);
+		@tmp = grep { /^\d+$/ } split(/\s+/);
+		$i = 0;
+		my $previous = 0;
+		my $next     = 0;
+		while ($i<=$#tmp) {
+		    # IF base is a gap, quality is the average for neighbouring sites
+		    if (substr($consensus,$j,1) eq '-') {
+			$previous = $tmp[$i-1] unless ($i == 0);
+			if ($i < $#tmp) {
+			    $next = $tmp[$i+1];
+			} else {
+			    $next = 0;
+			}
+			push(@quality,int(($previous+$next)/2));
+		    } else {
+			push(@quality,$tmp[$i]);
+			$i++;
+		    }
+		    $j++;
+		}
+	    }
+
+	    my $qual = Bio::Seq::PrimaryQual->new(-qual=>join(" ",@quality),
+						  -id=>$contigOBJ->id());
+	    $contigOBJ->set_consensus_quality($qual);
+	};
+
+	# Loading read info... (Assembled From field)
+	/^AF (\S+) (C|U) (-*\d+)/ && do {
+	    $read_name = $1; my $ori = $2;
+	    $read_data->{$read_name}{'padded_start'} = $3; # aligned start
+	    $ori = ( $ori eq 'U' ? 1 : -1);
+	    $read_data->{$read_name}{'strand'}  = $ori;
+	};
+
+	# Loading base segments definitions (Base Segment field)
+#	/^BS (\d+) (\d+) (\S+)/ && do {
+#	    if (exists($self->{'contigs'}[$contig]{'reads'}{$3}{'segments'})) {
+#		$self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} .= " " . $1 . " " . $2;
+#	    } else { $self->{'contigs'}[$contig]{'reads'}{$3}{'segments'} = $1 . " " . $2 }
+#	};
+
+	# Loading reads... (ReaD sequence field
+	/^RD (\S+) (-*\d+) (\d+) (\d+)/ && do {
+	    $read_name = $1;
+	    $read_data->{$read_name}{'length'} = $2; # number_of_padded_bases
+	    $read_data->{$read_name}{'contig'} = $contigOBJ;
+#	    $read_data->{$read_name}{'number_of_read_info_items'} = $3;
+#	    $read_data->{$read_name}{'number_of_tags'}            = $4;
+	    my $read_sequence;
+	    while ($_ = $self->_readline) {
+		chomp;
+		last if (/^$/);
+		s/\*/-/g; # Forcing '-' as gap symbol
+		$read_sequence .= $_; # aligned read sequence
+	    }
+	    my $read = Bio::LocatableSeq->new(-seq=>$read_sequence,
+					      -start=>1,
+					      -end=>$read_data->{$read_name}{'length'},
+					      -strand=>$read_data->{$read_name}{'strand'},
+					      -id=>$read_name,
+					      -primary_id=>$read_name,
+					      -alphabet=>'dna');
+
+	    # Adding read location and sequence to contig ("gapped consensus" coordinates)
+	    my $padded_start = $read_data->{$read_name}{'padded_start'};
+	    my $padded_end   = $padded_start + $read_data->{$read_name}{'length'} - 1;
+	    my $coord = Bio::SeqFeature::Generic->new(-start=>$padded_start,
+						      -end=>$padded_end,
+						      -strand=>$read_data->{$read_name}{'strand'},
+						      -tag => { 'contig' => $contigOBJ->id }
+						      );
+	    $contigOBJ->set_seq_coord($coord,$read);
+	};
+
+	# Loading read trimming and alignment ranges...
+	/^QA (-?\d+) (-?\d+) (-?\d+) (-?\d+)/ && do {
+	    my $qual_start  = $1; my $qual_end  = $2;
+	    my $align_start = $3; my $align_end = $4;
+
+	    unless ($align_start == -1 && $align_end == -1) {
+		$align_start = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_start);
+		$align_end   = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$align_end);
+		my $align_feat = Bio::SeqFeature::Generic->new(-start=>$align_start,
+							       -end=>$align_end,
+							       -strand=>$read_data->{$read_name}{'strand'},
+							       -primary=>"_align_clipping:$read_name");
+		$align_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
+		$contigOBJ->add_features([ $align_feat ], 0);
+	    }
+
+	    unless ($qual_start == -1 && $qual_end == -1) {
+		$qual_start  = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_start);
+		$qual_end    = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$qual_end);
+		my $qual_feat = Bio::SeqFeature::Generic->new(-start=>$qual_start,
+							      -end=>$qual_end,
+							      -strand=>$read_data->{$read_name}{'strand'},
+							      -primary=>"_quality_clipping:$read_name");
+		$qual_feat->attach_seq( $contigOBJ->get_seq_by_name($read_name) );
+		$contigOBJ->add_features([ $qual_feat ], 0);
+	    }
+	};
+
+	# Loading read description (DeScription fields)
+#	/^DS / && do {
+#	    /CHEM: (\S+)/ && do {
+#		$self->{'contigs'}[$contig]{'reads'}{$read_name}{'chemistry'} = $1;
+#	    };
+#	    /CHROMAT_FILE: (\S+)/ && do {
+#		$self->{'contigs'}[$contig]{'reads'}{$read_name}{'chromat_file'} = $1;
+#	    };
+#	    /DIRECTION: (\w+)/ && do {
+#		my $ori = $1;
+#		if    ($ori eq 'rev') { $ori = 'C' }
+#		elsif ($ori eq 'fwd') { $ori = 'U' }
+#		$self->{'contigs'}[$contig]{'reads'}{$read_name}{'strand'} = $ori;
+#	    };
+#	    /DYE: (\S+)/ && do {
+#		$self->{'contigs'}[$contig]{'reads'}{$read_name}{'dye'} = $1;
+#	    };
+#	    /PHD_FILE: (\S+)/ && do {
+#		$self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_file'} = $1;
+#	    };
+#	    /TEMPLATE: (\S+)/ && do {
+#		$self->{'contigs'}[$contig]{'reads'}{$read_name}{'template'} = $1;
+#	    };
+#	    /TIME: (\S+ \S+ \d+ \d+\:\d+\:\d+ \d+)/ && do {
+#		$self->{'contigs'}[$contig]{'reads'}{$read_name}{'phd_time'} = $1;
+#	    };
+#	};
+
+	# Loading contig tags ('tags' in phrap terminology, but Bioperl calls them features)
+	/^CT\s*\{/ && do {
+	    my ($contigID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
+	    $contigID =~ s/^Contig//i;
+	    my $extra_info = undef;
+	    while ($_ = $self->_readline) {
+		last if (/\}/);
+		$extra_info .= $_;
+	    }
+	    my $contig_tag = Bio::SeqFeature::Generic->new(-start=>$start,
+							   -end=>$end,
+							   -primary=>$type,
+							   -tag=>{ 'source' => $source,
+								   'creation_date' => $date,
+								   'extra_info' => $extra_info
+							       });
+	    $assembly->get_contig_by_id($contigID)->add_features([ $contig_tag ],1);
+	};
+
+	# Loading read tags
+	/^RT\s*\{/ && do {
+	    my ($readID,$type,$source,$start,$end,$date) = split(' ',$self->_readline);
+	    my $extra_info = undef;
+	    while ($_ = $self->_readline) {
+		last if (/\}/);
+		$extra_info .= $_;
+	    }
+	    $start  = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$start);
+	    $end    = $contigOBJ->change_coord("aligned $read_name",'gapped consensus',$end);
+	    my $read_tag = Bio::SeqFeature::Generic->new(-start=>$start,
+							 -end=>$end,
+							 -primary=>$type,
+							 -tag=>{ 'source' => $source,
+								 'creation_date' => $date,
+								 'extra_info' => $extra_info
+								 });
+	    my $contig = $read_data->{$readID}{'contig'};
+	    my $coord  = $contig->get_seq_coord( $contig->get_seq_by_name($readID) );
+	    $coord->add_sub_SeqFeature($read_tag);
+	};
+
+	# Loading read tags
+	/^WA\s*\{/ && do {
+	    my ($type,$source,$date) = split(' ',$self->_readline);
+	    my $extra_info = undef;
+	    while ($_ = $self->_readline) {
+		last if (/\}/);
+		$extra_info = $_;
+	    }
+#?	    push(@line,\@extra_info);
+	    my $assembly_tag = join(" ","TYPE: ",$type,"PROGRAM:",$source,
+				    "DATE:",$date,"DATA:",$extra_info);
+	    $assembly_tag = Bio::Annotation::SimpleValue->new(-value=>$assembly_tag);
+	    $assembly->annotation->add_Annotation('phrap',$assembly_tag);
+	};
+
+    } # while ($_ = $self->_readline)
+
+    $assembly->update_seq_list();
+    return $assembly;
+}
+
+=head2 write_assembly
+
+    Title   : write_assembly
+    Usage   : $ass_io->write_assembly($assembly)
+    Function: Write the assembly object in Phrap compatible ACE format
+    Returns : 1 on success, 0 for error
+    Args    : A Bio::Assembly::Scaffold object
+
+=cut
+
+sub write_assemebly {
+    my $self = shift;
+
+    $self->throw("Writing phrap ACE files is not implemented yet! Sorry...");
+}
+
+1;
+
+__END__
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children