Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Assembly/ContigAnalysis.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/ContigAnalysis.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,500 @@ +# $Id: ContigAnalysis.pm,v 1.2 2002/12/01 00:03:28 jason Exp $ +# +# BioPerl module for Bio::Assembly::ContigAnalysis +# +# Cared for by Robson francisco de Souza <rfsouza@citri.iq.usp.br> +# +# Copyright Robson Francisco de Souza +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::ContigAnalysis - + Perform analysis on sequence assembly contigs. + +=head1 SYNOPSIS + + # Module loading + use Bio::Assembly::ContigAnalysis; + + # Assembly loading methods + my $ca = new Bio::Assembly::ContigAnalysis( -contig=>$contigOBJ ); + + my @lcq = $ca->low_consensus_quality; + my @hqd = $ca->high_quality_discrepancies; + my @ss = $ca->single_strand_regions; + +=head1 DESCRIPTION + +A contig is as a set of sequences, locally aligned to each other, when +the sequences in a pair may be aligned. It may also include a +consensus sequence. Bio::Assembly::ContigAnalysis is a module +holding a collection of methods to analyze contig objects. It was +developed around the Bio::Assembly::Contig implementation of contigs and +can not work with another contig interface. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Robson Francisco de Souza + +Email: rfsouza@citri.iq.usp.br + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Assembly::ContigAnalysis; + +use Bio::Root::Root; +use strict; +use vars qw(@ISA); + +@ISA = qw(Bio::Root::Root); + +=head1 Object creator + +=head2 new + + Title : new + Usage : my $contig = Bio::Assembly::ContigAnalysis->new(-contig=>$contigOBJ); + Function : Creates a new contig analysis object + Returns : Bio::Assembly::ContigAnalysis + Args : + -contig : a Bio::Assembly::Contig object + +=cut + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my ($contigOBJ) = $self->_rearrange([qw(CONTIG)],@args); + unless ($contigOBJ->isa("Bio::Assembly::Contig")) { + $self->throw("ContigAnal works only on Bio::Assembly::Contig objects\n"); + } + + $self->{'_objref'} = $contigOBJ; + return $self; +} + +=head1 Analysis methods + +=head2 high_quality_discrepancies + + Title : high_quality_discrepancies + Usage : my $sfc = $ContigAnal->high_quality_discrepancies(); + Function : + + Locates all high quality discrepancies among aligned + sequences and the consensus sequence. + + Note: see Bio::Assembly::Contig POD documentation, + section "Coordinate System", for a definition of + available types. Default coordinate system type is + "gapped consensus", i.e. consensus sequence (with gaps) + coordinates. If limits are not specified, the entire + alignment is analyzed. + + Returns : Bio::SeqFeature::Collection + Args : optional arguments are + -threshold : cutoff value for low quality (minimum high quality) + Default: 40 + -ignore : number of bases that will not be analysed at + both ends of contig aligned elements + Default: 5 + -start : start of interval that will be analyzed + -end : start of interval that will be analyzed + -type : coordinate system type for interval + +=cut + +sub high_quality_discrepancies { + my ($self,@args) = shift; # Package reference + + my ($threshold,$ignore,$start,$end,$type) = + $self->_rearrange([qw(THRESHOLD IGNORE START END TYPE)],@args); + + # Defining default threhold and HQD_ignore + $threshold = 40 unless (defined($threshold)); + $ignore = 5 unless (defined($ignore)); + $type = 'gapped consensus' unless (defined($type)); + + # Changing input coordinates system (if needed) + if (defined $start && $type ne 'gapped consensus') { + $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start); + } elsif (!defined $start) { + $start = 1; + } + if (defined $end && $type ne 'gapped consensus') { + $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end); + } elsif (!defined $end) { + $end = $self->{'_objref'}->get_consensus_length(); + } + + # Scanning each read sequence and the contig sequence and + # adding discrepancies to Bio::SeqFeature::Collection + my @seqIDs = $self->{'_objref'}->get_seq_ids(-start=>$start, + -end=>$end, + -type=>$type); + my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq; + + my @HQD = (); + foreach my $seqID (@seqIDs) { + # Setting aligned read sub-sequence limits and loading data + my $seq = $self->{'_objref'}->get_seq_by_name($seqID); + my $qual = $self->{'_objref'}->get_qual_by_name($seqID); + unless (defined $qual) { + $self->warn("Can't correctly evaluate HQD without aligned sequence qualities for $seqID"); + next; + } + my $sequence = $seq->seq; + my @quality = @{ $qual->qual }; + + # Scanning the aligned region of each read + my $seq_ix = 0; + my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID"); + my ($astart,$aend) = ($coord->start,$coord->end); + $astart = $astart + $ignore; # Redefining limits to evaluate HQDs (jump $ignore at start) + $aend = $aend - $ignore; # Redefining limits to evaluate HQDs (stop $ignore before end) + + my ($d_start,$d_end,$i); + for ($i=$astart-1; $i<=$aend-1; $i++) { + # Changing coordinate $i+1 from 'gapped consensus' mode to "aligned $seqID" (coordinate $seq_ix) + $seq_ix = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$i+1); + next unless (($i >= $start) && ($i <= $end)); + + my $r_base = uc(substr($sequence,$seq_ix-1,1)); + my $c_base = uc(substr($consensus,$i,1)); + + # Discrepant region start: store $d_start and $type + (!defined($d_start) && + ($r_base ne $c_base) && + ($quality[$seq_ix-1] >= $threshold)) && do { + $d_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i+1); + #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n"; + next; + }; + + # Quality change or end of discrepant region: store limits and undef $d_start + if (defined($d_start) && + (($quality[$seq_ix-1] < $threshold) || + (uc($r_base) eq uc($c_base)))) { + $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + #print $seqID," ",$r_base," ",$i+1," ",$c_base," ",$contig_ix-1," ",$quality[$i]," $type\n"; + push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID", + -start=>$d_start, + -end=>$d_end, + -strand=>$seq->strand()) ); + $d_start = undef; + next; + } + } # for ($i=$astart-1; $i<=$aend-1; $i++) + + # Loading discrepancies located at sub-sequence end, if any. + if (defined($d_start)) { + $d_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + push(@HQD, Bio::SeqFeature::Generic->new(-primary=>"high_quality_discrepancy:$seqID", + -start=>$d_start, + -end=>$d_end, + -strand=>$seq->strand()) ); + } + } # foreach my $seqID (@seqIDs) + + return @HQD; +} + +=head2 low_consensus_quality + + Title : low_consensus_quality + Usage : my $sfc = $ContigAnal->low_consensus_quality(); + Function : Locates all low quality regions in the consensus + Returns : an array of Bio::SeqFeature::Generic objects + Args : optional arguments are + -threshold : cutoff value for low quality (minimum high quality) + Default: 25 + -start : start of interval that will be analyzed + -end : start of interval that will be analyzed + -type : coordinate system type for interval + +=cut + +sub low_consensus_quality { + my ($self,@args) = shift; # Packege reference + + my ($threshold,$start,$end,$type) = + $self->_rearrange([qw(THRESHOLD START END TYPE)],@args); + + # Setting default value for threshold + $threshold = 25 unless (defined($threshold)); + + # Loading qualities + my @quality = @{ $self->{'_objref'}->get_consensus_quality()->qual }; + + # Changing coordinates to gap mode noaln (consed: consensus without alignments) + $start = 1 unless (defined($start)); + if (defined $start && defined $type && ($type ne 'gapped consensus')) { + $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start); + $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end)); + } + $end = $self->{'_objref'}->get_consensus_length unless (defined $end); + + # Scanning @quality vector and storing intervals limits with base qualities less then + # the threshold value + my ($lcq_start); + my ($i,@LCQ); + for ($i=$start-1; $i<=$end-1; $i++) { +# print $quality[$i],"\t",$i,"\n"; + if (!defined($lcq_start) && (($quality[$i] <= $threshold) || ($quality[$i] == 98))) { + $lcq_start = $i+1; + } elsif (defined($lcq_start) && ($quality[$i] > $threshold)) { + $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start); + my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start, + -end=>$lcq_end, + -primary=>'low_consensus_quality') ); + $lcq_start = undef; + } + } + + if (defined $lcq_start) { + $lcq_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$lcq_start); + my $lcq_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + push(@LCQ, Bio::SeqFeature::Generic->new(-start=>$lcq_start, + -end=>$lcq_end, + -primary=>'low_consensus_quality') ); + } + + return @LCQ; +} + +=head2 not_confirmed_on_both_strands + + Title : low_quality_consensus + Usage : my $sfc = $ContigAnal->low_quality_consensus(); + Function : + + Locates all regions whose consensus bases were not + confirmed by bases from sequences aligned in both + orientations, i.e., in such regions, no bases in aligned + sequences of either +1 or -1 strand agree with the + consensus bases. + + Returns : an array of Bio::SeqFeature::Generic objects + Args : optional arguments are + -start : start of interval that will be analyzed + -end : start of interval that will be analyzed + -type : coordinate system type for interval + +=cut + +sub not_confirmed_on_both_strands { + my ($self,@args) = shift; # Package reference + + my ($start,$end,$type) = + $self->_rearrange([qw(START END TYPE)],@args); + + # Changing coordinates to default system 'align' (contig sequence with alignments) + $start = 1 unless (defined($start)); + if (defined($type) && ($type ne 'gapped consensus')) { + $start = $self->{'_objref'}->change_coord($type,'gapped consensus',$start); + $end = $self->{'_objref'}->change_coord($type,'gapped consensus',$end) if (defined($end)); + } + $end = $self->{'_objref'}->get_consensus_length unless (defined($end)); + + # Scanning alignment + my %confirmed = (); # If ($confirmed{$orientation}[$i] > 0) then $i is confirmed in $orientation strand + my ($i); + my $consensus = $self->{'_objref'}->get_consensus_sequence()->seq; + foreach my $seqID ($self->{'_objref'}->get_seq_ids) { + # Setting aligned read sub-sequence limits and loading data + my $seq = $self->{'_objref'}->get_seq_by_name($seqID); + my $sequence = $seq->seq; + + # Scanning the aligned regions of each read and registering confirmed sites + my $contig_ix = 0; + my $coord = $self->{'_objref'}->get_seq_feat_by_tag($seq,"_align_clipping:$seqID"); + my ($astart,$aend,$orientation) = ($coord->start,$coord->end,$coord->strand); + $astart = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$astart); + $aend = $self->{'_objref'}->change_coord('gapped consensus',"aligned $seqID",$aend); + for ($i=$astart-1; $i<=$aend-1; $i++) { + # $i+1 in 'align' mode is $contig_ix + $contig_ix = $self->{'_objref'}->change_coord("aligned $seqID",'gapped consensus',$i+1); + next unless (($contig_ix >= $start) && ($contig_ix <= $end)); + my $r_base = uc(substr($sequence,$i,1)); + my $c_base = uc(substr($consensus,$contig_ix-1,1)); + if ($c_base eq '-') { + $confirmed{$orientation}[$contig_ix] = -1; + } elsif (uc($r_base) eq uc($c_base)) { # Non discrepant region found + $confirmed{$orientation}[$contig_ix]++; + } + } # for ($i=$astart-1; $i<=$aend-1; $i++) + } # foreach $seqID (@reads) + + # Locating non-confirmed aligned regions for each orientation in $confirmed registry + my ($orientation); + my @NCBS = (); + foreach $orientation (keys %confirmed) { + my ($ncbs_start,$ncbs_end); + + for ($i=$start; $i<=$end; $i++) { + if (!defined($ncbs_start) && + (!defined($confirmed{$orientation}[$i]) || ($confirmed{$orientation}[$i] == 0))) { + $ncbs_start = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i); + } elsif (defined($ncbs_start) && + defined($confirmed{$orientation}[$i]) && + ($confirmed{$orientation}[$i] > 0)) { + $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$i-1); + push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start, + -end=>$ncbs_end, + -strand=>$orientation, + -primary=>"not_confirmed_on_both_strands") ); + $ncbs_start = undef; + } + } + + if (defined($ncbs_start)) { # NCBS at the end of contig + $ncbs_end = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$end); + push(@NCBS, Bio::SeqFeature::Generic->new(-start=>$ncbs_start, + -end=>$ncbs_end, + -strand=>$orientation, + -primary=>'not_confirmed_on_both_strands') ); + } + } + + return @NCBS; +} + +=head2 single_strand + + Title : single_strand + Usage : my $sfc = $ContigAnal->single_strand(); + Function : + + Locates all regions covered by aligned sequences only in + one of the two strands, i.e., regions for which aligned + sequence's strand() method returns +1 or -1 for all + sequences. + + Returns : an array of Bio::SeqFeature::Generic objects + Args : optional arguments are + -start : start of interval that will be analyzed + -end : start of interval that will be analyzed + -type : coordinate system type for interval + +=cut + +#' +sub single_strand { + my ($self,@args) = shift; # Package reference + + my ($start,$end,$type) = + $self->_rearrange([qw(START END TYPE)],@args); + + # Changing coordinates to gap mode align (consed: consensus sequence with alignments) + $type = 'gapped consensus' unless(defined($type)); + $start = 1 unless (defined($start)); + if (defined($type) && $type ne 'gapped consensus') { + $start = $self->{'objref'}->change_coord($type,'gapped consensus',$start); + $end = $self->{'objref'}->change_coord($type,'gapped consensus',$end) if (defined($end)); + } + ($end) = $self->{'_objref'}->get_consensus_length unless (defined($end)); + + # Loading complete list of coordinates for aligned sequences + my $sfc = $self->{'_objref'}->get_features_collection(); + my @forward = grep { $_->primary_tag =~ /^_aligned_coord:/ } + $sfc->features_in_range(-start=>$start, + -end=>$end, + -contain=>0, + -strand=>1, + -strandmatch=>'strong'); + my @reverse = grep { $_->primary_tag =~ /^_aligned_coord:/ } + $sfc->features_in_range(-start=>$start, + -end=>$end, + -contain=>0, + -strand=>-1, + -strandmatch=>'strong'); + # Merging overlapping features + @forward = $self->_merge_overlapping_features(@forward); + @reverse = $self->_merge_overlapping_features(@reverse); + + # Finding single stranded regions + my ($length) = $self->{'_objref'}->get_consensus_length; + $length = $self->{'_objref'}->change_coord('gapped consensus','ungapped consensus',$length); + @forward = $self->_complementary_features_list(1,$length,@forward); + @reverse = $self->_complementary_features_list(1,$length,@reverse); + + my @SS = (); + foreach my $feat (@forward, @reverse) { + $feat->primary_tag('single_strand_region'); + push(@SS,$feat); + } + + return @SS; +} + +=head1 Internal Methods + +=head2 _merge_overlapping_features + + Title : _merge_overlapping_features + Usage : my @feat = $ContigAnal->_merge_overlapping_features(@features); + Function : Merge all overlapping features into features + that hold original features as sub-features + Returns : array of Bio::SeqFeature::Generic objects + Args : array of Bio::SeqFeature::Generic objects + +=cut + +sub _merge_overlapping_features { + my ($self,@feat) = @_; + + $self->throw_not_implemented(); +} + +=head2 _complementary_features_list + + Title : _complementary_features_list + Usage : @feat = $ContigAnal->_complementary_features_list($start,$end,@features); + Function : Build a list of features for regions + not covered by features in @features array + Returns : array of Bio::SeqFeature::Generic objects + Args : + $start : [integer] start of first output feature + $end : [integer] end of last output feature + @features : array of Bio::SeqFeature::Generic objects + +=cut + +sub _complementary_features_list { + my ($self,$start,$end,@feat) = @_; + + $self->throw_not_implemented(); +} + +1; + +__END__