Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/EnsEMBL/Compara/BaseGenomicAlignSet.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Compara/BaseGenomicAlignSet.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,492 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Compara::BaseGenomicAlignSet - Base class for GenomicAlignBlock and GenomicAlignTree + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::EnsEMBL::Compara::BaseGenomicAlignSet; +use strict; + +# Object preamble +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Exception qw(throw warning info deprecate verbose); + + +=head2 slice + + Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice + Example : my $slice = $genomic_align_block->slice; + Example : $genomic_align_block->slice($slice); + Description: get/set for attribute slice. + Returntype : Bio::EnsEMBL::Slice object + Exceptions : + Caller : general + Status : Stable + +=cut + +sub slice { + my ($self, $reference_slice) = @_; + + if (defined($reference_slice)) { +# throw "[$reference_slice] is not a Bio::EnsEMBL::Slice" +# unless $reference_slice->isa("Bio::EnsEMBL::Slice"); + $self->{'reference_slice'} = $reference_slice; + } + + return $self->{'reference_slice'}; +} + +=head2 reference_slice + + Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice + Example : my $reference_slice = $genomic_align_block->reference_slice; + Example : $genomic_align_block->reference_slice($slice); + Description: Alias for slice method. TO BE DEPRECATED. + Returntype : Bio::EnsEMBL::Slice object + Exceptions : + Caller : general + Status : Stable + +=cut + +sub reference_slice { + my ($self, $reference_slice) = @_; + + return $self->slice($reference_slice); +} + +=head2 start + + Arg [1] : (optional) integer $start + Example : my $start = $genomic_align_block->start; + Example : $genomic_align_block->start(1035); + Description: get/set for attribute reference_slice_start. A value of 0 will set + the attribute to undefined. + Returntype : integer + Exceptions : none + Caller : general + +=cut + +sub start { + my $self = shift; + return $self->reference_slice_start(@_); +} + + +=head2 reference_slice_start + + Arg [1] : integer $reference_slice_start + Example : my $reference_slice_start = $genomic_align_block->reference_slice_start; + Example : $genomic_align_block->reference_slice_start(1035); + Description: get/set for attribute reference_slice_start. A value of 0 will set + the attribute to undefined. + Returntype : integer + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub reference_slice_start { + my ($self, $reference_slice_start) = @_; + + if (defined($reference_slice_start)) { + $self->{'reference_slice_start'} = ($reference_slice_start or undef); + } + + return $self->{'reference_slice_start'}; +} + + +=head2 end + + Arg [1] : (optional) integer $end + Example : my $end = $genomic_align_block->end; + Example : $genomic_align_block->end(1283); + Description: get/set for attribute reference_slice_end. A value of 0 will set + the attribute to undefined. + Returntype : integer + Exceptions : none + Caller : general + +=cut + +sub end { + my $self = shift; + return $self->reference_slice_end(@_); +} + + +=head2 reference_slice_end + + Arg [1] : integer $reference_slice_end + Example : my $reference_slice_end = $genomic_align_block->reference_slice_end; + Example : $genomic_align_block->reference_slice_end(1283); + Description: get/set for attribute reference_slice_end. A value of 0 will set + the attribute to undefined. + Returntype : integer + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub reference_slice_end { + my ($self, $reference_slice_end) = @_; + + if (defined($reference_slice_end)) { + $self->{'reference_slice_end'} = ($reference_slice_end or undef); + } + + return $self->{'reference_slice_end'}; + +} + +=head2 strand + + Arg [1] : integer $strand + Example : my $strand = $genomic_align_block->strand; + Example : $genomic_align_block->strand(-1); + Description: get/set for attribute strand. A value of 0 will set + the attribute to undefined. + Returntype : integer + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub strand { + my ($self, $reference_slice_strand) = @_; + + if (defined($reference_slice_strand)) { + $self->{'reference_slice_strand'} = ($reference_slice_strand or undef); + } + + return $self->{'reference_slice_strand'}; +} + +=head2 reference_slice_strand + + Arg [1] : integer $reference_slice_strand + Example : my $reference_slice_strand = $genomic_align_block->reference_slice_strand; + Example : $genomic_align_block->reference_slice_strand(-1); + Description: get/set for attribute reference_slice_strand. A value of 0 will set + the attribute to undefined. + Returntype : integer + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub reference_slice_strand { + my ($self, $reference_slice_strand) = @_; + + if (defined($reference_slice_strand)) { + $self->{'reference_slice_strand'} = ($reference_slice_strand or undef); + } + + return $self->{'reference_slice_strand'}; +} + +=head2 get_original_strand + + Args : none + Example : if (!$genomic_align_block->get_original_strand()) { + # original GenomicAlignBlock has been reverse-complemented + } + Description: getter for the _orignal_strand attribute + Returntype : none + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub get_original_strand { + my ($self) = @_; + + if (!defined($self->{_original_strand})) { + $self->{_original_strand} = 1; + } + + return $self->{_original_strand}; +} + +=head2 restrict_between_reference_positions + + Arg[1] : [optional] int $start, refers to the reference_dnafrag + Arg[2] : [optional] int $end, refers to the reference_dnafrag + Arg[3] : [optional] Bio::EnsEMBL::Compara::GenomicAlign $reference_GenomicAlign + Arg[4] : [optional] boolean $skip_empty_GenomicAligns + Example : none + Description: restrict this GenomicAlignBlock. It returns a new object unless no + restriction is needed. In that case, it returns the original unchanged + object + It might be the case that the restricted region coincide with a gap + in one or several GenomicAligns. By default these GenomicAligns are + returned with a dnafrag_end equals to its dnafrag_start + 1. For instance, + a GenomicAlign with dnafrag_start = 12345 and dnafrag_end = 12344 + correspond to a block which goes on this region from before 12345 to + after 12344, ie just between 12344 and 12345. You can choose to remove + these empty GenomicAligns by setting $skip_empty_GenomicAligns to any + true value. + Returntype : Bio::EnsEMBL::Compara::GenomicAlignBlock object in scalar context. In + list context, returns the previous object and the start and end + positions of the restriction in alignment coordinates (from 1 to + alignment_length) + Exceptions : return undef if reference positions lie outside of the alignment + Caller : general + Status : At risk + +=cut + +sub restrict_between_reference_positions { + my ($self, $start, $end, $reference_genomic_align, $skip_empty_GenomicAligns) = @_; + my $genomic_align_set; + my $new_reference_genomic_align; + my $new_genomic_aligns = []; + + $reference_genomic_align ||= $self->reference_genomic_align; + throw("A reference Bio::EnsEMBL::Compara::GenomicAlign must be given") if (!$reference_genomic_align); + $start = $reference_genomic_align->dnafrag_start if (!defined($start)); + $end = $reference_genomic_align->dnafrag_end if (!defined($end)); + + if ($start > $reference_genomic_align->dnafrag_end or $end < $reference_genomic_align->dnafrag_start) { + # restricting outside of boundaries => return undef object + warn("restricting outside of boundaries => return undef object: $start-$end (".$reference_genomic_align->dnafrag_start."-".$reference_genomic_align->dnafrag_end.")"); + return wantarray ? (undef, undef, undef) : undef; + } + my $number_of_base_pairs_to_trim_from_the_start = $start - $reference_genomic_align->dnafrag_start; + my $number_of_base_pairs_to_trim_from_the_end = $reference_genomic_align->dnafrag_end - $end; + + my $is_ref_low_coverage = 0; + if ($reference_genomic_align->cigar_line =~ /X/) { + $is_ref_low_coverage = 1; + } + + ## Skip if no restriction is needed. Return original object! We are still going on with the + ## restriction when either excess_at_the_start or excess_at_the_end are 0 as a (multiple) + ## alignment may start or end with gaps in the reference species. In that case, we want to + ## trim these gaps from the alignment as they fall just outside of the region of interest + + ##Exception if the reference species is low coverage, then need to continue + ##with this routine to find out the correct align positions + if ($number_of_base_pairs_to_trim_from_the_start < 0 and $number_of_base_pairs_to_trim_from_the_end < 0 and !$is_ref_low_coverage) { + return wantarray ? ($self, 1, $self->length) : $self; + } + + my $negative_strand = ($reference_genomic_align->dnafrag_strand == -1); + + ## Create a new Bio::EnsEMBL::Compara::GenomicAlignBlock object + throw("Reference GenomicAlign not found!") if (!$reference_genomic_align); + + my @reference_cigar = grep {$_} split(/(\d*[GDMXI])/, $reference_genomic_align->cigar_line); + if ($negative_strand) { + @reference_cigar = reverse @reference_cigar; + } + + #If this is negative, eg when a slice starts in one block and ends in + #another, need to set to 0 to ensure we enter the loop below. This + #fixes a bug when using a 2x species as the reference and fetching using + #an expanded align slice. + if ($number_of_base_pairs_to_trim_from_the_start < 0) { + $number_of_base_pairs_to_trim_from_the_start = 0; + } + + ## Parse start of cigar_line for the reference GenomicAlign + my $counter_of_trimmed_columns_from_the_start = 0; # num. of bp in the alignment to be trimmed + + if ($number_of_base_pairs_to_trim_from_the_start >= 0) { + my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the start) + ## Loop through the cigar pieces + while (my $cigar = shift(@reference_cigar)) { + # Parse each cigar piece + my ($num, $type) = ($cigar =~ /^(\d*)([GDMXI])/); + $num = 1 if ($num eq ""); + + # Insertions are not part of the alignment, don't count them + if ($type ne "I") { + $counter_of_trimmed_columns_from_the_start += $num; + } + + # Matches and insertions are actual base pairs in the reference + if ($type eq "M" or $type eq "I") { + $counter_of_trimmed_base_pairs += $num; + # If this cigar piece is too long and we overshoot the number of base pairs we want to trim, + # we substitute this cigar piece by a shorter one + if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_start) { + my $new_cigar_piece = ""; + # length of the new cigar piece + my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_start; + if ($length > 1) { + $new_cigar_piece = $length.$type; + } elsif ($length == 1) { + $new_cigar_piece = $type; + } + unshift(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece); + + # There is no need to correct the counter of trimmed columns if we are in an insertion + # when we overshoot + if ($type eq "M") { + $counter_of_trimmed_columns_from_the_start -= $length; + } + + ## We don't want to start with an insertion or a deletion. Trim them! + while (@reference_cigar and $reference_cigar[0] =~ /[DI]/) { + my ($num, $type) = ($reference_cigar[0] =~ /^(\d*)([DIGMX])/); + $num = 1 if ($num eq ""); + # only counts deletions, insertions are not part of the aligment + $counter_of_trimmed_columns_from_the_start += $num if ($type eq "D"); + shift(@reference_cigar); + } + last; + } + } + } + } + + #If this is negative, eg when a slice starts in one block and ends in + #another, need to set to 0 to ensure we enter the loop below. This + #fixes a bug when using a 2x species as the reference and fetching using + #an expanded align slice. + if ($number_of_base_pairs_to_trim_from_the_end < 0) { + $number_of_base_pairs_to_trim_from_the_end = 0; + } + + ## Parse end of cigar_line for the reference GenomicAlign + my $counter_of_trimmed_columns_from_the_end = 0; # num. of bp in the alignment to be trimmed + if ($number_of_base_pairs_to_trim_from_the_end >= 0) { + my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the end) + ## Loop through the cigar pieces + while (my $cigar = pop(@reference_cigar)) { + # Parse each cigar piece + my ($num, $type) = ($cigar =~ /^(\d*)([DIGMX])/); + $num = 1 if ($num eq ""); + + # Insertions are not part of the alignment, don't count them + if ($type ne "I") { + $counter_of_trimmed_columns_from_the_end += $num; + } + + # Matches and insertions are actual base pairs in the reference + if ($type eq "M" or $type eq "I") { + $counter_of_trimmed_base_pairs += $num; + # If this cigar piece is too long and we overshoot the number of base pairs we want to trim, + # we substitute this cigar piece by a shorter one + if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_end) { + my $new_cigar_piece = ""; + # length of the new cigar piece + my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_end; + if ($length > 1) { + $new_cigar_piece = $length.$type; + } elsif ($length == 1) { + $new_cigar_piece = $type; + } + push(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece); + + # There is no need to correct the counter of trimmed columns if we are in an insertion + # when we overshoot + if ($type eq "M") { + $counter_of_trimmed_columns_from_the_end -= $length; + } + + ## We don't want to end with an insertion or a deletion. Trim them! + while (@reference_cigar and $reference_cigar[-1] =~ /[DI]/) { + my ($num, $type) = ($reference_cigar[-1] =~ /^(\d*)([DIGMX])/); + $num = 1 if ($num eq ""); + # only counts deletions, insertions are not part of the aligment + $counter_of_trimmed_columns_from_the_end += $num if ($type eq "D"); + pop(@reference_cigar); + } + last; + } + } + } + } + + ## Skip if no restriction is needed. Return original object! This may happen when + ## either excess_at_the_start or excess_at_the_end are 0 but the alignment does not + ## start or end with gaps in the reference species. + if ($counter_of_trimmed_columns_from_the_start <= 0 and $counter_of_trimmed_columns_from_the_end <= 0) { + return wantarray ? ($self, 1, $self->length) : $self; + } + + my ($aln_start, $aln_end); + if ($negative_strand) { + $aln_start = $counter_of_trimmed_columns_from_the_end + 1; + $aln_end = $self->length - $counter_of_trimmed_columns_from_the_start; + } else { + $aln_start = $counter_of_trimmed_columns_from_the_start + 1; + $aln_end = $self->length - $counter_of_trimmed_columns_from_the_end; + } + + $genomic_align_set = $self->restrict_between_alignment_positions($aln_start, $aln_end, $skip_empty_GenomicAligns); + $new_reference_genomic_align = $genomic_align_set->reference_genomic_align; + + if (!defined $self->{'restricted_aln_start'}) { + $self->{'restricted_aln_start'} = 0; + } + if (!defined $self->{'restricted_aln_end'}) { + $self->{'restricted_aln_end'} = 0; + } + $genomic_align_set->{'restricted_aln_start'} = $counter_of_trimmed_columns_from_the_start + $self->{'restricted_aln_start'}; + $genomic_align_set->{'restricted_aln_end'} = $counter_of_trimmed_columns_from_the_end + $self->{'restricted_aln_end'}; + #$genomic_align_set->{'original_length'} = $self->length; + + #Need to use original gab length. If original_length is not set, length has + #not changed. Needed when use 2X genome as reference. + if (defined $self->{'original_length'}) { + $genomic_align_set->{'original_length'} = $self->{'original_length'}; + } else { + $genomic_align_set->{'original_length'} = $self->length; + } + + if (defined $self->slice) { + if ($self->strand == 1) { + $genomic_align_set->start($new_reference_genomic_align->dnafrag_start - + $self->slice->start + 1); + $genomic_align_set->end($new_reference_genomic_align->dnafrag_end - + $self->slice->start + 1); + $genomic_align_set->strand(1); + } else { + $genomic_align_set->start($self->{reference_slice}->{end} - + $new_reference_genomic_align->{dnafrag_end} + 1); + $genomic_align_set->end($self->{reference_slice}->{end} - + $new_reference_genomic_align->{dnafrag_start} + 1); + $genomic_align_set->strand(-1); + } + } + + return wantarray ? ($genomic_align_set, $aln_start, $aln_end) : $genomic_align_set; +} + +1;