Mercurial > repos > willmclaren > ensembl_vep
view variant_effect_predictor/Bio/EnsEMBL/Compara/BaseGenomicAlignSet.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line source
=head1 LICENSE Copyright (c) 1999-2012 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <dev@ensembl.org>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =head1 NAME Bio::EnsEMBL::Compara::BaseGenomicAlignSet - Base class for GenomicAlignBlock and GenomicAlignTree =head1 APPENDIX The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ =cut # Let the code begin... package Bio::EnsEMBL::Compara::BaseGenomicAlignSet; use strict; # Object preamble use Bio::EnsEMBL::Utils::Argument qw(rearrange); use Bio::EnsEMBL::Utils::Exception qw(throw warning info deprecate verbose); =head2 slice Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice Example : my $slice = $genomic_align_block->slice; Example : $genomic_align_block->slice($slice); Description: get/set for attribute slice. Returntype : Bio::EnsEMBL::Slice object Exceptions : Caller : general Status : Stable =cut sub slice { my ($self, $reference_slice) = @_; if (defined($reference_slice)) { # throw "[$reference_slice] is not a Bio::EnsEMBL::Slice" # unless $reference_slice->isa("Bio::EnsEMBL::Slice"); $self->{'reference_slice'} = $reference_slice; } return $self->{'reference_slice'}; } =head2 reference_slice Arg [1] : (optional) Bio::EnsEMBL::Slice $reference_slice Example : my $reference_slice = $genomic_align_block->reference_slice; Example : $genomic_align_block->reference_slice($slice); Description: Alias for slice method. TO BE DEPRECATED. Returntype : Bio::EnsEMBL::Slice object Exceptions : Caller : general Status : Stable =cut sub reference_slice { my ($self, $reference_slice) = @_; return $self->slice($reference_slice); } =head2 start Arg [1] : (optional) integer $start Example : my $start = $genomic_align_block->start; Example : $genomic_align_block->start(1035); Description: get/set for attribute reference_slice_start. A value of 0 will set the attribute to undefined. Returntype : integer Exceptions : none Caller : general =cut sub start { my $self = shift; return $self->reference_slice_start(@_); } =head2 reference_slice_start Arg [1] : integer $reference_slice_start Example : my $reference_slice_start = $genomic_align_block->reference_slice_start; Example : $genomic_align_block->reference_slice_start(1035); Description: get/set for attribute reference_slice_start. A value of 0 will set the attribute to undefined. Returntype : integer Exceptions : none Caller : general Status : Stable =cut sub reference_slice_start { my ($self, $reference_slice_start) = @_; if (defined($reference_slice_start)) { $self->{'reference_slice_start'} = ($reference_slice_start or undef); } return $self->{'reference_slice_start'}; } =head2 end Arg [1] : (optional) integer $end Example : my $end = $genomic_align_block->end; Example : $genomic_align_block->end(1283); Description: get/set for attribute reference_slice_end. A value of 0 will set the attribute to undefined. Returntype : integer Exceptions : none Caller : general =cut sub end { my $self = shift; return $self->reference_slice_end(@_); } =head2 reference_slice_end Arg [1] : integer $reference_slice_end Example : my $reference_slice_end = $genomic_align_block->reference_slice_end; Example : $genomic_align_block->reference_slice_end(1283); Description: get/set for attribute reference_slice_end. A value of 0 will set the attribute to undefined. Returntype : integer Exceptions : none Caller : general Status : Stable =cut sub reference_slice_end { my ($self, $reference_slice_end) = @_; if (defined($reference_slice_end)) { $self->{'reference_slice_end'} = ($reference_slice_end or undef); } return $self->{'reference_slice_end'}; } =head2 strand Arg [1] : integer $strand Example : my $strand = $genomic_align_block->strand; Example : $genomic_align_block->strand(-1); Description: get/set for attribute strand. A value of 0 will set the attribute to undefined. Returntype : integer Exceptions : none Caller : general Status : Stable =cut sub strand { my ($self, $reference_slice_strand) = @_; if (defined($reference_slice_strand)) { $self->{'reference_slice_strand'} = ($reference_slice_strand or undef); } return $self->{'reference_slice_strand'}; } =head2 reference_slice_strand Arg [1] : integer $reference_slice_strand Example : my $reference_slice_strand = $genomic_align_block->reference_slice_strand; Example : $genomic_align_block->reference_slice_strand(-1); Description: get/set for attribute reference_slice_strand. A value of 0 will set the attribute to undefined. Returntype : integer Exceptions : none Caller : general Status : Stable =cut sub reference_slice_strand { my ($self, $reference_slice_strand) = @_; if (defined($reference_slice_strand)) { $self->{'reference_slice_strand'} = ($reference_slice_strand or undef); } return $self->{'reference_slice_strand'}; } =head2 get_original_strand Args : none Example : if (!$genomic_align_block->get_original_strand()) { # original GenomicAlignBlock has been reverse-complemented } Description: getter for the _orignal_strand attribute Returntype : none Exceptions : none Caller : general Status : Stable =cut sub get_original_strand { my ($self) = @_; if (!defined($self->{_original_strand})) { $self->{_original_strand} = 1; } return $self->{_original_strand}; } =head2 restrict_between_reference_positions Arg[1] : [optional] int $start, refers to the reference_dnafrag Arg[2] : [optional] int $end, refers to the reference_dnafrag Arg[3] : [optional] Bio::EnsEMBL::Compara::GenomicAlign $reference_GenomicAlign Arg[4] : [optional] boolean $skip_empty_GenomicAligns Example : none Description: restrict this GenomicAlignBlock. It returns a new object unless no restriction is needed. In that case, it returns the original unchanged object It might be the case that the restricted region coincide with a gap in one or several GenomicAligns. By default these GenomicAligns are returned with a dnafrag_end equals to its dnafrag_start + 1. For instance, a GenomicAlign with dnafrag_start = 12345 and dnafrag_end = 12344 correspond to a block which goes on this region from before 12345 to after 12344, ie just between 12344 and 12345. You can choose to remove these empty GenomicAligns by setting $skip_empty_GenomicAligns to any true value. Returntype : Bio::EnsEMBL::Compara::GenomicAlignBlock object in scalar context. In list context, returns the previous object and the start and end positions of the restriction in alignment coordinates (from 1 to alignment_length) Exceptions : return undef if reference positions lie outside of the alignment Caller : general Status : At risk =cut sub restrict_between_reference_positions { my ($self, $start, $end, $reference_genomic_align, $skip_empty_GenomicAligns) = @_; my $genomic_align_set; my $new_reference_genomic_align; my $new_genomic_aligns = []; $reference_genomic_align ||= $self->reference_genomic_align; throw("A reference Bio::EnsEMBL::Compara::GenomicAlign must be given") if (!$reference_genomic_align); $start = $reference_genomic_align->dnafrag_start if (!defined($start)); $end = $reference_genomic_align->dnafrag_end if (!defined($end)); if ($start > $reference_genomic_align->dnafrag_end or $end < $reference_genomic_align->dnafrag_start) { # restricting outside of boundaries => return undef object warn("restricting outside of boundaries => return undef object: $start-$end (".$reference_genomic_align->dnafrag_start."-".$reference_genomic_align->dnafrag_end.")"); return wantarray ? (undef, undef, undef) : undef; } my $number_of_base_pairs_to_trim_from_the_start = $start - $reference_genomic_align->dnafrag_start; my $number_of_base_pairs_to_trim_from_the_end = $reference_genomic_align->dnafrag_end - $end; my $is_ref_low_coverage = 0; if ($reference_genomic_align->cigar_line =~ /X/) { $is_ref_low_coverage = 1; } ## Skip if no restriction is needed. Return original object! We are still going on with the ## restriction when either excess_at_the_start or excess_at_the_end are 0 as a (multiple) ## alignment may start or end with gaps in the reference species. In that case, we want to ## trim these gaps from the alignment as they fall just outside of the region of interest ##Exception if the reference species is low coverage, then need to continue ##with this routine to find out the correct align positions if ($number_of_base_pairs_to_trim_from_the_start < 0 and $number_of_base_pairs_to_trim_from_the_end < 0 and !$is_ref_low_coverage) { return wantarray ? ($self, 1, $self->length) : $self; } my $negative_strand = ($reference_genomic_align->dnafrag_strand == -1); ## Create a new Bio::EnsEMBL::Compara::GenomicAlignBlock object throw("Reference GenomicAlign not found!") if (!$reference_genomic_align); my @reference_cigar = grep {$_} split(/(\d*[GDMXI])/, $reference_genomic_align->cigar_line); if ($negative_strand) { @reference_cigar = reverse @reference_cigar; } #If this is negative, eg when a slice starts in one block and ends in #another, need to set to 0 to ensure we enter the loop below. This #fixes a bug when using a 2x species as the reference and fetching using #an expanded align slice. if ($number_of_base_pairs_to_trim_from_the_start < 0) { $number_of_base_pairs_to_trim_from_the_start = 0; } ## Parse start of cigar_line for the reference GenomicAlign my $counter_of_trimmed_columns_from_the_start = 0; # num. of bp in the alignment to be trimmed if ($number_of_base_pairs_to_trim_from_the_start >= 0) { my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the start) ## Loop through the cigar pieces while (my $cigar = shift(@reference_cigar)) { # Parse each cigar piece my ($num, $type) = ($cigar =~ /^(\d*)([GDMXI])/); $num = 1 if ($num eq ""); # Insertions are not part of the alignment, don't count them if ($type ne "I") { $counter_of_trimmed_columns_from_the_start += $num; } # Matches and insertions are actual base pairs in the reference if ($type eq "M" or $type eq "I") { $counter_of_trimmed_base_pairs += $num; # If this cigar piece is too long and we overshoot the number of base pairs we want to trim, # we substitute this cigar piece by a shorter one if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_start) { my $new_cigar_piece = ""; # length of the new cigar piece my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_start; if ($length > 1) { $new_cigar_piece = $length.$type; } elsif ($length == 1) { $new_cigar_piece = $type; } unshift(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece); # There is no need to correct the counter of trimmed columns if we are in an insertion # when we overshoot if ($type eq "M") { $counter_of_trimmed_columns_from_the_start -= $length; } ## We don't want to start with an insertion or a deletion. Trim them! while (@reference_cigar and $reference_cigar[0] =~ /[DI]/) { my ($num, $type) = ($reference_cigar[0] =~ /^(\d*)([DIGMX])/); $num = 1 if ($num eq ""); # only counts deletions, insertions are not part of the aligment $counter_of_trimmed_columns_from_the_start += $num if ($type eq "D"); shift(@reference_cigar); } last; } } } } #If this is negative, eg when a slice starts in one block and ends in #another, need to set to 0 to ensure we enter the loop below. This #fixes a bug when using a 2x species as the reference and fetching using #an expanded align slice. if ($number_of_base_pairs_to_trim_from_the_end < 0) { $number_of_base_pairs_to_trim_from_the_end = 0; } ## Parse end of cigar_line for the reference GenomicAlign my $counter_of_trimmed_columns_from_the_end = 0; # num. of bp in the alignment to be trimmed if ($number_of_base_pairs_to_trim_from_the_end >= 0) { my $counter_of_trimmed_base_pairs = 0; # num of bp in the reference sequence we trim (from the end) ## Loop through the cigar pieces while (my $cigar = pop(@reference_cigar)) { # Parse each cigar piece my ($num, $type) = ($cigar =~ /^(\d*)([DIGMX])/); $num = 1 if ($num eq ""); # Insertions are not part of the alignment, don't count them if ($type ne "I") { $counter_of_trimmed_columns_from_the_end += $num; } # Matches and insertions are actual base pairs in the reference if ($type eq "M" or $type eq "I") { $counter_of_trimmed_base_pairs += $num; # If this cigar piece is too long and we overshoot the number of base pairs we want to trim, # we substitute this cigar piece by a shorter one if ($counter_of_trimmed_base_pairs > $number_of_base_pairs_to_trim_from_the_end) { my $new_cigar_piece = ""; # length of the new cigar piece my $length = $counter_of_trimmed_base_pairs - $number_of_base_pairs_to_trim_from_the_end; if ($length > 1) { $new_cigar_piece = $length.$type; } elsif ($length == 1) { $new_cigar_piece = $type; } push(@reference_cigar, $new_cigar_piece) if ($new_cigar_piece); # There is no need to correct the counter of trimmed columns if we are in an insertion # when we overshoot if ($type eq "M") { $counter_of_trimmed_columns_from_the_end -= $length; } ## We don't want to end with an insertion or a deletion. Trim them! while (@reference_cigar and $reference_cigar[-1] =~ /[DI]/) { my ($num, $type) = ($reference_cigar[-1] =~ /^(\d*)([DIGMX])/); $num = 1 if ($num eq ""); # only counts deletions, insertions are not part of the aligment $counter_of_trimmed_columns_from_the_end += $num if ($type eq "D"); pop(@reference_cigar); } last; } } } } ## Skip if no restriction is needed. Return original object! This may happen when ## either excess_at_the_start or excess_at_the_end are 0 but the alignment does not ## start or end with gaps in the reference species. if ($counter_of_trimmed_columns_from_the_start <= 0 and $counter_of_trimmed_columns_from_the_end <= 0) { return wantarray ? ($self, 1, $self->length) : $self; } my ($aln_start, $aln_end); if ($negative_strand) { $aln_start = $counter_of_trimmed_columns_from_the_end + 1; $aln_end = $self->length - $counter_of_trimmed_columns_from_the_start; } else { $aln_start = $counter_of_trimmed_columns_from_the_start + 1; $aln_end = $self->length - $counter_of_trimmed_columns_from_the_end; } $genomic_align_set = $self->restrict_between_alignment_positions($aln_start, $aln_end, $skip_empty_GenomicAligns); $new_reference_genomic_align = $genomic_align_set->reference_genomic_align; if (!defined $self->{'restricted_aln_start'}) { $self->{'restricted_aln_start'} = 0; } if (!defined $self->{'restricted_aln_end'}) { $self->{'restricted_aln_end'} = 0; } $genomic_align_set->{'restricted_aln_start'} = $counter_of_trimmed_columns_from_the_start + $self->{'restricted_aln_start'}; $genomic_align_set->{'restricted_aln_end'} = $counter_of_trimmed_columns_from_the_end + $self->{'restricted_aln_end'}; #$genomic_align_set->{'original_length'} = $self->length; #Need to use original gab length. If original_length is not set, length has #not changed. Needed when use 2X genome as reference. if (defined $self->{'original_length'}) { $genomic_align_set->{'original_length'} = $self->{'original_length'}; } else { $genomic_align_set->{'original_length'} = $self->length; } if (defined $self->slice) { if ($self->strand == 1) { $genomic_align_set->start($new_reference_genomic_align->dnafrag_start - $self->slice->start + 1); $genomic_align_set->end($new_reference_genomic_align->dnafrag_end - $self->slice->start + 1); $genomic_align_set->strand(1); } else { $genomic_align_set->start($self->{reference_slice}->{end} - $new_reference_genomic_align->{dnafrag_end} + 1); $genomic_align_set->end($self->{reference_slice}->{end} - $new_reference_genomic_align->{dnafrag_start} + 1); $genomic_align_set->strand(-1); } } return wantarray ? ($genomic_align_set, $aln_start, $aln_end) : $genomic_align_set; } 1;