Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Utils/CigarString.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/CigarString.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,350 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 AUTHOR + +Juguang Xiao <juguang@tll.org.sg> + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::CigarString, a utilites module to generate cigar +strings + +=head1 DESCRIPTION + +Sequence alignment hits were previously stored within the core database +as ungapped alignments. This imposed 2 major constraints on alignments: + +a) alignments for a single hit record would require multiple rows in the + database, and +b) it was not possible to accurately retrieve the exact original + alignment. + +Therefore, in the new branch sequence alignments are now stored as +ungapped alignments in the cigar line format (where CIGAR stands for +Concise Idiosyncratic Gapped Alignment Report). + +In the cigar line format alignments are sotred as follows: + + M: Match + D: Deletino + I: Insertion + +An example of an alignment for a hypthetical protein match is shown +below: + + + Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL... + PG P G GP R PLGP + Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD... + +protein_align_feature table as the following cigar line: + + 7M4D12M2I2MD7M + +=cut + +package Bio::EnsEMBL::Utils::CigarString; + +use strict; +use vars qw(@ISA); +use Bio::EnsEMBL::Root; + +@ISA = qw(Bio::EnsEMBL::Root); + +=head2 split_hsp + + Name : split_hsp (this name is derived from the original sub in BlastWorn) + Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP object. +my $factory = new Bio::EnsEMBL::Utils::CigarString; +my $cigar_string = $factory->split_hsp($hsp); + + Function: generate cigar string. + Argument: a HSP object. + Returns : a text string. + +=cut + +sub split_hsp { + my ($self, $hsp) = @_; + + $self->throw("a defined object needed") unless($hsp && defined($hsp)); + unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){ + $self->throw("a HSP object needed"); + } + + my ($qtype, $htype) = $self->_findTypes($hsp); + my ($qstrand, $hstrand) = $self->_findStrands($hsp); + my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype); + + my @gaps = (); + my @qchars = split(//, $hsp->query_string); + my @hchars = split(//, $hsp->hit_string); + my $qstart; + if($qstrand == 1){ + $qstart = $hsp->query->start; + }elsif($qstart == -1){ + $qstart = $hsp->query->end; + }else{ + $self->warn("[$qstart], invalid strand value on query"); + $qstart = $hsp->query->start; + # Is this a SearchIO's bug??? + } + + my $hstart; + if($hstrand == 1){ + $hstart = $hsp->subject->start; + }elsif($hstrand != -1){ + $hstart = $hsp->subject->end; + }else{ + $self->throw("[$hstart], invalid strand value on subject"); + } + + my $qend = $qstart; + my $hend = $hstart; + my $count = 0; + my $found = 0; + + my @align_coordinates = (); + while($count <= $#qchars){ + if($qchars[$count] ne '-' && $hchars[$count] ne '-') { + $qend += $qinc; + $hend += $hinc; + $found = 1; + }else{ # gapped region + push(@align_coordinates, [$qstart, $hstart]) if($found == 1); + + $qstart = $qend; + $qstart += $qinc if($qchars[$count] ne '-'); + + $hstart = $hend; + $hstart += $hinc if($hchars[$count] ne '-'); + + $qend = $qstart; + $hend = $hstart; + $found = 0; + } + $count++; + } + + if($found){ + push(@align_coordinates, [$qstart, $hstart]); + } + + my $cigar_string = ""; + my $last = $#align_coordinates; + if($last >= 0){ + for(my $i=0; $i<$last; $i++){ + my $q_this_start = $align_coordinates[$i]->[0]; + my $q_next_start = $align_coordinates[$i+1]->[0]; + my $q_length = ($q_next_start-$q_this_start-1)*$qinc; + $q_length = abs($q_length); + my $h_this_start = $align_coordinates[$i]->[1]; + my $h_next_start = $align_coordinates[$i+1]->[1]; + my $h_length = ($h_next_start-$h_this_start-1)*$hinc; + $h_length = abs($h_length); + + my $diff = $q_length - $h_length; + if($diff > 0){ # Insertion + $cigar_string .= $diff unless($diff == 1); + $cigar_string .= 'I'; + }elsif($diff < 0){ # Deletion + $cigar_string .= -$diff unless($diff == -1); + $cigar_string .= 'D'; + }else{ # e.g $diff == 0, Match + $cigar_string .= $q_length unless($q_length == 1); + $cigar_string .= 'M'; + } + + } # for + } # if + + return $cigar_string; +} + + +sub _findStrands { + my ($self,$hsp) = @_; + + my $qstrand = $hsp->query->strand; + unless($qstrand == 1 || $qstrand == -1){ + $self->warn("query's strand value is neither 1 or -1"); + $qstrand = 1; + } + + my $hstrand = $hsp->subject->strand; + unless($hstrand == 1 || $hstrand == -1){ + $self->warn("subject's strand value is neither 1 or -1"); + $hstrand = 1; + } + + return ( $qstrand, $hstrand); +} + +sub _findTypes { + my ($self,$hsp) = @_; + + my $type1; + my $type2; + my $len1 = $hsp->query->length(); + my $len2 = $hsp->subject->length(); + + if ($len1/$len2 > 2) { + $type1 = 'dna'; + $type2 = 'pep'; + } elsif ($len2/$len1 > 2) { + $type1 = 'pep'; + $type2 = 'dna'; + } else { + $type1 = 'dna'; + $type2 = 'dna'; + } + + return ($type1,$type2); +} + +sub _findIncrements { + my ($self,$qstrand,$hstrand,$qtype,$htype) = @_; + + my $qinc = 1 * $qstrand; + my $hinc = 1 * $hstrand; + + if ($qtype eq 'dna' && $htype eq 'pep') { + $qinc = 3 * $qinc; + } + if ($qtype eq 'pep' && $htype eq 'dna') { + $hinc = 3 * $hinc; + } + + return ($qinc,$hinc); +} + +# This is a core logic of cigar string. The finite state machine theory is +# apply. See the below table, x-axis represents the input, with 3 options: +# (+/+) -- Both current query and subject bases are non-gap. Match +# (-/+) -- The current query base is gap, but subject not. Deletion +# (+/-) -- The current subject base is gap, but query not. Insertion +# While the y-axis means the current state with letter 'M', 'D', 'I' +# +# The content of this table is the action taken in response of the input and +# the current state. +# R remain the state, counter increment. +# G;X generate the cigar line based on the current state and counter; +# clear the counter to zero and change to the state X +# +# || +/+ | -/+ | +/- | +# -------+----------------------+ +# M || R | G;D | G;I | +# ------------------------------+ +# D || G;M | R | G;I | +# ------------------------------+ +# I || G;M | G;D | R | +# + +=head2 generate_cigar_string + + Name : generate_cigar_string + Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars); + Function: generate the cigar string for a piece of alignment. + Args: 2 array references. The lengths of 2 arrays are the same + Return: a cigar string + +=cut + +# Developer's Note: The method is originally abstracted from the concept of +# cigar string. It only asks the essential information of 2 sequence characters +# of the alignment, while the BlastWorn::split_HSP asks more unused information +# for cigar string, which is useful to form align_coordinates. - Juguang + +my ($count, $state); # strictly only used in the following 2 subs + +sub generate_cigar_string { + +# my ($self, $qstart, $hstart, $qinc, $hinc, $qchars_ref, $hchars_ref) = @_; + + my ($self, $qchars_ref, $hchars_ref) = @_; + + my @qchars = @{$qchars_ref}; + my @hchars = @{$hchars_ref}; + + unless(scalar(@qchars) == scalar(@hchars)){ + $self->throw("two sequences are not equal in lengths"); + } + + $count = 0; + $state = 'M'; # the current state of gaps, (M, D, I) + + my $cigar_string = ''; + for(my $i=0; $i <= $#qchars; $i++){ + my $qchar = $qchars[$i]; + my $hchar = $hchars[$i]; + if($qchar ne '-' && $hchar ne '-'){ # Match + $cigar_string .= $self->_sub_cigar_string('M'); + }elsif($qchar eq '-'){ # Deletion + $cigar_string .= $self->_sub_cigar_string('D'); + }elsif($hchar eq '-'){ # Insertion + $cigar_string .= $self->_sub_cigar_string('I'); + }else{ + $self->throw("Impossible state that 2 gaps on each seq aligned"); + } + } + $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail. + return $cigar_string; +} + +sub _sub_cigar_string { + my ($self, $new_state) = @_; + my $sub_cigar_string = ''; + if($state eq $new_state){ + $count++; # Remain the state and increase the counter + }else{ + $sub_cigar_string .= $count unless $count == 1; + $sub_cigar_string .= $state; + $count = 1; + $state = $new_state; + } + return $sub_cigar_string; +} + +=head2 generate_cigar_string_by_hsp + + Name : generate_cigar_string_by_hsp + Usage : my $hsp; # a ready GenericHSP object +my $cigar_string = $self->generate_cigar_string_by_hsp($hsp); + Function: generate a cigar string by given HSP object. + Args : a GenericHSP object + Returns: a text string of cigar string + +=cut + +sub generate_cigar_string_by_hsp { + my ($self, $hsp) = @_; + + unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){ + $self->throw("A GenericHSP object needed"); + } + + my @qchars = split(//, $hsp->query_string); + my @hchars = split(//, $hsp->hit_string); + + return $self->generate_cigar_string(\@qchars, \@hchars); +} + +1;