Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IdMapping/SyntenyRegion.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,528 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::IdMapping::SyntenyRegion - object representing syntenic regions + +=head1 SYNOPSIS + + # create a new SyntenyRegion from a source and a target gene + my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast( [ + $source_gene->start, $source_gene->end, + $source_gene->strand, $source_gene->seq_region_name, + $target_gene->start, $target_gene->end, + $target_gene->strand, $target_gene->seq_region_name, + $entry->score, + ] ); + + # merge with another SyntenyRegion + my $merged_sr = $sr->merge($sr1); + + # score a gene pair against this SyntenyRegion + my $score = + $sr->score_location_relationship( $source_gene1, $target_gene1 ); + +=head1 DESCRIPTION + +This object represents a synteny between a source and a target location. +SyntenyRegions are built from mapped genes, and the their score is +defined as the score of the gene mapping. For merged SyntenyRegions, +scores are combined. + +=head1 METHODS + + new_fast + source_start + source_end + source_strand + source_seq_region_name + target_start + target_end + target_strand + target_seq_region_name + score + merge + stretch + score_location_relationship + to_string + +=cut + +package Bio::EnsEMBL::IdMapping::SyntenyRegion; + +use strict; +use warnings; +no warnings 'uninitialized'; + +use Bio::EnsEMBL::Utils::Exception qw(throw warning); + + +=head2 new_fast + + Arg[1] : Arrayref $array_ref - the arrayref to bless into the + SyntenyRegion object + Example : my $sr = Bio::EnsEMBL::IdMapping::SyntenyRegion->new_fast([ + ]); + Description : Constructor. On instantiation, source and target regions are + reverse complemented so that source is always on forward strand. + Return type : a Bio::EnsEMBL::IdMapping::SyntenyRegion object + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub new_fast { + my $class = shift; + my $array_ref = shift; + + # reverse complement source and target so that source is always on forward + # strand; this will make merging and other comparison operations easier + # at later stages + if ($array_ref->[2] == -1) { + $array_ref->[2] = 1; + $array_ref->[6] = -1 * $array_ref->[6]; + } + + return bless $array_ref, $class; +} + + +=head2 source_start + + Arg[1] : (optional) Int - source location start coordinate + Description : Getter/setter for source location start coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub source_start { + my $self = shift; + $self->[0] = shift if (@_); + return $self->[0]; +} + + +=head2 source_end + + Arg[1] : (optional) Int - source location end coordinate + Description : Getter/setter for source location end coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub source_end { + my $self = shift; + $self->[1] = shift if (@_); + return $self->[1]; +} + + +=head2 source_strand + + Arg[1] : (optional) Int - source location strand + Description : Getter/setter for source location strand. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub source_strand { + my $self = shift; + $self->[2] = shift if (@_); + return $self->[2]; +} + + +=head2 source_seq_region_name + + Arg[1] : (optional) String - source location seq_region name + Description : Getter/setter for source location seq_region name. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub source_seq_region_name { + my $self = shift; + $self->[3] = shift if (@_); + return $self->[3]; +} + + +=head2 target_start + + Arg[1] : (optional) Int - target location start coordinate + Description : Getter/setter for target location start coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub target_start { + my $self = shift; + $self->[4] = shift if (@_); + return $self->[4]; +} + + +=head2 target_end + + Arg[1] : (optional) Int - target location end coordinate + Description : Getter/setter for target location end coordinate. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub target_end { + my $self = shift; + $self->[5] = shift if (@_); + return $self->[5]; +} + + +=head2 target_strand + + Arg[1] : (optional) Int - target location strand + Description : Getter/setter for target location strand. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub target_strand { + my $self = shift; + $self->[6] = shift if (@_); + return $self->[6]; +} + + +=head2 target_seq_region_name + + Arg[1] : (optional) String - target location seq_region name + Description : Getter/setter for target location seq_region name. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub target_seq_region_name { + my $self = shift; + $self->[7] = shift if (@_); + return $self->[7]; +} + + +=head2 score + + Arg[1] : (optional) Float - score + Description : Getter/setter for the score between source and target location. + Return type : Int + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub score { + my $self = shift; + $self->[8] = shift if (@_); + return $self->[8]; +} + + +=head2 merge + + Arg[1] : Bio::EnsEMBL::IdMapping::SyntenyRegion $sr - another + SyntenyRegion + Example : $merged_sr = $sr->merge($other_sr); + Description : Merges two overlapping SyntenyRegions if they meet certain + criteria (see documentation in the code for details). Score is + calculated as a combined distance score. If the two + SyntenyRegions aren't mergeable, this method returns undef. + Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion or undef + Exceptions : warns on bad scores + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub merge { + my ($self, $sr) = @_; + + # must be on same seq_region + if ($self->source_seq_region_name ne $sr->source_seq_region_name or + $self->target_seq_region_name ne $sr->target_seq_region_name) { + return 0; + } + + # target must be on same strand + return 0 unless ($self->target_strand == $sr->target_strand); + + # find the distance of source and target pair and compare + my $source_dist = $sr->source_start - $self->source_start; + my $target_dist; + if ($self->target_strand == 1) { + $target_dist = $sr->target_start - $self->target_start; + } else { + $target_dist = $self->target_end - $sr->target_end; + } + + # prevent division by zero error + if ($source_dist == 0 or $target_dist == 0) { + warn("WARNING: source_dist ($source_dist) and/or target_dist ($target_dist) is zero.\n"); + return 0; + } + + # calculate a distance score + my $dist = $source_dist - $target_dist; + $dist = -$dist if ($dist < 0); + my $d1 = $dist/$source_dist; + $d1 = -$d1 if ($d1 < 0); + my $d2 = $dist/$target_dist; + $d2 = -$d2 if ($d2 < 0); + my $dist_score = 1 - $d1 - $d2; + + # distance score must be more than 50% + return 0 if ($dist_score < 0.5); + + my $new_score = $dist_score * ($sr->score + $self->score)/2; + + if ($new_score > 1) { + warn("WARNING: Bad merge score: $new_score\n"); + } + + # extend SyntenyRegion to cover both sources and targets, set merged score + # and return + if ($sr->source_start < $self->source_start) { + $self->source_start($sr->source_start); + } + if ($sr->source_end > $self->source_end) { + $self->source_end($sr->source_end); + } + + if ($sr->target_start < $self->target_start) { + $self->target_start($sr->target_start); + } + if ($sr->target_end > $self->target_end) { + $self->target_end($sr->target_end); + } + + $self->score($new_score); + + return $self; +} + + +=head2 stretch + + Arg[1] : Float $factor - stretching factor + Example : $stretched_sr = $sr->stretch(2); + Description : Extends this SyntenyRegion to span a $factor * $score more area. + Return type : Bio::EnsEMBL::IdMapping::SyntenyRegion + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub stretch { + my ($self, $factor) = @_; + + my $source_adjust = int(($self->source_end - $self->source_start + 1) * + $factor * $self->score); + $self->source_start($self->source_start - $source_adjust); + $self->source_end($self->source_end + $source_adjust); + #warn sprintf(" sss %d %d %d\n", $source_adjust, $self->source_start, + # $self->source_end); + + my $target_adjust = int(($self->target_end - $self->target_start + 1) * + $factor * $self->score); + $self->target_start($self->target_start - $target_adjust); + $self->target_end($self->target_end + $target_adjust); + + return $self; +} + + +=head2 score_location_relationship + + Arg[1] : Bio::EnsEMBL::IdMapping::TinyGene $source_gene - source gene + Arg[2] : Bio::EnsEMBL::IdMapping::TinyGene $target_gene - target gene + Example : my $score = $sr->score_location_relationship($source_gene, + $target_gene); + Description : This function calculates how well the given source location + interpolates on given target location inside this SyntenyRegion. + + Scoring is done the following way: Source and target location + are normalized with respect to this Regions source and target. + Source range will then be somewhere close to 0.0-1.0 and target + range anything around that. + + The extend of the covered area between source and target range + is a measurement of how well they agree (smaller extend is + better). The extend (actually 2*extend) is reduced by the size + of the regions. This will result in 0.0 if they overlap + perfectly and bigger values if they dont. + + This is substracted from 1.0 to give the score. The score is + likely to be below zero, but is cut off at 0.0f. + + Finally, the score is multiplied with the score of the synteny + itself. + Return type : Float + Exceptions : warns if score out of range + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + + + +sub score_location_relationship { + my ($self, $source_gene, $target_gene) = @_; + + # must be on same seq_region + if (($self->source_seq_region_name ne $source_gene->seq_region_name) or + ($self->target_seq_region_name ne $target_gene->seq_region_name)) { + return 0; + } + + # strand relationship must be the same (use logical XOR to find out) + if (($self->source_strand == $source_gene->strand) xor + ($self->target_strand == $target_gene->strand)) { + return 0; + } + + # normalise source location + my $source_rel_start = ($source_gene->start - $self->source_start) / + ($self->source_end - $self->source_start + 1); + + my $source_rel_end = ($source_gene->end - $self->source_start + 1) / + ($self->source_end - $self->source_start + 1); + + #warn " aaa ".$self->to_string."\n"; + #warn sprintf(" bbb %.6f %.6f\n", $source_rel_start, $source_rel_end); + + # cut off if the source location is completely outside + return 0 if ($source_rel_start > 1.1 or $source_rel_end < -0.1); + + # normalise target location + my ($target_rel_start, $target_rel_end); + my $t_length = $self->target_end - $self->target_start + 1; + + if ($self->target_strand == 1) { + + $target_rel_start = ($target_gene->start - $self->target_start) / $t_length; + + $target_rel_end = ($target_gene->end - $self->target_start + 1) / $t_length; + + } else { + $target_rel_start = ($self->target_end - $target_gene->end) / $t_length; + $target_rel_end = ($self->target_end - $target_gene->start + 1) / $t_length; + } + + my $added_range = (($target_rel_end > $source_rel_end) ? $target_rel_end : + $source_rel_end) - + (($target_rel_start < $source_rel_start) ? $target_rel_start : + $source_rel_start); + + my $score = $self->score * (1 - (2 * $added_range - $target_rel_end - + $source_rel_end + $target_rel_start + $source_rel_start)); + + #warn " ccc ".sprintf("%.6f:%.6f:%.6f:%.6f:%.6f\n", $added_range, + # $source_rel_start, $source_rel_end, $target_rel_start, $target_rel_end); + + $score = 0 if ($score < 0); + + # sanity check + if ($score > 1) { + warn "Out of range score ($score) for ".$source_gene->id.":". + $target_gene->id."\n"; + } + + return $score; +} + + +=head2 to_string + + Example : print LOG $sr->to_string, "\n"; + Description : Returns a string representation of the SyntenyRegion object. + Useful for debugging and logging. + Return type : String + Exceptions : none + Caller : Bio::EnsEMBL::IdMapping::SyntenyFramework + Status : At Risk + : under development + +=cut + +sub to_string { + my $self = shift; + return sprintf("%s:%s-%s:%s %s:%s-%s:%s %.6f", + $self->source_seq_region_name, + $self->source_start, + $self->source_end, + $self->source_strand, + $self->target_seq_region_name, + $self->target_start, + $self->target_end, + $self->target_strand, + $self->score + ); +} + + +1; +