Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/IndividualSlice.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/IndividualSlice.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,655 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::IndividualSlice - SubClass of the Slice. Represents the +slice of the genome for a certain individual (applying the alleles for +this individual) + +=head1 SYNOPSIS + + $sa = $db->get_SliceAdaptor; + + $slice = + $sa->fetch_by_region( 'chromosome', 'X', 1_000_000, 2_000_000 ); + + $individualSlice = $slice->get_by_Individual($individual_name); + + # Get the sequence from the Individual Slice: will contain IUPAC codes + # for SNPs and Ensembl ambiguity codes for indels + my $seq = $individualSlice->seq(); + print $seq; + + # Get a subSlice of the Strain + my $subSlice_individual = + $individualSlice->sub_Slice( 5_000, 8_000, 1 ) + + # Compare two different individuals in the same Slice + my $sliceIndividual2 = $slice->get_by_Individual($individual_name2); + my $differences = + $individualSlice->get_all_differences_IndividualSlice( + $sliceIndividual2); + + foreach my $af ( @{$differences} ) { + print + "There is a difference between $individual_name " + . "and $individual_name2 at ", + $af->start, "-", $af->end, + " with allele ", $af->allele_string(), "\n"; + } + +=head1 DESCRIPTION + +A IndividualSlice object represents a region of a genome for a certain +individual. It can be used to retrieve sequence or features from a +individual. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::IndividualSlice; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Utils::Argument qw(rearrange); +use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); +use Bio::EnsEMBL::Slice; +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning); + +@ISA = qw(Bio::EnsEMBL::Slice); + + +=head2 new + + Arg [1..N] : List of named arguments + Bio::EnsEMBL::CoordSystem COORD_SYSTEM + string SEQ_REGION_NAME, + int START, + int END, + string VERSION (optional, defaults to '') + int STRAND, (optional, defaults to 1) + Bio::EnsEMBL::DBSQL::SliceAdaptor ADAPTOR (optional) + Arg[N+1] : string $individual_name + Example : $individualSlice = Bio::EnsEMBL::IndividualSlice->new(-coord_system => $cs, + -start => 1, + -end => 10000, + -strand => 1, + -seq_region_name => 'X', + -seq_region_length => 12e6, + -individual_name => $individual_name); + Description : Creates a new Bio::EnsEMBL::IndividualSlice object that will contain a shallow copy of the + Slice object, plus additional information such as the individual this Slice refers to + and listref of Bio::EnsEMBL::Variation::AlleleFeatures of differences with the + reference sequence + ReturnType : Bio::EnsEMBL::IndividualSlice + Exceptions : none + Caller : general + +=cut + +sub new{ + my $caller = shift; + my $class = ref($caller) || $caller; + + #create the IndividualSlice object as the Slice, plus the individual attribute + my ($individual_name, $sample_id) = rearrange(['INDIVIDUAL', 'SAMPLE_ID'],@_); + + my $self = $class->SUPER::new(@_); + + $self->{'individual_name'} = $individual_name; + $self->{'sample_id'} = $sample_id; + + return $self; + +} + +=head2 individual_name + + Arg [1] : (optional) string $individual_name + Example : my $individual_name = $individualSlice->individual_name(); + Description : Getter/Setter for the name of the individual in the slice + ReturnType : string + Exceptions : none + Caller : general + +=cut + +sub individual_name{ + my $self = shift; + if (@_){ + $self->{'individual_name'} = shift @_; + } + return $self->{'individual_name'}; +} + +=head2 seq + + Arg [1] : none + Example : print "SEQUENCE = ", $strainSlice->seq(); + Description: Returns the sequence of the region represented by this + StrainSlice formatted as a string. + Returntype : string + Exceptions : none + Caller : general + +=cut + +sub seq { + my $self = shift; + + # special case for in-between (insert) coordinates + return '' if($self->start() == $self->end() + 1); + + return $self->{'seq'} if($self->{'seq'}); + + if($self->adaptor()) { + my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor(); + my $reference_sequence = $seqAdaptor->fetch_by_Slice_start_end_strand($self,1,undef,1); #get the reference sequence for that slice + #apply all differences to the reference sequence + + # sort edits in reverse order to remove complication of + # adjusting downstream edits + my @allele_features_ordered = sort {$b->start() <=> $a->start() || $b->end() <=> $a->end()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); + + foreach my $af (@allele_features_ordered){ + $af->apply_edit($reference_sequence); #change, in the reference sequence, the af + } +# return substr(${$reference_sequence},0,1) if ($self->length == 1); + return ${$reference_sequence}; #returns the reference sequence, applying the alleleFeatures + } + + # no attached sequence, and no db, so just return Ns + return 'N' x $self->length(); +} + +=head2 get_all_differences_Slice + + Args : none + Example : my $differences = $individualSlice->get_all_differences_Slice() + Description : Gets all differences between the IndividualSlice object and the Slice is defined + ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : none + Caller : general + +=cut + +sub get_all_differences_Slice{ + my $self = shift; + my $differences; #reference to the array with the differences between Slice and StrainSlice + my $ref_allele; + foreach my $difference (@{$self->{'alleleFeatures'}}){ + if ($difference->length_diff == 0){ + #the difference is a SNP, check if it is the same as the reference allele + $ref_allele = $self->SUPER::subseq($difference->start,$difference->end,$difference->strand); + $ref_allele = '-' if ($ref_allele eq ''); + if ($ref_allele ne $difference->allele_string){ + #when the alleleFeature is different from the reference allele, add to the differences list + push @{$differences},$difference; + } + } + else{ + push @{$differences},$difference; + } + } + + return $differences; + +} + +=head2 get_all_differences_IndividualSlice + + Arg[1] : Bio::EnsEMBL::IndividualSlice $is + Example : my $differences = $individualSlice->get_all_differences_IndividualSlice($individualslice) + Description : Gets differences between 2 IndividualSlice objects + ReturnType : listref of Bio::EnsEMBL::Variation::AlleleFeature + Exceptions : thrown on bad argument + Caller : general + +=cut + +sub get_all_differences_IndividualSlice{ + my $self = shift; + my $individualSlice = shift; + + if (!ref($individualSlice) || !$individualSlice->isa('Bio::EnsEMBL::IndividualSlice')){ + throw('Bio::EnsEMBL::IndividualSlice arg expected'); + } + if ( @{$self->{'alleleFeatures'}} == 0 && @{$individualSlice->{'alleleFeatures'}} == 0){ + return undef; #there are no differences in any of the Individuals + + } + my $differences; #differences between individuals + if (@{$individualSlice->{'alleleFeatures'}} == 0){ + #need to create a copy of alleleFeature for the first Individual + foreach my $difference (@{$self->{'alleleFeatures'}}){ + my %vf = %$difference; + push @{$differences},bless \%vf,ref($difference); + } + } + elsif (@{$self->{'alleleFeatures'}} == 0){ + #need to create a copy of AlleleFeature, but changing the allele by the allele in the reference sequence + foreach my $difference (@{$individualSlice->{'alleleFeatures'}}){ + push @{$differences}, $individualSlice->_convert_difference($difference); + } + } + else{ + #both individuals have differences + #create a hash with the differences in the first slice + my %allele_features_self = map {$_->start.'-'.$_->end => $_} @{$self->{'alleleFeatures'}}; + foreach my $difference (@{$individualSlice->{'alleleFeatures'}}){ + #there is no difference in the other individual slice, convert the allele + if (!defined $allele_features_self{$difference->start.'-'.$difference->end}){ + push @{$differences},$individualSlice->_convert_difference($difference); + } + else{ + #if it is defined and have the same allele, delete from the hash since it is not a difference + #between the individuals + if ($allele_features_self{$difference->start.'-'.$difference->end}->allele_string eq $difference->allele_string){ + delete $allele_features_self{$difference->start.'-'.$difference->end}; + } + } + } + #and finally, make a shallow copy of the differences in the first individual + foreach my $difference (values %allele_features_self){ + my %vf = %$difference; + push @{$differences},bless \%vf,ref($difference); + } + + } + #need to map differences to the first individual, self, since the coordinates are in the Slice coordinate system + my $mapper = $self->mapper(); #now that we have the differences, map them in the IndividualSlice + my @results; + foreach my $difference (@{$differences}){ + @results = $mapper->map_coordinates('Slice',$difference->start,$difference->end,$difference->strand,'Slice'); + #we can have 3 possibilities: + #the difference is an insertion and when mapping returns the boundaries of the insertion in the IndividualSlice + if (@results == 2){ + #the first position in the result is the beginning of the insertion + if($results[0]->start < $results[1]->start){ + $difference->start($results[0]->end+1); + $difference->end($results[1]->start-1); + } + else{ + #it is the second position the beginning of the insertion + $difference->start($results[1]->end+1); + $difference->end($results[0]->start-1); + } + $difference->strand($results[0]->strand); + } + else{ + #it can be either a SNP or a deletion, and we have the coordinates in the result, etither a Bio::EnsEMBL::Mapper::Coordinate + # or a Bio::EnsEMBL::Mapper::IndelCoordinate + $difference->start($results[0]->start); + $difference->end($results[0]->end); + $difference->strand($results[0]->strand); + } + } + + return $differences; +} + +#for a given AlleleFeature, converts the allele into the reference allele and returns +#the converted AlleleFeature + +sub _convert_difference{ + my $self = shift; + my $difference = shift; + my %new_af = %$difference; #make a copy of the alleleFeature + #and change the allele with the one from the reference Slice + $new_af{'allele_string'} = $self->SUPER::subseq($difference->start,$difference->end,$difference->strand); + return bless \%new_af,ref($difference); +} + +=head2 mapper + + Args : none + Description: Getter for the mapper between the between the IndividualSlice and the Slice it refers to. + It is done automatically when necessary to create subSlice or to get the differences between individuals + Returntype : Bio::EnsEMBL::Mapper + Exceptions : none + Caller : Internal function + +=cut + +sub mapper{ + my $self = shift; + + if (@_) { + #allow to create again the mapper + delete $self->{'mapper'}; + } + if(!defined $self->{'mapper'}){ + #create the mapper between the Slice and StrainSlice + my $mapper = Bio::EnsEMBL::Mapper->new('Slice','IndividualSlice'); + #align with Slice + #get all the VariationFeatures in the Individual Slice, from start to end in the Slice + my @allele_features_ordered = sort {$a->start() <=> $b->start() || $b->end() <=> $a->end()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); + + my $start_slice = 1; + my $end_slice; + my $start_individual = 1; + my $end_individual; + my $length_allele; + my $total_length_diff = 0; + #we will walk from left to right in the slice object, updating the start and end individual every time + #there is a new alleleFeature in the Individual + foreach my $allele_feature (@allele_features_ordered){ + #we have a insertion/deletion: marks the beginning of new slice move coordinates + if ($allele_feature->length_diff != 0){ + $total_length_diff += $allele_feature->length_diff; + $length_allele = $allele_feature->length + $allele_feature->length_diff(); #length of the allele in the Individual + $end_slice = $allele_feature->start() - 1; #set the end of the slice before the alleleFeature + if ($end_slice >= $start_slice){ + #normal cases (not with gaps) + $end_individual = $end_slice - $start_slice + $start_individual; #set the end of the individual from the beginning plus the offset + #add the sequence that maps + $mapper->add_map_coordinates('Slice',$start_slice,$end_slice,1,'IndividualSlice',$start_individual,$end_individual); + #and add the indel + $mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $allele_feature->length,1,'IndividualSlice',$end_individual+1,$end_individual + $length_allele); + $start_individual = $end_individual + $length_allele + 1; #set the beginning of the individual after the allele + } + else{ + #add the indel + $mapper->add_indel_coordinates('Slice',$end_slice+1,$end_slice + $allele_feature->length,1,'IndividualSlice',$end_individual+1,$end_individual + $length_allele); + $start_individual += $length_allele; + } + $start_slice = $end_slice + $allele_feature->length+ 1; #set the beginning of the slice after the variation feature + } + } + if ($start_slice <= $self->length){ + #if we haven't reached the end of the IndividualSlice, add the final map coordinates between the individual and the slice + $mapper->add_map_coordinates('Slice',$start_slice,$self->length,1,'IndividualSlice',$start_individual,$start_individual + $self->length - $start_slice); + } + + $mapper->add_map_coordinates('Slice', -$self->start+1, 0,1, 'IndividualSlice', -$self->start +1,0) if ($self->start > 0); #before individualSlice + $mapper->add_map_coordinates('Slice', $self->length + 1,$self->seq_region_length - ($self->length +1),1, 'IndividualSlice', $self->length + 1 + $total_length_diff,$self->seq_region_length + $total_length_diff - ($self->length +1) ) if ($self->length <= $self->seq_region_length); #after strainSlice + $self->{'mapper'} = $mapper; + } + return $self->{'mapper'}; +} + +=head2 sub_Slice + + Arg 1 : int $start + Arg 2 : int $end + Arge [3] : int $strand + Example : none + Description: Makes another IndividualSlice that covers only part of this IndividualSlice + with the appropriate differences to the reference Slice + If a slice is requested which lies outside of the boundaries + of this function will return undef. This means that + behaviour will be consistant whether or not the slice is + attached to the database (i.e. if there is attached sequence + to the slice). Alternatively the expand() method or the + SliceAdaptor::fetch_by_region method can be used instead. + Returntype : Bio::EnsEMBL::IndividualSlice or undef if arguments are wrong + Exceptions : thrown when trying to get the subSlice in the middle of a + insertion + Caller : general + +=cut + +sub sub_Slice { + my ( $self, $start, $end, $strand ) = @_; + my $mapper = $self->mapper(); + #map from the Individual to the Slice to get the sub_Slice, and then, apply the differences in the subSlice + my @results = $mapper->map_coordinates('IndividualSlice',$start,$end,$strand,'IndividualSlice'); + my $new_start; + my $new_end; + my $new_strand; + my $new_seq; + #Get need start and end for the subSlice of the IndividualSlice + my @results_ordered = sort {$a->start <=> $b->start} grep {ref($_) eq 'Bio::EnsEMBL::Mapper::Coordinate'} @results; + $new_start = $results_ordered[0]->start(); + $new_strand = $results_ordered[0]->strand() if (ref($results_ordered[0]) eq 'Bio::EnsEMBL::Mapper::Coordinate'); +# $new_strand = $results_ordered[-1]->strand() if (ref($results_ordered[-1]) eq 'Bio::EnsEMBL::Mapper::Coordinate'); + $new_end = $results_ordered[-1]->end(); #get last element of the array, the end of the slice + + my $subSlice = $self->SUPER::sub_Slice($new_start,$new_end,$new_strand); + $subSlice->{'individual_name'} = $self->{'individual_name'}; + + my $new_alleles; #reference to an array that will contain the variationFeatures in the new subSlice + #update the VariationFeatures in the sub_Slice of the Individual + my %af; + my $new_allele_feature; + foreach my $alleleFeature (@{$self->{'alleleFeatures'}}){ + $new_allele_feature = $alleleFeature->transfer($subSlice); + #only transfer the coordinates to the SubSlice that are within the boundaries + if ($new_allele_feature->start >= 1 && $new_allele_feature->end <= $subSlice->length){ + push @{$new_alleles}, $new_allele_feature; + } + } + $subSlice->{'alleleFeatures'} = $new_alleles; + return $subSlice; + +} + +=head2 subseq + + Arg [1] : int $startBasePair + relative to start of slice, which is 1. + Arg [2] : int $endBasePair + relative to start of slice. + Arg [3] : (optional) int $strand + The strand of the individual slice to obtain sequence from. Default + value is 1. + Description: returns string of dna sequence + Returntype : txt + Exceptions : end should be at least as big as start + strand must be set + Caller : general + +=cut + +sub subseq { + my ( $self, $start, $end, $strand ) = @_; + + if ( $end+1 < $start ) { + throw("End coord + 1 is less than start coord"); + } + + # handle 'between' case for insertions + return '' if( $start == $end + 1); + + $strand = 1 unless(defined $strand); + + if ( $strand != -1 && $strand != 1 ) { + throw("Invalid strand [$strand] in call to Slice::subseq."); + } + + my $subseq; + my $seq; + if($self->adaptor){ + my $seqAdaptor = $self->adaptor()->db()->get_SequenceAdaptor(); + $subseq = ${$seqAdaptor->fetch_by_Slice_start_end_strand($self,$start,$end,$strand)}; #get the reference sequence for that slice + #apply all differences to the reference sequence + # sort edits in reverse order to remove complication of + # adjusting downstream edits + my @allele_features_ordered = sort {$b->start() <=> $a->start() || $b->end() <=> $a->end()} @{$self->{'alleleFeatures'}} if (defined $self->{'alleleFeatures'}); + my $af_start; + my $af_end; + foreach my $af (@allele_features_ordered){ + if (($af->start - $start +1 > 0) && ($end - $af->end > 0)){ + #save the current start and end of the alleleFeature before changing for apply_edit + $af_start = $af->start; + $af_end = $af->end; + #apply the difference if the feature is in the new slice + $af->start($af->start - $start +1); + $af->end($af->end - $start +1); + $af->apply_edit(\$subseq); #change, in the reference sequence, the af + #restore the initial values of alleleFeature start and end + $af->start($af_start); + $af->end($af_end); + + } + } + } + else { + ## check for gap at the beginning and pad it with Ns + if ($start < 1) { + $subseq = "N" x (1 - $start); + $start = 1; + } + $subseq .= substr ($self->seq(), $start-1, $end - $start + 1); + ## check for gap at the end and pad it with Ns + if ($end > $self->length()) { + $subseq .= "N" x ($end - $self->length()); + } + reverse_comp(\$subseq) if($strand == -1); + } + return $subseq; + +} + +=head2 get_all_Transcripts + + Args : None + Example : @transcripts = @{$individualslice->get_all_Transcripts)}; + Description: Gets all transcripts which overlap this Individual Slice. If you want to + specify a particular analysis or type, then you are better off + using get_all_Genes or get_all_Genes_by_type and iterating + through the transcripts of each gene. + Returntype : reference to a list of Bio::EnsEMBL::Transcripts + Exceptions : none + Caller : general + +=cut + +sub get_all_Transcripts { + my $self = shift; + + my $transcripts = $self->SUPER::get_all_Transcripts(1); + $self->map_to_Individual($transcripts); + + return $transcripts; +} + + +=head2 get_all_Exons + + Arg [1] : (optional) string $dbtype + The dbtype of exons to obtain. This assumes that the db has + been added to the DBAdaptor under this name (using the + DBConnection::add_db_adaptor method). + Example : @exons = @{$individualSlice->get_all_Exons}; + Description: Gets all exons which overlap this IndividualSlice. Note that these exons + will not be associated with any transcripts, so this may not + be terribly useful. + Returntype : reference to a list of Bio::EnsEMBL::Exons + Exceptions : none + Caller : general + +=cut + +sub get_all_Exons { + my $self = shift; + my $dbtype = shift; + + my $exons = $self->SUPER::get_all_Exons($dbtype); + $self->map_to_Individual($exons); #map the exons to the Individual + + return $exons; +} + +=head2 get_all_Genes + + Arg [1] : (optional) string $logic_name + The name of the analysis used to generate the genes to retrieve + Arg [2] : (optional) string $dbtype + The dbtype of genes to obtain. This assumes that the db has + been added to the DBAdaptor under this name (using the + DBConnection::add_db_adaptor method). + Example : @genes = @{$individualSlice->get_all_Genes}; + Description: Retrieves all genes that overlap this slice. + Returntype : listref of Bio::EnsEMBL::Genes + Exceptions : none + Caller : none + +=cut + +sub get_all_Genes{ + my ($self, $logic_name, $dbtype) = @_; + + my $genes = $self->SUPER::get_all_Genes($logic_name, $dbtype, 1); + + $self->map_to_Individual($genes); + + foreach my $gene (@{$genes}){ + $self->map_to_Individual($gene->get_all_Exons); #map the Exons to the Individual + $self->map_to_Individual($gene->get_all_Transcripts); #map the Transcripts to the Individual + } + + return $genes; +} + +=head2 map_to_Individual + + Arg[1] : ref $features + Example : $individualSlice->map_to_Individual($exons); + Description : Gets the features from the Slice and maps it in the IndividualSlice, using the mapper + between Slice and IndividualSlice + ReturnType : None + Exceptions : None + Caller : general + +=cut + +sub map_to_Individual{ + my $self = shift; + my $features = shift; + + my $mapper = $self->mapper(); + my (@results, @results_ordered, $new_start, $new_end, $new_strand); + #foreach of the transcripts, map them to the IndividualSlice and replace the Slice with the IndividualSlice + foreach my $feature (@{$features}){ + $feature->slice($self); #replace the IndividualSlice as the Slice for this feature (the Slice plus the AlleleFeatures) + #map from the Slice to the Individual Slice + my @results = $mapper->map_coordinates('Slice',$feature->start,$feature->end,$feature->strand,'Slice'); + #from the results, order them but filter out those that are not coordinates + @results_ordered = sort {$a->start <=> $b->start} grep {ref($_) eq 'Bio::EnsEMBL::Mapper::Coordinate'} @results; + $new_start = $results_ordered[0]->start(); + $new_strand = $results_ordered[0]->strand(); + $new_end = $results_ordered[-1]->end(); #get last element of the array, the end of the slice + $feature->start($new_start); #update new coordinates + $feature->end($new_end); + $feature->strand($new_strand); + } +} + +sub alleleFeatures{ + my $self = shift; + return $self->{'alleleFeatures'}; +} + +sub add_AlleleFeature{ + my $self = shift; + + if (@_){ + if(!ref($_[0]) || !$_[0]->isa('Bio::EnsEMBL::Variation::AlleleFeature')) { + throw("Bio::EnsEMBL::Variation::AlleleFeature argument expected"); + } + #add the alleleFeature to the individualSlice + push @{$self->{'alleleFeatures'}},shift; + } +} +1;