Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,508 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::ProbeFeature +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <ensembl-dev@ebi.ac.uk>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::ProbeFeature - A module to represent an nucleotide probe +genomic mapping. + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::ProbeFeature; + +my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( + -PROBE => $probe, + -MISMATCHCOUNT => 0, + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -ANALYSIS => $analysis, + -CIGAR_STRING => '1U2M426D2M1m21M', +); + + +=head1 DESCRIPTION + +An ProbeFeature object represents the genomic placement of an Probe +object. The data are stored in the probe_feature table. + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::ProbeFeature; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Feature; +use Bio::EnsEMBL::Funcgen::Storable; +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(median); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable); + + +=head2 new + + Arg [-PROBE] : Bio::EnsEMBL::Funcgen::Probe - probe + A ProbeFeature must have a probe. This probe must already be stored if + you plan to store the feature. + Arg [-MISMATCHCOUNT]: int + Number of mismatches over the length of the probe. + Arg [-SLICE] : Bio::EnsEMBL::Slice + The slice on which this feature is. + Arg [-START] : int + The start coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-END] : int + The end coordinate of this feature relative to the start of the slice + it is sitting on. Coordinates start at 1 and are inclusive. + Arg [-STRAND] : int + The orientation of this feature. Valid values are 1, -1 and 0. + Arg [-dbID] : (optional) int + Internal database ID. + Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor + Database adaptor. + Example : my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( + -PROBE => $probe, + -MISMATCHCOUNT => 0, + -SLICE => $chr_1_slice, + -START => 1_000_000, + -END => 1_000_024, + -STRAND => -1, + -ANALYSIS => $analysis, + -CIGARLINE => '15M2m3d4M', + #Can represent transcript alignment as gapped genomic alignments + #D(eletions) representing introns + #Lowercase m's showing sequence mismatches + ); + Description: Constructor for ProbeFeature objects. + Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature + Exceptions : None + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + my ($probe, $mismatchcount, $pid, $cig_line) + = rearrange(['PROBE', 'MISMATCHCOUNT', 'PROBE_ID', 'CIGAR_STRING'], @_); + + #remove mismatch? + #mandatory args? + + #warn "creating probe feature with $pid"; + $self->{'probe_id'} = $pid if $pid; + $self->probe($probe) if $probe; + $self->mismatchcount($mismatchcount) if defined $mismatchcount;#do not remove until probe mapping pipeline fixed + $self->cigar_string($cig_line) if defined $cig_line; + + #do we need to validate this against the db? Grab from slice and create new if not present? Will this be from the dnadb? + + #do we need this coordsys id if we're passing a slice? We should have the method but not in here? + + return $self; +} + +=head2 new_fast + + Args : Hashref with all internal attributes set + Example : none + Description: Quick and dirty version of new. Only works if the code is very + disciplined. + Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub new_fast { + bless ($_[1], $_[0]); +} + +=head2 probeset + + Arg [1] : (optional) string - probeset + Example : my $probeset = $feature->probeset(); + Description: Getter and setter for the probeset for this feature. Shortcut + for $feature->probe->probeset(), which should be used instead. + Probeset is not persisted if set with this method. + Returntype : string + Exceptions : None + Caller : General + Status : Medium Risk + : Use $feature->probe->probeset() because this may be removed + +=cut + +sub probeset { + my $self = shift; + + $self->{'probeset'} = shift if @_; + + if (! $self->{'probeset'}) { + $self->{'probeset'} = $self->probe()->probeset(); + } + + #We could bypass this entirely and call directly using proveset_id? + + + return $self->{'probeset'}; +} + + +#Only ever needs to be set in _objs_from_sth +#This is to allow linkage of probe_feature glyphs without retrieving the probeset. + +sub probeset_id{ + my $self = shift; + + return $self->{'_probeset_id'}; +} + +=head2 mismatchcount + + Arg [1] : int - number of mismatches + Example : my $mismatches = $feature->mismatchcount(); + Description: Getter and setter for number of mismatches for this feature. + Returntype : int + Exceptions : None + Caller : General + Status : High Risk + +=cut + +sub mismatchcount { + my $self = shift; + + + #replace with dynamic check of cigarline? + + $self->{'mismatchcount'} = shift if @_; + + return $self->{'mismatchcount'}; +} + + +=head2 cigar_string + + Arg [1] : str - Cigar line alignment annotation (M = Align & Seq match, m = Align matcht & Seq mismatch, D = Deletion in ProbeFeature wrt genome, U = Unknown at time of alignment) + Example : my $cg = $feature->cigar_string(); + Description: Getter and setter for number of the cigar line attribute for this feature. + Returntype : str + Exceptions : None + Caller : General + Status : High Risk + +=cut + +sub cigar_string { + my $self = shift; + + $self->{'cigar_string'} = shift if @_; + + return $self->{'cigar_string'}; +} + + +=head2 probe + + Arg [1] : Bio::EnsEMBL::Funcgen::Probe - probe + Example : my $probe = $feature->probe(); + Description: Getter, setter and lazy loader of probe attribute for + ProbeFeature objects. Features are retrieved from the database + without attached probes, so retrieving probe information for a + feature will involve another query. + Returntype : Bio::EnsEMBL::Funcgen::Probe + Exceptions : None + Caller : General + Status : at risk + +=cut + +sub probe { + my $self = shift; + my $probe = shift; + + #can we not use _probe_id here? + #why is probe_id not set sometimes? + + + #warn "in pf and probe is ".$self->{'probe_id'}; + + if ($probe) { + + #warn "Probe defined and is ".$probe. "and probe id is".$self->{'probe_id'}; + + if ( !ref $probe || !$probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) { + throw('Probe must be a Bio::EnsEMBL::Funcgen::Probe object'); + } + $self->{'probe'} = $probe; + } + + if ( ! defined $self->{'probe'}){ + # && $self->dbID() && $self->adaptor() ) { + #$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_ProbeFeature($self); + #warn "fetching probe with dbID ".$self->probe_id(); + $self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_dbID($self->probe_id()); + } + return $self->{'probe'}; +} + + +=head2 probe_id + + Example : my $probe_id = $pfeature->probe_id(); + Description: Getter for the probe db id of the ProbeFeature + Returntype : int + Exceptions : None + Caller : General + Status : at risk + +=cut + +sub probe_id{ + my $self = shift; + + return $self->{'probe_id'} || $self->probe->dbID(); +} + +=head2 get_results_by_channel_id + + Arg [1] : int - channel_id (mandatory) + Arg [2] : string - Analysis name e.g. RawValue, VSN (optional) + Example : my @results = $feature->results(); + Description: Getter, setter and lazy loader of results attribute for + ProbeFeature objects. + Returntype : List ref to arrays containing ('score', 'Analysis logic_name'); + Exceptions : None + Caller : General + Status : Medium Risk + +=cut + +sub get_results_by_channel_id { + my $self = shift; + my $channel_id = shift; + my $anal_name = shift; + + warn("This method not fully implemented, remove/deprecate?"); + + #$self->{'results'} ||= {}; + $self->{'results_complete'} ||= 0; + + if(! $self->{'results'} || ($anal_name && ! exists $self->{'results'}{$anal_name})){ + #fetch all, set complete set flag + $self->{'results_complete'} ||= 1 if(! $anal_name); + + foreach my $results_ref(@{$self->adaptor->fetch_results_by_channel_analysis($self->probe->dbID(), + $channel_id, $anal_name)}){ + + $self->{'results'}{$$results_ref[1]} = $$results_ref[0]; + } + } + + return $self->{'results'} +} + + +#The experiment/al chip specificity has already been done by the ofa->fetch_all_by_Slice_Experiment +#This may be called with no preceding Experiment specificity +#this would return results for all experiments +#do we need to set a default Experiment? + + +#THis should return both Chip and Channel based results +#just Chip for now +#maybe retrieve and hash all if not Analysis object passed? Then return what? + + +=head2 get_result_by_Analysis_ExperimentalChips + + Arg [1] : Bio::EnsEMBL::Analysis + Arg [2] : listref - Bio::EnsEMBL::Funcgen::ExperimentalChip + Example : my $result = $feature->get_result_by_Analysis_ExperimentalChips($anal, \@echips); + Description: Getter of results attribute for a given Analysis and set of ExperimentalChips + Returntype : float + Exceptions : Throws is no Analysis or ExperimentalChips are not passed? + Caller : General + Status : High Risk + +=cut + + +#make ExperimentalChips optional? + +#or have ResultSetAdaptor? Do we need a ResultSet? +#may not have ExperimentalChip, so would need to return ec dbID aswell + + +######This will break/return anomalous if +#ECs are passed from different experiments +#ECs are passed from different Arrays + + +sub get_result_by_Analysis_ExperimentalChips{ + my ($self, $anal, $exp_chips) = @_; + + throw("Need to pass listref of ExperiemntalChips") if(scalar(@$exp_chips) == 0); + throw("Need to pass a valid Bio::EnsEMBL::Analysis") if ! $anal->isa("Bio::EnsEMBL::Analysis"); + + my (%query_ids, %all_ids, %ac_ids); + my $anal_name = $anal->logic_name(); + + foreach my $ec(@$exp_chips){ + + throw("Need to pass a listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects") + if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip"); + + #my $tmp_id = $self->adaptor->db->get_ArrayAdaptor->fetch_by_array_chip_dbID($ec->array_chip_id())->dbID(); + + #$array_id ||= $tmp_id; + + #throw("You have passed ExperimentalChips from different if($array_id != $tmp_id) + + #if(exists $ac_ids{$ec->array_chip_id()}){ +# throw("Multiple chip query only works with contiguous chips within an array, rather than duplicates"); + # } + + $ac_ids{$ec->array_chip_id()} = 1; + $all_ids{$ec->dbID()} = 1; + $query_ids{$ec->dbID()} = 1 if(! exists $self->{'results'}{$anal_name}{$ec->dbID()}); + + } + + + my @ec_ids = keys %query_ids; + my @all_ids = keys %all_ids; + + + #warn "ec ids @ec_ids\n"; + #warn "all ids @all_ids\n"; + + #$self->{'results'} ||= {}; + #$self->{'results_complete'} ||= 0;#do we need this now? + + if((scalar(@all_ids) - scalar(@ec_ids))> 1){ + throw("DATA ERROR - There is more than one result stored for the following ExperimentalChip ids: @all_ids"); + } + elsif(! $self->{'results'} || (($anal_name && scalar(@ec_ids) > 0) && scalar(@all_ids) == scalar(@ec_ids))){ + #fetch all, set complete set flag + #$self->{'results_complete'} ||= 1 if(! $anal_name); + #would need to look up chip and channel analyses here and call relevant fetch + #or pass the chip and then build the query as = or IN dependent on context of logic name + #if there are multiple results, last one will overwrite others + #could do foreach here to deal with retrieving all i.e. no logic name + #Can supply mutliple chips, but probe ids "should" be unique(in the DB at least) amongst contiguous array_chips + #build the cache based on logic name and table_id + #cahce key?? should we cat the ec_ids together? + + my @result_refs = @{$self->adaptor->fetch_results_by_Probe_Analysis_experimental_chip_ids($self->probe(), + $anal, + \@ec_ids)}; + + #Remove lines with no result + while(@result_refs && (! $result_refs[0]->[0])){ + shift @result_refs; + } + + my $num_results = scalar(@result_refs); + my ($result, $mpos); + #throw("Fetched more than one result for this ProbeFeature, Analysis and ExperimentalChips") if (scalar(@result_refs) >1); + + #No sort needed as we sort in the query + + if($num_results == 1){ + $result = $result_refs[0]->[0]; + } + elsif($num_results == 2){#mean + $result = ($result_refs[0]->[0] + $result_refs[1]->[0])/2; + + } + elsif($num_results > 2){#median or mean of median flanks + $mpos = $num_results/2; + + if($mpos =~ /\./){#true median + $mpos =~ s/\..*//; + $mpos ++; + $result = $result_refs[$mpos]->[0]; + }else{ + $result = ($result_refs[$mpos]->[0] + $result_refs[($mpos+1)]->[0])/2 ; + } + } + + $self->{'results'}{$anal_name}{":".join(":", @ec_ids).":"} = $result; + } + + #do we return the ec ids here to, or do we trust that the user will know to only pass contiguous rather than duplicate chips + + #how are we going to retrieve the result for one of many possible ec id keys? + #options, cat ec dbids as key, and grep them to find full key, then return result + #this may hide the duplicate chip problem + #If a query has already been made and cached,another query with one differing ID(duplicate result) may never be queried as we already have a cahced result + #We shoulld pick up duplicates before this happens + #If we try and mix ExperimentalChips from different experiments, then this would also cause multiple results, and hence hide some data + + my @keys; + foreach my $id(@all_ids){ + my @tmp = grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); + #Hacky needs sorting, quick fix for release!! + + if(@tmp){ + push @keys, grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); + + last; + } + + } + + throw("Got more than one key for the results cache") if scalar(@keys) > 1; + + return $self->{'results'}{$anal_name}{$keys[0]}; +} + + +#Will this be too slow, can we not do one query across all tables + +sub get_result_by_ResultSet{ + my ($self, $rset) = @_; + + my $results = $rset->adaptor->fetch_results_by_probe_id_ResultSet($self->probe_id(), $rset); + + return median($results); +} + + + + + +1; +