Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,508 @@
+#
+# Ensembl module for Bio::EnsEMBL::Funcgen::ProbeFeature
+#
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::ProbeFeature - A module to represent an nucleotide probe
+genomic mapping.
+
+=head1 SYNOPSIS
+
+use Bio::EnsEMBL::Funcgen::ProbeFeature;
+
+my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
+	-PROBE         => $probe,
+	-MISMATCHCOUNT => 0,
+	-SLICE         => $chr_1_slice,
+	-START         => 1_000_000,
+	-END           => 1_000_024,
+	-STRAND        => -1,
+    -ANALYSIS      => $analysis,
+    -CIGAR_STRING  => '1U2M426D2M1m21M',
+); 
+
+
+=head1 DESCRIPTION
+
+An ProbeFeature object represents the genomic placement of an Probe
+object. The data are stored in the probe_feature table.
+
+=cut
+
+use strict;
+use warnings;
+
+package Bio::EnsEMBL::Funcgen::ProbeFeature;
+
+use Bio::EnsEMBL::Utils::Argument qw( rearrange );
+use Bio::EnsEMBL::Utils::Exception qw( throw );
+use Bio::EnsEMBL::Feature;
+use Bio::EnsEMBL::Funcgen::Storable;
+use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(median);
+
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable);
+
+
+=head2 new
+
+  Arg [-PROBE]        : Bio::EnsEMBL::Funcgen::Probe - probe
+        A ProbeFeature must have a probe. This probe must already be stored if
+		you plan to store the feature.
+  Arg [-MISMATCHCOUNT]: int
+        Number of mismatches over the length of the probe. 
+  Arg [-SLICE]        : Bio::EnsEMBL::Slice
+        The slice on which this feature is.
+  Arg [-START]        : int
+        The start coordinate of this feature relative to the start of the slice
+		it is sitting on. Coordinates start at 1 and are inclusive.
+  Arg [-END]          : int
+        The end coordinate of this feature relative to the start of the slice
+		it is sitting on. Coordinates start at 1 and are inclusive.
+  Arg [-STRAND]       : int
+        The orientation of this feature. Valid values are 1, -1 and 0.
+  Arg [-dbID]         : (optional) int
+        Internal database ID.
+  Arg [-ADAPTOR]      : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor
+        Database adaptor.
+  Example    : my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
+				   -PROBE         => $probe,
+				   -MISMATCHCOUNT => 0,
+				   -SLICE         => $chr_1_slice,
+				   -START         => 1_000_000,
+				   -END           => 1_000_024,
+				   -STRAND        => -1,
+				   -ANALYSIS      => $analysis,
+                   -CIGARLINE     => '15M2m3d4M', 
+                   #Can represent transcript alignment as gapped genomic alignments
+                   #D(eletions) representing introns
+                   #Lowercase m's showing sequence mismatches
+			   ); 
+  Description: Constructor for ProbeFeature objects.
+  Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature
+  Exceptions : None
+  Caller     : General
+  Status     : At risk
+
+=cut
+
+sub new {
+  my $caller = shift;
+	
+  my $class = ref($caller) || $caller;
+	
+  my $self = $class->SUPER::new(@_);
+	
+  my ($probe, $mismatchcount, $pid, $cig_line)
+    = rearrange(['PROBE', 'MISMATCHCOUNT', 'PROBE_ID', 'CIGAR_STRING'], @_);
+
+  #remove mismatch?
+  #mandatory args?
+  
+  #warn "creating probe feature with $pid";
+  $self->{'probe_id'} = $pid if $pid;
+  $self->probe($probe) if $probe;
+  $self->mismatchcount($mismatchcount)  if defined $mismatchcount;#do not remove until probe mapping pipeline fixed
+  $self->cigar_string($cig_line)          if defined $cig_line;
+   
+  #do we need to validate this against the db?  Grab from slice and create new if not present?  Will this be from the dnadb?
+  
+  #do we need this coordsys id if we're passing a slice?  We should have the method but not in here?
+
+  return $self;
+}
+
+=head2 new_fast
+
+  Args       : Hashref with all internal attributes set
+  Example    : none
+  Description: Quick and dirty version of new. Only works if the code is very
+               disciplined. 
+  Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature
+  Exceptions : None
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub new_fast {
+  bless ($_[1], $_[0]);
+}
+
+=head2 probeset
+
+  Arg [1]    : (optional) string - probeset
+  Example    : my $probeset = $feature->probeset();
+  Description: Getter and setter for the probeset for this feature. Shortcut
+               for $feature->probe->probeset(), which should be used instead.
+			   Probeset is not persisted if set with this method.
+  Returntype : string
+  Exceptions : None
+  Caller     : General
+  Status     : Medium Risk
+             : Use $feature->probe->probeset() because this may be removed
+
+=cut
+
+sub probeset {
+    my $self = shift;
+	
+    $self->{'probeset'} = shift if @_;
+	
+    if (! $self->{'probeset'}) {
+	  $self->{'probeset'} = $self->probe()->probeset();
+    }
+
+	#We could bypass this entirely and call directly using proveset_id?
+
+	
+    return $self->{'probeset'};
+}
+
+
+#Only ever needs to be set in _objs_from_sth
+#This is to allow linkage of probe_feature glyphs without retrieving the probeset.
+
+sub probeset_id{
+  my $self = shift;
+
+  return $self->{'_probeset_id'};
+}
+
+=head2 mismatchcount
+
+  Arg [1]    : int - number of mismatches
+  Example    : my $mismatches = $feature->mismatchcount();
+  Description: Getter and setter for number of mismatches for this feature.
+  Returntype : int
+  Exceptions : None
+  Caller     : General
+  Status     : High Risk
+
+=cut
+
+sub mismatchcount {
+    my $self = shift;
+	
+
+    #replace with dynamic check of cigarline?
+
+    $self->{'mismatchcount'} = shift if @_;
+	
+    return $self->{'mismatchcount'};
+}
+
+
+=head2 cigar_string
+
+  Arg [1]    : str - Cigar line alignment annotation (M = Align & Seq match, m = Align matcht & Seq mismatch, D = Deletion in ProbeFeature wrt genome, U = Unknown at time of alignment)
+  Example    : my $cg = $feature->cigar_string();
+  Description: Getter and setter for number of the cigar line attribute for this feature.
+  Returntype : str
+  Exceptions : None
+  Caller     : General
+  Status     : High Risk
+
+=cut
+
+sub cigar_string {
+  my $self = shift;
+  
+  $self->{'cigar_string'} = shift if @_;
+	
+  return $self->{'cigar_string'};
+}
+
+
+=head2 probe
+
+  Arg [1]    : Bio::EnsEMBL::Funcgen::Probe - probe
+  Example    : my $probe = $feature->probe();
+  Description: Getter, setter and lazy loader of probe attribute for
+               ProbeFeature objects. Features are retrieved from the database
+			   without attached probes, so retrieving probe information for a
+			   feature will involve another query.
+  Returntype : Bio::EnsEMBL::Funcgen::Probe
+  Exceptions : None
+  Caller     : General
+  Status     : at risk
+
+=cut
+
+sub probe {
+  my $self = shift;
+  my $probe = shift;
+  
+  #can we not use _probe_id here?
+  #why is probe_id not set sometimes?
+  
+  
+  #warn "in pf and probe is ".$self->{'probe_id'};
+
+  if ($probe) {
+
+	#warn "Probe defined and is ".$probe. "and probe id is".$self->{'probe_id'};
+    
+    if ( !ref $probe || !$probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) {
+      throw('Probe must be a Bio::EnsEMBL::Funcgen::Probe object');
+    }
+    $self->{'probe'} = $probe;
+  }
+
+  if ( ! defined $self->{'probe'}){
+	# && $self->dbID() && $self->adaptor() ) {
+    #$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_ProbeFeature($self);
+	#warn "fetching probe with dbID ".$self->probe_id();
+	$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_dbID($self->probe_id());
+  }
+  return $self->{'probe'};
+}
+
+
+=head2 probe_id
+
+  Example    : my $probe_id = $pfeature->probe_id();
+  Description: Getter for the probe db id of the ProbeFeature
+  Returntype : int
+  Exceptions : None
+  Caller     : General
+  Status     : at risk
+
+=cut
+
+sub probe_id{
+  my $self = shift;
+
+  return $self->{'probe_id'} || $self->probe->dbID();
+}
+
+=head2 get_results_by_channel_id
+
+  Arg [1]    : int - channel_id (mandatory)
+  Arg [2]    : string - Analysis name e.g. RawValue, VSN (optional)
+  Example    : my @results = $feature->results();
+  Description: Getter, setter and lazy loader of results attribute for
+               ProbeFeature objects.
+  Returntype : List ref to arrays containing ('score', 'Analysis logic_name');
+  Exceptions : None
+  Caller     : General
+  Status     : Medium Risk
+
+=cut
+
+sub get_results_by_channel_id {
+    my $self = shift;
+    my $channel_id = shift;
+    my $anal_name = shift;
+
+    warn("This method not fully implemented, remove/deprecate?");
+
+    #$self->{'results'} ||= {};
+    $self->{'results_complete'} ||= 0;
+	
+    if(! $self->{'results'} || ($anal_name && ! exists $self->{'results'}{$anal_name})){
+      #fetch all, set complete set flag
+      $self->{'results_complete'} ||= 1 	if(! $anal_name);
+      
+      foreach my $results_ref(@{$self->adaptor->fetch_results_by_channel_analysis($self->probe->dbID(), 
+										  $channel_id, $anal_name)}){
+	
+	$self->{'results'}{$$results_ref[1]} = $$results_ref[0];
+      }
+    }
+    
+    return $self->{'results'}
+}
+
+
+#The experiment/al chip specificity has already been done by the ofa->fetch_all_by_Slice_Experiment
+#This may be called with no preceding Experiment specificity
+#this would return results for all experiments
+#do we need to set a default Experiment?
+
+
+#THis should return both Chip and Channel based results
+#just Chip for now
+#maybe retrieve and hash all if not Analysis object passed?  Then return what?  
+
+
+=head2 get_result_by_Analysis_ExperimentalChips
+
+  Arg [1]    : Bio::EnsEMBL::Analysis
+  Arg [2]    : listref - Bio::EnsEMBL::Funcgen::ExperimentalChip
+  Example    : my $result = $feature->get_result_by_Analysis_ExperimentalChips($anal, \@echips);
+  Description: Getter of results attribute for a given Analysis and set of ExperimentalChips
+  Returntype : float
+  Exceptions : Throws is no Analysis or ExperimentalChips are not passed?
+  Caller     : General
+  Status     : High Risk
+
+=cut
+
+
+#make ExperimentalChips optional?
+
+#or have ResultSetAdaptor?  Do we need a ResultSet?
+#may not have ExperimentalChip, so would need to return ec dbID aswell
+
+
+######This will break/return anomalous if
+#ECs are passed from different experiments
+#ECs are passed from different Arrays
+
+
+sub get_result_by_Analysis_ExperimentalChips{
+    my ($self, $anal, $exp_chips) = @_;
+
+    throw("Need to pass listref of ExperiemntalChips") if(scalar(@$exp_chips) == 0);
+    throw("Need to pass a valid Bio::EnsEMBL::Analysis") if ! $anal->isa("Bio::EnsEMBL::Analysis");
+
+    my (%query_ids, %all_ids, %ac_ids);
+    my $anal_name = $anal->logic_name();
+    
+    foreach my $ec(@$exp_chips){
+				
+      throw("Need to pass a listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects") 
+	if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip");
+
+		#my $tmp_id = $self->adaptor->db->get_ArrayAdaptor->fetch_by_array_chip_dbID($ec->array_chip_id())->dbID();
+      
+		#$array_id ||= $tmp_id;
+      
+      #throw("You have passed ExperimentalChips from different if($array_id != $tmp_id)
+      
+      #if(exists  $ac_ids{$ec->array_chip_id()}){
+#	throw("Multiple chip query only works with contiguous chips within an array, rather than duplicates");
+ #     }
+      
+      $ac_ids{$ec->array_chip_id()} = 1;
+      $all_ids{$ec->dbID()} = 1;
+      $query_ids{$ec->dbID()} = 1 if(! exists $self->{'results'}{$anal_name}{$ec->dbID()});
+      
+    }
+    
+    
+    my @ec_ids = keys %query_ids;
+    my @all_ids = keys %all_ids;
+    
+    
+    #warn "ec ids @ec_ids\n";
+    #warn "all ids @all_ids\n";
+    
+    #$self->{'results'} ||= {};
+    #$self->{'results_complete'} ||= 0;#do we need this now?
+    
+    if((scalar(@all_ids) - scalar(@ec_ids))> 1){
+      throw("DATA ERROR - There is more than one result stored for the following ExperimentalChip ids: @all_ids");
+    }		
+    elsif(! $self->{'results'} || (($anal_name && scalar(@ec_ids) > 0) && scalar(@all_ids) == scalar(@ec_ids))){
+      #fetch all, set complete set flag
+      #$self->{'results_complete'} ||= 1 	if(! $anal_name);
+      #would need to look up chip and channel analyses here and call relevant fetch
+      #or pass the chip and then build the query as = or IN dependent on context of logic name
+      #if there are multiple results, last one will overwrite others
+      #could do foreach here to deal with retrieving all i.e. no logic name
+      #Can supply mutliple chips, but probe ids "should" be unique(in the DB at least) amongst contiguous array_chips
+      #build the cache based on logic name and table_id
+      #cahce key??  should we cat the ec_ids together?
+
+      my @result_refs = @{$self->adaptor->fetch_results_by_Probe_Analysis_experimental_chip_ids($self->probe(), 
+												$anal,
+												\@ec_ids)};
+
+      #Remove lines with no result
+      while(@result_refs && (! $result_refs[0]->[0])){
+	shift @result_refs;
+      }
+
+      my $num_results = scalar(@result_refs);
+      my ($result, $mpos);
+      #throw("Fetched more than one result for this ProbeFeature, Analysis and ExperimentalChips") if (scalar(@result_refs) >1);
+
+      #No sort needed as we sort in the query
+
+      if($num_results == 1){
+	$result = $result_refs[0]->[0];
+      }
+      elsif($num_results == 2){#mean
+	$result = ($result_refs[0]->[0] + $result_refs[1]->[0])/2;
+    
+      }
+      elsif($num_results > 2){#median or mean of median flanks
+	$mpos = $num_results/2;
+    
+	if($mpos =~ /\./){#true median
+	  $mpos =~ s/\..*//;
+	  $mpos ++;
+	  $result =  $result_refs[$mpos]->[0];
+	}else{
+	  $result = ($result_refs[$mpos]->[0] + $result_refs[($mpos+1)]->[0])/2 ;
+	}
+      }
+      
+      $self->{'results'}{$anal_name}{":".join(":", @ec_ids).":"} = $result;
+    }
+
+	#do we return the ec ids here to, or do we trust that the user will know to only pass contiguous rather than duplicate chips
+
+	#how are we going to retrieve the result for one of many possible ec id keys?
+	#options, cat ec dbids as key, and grep them to find full key, then return result
+	#this may hide the duplicate chip problem
+	#If a query has already been made and cached,another query with one differing ID(duplicate result) may never be queried as we already have a cahced result
+	#We shoulld pick up duplicates before this happens
+	#If we try and mix ExperimentalChips from different experiments, then this would also cause multiple results, and hence hide some data
+	
+	my @keys;
+	foreach my $id(@all_ids){
+	  my @tmp = grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}});
+	  #Hacky needs sorting, quick fix for release!!
+
+	  if(@tmp){
+	    push @keys, grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}});
+
+	    last;
+	  }
+
+	}
+
+    throw("Got more than one key for the results cache") if scalar(@keys) > 1;
+
+    return $self->{'results'}{$anal_name}{$keys[0]};
+}
+
+
+#Will this be too slow, can we not do one query across all tables
+
+sub get_result_by_ResultSet{
+    my ($self, $rset) = @_;
+
+    my $results = $rset->adaptor->fetch_results_by_probe_id_ResultSet($self->probe_id(), $rset);
+   
+    return median($results);
+}
+
+
+
+
+
+1;
+
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children