ensembl: variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm comparison

comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm @ 0:1f6dce3d34e0

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1f6dce3d34e0
+#
+# Ensembl module for Bio::EnsEMBL::Funcgen::ProbeFeature
+#
+=head1 LICENSE
+Copyright (c) 1999-2011 The European Bioinformatics Institute and
+Genome Research Limited.  All rights reserved.
+This software is distributed under a modified Apache license.
+For license details, please see
+http://www.ensembl.org/info/about/code_licence.html
+=head1 CONTACT
+Please email comments or questions to the public Ensembl
+developers list at <ensembl-dev@ebi.ac.uk>.
+Questions may also be sent to the Ensembl help desk at
+<helpdesk@ensembl.org>.
+=head1 NAME
+Bio::EnsEMBL::ProbeFeature - A module to represent an nucleotide probe
+genomic mapping.
+=head1 SYNOPSIS
+use Bio::EnsEMBL::Funcgen::ProbeFeature;
+my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
+	-PROBE         => $probe,
+	-MISMATCHCOUNT => 0,
+	-SLICE         => $chr_1_slice,
+	-START         => 1_000_000,
+	-END           => 1_000_024,
+	-STRAND        => -1,
+-ANALYSIS      => $analysis,
+-CIGAR_STRING  => '1U2M426D2M1m21M',
+);
+=head1 DESCRIPTION
+An ProbeFeature object represents the genomic placement of an Probe
+object. The data are stored in the probe_feature table.
+=cut
+use strict;
+use warnings;
+package Bio::EnsEMBL::Funcgen::ProbeFeature;
+use Bio::EnsEMBL::Utils::Argument qw( rearrange );
+use Bio::EnsEMBL::Utils::Exception qw( throw );
+use Bio::EnsEMBL::Feature;
+use Bio::EnsEMBL::Funcgen::Storable;
+use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(median);
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable);
+=head2 new
+Arg [-PROBE]        : Bio::EnsEMBL::Funcgen::Probe - probe
+A ProbeFeature must have a probe. This probe must already be stored if
+		you plan to store the feature.
+Arg [-MISMATCHCOUNT]: int
+Number of mismatches over the length of the probe.
+Arg [-SLICE]        : Bio::EnsEMBL::Slice
+The slice on which this feature is.
+Arg [-START]        : int
+The start coordinate of this feature relative to the start of the slice
+		it is sitting on. Coordinates start at 1 and are inclusive.
+Arg [-END]          : int
+The end coordinate of this feature relative to the start of the slice
+		it is sitting on. Coordinates start at 1 and are inclusive.
+Arg [-STRAND]       : int
+The orientation of this feature. Valid values are 1, -1 and 0.
+Arg [-dbID]         : (optional) int
+Internal database ID.
+Arg [-ADAPTOR]      : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor
+Database adaptor.
+Example    : my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
+				   -PROBE         => $probe,
+				   -MISMATCHCOUNT => 0,
+				   -SLICE         => $chr_1_slice,
+				   -START         => 1_000_000,
+				   -END           => 1_000_024,
+				   -STRAND        => -1,
+				   -ANALYSIS      => $analysis,
+-CIGARLINE     => '15M2m3d4M',
+#Can represent transcript alignment as gapped genomic alignments
+#D(eletions) representing introns
+#Lowercase m's showing sequence mismatches
+			   );
+Description: Constructor for ProbeFeature objects.
+Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature
+Exceptions : None
+Caller     : General
+Status     : At risk
+=cut
+sub new {
+my $caller = shift;
+my $class = ref($caller) || $caller;
+my $self = $class->SUPER::new(@_);
+my ($probe, $mismatchcount, $pid, $cig_line)
+= rearrange(['PROBE', 'MISMATCHCOUNT', 'PROBE_ID', 'CIGAR_STRING'], @_);
+#remove mismatch?
+#mandatory args?
+#warn "creating probe feature with $pid";
+$self->{'probe_id'} = $pid if $pid;
+$self->probe($probe) if $probe;
+$self->mismatchcount($mismatchcount)  if defined $mismatchcount;#do not remove until probe mapping pipeline fixed
+$self->cigar_string($cig_line)          if defined $cig_line;
+#do we need to validate this against the db?  Grab from slice and create new if not present?  Will this be from the dnadb?
+#do we need this coordsys id if we're passing a slice?  We should have the method but not in here?
+return $self;
+}
+=head2 new_fast
+Args       : Hashref with all internal attributes set
+Example    : none
+Description: Quick and dirty version of new. Only works if the code is very
+disciplined.
+Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature
+Exceptions : None
+Caller     : General
+Status     : At Risk
+=cut
+sub new_fast {
+bless ($_[1], $_[0]);
+}
+=head2 probeset
+Arg [1]    : (optional) string - probeset
+Example    : my $probeset = $feature->probeset();
+Description: Getter and setter for the probeset for this feature. Shortcut
+for $feature->probe->probeset(), which should be used instead.
+			   Probeset is not persisted if set with this method.
+Returntype : string
+Exceptions : None
+Caller     : General
+Status     : Medium Risk
+: Use $feature->probe->probeset() because this may be removed
+=cut
+sub probeset {
+my $self = shift;
+$self->{'probeset'} = shift if @_;
+if (! $self->{'probeset'}) {
+	  $self->{'probeset'} = $self->probe()->probeset();
+}
+	#We could bypass this entirely and call directly using proveset_id?
+return $self->{'probeset'};
+}
+#Only ever needs to be set in _objs_from_sth
+#This is to allow linkage of probe_feature glyphs without retrieving the probeset.
+sub probeset_id{
+my $self = shift;
+return $self->{'_probeset_id'};
+}
+=head2 mismatchcount
+Arg [1]    : int - number of mismatches
+Example    : my $mismatches = $feature->mismatchcount();
+Description: Getter and setter for number of mismatches for this feature.
+Returntype : int
+Exceptions : None
+Caller     : General
+Status     : High Risk
+=cut
+sub mismatchcount {
+my $self = shift;
+#replace with dynamic check of cigarline?
+$self->{'mismatchcount'} = shift if @_;
+return $self->{'mismatchcount'};
+}
+=head2 cigar_string
+Arg [1]    : str - Cigar line alignment annotation (M = Align & Seq match, m = Align matcht & Seq mismatch, D = Deletion in ProbeFeature wrt genome, U = Unknown at time of alignment)
+Example    : my $cg = $feature->cigar_string();
+Description: Getter and setter for number of the cigar line attribute for this feature.
+Returntype : str
+Exceptions : None
+Caller     : General
+Status     : High Risk
+=cut
+sub cigar_string {
+my $self = shift;
+$self->{'cigar_string'} = shift if @_;
+return $self->{'cigar_string'};
+}
+=head2 probe
+Arg [1]    : Bio::EnsEMBL::Funcgen::Probe - probe
+Example    : my $probe = $feature->probe();
+Description: Getter, setter and lazy loader of probe attribute for
+ProbeFeature objects. Features are retrieved from the database
+			   without attached probes, so retrieving probe information for a
+			   feature will involve another query.
+Returntype : Bio::EnsEMBL::Funcgen::Probe
+Exceptions : None
+Caller     : General
+Status     : at risk
+=cut
+sub probe {
+my $self = shift;
+my $probe = shift;
+#can we not use _probe_id here?
+#why is probe_id not set sometimes?
+#warn "in pf and probe is ".$self->{'probe_id'};
+if ($probe) {
+	#warn "Probe defined and is ".$probe. "and probe id is".$self->{'probe_id'};
+if ( !ref $probe || !$probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) {
+throw('Probe must be a Bio::EnsEMBL::Funcgen::Probe object');
+}
+$self->{'probe'} = $probe;
+}
+if ( ! defined $self->{'probe'}){
+	# && $self->dbID() && $self->adaptor() ) {
+#$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_ProbeFeature($self);
+	#warn "fetching probe with dbID ".$self->probe_id();
+	$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_dbID($self->probe_id());
+}
+return $self->{'probe'};
+}
+=head2 probe_id
+Example    : my $probe_id = $pfeature->probe_id();
+Description: Getter for the probe db id of the ProbeFeature
+Returntype : int
+Exceptions : None
+Caller     : General
+Status     : at risk
+=cut
+sub probe_id{
+my $self = shift;
+return $self->{'probe_id'} || $self->probe->dbID();
+}
+=head2 get_results_by_channel_id
+Arg [1]    : int - channel_id (mandatory)
+Arg [2]    : string - Analysis name e.g. RawValue, VSN (optional)
+Example    : my @results = $feature->results();
+Description: Getter, setter and lazy loader of results attribute for
+ProbeFeature objects.
+Returntype : List ref to arrays containing ('score', 'Analysis logic_name');
+Exceptions : None
+Caller     : General
+Status     : Medium Risk
+=cut
+sub get_results_by_channel_id {
+my $self = shift;
+my $channel_id = shift;
+my $anal_name = shift;
+warn("This method not fully implemented, remove/deprecate?");
+#$self->{'results'} ||= {};
+$self->{'results_complete'} ||= 0;
+if(! $self->{'results'} || ($anal_name && ! exists $self->{'results'}{$anal_name})){
+#fetch all, set complete set flag
+$self->{'results_complete'} ||= 1 	if(! $anal_name);
+foreach my $results_ref(@{$self->adaptor->fetch_results_by_channel_analysis($self->probe->dbID(),
+										  $channel_id, $anal_name)}){
+	$self->{'results'}{$$results_ref[1]} = $$results_ref[0];
+}
+}
+return $self->{'results'}
+}
+#The experiment/al chip specificity has already been done by the ofa->fetch_all_by_Slice_Experiment
+#This may be called with no preceding Experiment specificity
+#this would return results for all experiments
+#do we need to set a default Experiment?
+#THis should return both Chip and Channel based results
+#just Chip for now
+#maybe retrieve and hash all if not Analysis object passed?  Then return what?
+=head2 get_result_by_Analysis_ExperimentalChips
+Arg [1]    : Bio::EnsEMBL::Analysis
+Arg [2]    : listref - Bio::EnsEMBL::Funcgen::ExperimentalChip
+Example    : my $result = $feature->get_result_by_Analysis_ExperimentalChips($anal, \@echips);
+Description: Getter of results attribute for a given Analysis and set of ExperimentalChips
+Returntype : float
+Exceptions : Throws is no Analysis or ExperimentalChips are not passed?
+Caller     : General
+Status     : High Risk
+=cut
+#make ExperimentalChips optional?
+#or have ResultSetAdaptor?  Do we need a ResultSet?
+#may not have ExperimentalChip, so would need to return ec dbID aswell
+######This will break/return anomalous if
+#ECs are passed from different experiments
+#ECs are passed from different Arrays
+sub get_result_by_Analysis_ExperimentalChips{
+my ($self, $anal, $exp_chips) = @_;
+throw("Need to pass listref of ExperiemntalChips") if(scalar(@$exp_chips) == 0);
+throw("Need to pass a valid Bio::EnsEMBL::Analysis") if ! $anal->isa("Bio::EnsEMBL::Analysis");
+my (%query_ids, %all_ids, %ac_ids);
+my $anal_name = $anal->logic_name();
+foreach my $ec(@$exp_chips){
+throw("Need to pass a listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects")
+	if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip");
+		#my $tmp_id = $self->adaptor->db->get_ArrayAdaptor->fetch_by_array_chip_dbID($ec->array_chip_id())->dbID();
+		#$array_id ||= $tmp_id;
+#throw("You have passed ExperimentalChips from different if($array_id != $tmp_id)
+#if(exists  $ac_ids{$ec->array_chip_id()}){
+#	throw("Multiple chip query only works with contiguous chips within an array, rather than duplicates");
+#     }
+$ac_ids{$ec->array_chip_id()} = 1;
+$all_ids{$ec->dbID()} = 1;
+$query_ids{$ec->dbID()} = 1 if(! exists $self->{'results'}{$anal_name}{$ec->dbID()});
+}
+my @ec_ids = keys %query_ids;
+my @all_ids = keys %all_ids;
+#warn "ec ids @ec_ids\n";
+#warn "all ids @all_ids\n";
+#$self->{'results'} ||= {};
+#$self->{'results_complete'} ||= 0;#do we need this now?
+if((scalar(@all_ids) - scalar(@ec_ids))> 1){
+throw("DATA ERROR - There is more than one result stored for the following ExperimentalChip ids: @all_ids");
+}
+elsif(! $self->{'results'} || (($anal_name && scalar(@ec_ids) > 0) && scalar(@all_ids) == scalar(@ec_ids))){
+#fetch all, set complete set flag
+#$self->{'results_complete'} ||= 1 	if(! $anal_name);
+#would need to look up chip and channel analyses here and call relevant fetch
+#or pass the chip and then build the query as = or IN dependent on context of logic name
+#if there are multiple results, last one will overwrite others
+#could do foreach here to deal with retrieving all i.e. no logic name
+#Can supply mutliple chips, but probe ids "should" be unique(in the DB at least) amongst contiguous array_chips
+#build the cache based on logic name and table_id
+#cahce key??  should we cat the ec_ids together?
+my @result_refs = @{$self->adaptor->fetch_results_by_Probe_Analysis_experimental_chip_ids($self->probe(),
+												$anal,
+												\@ec_ids)};
+#Remove lines with no result
+while(@result_refs && (! $result_refs[0]->[0])){
+	shift @result_refs;
+}
+my $num_results = scalar(@result_refs);
+my ($result, $mpos);
+#throw("Fetched more than one result for this ProbeFeature, Analysis and ExperimentalChips") if (scalar(@result_refs) >1);
+#No sort needed as we sort in the query
+if($num_results == 1){
+	$result = $result_refs[0]->[0];
+}
+elsif($num_results == 2){#mean
+	$result = ($result_refs[0]->[0] + $result_refs[1]->[0])/2;
+}
+elsif($num_results > 2){#median or mean of median flanks
+	$mpos = $num_results/2;
+	if($mpos =~ /\./){#true median
+	  $mpos =~ s/\..*//;
+	  $mpos ++;
+	  $result =  $result_refs[$mpos]->[0];
+	}else{
+	  $result = ($result_refs[$mpos]->[0] + $result_refs[($mpos+1)]->[0])/2 ;
+	}
+}
+$self->{'results'}{$anal_name}{":".join(":", @ec_ids).":"} = $result;
+}
+	#do we return the ec ids here to, or do we trust that the user will know to only pass contiguous rather than duplicate chips
+	#how are we going to retrieve the result for one of many possible ec id keys?
+	#options, cat ec dbids as key, and grep them to find full key, then return result
+	#this may hide the duplicate chip problem
+	#If a query has already been made and cached,another query with one differing ID(duplicate result) may never be queried as we already have a cahced result
+	#We shoulld pick up duplicates before this happens
+	#If we try and mix ExperimentalChips from different experiments, then this would also cause multiple results, and hence hide some data
+	my @keys;
+	foreach my $id(@all_ids){
+	  my @tmp = grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}});
+	  #Hacky needs sorting, quick fix for release!!
+	  if(@tmp){
+	    push @keys, grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}});
+	    last;
+	  }
+	}
+throw("Got more than one key for the results cache") if scalar(@keys) > 1;
+return $self->{'results'}{$anal_name}{$keys[0]};
+}
+#Will this be too slow, can we not do one query across all tables
+sub get_result_by_ResultSet{
+my ($self, $rset) = @_;
+my $results = $rset->adaptor->fetch_results_by_probe_id_ResultSet($self->probe_id(), $rset);
+return median($results);
+}
+1;

Mercurial > repos > mahtabm > ensembl

comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm @ 0:1f6dce3d34e0