Mercurial > repos > mahtabm > ensembl
view variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line source
# # Ensembl module for Bio::EnsEMBL::Funcgen::ProbeFeature # =head1 LICENSE Copyright (c) 1999-2011 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =head1 NAME Bio::EnsEMBL::ProbeFeature - A module to represent an nucleotide probe genomic mapping. =head1 SYNOPSIS use Bio::EnsEMBL::Funcgen::ProbeFeature; my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( -PROBE => $probe, -MISMATCHCOUNT => 0, -SLICE => $chr_1_slice, -START => 1_000_000, -END => 1_000_024, -STRAND => -1, -ANALYSIS => $analysis, -CIGAR_STRING => '1U2M426D2M1m21M', ); =head1 DESCRIPTION An ProbeFeature object represents the genomic placement of an Probe object. The data are stored in the probe_feature table. =cut use strict; use warnings; package Bio::EnsEMBL::Funcgen::ProbeFeature; use Bio::EnsEMBL::Utils::Argument qw( rearrange ); use Bio::EnsEMBL::Utils::Exception qw( throw ); use Bio::EnsEMBL::Feature; use Bio::EnsEMBL::Funcgen::Storable; use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(median); use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable); =head2 new Arg [-PROBE] : Bio::EnsEMBL::Funcgen::Probe - probe A ProbeFeature must have a probe. This probe must already be stored if you plan to store the feature. Arg [-MISMATCHCOUNT]: int Number of mismatches over the length of the probe. Arg [-SLICE] : Bio::EnsEMBL::Slice The slice on which this feature is. Arg [-START] : int The start coordinate of this feature relative to the start of the slice it is sitting on. Coordinates start at 1 and are inclusive. Arg [-END] : int The end coordinate of this feature relative to the start of the slice it is sitting on. Coordinates start at 1 and are inclusive. Arg [-STRAND] : int The orientation of this feature. Valid values are 1, -1 and 0. Arg [-dbID] : (optional) int Internal database ID. Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor Database adaptor. Example : my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( -PROBE => $probe, -MISMATCHCOUNT => 0, -SLICE => $chr_1_slice, -START => 1_000_000, -END => 1_000_024, -STRAND => -1, -ANALYSIS => $analysis, -CIGARLINE => '15M2m3d4M', #Can represent transcript alignment as gapped genomic alignments #D(eletions) representing introns #Lowercase m's showing sequence mismatches ); Description: Constructor for ProbeFeature objects. Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature Exceptions : None Caller : General Status : At risk =cut sub new { my $caller = shift; my $class = ref($caller) || $caller; my $self = $class->SUPER::new(@_); my ($probe, $mismatchcount, $pid, $cig_line) = rearrange(['PROBE', 'MISMATCHCOUNT', 'PROBE_ID', 'CIGAR_STRING'], @_); #remove mismatch? #mandatory args? #warn "creating probe feature with $pid"; $self->{'probe_id'} = $pid if $pid; $self->probe($probe) if $probe; $self->mismatchcount($mismatchcount) if defined $mismatchcount;#do not remove until probe mapping pipeline fixed $self->cigar_string($cig_line) if defined $cig_line; #do we need to validate this against the db? Grab from slice and create new if not present? Will this be from the dnadb? #do we need this coordsys id if we're passing a slice? We should have the method but not in here? return $self; } =head2 new_fast Args : Hashref with all internal attributes set Example : none Description: Quick and dirty version of new. Only works if the code is very disciplined. Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature Exceptions : None Caller : General Status : At Risk =cut sub new_fast { bless ($_[1], $_[0]); } =head2 probeset Arg [1] : (optional) string - probeset Example : my $probeset = $feature->probeset(); Description: Getter and setter for the probeset for this feature. Shortcut for $feature->probe->probeset(), which should be used instead. Probeset is not persisted if set with this method. Returntype : string Exceptions : None Caller : General Status : Medium Risk : Use $feature->probe->probeset() because this may be removed =cut sub probeset { my $self = shift; $self->{'probeset'} = shift if @_; if (! $self->{'probeset'}) { $self->{'probeset'} = $self->probe()->probeset(); } #We could bypass this entirely and call directly using proveset_id? return $self->{'probeset'}; } #Only ever needs to be set in _objs_from_sth #This is to allow linkage of probe_feature glyphs without retrieving the probeset. sub probeset_id{ my $self = shift; return $self->{'_probeset_id'}; } =head2 mismatchcount Arg [1] : int - number of mismatches Example : my $mismatches = $feature->mismatchcount(); Description: Getter and setter for number of mismatches for this feature. Returntype : int Exceptions : None Caller : General Status : High Risk =cut sub mismatchcount { my $self = shift; #replace with dynamic check of cigarline? $self->{'mismatchcount'} = shift if @_; return $self->{'mismatchcount'}; } =head2 cigar_string Arg [1] : str - Cigar line alignment annotation (M = Align & Seq match, m = Align matcht & Seq mismatch, D = Deletion in ProbeFeature wrt genome, U = Unknown at time of alignment) Example : my $cg = $feature->cigar_string(); Description: Getter and setter for number of the cigar line attribute for this feature. Returntype : str Exceptions : None Caller : General Status : High Risk =cut sub cigar_string { my $self = shift; $self->{'cigar_string'} = shift if @_; return $self->{'cigar_string'}; } =head2 probe Arg [1] : Bio::EnsEMBL::Funcgen::Probe - probe Example : my $probe = $feature->probe(); Description: Getter, setter and lazy loader of probe attribute for ProbeFeature objects. Features are retrieved from the database without attached probes, so retrieving probe information for a feature will involve another query. Returntype : Bio::EnsEMBL::Funcgen::Probe Exceptions : None Caller : General Status : at risk =cut sub probe { my $self = shift; my $probe = shift; #can we not use _probe_id here? #why is probe_id not set sometimes? #warn "in pf and probe is ".$self->{'probe_id'}; if ($probe) { #warn "Probe defined and is ".$probe. "and probe id is".$self->{'probe_id'}; if ( !ref $probe || !$probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) { throw('Probe must be a Bio::EnsEMBL::Funcgen::Probe object'); } $self->{'probe'} = $probe; } if ( ! defined $self->{'probe'}){ # && $self->dbID() && $self->adaptor() ) { #$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_ProbeFeature($self); #warn "fetching probe with dbID ".$self->probe_id(); $self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_dbID($self->probe_id()); } return $self->{'probe'}; } =head2 probe_id Example : my $probe_id = $pfeature->probe_id(); Description: Getter for the probe db id of the ProbeFeature Returntype : int Exceptions : None Caller : General Status : at risk =cut sub probe_id{ my $self = shift; return $self->{'probe_id'} || $self->probe->dbID(); } =head2 get_results_by_channel_id Arg [1] : int - channel_id (mandatory) Arg [2] : string - Analysis name e.g. RawValue, VSN (optional) Example : my @results = $feature->results(); Description: Getter, setter and lazy loader of results attribute for ProbeFeature objects. Returntype : List ref to arrays containing ('score', 'Analysis logic_name'); Exceptions : None Caller : General Status : Medium Risk =cut sub get_results_by_channel_id { my $self = shift; my $channel_id = shift; my $anal_name = shift; warn("This method not fully implemented, remove/deprecate?"); #$self->{'results'} ||= {}; $self->{'results_complete'} ||= 0; if(! $self->{'results'} || ($anal_name && ! exists $self->{'results'}{$anal_name})){ #fetch all, set complete set flag $self->{'results_complete'} ||= 1 if(! $anal_name); foreach my $results_ref(@{$self->adaptor->fetch_results_by_channel_analysis($self->probe->dbID(), $channel_id, $anal_name)}){ $self->{'results'}{$$results_ref[1]} = $$results_ref[0]; } } return $self->{'results'} } #The experiment/al chip specificity has already been done by the ofa->fetch_all_by_Slice_Experiment #This may be called with no preceding Experiment specificity #this would return results for all experiments #do we need to set a default Experiment? #THis should return both Chip and Channel based results #just Chip for now #maybe retrieve and hash all if not Analysis object passed? Then return what? =head2 get_result_by_Analysis_ExperimentalChips Arg [1] : Bio::EnsEMBL::Analysis Arg [2] : listref - Bio::EnsEMBL::Funcgen::ExperimentalChip Example : my $result = $feature->get_result_by_Analysis_ExperimentalChips($anal, \@echips); Description: Getter of results attribute for a given Analysis and set of ExperimentalChips Returntype : float Exceptions : Throws is no Analysis or ExperimentalChips are not passed? Caller : General Status : High Risk =cut #make ExperimentalChips optional? #or have ResultSetAdaptor? Do we need a ResultSet? #may not have ExperimentalChip, so would need to return ec dbID aswell ######This will break/return anomalous if #ECs are passed from different experiments #ECs are passed from different Arrays sub get_result_by_Analysis_ExperimentalChips{ my ($self, $anal, $exp_chips) = @_; throw("Need to pass listref of ExperiemntalChips") if(scalar(@$exp_chips) == 0); throw("Need to pass a valid Bio::EnsEMBL::Analysis") if ! $anal->isa("Bio::EnsEMBL::Analysis"); my (%query_ids, %all_ids, %ac_ids); my $anal_name = $anal->logic_name(); foreach my $ec(@$exp_chips){ throw("Need to pass a listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects") if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip"); #my $tmp_id = $self->adaptor->db->get_ArrayAdaptor->fetch_by_array_chip_dbID($ec->array_chip_id())->dbID(); #$array_id ||= $tmp_id; #throw("You have passed ExperimentalChips from different if($array_id != $tmp_id) #if(exists $ac_ids{$ec->array_chip_id()}){ # throw("Multiple chip query only works with contiguous chips within an array, rather than duplicates"); # } $ac_ids{$ec->array_chip_id()} = 1; $all_ids{$ec->dbID()} = 1; $query_ids{$ec->dbID()} = 1 if(! exists $self->{'results'}{$anal_name}{$ec->dbID()}); } my @ec_ids = keys %query_ids; my @all_ids = keys %all_ids; #warn "ec ids @ec_ids\n"; #warn "all ids @all_ids\n"; #$self->{'results'} ||= {}; #$self->{'results_complete'} ||= 0;#do we need this now? if((scalar(@all_ids) - scalar(@ec_ids))> 1){ throw("DATA ERROR - There is more than one result stored for the following ExperimentalChip ids: @all_ids"); } elsif(! $self->{'results'} || (($anal_name && scalar(@ec_ids) > 0) && scalar(@all_ids) == scalar(@ec_ids))){ #fetch all, set complete set flag #$self->{'results_complete'} ||= 1 if(! $anal_name); #would need to look up chip and channel analyses here and call relevant fetch #or pass the chip and then build the query as = or IN dependent on context of logic name #if there are multiple results, last one will overwrite others #could do foreach here to deal with retrieving all i.e. no logic name #Can supply mutliple chips, but probe ids "should" be unique(in the DB at least) amongst contiguous array_chips #build the cache based on logic name and table_id #cahce key?? should we cat the ec_ids together? my @result_refs = @{$self->adaptor->fetch_results_by_Probe_Analysis_experimental_chip_ids($self->probe(), $anal, \@ec_ids)}; #Remove lines with no result while(@result_refs && (! $result_refs[0]->[0])){ shift @result_refs; } my $num_results = scalar(@result_refs); my ($result, $mpos); #throw("Fetched more than one result for this ProbeFeature, Analysis and ExperimentalChips") if (scalar(@result_refs) >1); #No sort needed as we sort in the query if($num_results == 1){ $result = $result_refs[0]->[0]; } elsif($num_results == 2){#mean $result = ($result_refs[0]->[0] + $result_refs[1]->[0])/2; } elsif($num_results > 2){#median or mean of median flanks $mpos = $num_results/2; if($mpos =~ /\./){#true median $mpos =~ s/\..*//; $mpos ++; $result = $result_refs[$mpos]->[0]; }else{ $result = ($result_refs[$mpos]->[0] + $result_refs[($mpos+1)]->[0])/2 ; } } $self->{'results'}{$anal_name}{":".join(":", @ec_ids).":"} = $result; } #do we return the ec ids here to, or do we trust that the user will know to only pass contiguous rather than duplicate chips #how are we going to retrieve the result for one of many possible ec id keys? #options, cat ec dbids as key, and grep them to find full key, then return result #this may hide the duplicate chip problem #If a query has already been made and cached,another query with one differing ID(duplicate result) may never be queried as we already have a cahced result #We shoulld pick up duplicates before this happens #If we try and mix ExperimentalChips from different experiments, then this would also cause multiple results, and hence hide some data my @keys; foreach my $id(@all_ids){ my @tmp = grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); #Hacky needs sorting, quick fix for release!! if(@tmp){ push @keys, grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); last; } } throw("Got more than one key for the results cache") if scalar(@keys) > 1; return $self->{'results'}{$anal_name}{$keys[0]}; } #Will this be too slow, can we not do one query across all tables sub get_result_by_ResultSet{ my ($self, $rset) = @_; my $results = $rset->adaptor->fetch_results_by_probe_id_ResultSet($self->probe_id(), $rset); return median($results); } 1;