Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/DataSet.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/DataSet.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,595 @@ +# +# Ensembl module for Bio::EnsEMBL::Funcgen::DataSet +# +# You may distribute this module under the same terms as Perl itself + + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <ensembl-dev@ebi.ac.uk>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + + +=head1 NAME + +Bio::EnsEMBL::Funcgen::DataSet - A module to represent DataSet object. + + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Funcgen::DataSet; + +my $data_set = Bio::EnsEMBL::Funcgen::DataSet->new( + -DBID => $dbID, + -ADAPTOR => $self, + -SUPPORTING_SETS => [$rset], + -FEATURE_SET => $fset, + -DISPLAYABLE => 1, + -NAME => 'DATASET1', + ); + + + +=head1 DESCRIPTION + +A DataSet object provides access to either or both raw results and AnnotatedFeatures +for a given experiment within a Slice, associated with set wide experimental meta data. +This was aimed primarily at easing access to data via the web API by creating +a wrapper class with convenience methods. The focus of this class is to contain raw and +associated processed/analysed data to be displayed as a set within the browser i.e. an +experiment may have different cell lines, features or time points, these would require different DataSets. +# However a DataSet may contain mixed data types i.e. promoter & histone???? No give separate sets? +May have duplicates for raw data but only one predicted features track?? +The data in this class is kept as lightweight as possible with data being loaded dynamically. + + +=cut + +use strict; +use warnings; + +package Bio::EnsEMBL::Funcgen::DataSet; + +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate); +use Bio::EnsEMBL::Funcgen::Storable; + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Storable); +#Should not be a Set as is sufficiently different + + +=head2 new + + + + Example : my $dset = Bio::EnsEMBL::Funcgen::DataSet->new( + -SUPPORTING_SETS => [$fset1, $fset2], + -FEATURE_SET => $fset, + -DISPLAYABLE => 1, + -NAME => 'DATASET1', + ); + + Description: Constructor for DataSet objects. + Returntype : Bio::EnsEMBL::Funcgen::DataSet + Exceptions : Throws if no experiment_id defined + Caller : General + Status : At risk + +=cut + +sub new { + my $caller = shift; + + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_); + + #do we need to add $fg_ids to this? Currently maintaining one feature_group focus.(combi exps?) + my ($fset, $sets, $name) + = rearrange(['FEATURE_SET', 'SUPPORTING_SETS', 'NAME'], @_); + + + my @caller = caller(); + + #do we need to passexperiment_id to check that table_name/id correspond for storage? + #'EXPERIMENT_ID', 'EXPERIMENT_IDS', + + #Can have more than one experiment_id for a combined feature set. But shouldn't query like that. + #therefore we need to be able to track back from feature to ec's rather than exps. + #as there may be mixed data in an exp which didn't necessarily contribute to the combined feature + #We are now separating potentially different featuretype from the same exp into different result_groups + #therefore we only have to track back to the result_group e.g. the contig chip set + + #We also need a way of pulling back GOLDEN/combined resultssets based on feature_set_id + #Set status as GOLDEN, then pull back displayable or GOLDEN raw results + + #Could link experiment_feature_type/feature_set to ec or result_set table? + #latter would mean we don't have to specifiy which ec, just part of set. + #This will make it easier for populating pfs but will mean that we can't easily track back to a particular ec without doing some probe/slice look up via the array chip. + #Not really a requirement, so let's take this hit. + + #Could then maybe use DataSet to store pfs, otherwise we'd have to pass the rset or at the very least the result_set_id. + #do we need some control of creating new objects with dbID and adding result_groups/feature_sets and them storing/updating them + #potential for someone to create one from new using a duplicate dbID and then linking incorrect data to a pre-existing ResultGroup + #can we check wether caller is DataSetAdaptor if we have dbID? + + if($self->dbID() && $caller[0] ne "Bio::EnsEMBL::Funcgen::DBSQL::DataSetAdaptor"){ + throw('You must use the DataSetAdaptor to generate DataSets with dbID i.e. from the DB,'. + ' as this module accomodates updating which may cause incorrect data if the object'. + ' is not generated from the DB'); + } + + + $self->{'supporting_sets'} ||= {}; + #throw("Must specify at least one Result/FeatureSet") if((! $sets) && (! $fset)); + #removed this to allow generation of DataSets without feature sets + #could reimplement this if we change the DataSetAdaptor::_obj_from_sth + + $self->add_supporting_sets($sets) if $sets; + $self->product_FeatureSet($fset) if $fset; + $self->name($name) if $name; + + return $self; +} + + + + + + + +#methods +#set wide display label(predicted_feature) + more wordy label for wiggle tracks? +#defined by experiment type i.e. time course would require timepoint in display label +#deal with this dynamically or have display_label in table +#Need call on type, or fetch all would + +#_get_ec_ids or contigsets? +#this should now be an intrinsic part of this class/adaptor + +#cell line +#feature_type +#displayable...should have one for the whole set and one for each raw and predicted? + +#have analysis as arg? Or do we get all analysis sets? +#we need to be able to set analyses for DataSets dynamically from DB +#pick up all DataSets +#displayable field in DataSets also? + +#If we have mixed types in the same experiment then we could get promoter features and histone wiggle tracks displayed togeter +#Not v.good for display purposes? We may want to separate the promoter and histone tracks, or we may want ll the experiment data together but of mixed types. +#We need to be able to pull back the experiment type for each set, therefore this needs setting on an ec level, not an experiment level. +#This is also v.reliant on putting contig set info in place, otherwise we may get mixed chip types in same set. + +#get_raw_analysis_name +#get_predicted_feature_analysis_name +#set ResultFeatures and AnnotatedFeatures in hash keyed by analysis_name? + +#Need to change to simple accessor +#or should we maintain to provide explicit method for delineating between parent and supporting FeatureSets? +#yes, and sub the feature_type/cell_type checks + + +=head2 product_FeatureSet + + Arg [1] : (optional) Bio::EnsEMBL::Funcgen::FeatureSet + Example : $data_set->product_FeatureSet($fset); + Description: Getter and setter for the main feature_set attribute for this DataSet. + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet + Exceptions : Throws not a valid FeatureSet or if main feature_set has already been set. + Caller : General + Status : At Risk - change to get_product_FeatureSet + +=cut + +sub product_FeatureSet { + my ($self, $fset) = @_; + + if($fset){ + + if (! ($fset && ref($fset) && $fset->isa("Bio::EnsEMBL::Funcgen::FeatureSet"))){ + throw("Need to pass a valid Bio::EnsEMBL::Funcgen::FeatureSet") + } + + if(defined $self->{'feature_set'}){ + throw("The main feature_set has already been set for this DataSet, maybe you want add_SupportingSets?"); + } + else{ + $self->_validate_and_set_types($fset); + $self->{'feature_set'} = $fset; + } + } + + return $self->{'feature_set'}; +} + + +=head2 add_supporting_sets + + Arg [1] : Array of Bio::EnsEMBL::Feature/ResultSet object + Example : $dset->add_supporting_sets($rset); + Description: Adds Result/FeatureSets to the DataSet + Returntype : none + Exceptions : Throws if set not valid for supporting_set type of DataSet + Throws if supporting_sets is not an array ref + Caller : General + Status : At Risk + +=cut + + +sub add_supporting_sets { + my ($self, $sets) = @_; + + #should we handle displayable here, and propogate to the ResultSet if update_status is set + #is there scope to write a Funcgen::Storable, which provides convenience methods to StatusAdaptor? + #would have to make sure Feature object also inherited from Funcgen::Storable aswell as BaseFeature + + throw("Supporting sets need to be a reference to an ARRAY:\t".$sets) if ref($sets) ne 'ARRAY'; + + foreach my $set(@$sets){ + + if(!(ref($set) && $set->isa('Bio::EnsEMBL::Funcgen::Set') && $set->set_type ne 'data' && $set->dbID)){ + throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::Set which is not a DataSet:\t$set"); + } + #set type cannot be data at present as it does not inherit from Set.pm + + + + #Only validate if we are dealing with result type data + #As we can have various cell/feature_types for compound analyses e.g. RegulatoryFeatures + + $self->_validate_and_set_types($set) if $set->set_type() ne 'feature'; + + #should ResultSet/Adaptor contain all the fetch_methods, and leave DataSet as a kind of organisational class as a single point of access. + #DataSetAdaptor to perform the ordering according to feature/celltype + #This will still not resolve the complex data sets which can be accomodated by the DB. + #Maybe we can keep the data sets as simple as there are and confer the association by tracking back to the experiment? + #Would there only ever be one experiment for a complex data_set? + + + #Can have more than one experiment for a compound feature set, would we ever want to display raw data? + #This is actually an easier problem unless we are displaying two feature types(i.e. complex and compound) + + + $self->{'supporting_sets'}->{$set->analysis->dbID()} ||= (); + push @{$self->{'supporting_sets'}->{$set->analysis->dbID()}}, $set; + } + + return; +} + + +=head2 _validate_and_set_types + + Arg [1] : Bio::EnsEMBL::Feature/ResultSet object + Example : $dset->_validate_and_set_types($rset); + Description: Validates and sets DataSet cell and feature types + Returntype : none + Exceptions : Throws if types not valid + Caller : General + Status : At Risk + +=cut + + +sub _validate_and_set_types{ + my ($self, $set) = @_; + + #slightly dodgy bypassing methods, but extendable + + #This currently restricts all set types to one cell and feature type + #this is incorrect for feature_set types as we want to munge several feature and possibly cell types + #into one combined data set. + #this should set it to the FeatureSet type if is feature_set data_set + #this only works as we only validate supporting_sets if type is not feature + + for my $type('feature_type', 'cell_type'){ + + if(defined $self->{$type}){ + + #Need to test isa here? Why is this passing the defined test if not set? + if($set->{$type}->name() ne $self->{$type}->name()){ + + throw(ref($set)." $type(".$set->{$type}->name(). + ") does not match DataSet $type(".$self->{$type}->name().")"); + + } + } + else{ + $self->{$type} = $set->{$type}; + } + } + + return; +} + + + +=head2 get_supporting_sets_by_Analysis + + Arg [1] : Bio::EnsEMBL::Funcgen:Analysis + Arg [2] : (optional) status - e.g 'DISPLAYABLE' + Example : my $anal_sets = @{$result_set->get_ResultSets_by_Analysis($analysis)}; + Description: Getter for the SupportingSet objects of a given Analysis. + Returntype : ARRAYREF + Exceptions : Throws if arg is not a valid stored Bio::EnsEMBL::Anaylsis + Caller : General + Status : At Risk + +=cut + +sub get_supporting_sets_by_Analysis { + my ($self, $analysis, $status) = @_; + + + my @rsets; + + + #should we handle displayable here, and propogate to the ResultSet if update_status is set + #is there scope to write a Funcgen::Storable, which provides convenience methods to StatusAdaptor? + #would have to make sure Feature object also inherited from Funcgen::Storable aswell as BaseFeature + + + if (! ($analysis->isa("Bio::EnsEMBL::Analysis") && $analysis->dbID())){ + throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::ResultSet"); + } + + #will have to generate new array of object here if we want to filter displayable + #This may result in returning a ref to the stored ResultSets for no status + #And a ref to the abstracted/filtered i.e. non-stored ResultSets if we have a status + #This could cause problems if people want to edit the real ResultSets via the refs + #If we edit the ResultSets like this, we would still store via their adaptor + #so would need to refresh DataSet anyway. + + #should ResultSet/Adaptor contain all the fetch_methods, and leave DataSet as a kind of organisational class as a single point of access. + #DataSetAdaptor to perform the ordering according to feature/celltype + #This will still not resolve the complex data sets which can be accomodated by the DB. + #Maybe we can keep the data sets as simple as there are and confer the association by tracking back to the experiment? + #Would there only ever be one experiment for a complex data_set? + + + #Can have more than one experiment for a compound feature set, would we ever want to display raw data? + #This is actually an easier problem unless we are displaying two feature types(i.e. complex and compound) + + #could we have >1 rset with the same analysis? + + foreach my $anal_rset(@{$self->{'supporting_sets'}->{$analysis->dbID()}}){ + + if(! defined $status){ + push @rsets, $anal_rset; + } + elsif($anal_rset->has_status($status)){ + push @rsets, $anal_rset; + } + } + + return \@rsets; +} + + + +=head2 get_supporting_sets + + Arg [1] : (optional) status - e.g 'DISPLAYABLE' + Example : my @status_sets = @{$data_set->get_supporting_sets($status)}; + Description: Getter for the ResultSets for this DataSet. + Returntype : Arrayref + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_supporting_sets{ + my ($self, $status, $set_type) = @_; + #swap the args here + + #Add analysis here and make above method wrapper + + #Validate type here + if($set_type && + ($set_type ne 'result' && + $set_type ne 'feature' && + $set_type ne 'input')){ + throw("You have specified an invalid supporting set type:\t$set_type"); + } + + + my @ssets; + + foreach my $anal_id(keys %{$self->{'supporting_sets'}}){ + + foreach my $sset(@{$self->{'supporting_sets'}->{$anal_id}}){ + + if(defined $status && + (! $sset->has_status($status))){ + next; + } + + if(defined $set_type && + ($sset->set_type ne $set_type)){ + next; + } + + push @ssets, $sset; + } + } + + return \@ssets; +} + + + + +=head2 get_displayable_supporting_sets + + Example : my @displayable_rsets = @{$result_set->get_displayable_supporting_sets()}; + Description: Convenience method for web display + Returntype : Arrayref + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_displayable_supporting_sets{ + my ($self, $set_type) = @_; + + return $self->get_supporting_sets('DISPLAYABLE', $set_type); +} + + + +=head2 get_displayable_product_FeatureSet + + Example : my $fset = $data_set->get_displayable_product_FeatureSet(); + Description: Convenience method for web display + Returntype : Bio::EnsEMBL::Funcgen::FeatureSet + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub get_displayable_product_FeatureSet{ + my $self = shift; + + return $self->product_FeatureSet->has_status('DISPLAYABLE') ? $self->product_FeatureSet() : undef; +} + + + + + +=head2 name + + Example : my $dset->name('DATASET1'); + Description: Getter/Setter for the name of this DataSet. + Returntype : string + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub name { + my $self = shift; + + $self->{'name'} = shift if @_; + + return $self->{'name'}; +} + + + + +#The following attributes are generated dynamically from the consituent Result/FeatureSets + +=head2 cell_type + + Example : my $dset_ctype_name = $dset->cell_type->name(); + Description: Getter for the cell_type for this DataSet. + Returntype : Bio::EnsEMBL::Funcgen::CellType + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub cell_type { + my $self = shift; + + return $self->{'cell_type'}; +} + +=head2 feature_type + + Example : my $dset_ftype_name = $dset->feature_type->name(); + Description: Getter for the feature_type for this DataSet. + Returntype : Bio::EnsEMBL::Funcgen::FeatureType + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub feature_type { + my $self = shift; + + return $self->{'feature_type'}; +} + + + + + +=head2 display_label + + Example : print $rset->display_label(); + Description: Getter for the display_label attribute for this DataSet. + This is more appropriate for teh predicted_features of the set. + Use the individual display_labels for each raw result set. + Returntype : str + Exceptions : None + Caller : General + Status : At Risk + +=cut + +sub display_label { + my $self = shift; + + + #Add display label in table? + + if(! $self->{'display_label'}){ + + #This does not account for DataSet without a product FeatureSet + my $fset = $self->product_FeatureSet; + + if($fset && ($fset->feature_type->class() eq 'Regulatory Feature')){ + $self->{'display_label'} = 'Regulatory Features'; + } + else{ + + $self->{'display_label'} = $self->feature_type->name()." -"; + $self->{'display_label'} .= " ".($self->cell_type->display_label() || + $self->cell_type->description() || + $self->cell_type()->name()); + $self->{'display_label'} .= " Enriched Sites"; + } + } + + return $self->{'display_label'}; +} + + +#sub get_type_config{ +# my ($self) = @_; +# +# if (! defined $self->{type_config}){ +# $self->{type_config} = $self->adaptor->fetch_type_config_by_DataSet($self); +# } +# +# return $self->{type_config}; +#} + + + +1; +