Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Funcgen.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/Funcgen.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,399 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen + +=head1 DESCRIPTION + +'Funcgen' is a base class for other runnables of the Funcgen Hive Pipeline +It performs common tasks such as connecting to the EFG DB etc... + +=cut + +package Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen; + +use warnings; +use strict; + +use Bio::EnsEMBL::Funcgen::Utils::Helper; +use Bio::EnsEMBL::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; +use Bio::EnsEMBL::Funcgen::InputSet; +use Bio::EnsEMBL::Funcgen::DataSet; +use Bio::EnsEMBL::Funcgen::FeatureSet; +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +#use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor; +use base ('Bio::EnsEMBL::Hive::Process'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Data::Dumper; + +#This defines a set of parameters based on given parameters from the pipeline: +sub fetch_input { # nothing to fetch... just the DB parameters... + my $self = shift @_; + + #An example of debug, in case needed + #print Dumper $self->param('dnadb'); + if(!$self->param('bin_dir')){ throw "Folder with funcgen binaries bin_dir required"; } + $self->_bin_dir($self->param('bin_dir')); + + my $dnadb_params = $self->param('dnadb') || throw "No parameters for Core DB"; + my $efgdb_params = $self->param('efgdb') || throw "No parameters for EFG DB"; + + #Get efg connection, otherwise fail.. + eval{ + $self->_efgdba(Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor->new + ( + %{ $efgdb_params }, + #let efg dba hanle dnadb + { + -dnadb_name => $dnadb_params->{-dbname}, + -dnadb_user => $dnadb_params->{-user}, + -dnadb_host => $dnadb_params->{-host}, + -dnadb_port => $dnadb_params->{-port}, + -dnadb_pass => $dnadb_params->{-pass}, + + } + )); + + #Actually test connections + $self->_efgdba->dbc->db_handle; + $self->_efgdba->dnadb->dbc->db_handle; + }; + + if($@) { throw "Error creating the EFG DBAdaptor and/or dna DBAdaptor $@"; } + + #Set some params + my $cell_type = $self->param('cell_type') || throw "No cell_type given"; + my $feature_type = $self->param('feature_type') || throw "No feature_type given"; + my $experiment_name = $self->param('experiment_name') || throw "No experiment_name given"; + $self->_experiment_name($experiment_name); + my $set_name = $self->param('set_name') || $cell_type."_".$feature_type."_".$experiment_name; + $self->_set_name($set_name); + my $group_name = $self->param('group') || 'efg'; + my $species = $self->param('species') || throw "No species defined"; + $self->_species($species); + my $assembly = $self->param('assembly') || throw "No assembly version given"; + $self->_assembly($assembly); + my $file_type = $self->param('file_type') || throw "No file type given"; + $self->_file_type($file_type); + my $work_dir = $self->param('work_dir') || throw "'work_dir' is a required parameter"; + $self->_work_dir($work_dir); + + + + #Configure DBAdaptors + my $efgdba = $self->_efgdba(); + #To avoid farm issues... + $efgdba->dbc->disconnect_when_inactive(1); + $efgdba->dnadb->dbc->disconnect_when_inactive(1); + + #Fetch & Set object params + #CellType + my $cta = $efgdba->get_CellTypeAdaptor(); + my $ct_obj = $cta->fetch_by_name($cell_type); + if(!$ct_obj){ throw "Cell type $cell_type does not exist in the database"; } + $self->_cell_type($ct_obj); + + #FeatureType + my $fta = $efgdba->get_FeatureTypeAdaptor(); + my $ft_obj = $fta->fetch_by_name($feature_type); + if(!$ft_obj){ throw "Feature type $feature_type does not exist in the database"; } + $self->_feature_type($ft_obj); + + #ExperimentalGroup + my $ega = $efgdba->get_ExperimentalGroupAdaptor(); + my $eg_obj = $ega->fetch_by_name($group_name); + if(!$eg_obj){ throw "Experimental Group $group_name does not exist in the database"; } + $self->_group($eg_obj); + + + if($file_type eq 'sam'){ + #Change the directory structure so it will agree with the rest, without the need to do uc() + my $sam_header = $self->_work_dir()."/sam_header/".$species."/".$species."_"; + $sam_header .= $ct_obj->gender() ? $ct_obj->gender() : 'male'; + #Carefull with naming standards... + #$sam_header .= "_".$assembly."_unmasked.fa.fai"; + $sam_header .= "_".$assembly."_unmasked.fasta.fai"; + $self->_sam_header($sam_header); + } + + #Work with conventions here too?? work_dir/output/dbname ?? + my $output_dir = $self->param('output_dir') || throw "'output_dir' is a required parameter"; + $self->_output_dir($output_dir."/".$experiment_name); + + return 1; +} + + +sub run { + my $self = shift @_; + + return 1; +} + + +sub write_output { + my $self = shift @_; + + return 1; + +} + + +#Private Function to check and create Experiment and Feature/Data sets as needed +#Requires some global parameters that are not set in Funcgen->fetch_input, such as +#'analysis', 'feature_set_name', 'data_set_name' (these could be given as local parameters...) +sub _check_Experiment { + + #Todo make it more generic and accept multiple input_subsets + #Also maybe pass parameters as hash list... + my ($self, $analysis, $input_subset, $fset_name) = @_; + + #Global parameters set in Funcgen->fetch_input + my $efgdba = $self->_efgdba(); + my $set_name = $self->_set_name(); + my $group = $self->_group(); + my $cell_type = $self->_cell_type(); + my $feature_type = $self->_feature_type(); + + my $iset_name = $set_name; + my $dset_name = $fset_name; + + # set experiment: Reuse if already exists? (This comes from result sets) + my $ea = $efgdba->get_ExperimentAdaptor; + my $exp = $ea->fetch_by_name($set_name); + + my @date = (localtime)[5,4,3]; + $date[0] += 1900; $date[1]++; + + if (! defined $exp) { + + #Group needs to be set manually, like Cell_Type and Feature_Type + #Do not create Group on the fly here, as it will cause concurrency issues... + $exp = Bio::EnsEMBL::Funcgen::Experiment->new + ( + -NAME => $set_name, + -EXPERIMENTAL_GROUP => $group, + -DATE => join('-', @date), + -PRIMARY_DESIGN_TYPE => 'binding_site_identification', + -ADAPTOR => $ea, + ); + + ($exp) = @{$ea->store($exp)}; + + } + throw("Can't create experiment $set_name ") unless $exp; + + my $isa = $efgdba->get_InputSetAdaptor(); + my $iset = $isa->fetch_by_name($iset_name); + + if (! defined $iset){ + + $iset = Bio::EnsEMBL::Funcgen::InputSet->new + ( + -name => $iset_name, + -experiment => $exp, + -feature_type => $feature_type, + -cell_type => $cell_type, + -vendor => 'SOLEXA', + -format => 'SEQUENCING', + -feature_class => 'result' + # Analysis is not being used?? + #-analysis => $self->feature_analysis, + ); + warn "Storing new InputSet:\t$iset_name\n"; + ($iset) = @{$isa->store($iset)}; + + $iset->add_new_subset($input_subset); + $iset->adaptor->store_InputSubsets($iset->get_InputSubsets); + } else { + + #We only expect one subset here (? why??)... + #shouldn't we be adding the control file also when used?? But this is SWEmbl-specific... + #And it should be the same file name... + #Maybe do some file checking here??? + warn "InputSet already exists:\t$iset_name\n"; + my @issets = @{$iset->get_InputSubsets}; + + #if(scalar(@issets) > 1){ + # throw("InputSet $iset_name has more than one InputSubset:\t".join("\t", (map $_->name, @issets))); + #} elsif((scalar(@issets) == 1) && ($issets[0]->name ne $self->param('input_file'))){ + # throw("InputSet $iset_name already has an InputSubset(".$issets[0]->name.") which does not match ".$self->param('input_file')); + #} elsif(scalar(@issets) == 0){ #we can just add this InputSubset + # $iset->add_new_subset($self->input_id); + # $iset->adaptor->store_InputSubsets($iset->get_InputSubsets); + #} + + if(scalar(@issets)==0){ + #we can just add this InputSubset. Add an extra 'input:' as prefix? + $iset->add_new_subset($input_subset); + $iset->adaptor->store_InputSubsets($iset->get_InputSubsets); + } else { + #warn("Need to uncomment this section!! - it was commented just for testing purposes!!"); + #we just need to check if our file(s) is(are) already here... + if(!$iset->get_subset_by_name($input_subset)){ + #throw("InputSet $iset_name has InputSubsets(".join("\t", (map $_->name, @issets)).") which do not match ".$input_subset); + #warn("InputSet $iset_name has InputSubsets(".join("\t", (map $_->name, @issets)).") which do not match ".$input_subset); + } + } + } + + my $fsa = $efgdba->get_FeatureSetAdaptor(); + my $fset = $fsa->fetch_by_name($fset_name); + + if ( ! defined $fset ) { + + $fset = Bio::EnsEMBL::Funcgen::FeatureSet->new + ( + -analysis => $analysis, + -feature_type => $feature_type, + -cell_type => $cell_type, + -name => $fset_name, + -feature_class => 'annotated', + -experiment_id => $exp->dbID, + #The adaptor is needed to store! + -adaptor => $fsa + + ); + + warn "Storing new FeatureSet:\t$fset_name\n"; + ($fset) = @{$fsa->store($fset)}; + + } + else { + warn "FeatureSet already exists:\t$fset_name\n"; + + if(@{$efgdba->get_AnnotatedFeatureAdaptor->fetch_all_by_FeatureSets([$fset])}){ + throw "Feature Set $set_name already contains data. Please rollback before rerunning"; + } + + } + + my $dsa = $efgdba->get_DataSetAdaptor; + my $dset = $dsa->fetch_by_name($dset_name); + + + if ( ! defined $dset ) { + + $dset = Bio::EnsEMBL::Funcgen::DataSet->new + ( + -SUPPORTING_SETS => [$iset], + -FEATURE_SET => $fset, + -DISPLAYABLE => 1, + -NAME => $dset_name, + -SUPPORTING_SET_TYPE => 'input', + ); + + warn "Storing new DataSet:\t$dset_name\n"; + ($dset) = @{$dsa->store($dset)} + } + else { + + warn "DataSet already exists:\t$dset_name\n"; + + # need to check whether InputSets and supporting_sets are the same and + # possibly add InputSet to supporting_sets + + my $ssets = $dset->get_supporting_sets(); + + my %ssets_dbIDs = (); + map { $ssets_dbIDs{$_->dbID}='' } (@{$ssets}); + $dset->add_supporting_sets([ $iset ]) if (! exists $ssets_dbIDs{$iset->dbID}); + + } + +} + + +#Private Generic getter and setter +sub _getter_setter { + my ($self, $param_name, $param_value) = @_; + if(!$param_name){ return undef; } + if(!$param_value){ + $param_value = $self->param($param_name); + } else { + $self->param($param_name, $param_value); + } + return $param_value; +} + +# Private getter / setters : Maybe do some validation in some cases... + +#Private getter / setter to the EFG DB Adaptor +sub _efgdba { + return $_[0]->_getter_setter('efgdb',$_[1]); +} + +#Private getter / setter to the Core DB Adaptor +sub _dnadba { + return $_[0]->_getter_setter('dnadb',$_[1]); +} + +#Private getter / setter to the Cell Type object +sub _cell_type { + return $_[0]->_getter_setter('cell_type',$_[1]); +} + +#Private getter / setter to the Feature Type object +sub _feature_type { + return $_[0]->_getter_setter('feature_type',$_[1]); +} + +#Private getter / setter to the Species name +sub _species { + return $_[0]->_getter_setter('species',$_[1]); +} + +#Private getter / setter to the assembly name +sub _assembly { + return $_[0]->_getter_setter('assembly',$_[1]); +} + +#Private getter / setter to the Analysis object +sub _analysis { + return $_[0]->_getter_setter('analysis',$_[1]); +} + +#Private getter / setter to the Group +sub _group { + return $_[0]->_getter_setter('group',$_[1]); +} + +#Private getter / setter to the Experiment Name (do not mix with the Set Name) +sub _experiment_name { + return $_[0]->_getter_setter('experiment_name',$_[1]); +} + +#Private getter / setter to the Set Name +sub _set_name { + return $_[0]->_getter_setter('set_name',$_[1]); +} + +#Private getter / setter to the file type +sub _file_type { + return $_[0]->_getter_setter('file_type',$_[1]); +} + +#Private getter / setter to the sam header (only set when file type is sam) +sub _sam_header { + return $_[0]->_getter_setter('sam_header',$_[1]); +} + +#Private getter / setter to the work folder +sub _work_dir { + return $_[0]->_getter_setter('work_dir',$_[1]); +} + +#Private getter / setter to the output folder +sub _output_dir { + return $_[0]->_getter_setter('output_dir',$_[1]); +} + +#Private getter / setter to the bin folder +sub _bin_dir { + return $_[0]->_getter_setter('bin_dir',$_[1]); +} + +1;