Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupPeaksPipeline.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupPeaksPipeline.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,146 @@
+=pod 
+
+=head1 NAME
+
+Bio::EnsEMBL::Hive::RunnableDB::Funcgen::SetupPeaksPipeline
+
+=head1 DESCRIPTION
+
+'SetupPeakPipeline' Does all the setup before the Peaks Pipeline
+Checks for existence of Cell and Feature Type 
+(only flags if they do not exist, does not try to create them!!)
+Creates Experiment and Set Entries, Checks Analysis, Group etc...
+This Runnable CANNOT be run multiple times in parallell!
+
+=cut
+
+package Bio::EnsEMBL::Funcgen::RunnableDB::SetupPeaksPipeline;
+
+use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl');
+
+use warnings;
+use strict;
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; 
+use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
+#use Data::Dumper;
+
+sub fetch_input {   # fetch parameters...
+  my $self = shift @_;
+  
+  $self->SUPER::fetch_input();
+
+  my $efgdba = $self->_efgdba();
+  #my $dnadba = $self->param('dnadb');
+  
+  #Need to change this...
+  
+  #my $group = $self->_group_name();
+  my $group = 'efg';
+  $efgdba->fetch_group_details($group) || throw "Group $group does not exist in the database. Please create it"; 
+
+  #Theoretically we should be getting this at the registry...
+  #my $csa = $dnadba->get_adaptor('coordsystem');
+
+  my $assembly = $self->_assembly();
+  ##Check if the assembly is allowed in the current db we're working on...
+  ##In certain DBs this doesn't work: e.g. dev_*_funcgen
+  #if(!grep(/$assembly/, map($_->version, @{ $csa->fetch_all()}))){ throw $assembly." is not a valid assembly"; }
+
+  #Checks if the input dir for this experiment_name exists... all input files (including controls must be in this folder)
+  my $input_dir = $self->_input_dir();
+  if(! -d $input_dir){ throw("Didn't find input directory $input_dir"); }
+  
+  #Sets up the output dir for this experiment_name
+  my $output_dir = $self->_output_dir();
+  if(! -d $output_dir){ 
+    system("mkdir -p $output_dir") && throw("Couldn't create output directory $output_dir");
+  }
+  
+  ##Better pass this as a parameter... as sometimes the db does not return the species name... 
+  #my $species = $dnadba->species;
+  my $species = $self->_species();
+
+  my $file_type = $self->_file_type();
+  if(($file_type ne 'sam') && ($file_type ne 'bed')){ throw "File type $file_type currently not supported"; }
+
+  #infer sam_header from species and assembly...
+  if($file_type eq 'sam'){
+    #Change the directory structure so it will agree with the rest, without the need to do uc()
+    my $sam_header = $self->_sam_header();
+    if(! -e $sam_header){ throw " File type is sam but could not find sam header $sam_header"; }
+  }
+
+  #Check if input file exists... do not do preprocessing here... as this can be done in parallel... 
+  my $input_file = $self->_input_file();
+  if(! -e $input_dir."/".$input_file){ throw " Couldn't find input file $input_file in $input_dir"; }
+
+  #Check if control file exists
+  if(!$self->_skip_control()){ 
+    my $control_file = $self->_control_file();
+    if(! -e  $input_dir."/".$control_file ){ 
+      #Force throw or just warn?
+      throw "Couldn't find control file ${input_dir}/${control_file}";
+      #$self->_skip_control(1);
+    }
+  } else {
+    throw "CCAT requires a control. Cannot skip it" if($self->_analysis()->logic_name =~ /ccat/i);
+  }
+
+  return 1;
+}
+
+sub run {   # Check parameters and do appropriate database/file operations... 
+  my $self = shift @_;
+
+  #Preprocess control file if needed. this cannot be done in parallel as several sets may require same control file
+  if(!$self->_skip_control()){
+    #Do not mix this "input" file with the true data input file...
+    my $input_file = $self->_input_dir().'/'.$self->_control_file();
+    my $output_file = $self->_output_dir().'/'.$self->_control_file();
+    #Only do it if it hasn't already been done..
+    if(! -e $output_file){
+      $self->_preprocess_file($input_file, $output_file, $self->_file_type()) || throw "Error processing file $input_file";
+    }
+  }
+
+  # Check experiment, data set, feature set and create as appropriate...
+  $self->_check_Experiment($self->_analysis(), $self->_input_file(), $self->_feature_set_name());
+
+  if(!$self->_skip_control()){
+    #Add control file as a subset, if needed...
+    my $input_set = $self->_efgdba()->get_InputSetAdaptor()->fetch_by_name($self->_set_name());
+    if(!$input_set) { throw "Input set was not created"; }
+    my $isubset_name = $self->_control_file();
+    if(!$input_set->get_subset_by_name($isubset_name)){ 
+      #Change InputSetAdaptor so one subset could be stored each time?
+      $input_set->add_new_subset($isubset_name);
+      #this expects a behavior where subsets already stored will just be ignored and no error is thrown
+      $input_set->adaptor->store_InputSubsets($input_set->get_InputSubsets);
+    }
+  }
+
+  return 1;
+}
+
+
+sub write_output {  # Create the relevant job
+  my $self = shift @_;
+
+  #These numbers need to be parameters...
+  ## If feature type is H3K36me3 or H3K27me3 use broad peak caller... Maybe all histone data?
+  #if($self->_feature_type()->class eq 'Histone'){
+  #if(($self->_feature_type()->name eq 'H3K36me3') || ($self->_feature_type()->name eq 'H3K27me3')){
+  #  $self->dataflow_output_id( $self->input_id, $self->_broad_peak_id);
+
+  if($self->_analysis()->logic_name =~ /ccat/i){
+    $self->dataflow_output_id( $self->input_id, 4);
+  } else {
+    $self->dataflow_output_id( $self->input_id, 3);
+  }
+  
+  return 1;
+
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children