comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SetupPeaksPipeline.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 NAME
4
5 Bio::EnsEMBL::Hive::RunnableDB::Funcgen::SetupPeaksPipeline
6
7 =head1 DESCRIPTION
8
9 'SetupPeakPipeline' Does all the setup before the Peaks Pipeline
10 Checks for existence of Cell and Feature Type
11 (only flags if they do not exist, does not try to create them!!)
12 Creates Experiment and Set Entries, Checks Analysis, Group etc...
13 This Runnable CANNOT be run multiple times in parallell!
14
15 =cut
16
17 package Bio::EnsEMBL::Funcgen::RunnableDB::SetupPeaksPipeline;
18
19 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl');
20
21 use warnings;
22 use strict;
23 use Bio::EnsEMBL::DBSQL::DBAdaptor;
24 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
25 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
26 #use Data::Dumper;
27
28 sub fetch_input { # fetch parameters...
29 my $self = shift @_;
30
31 $self->SUPER::fetch_input();
32
33 my $efgdba = $self->_efgdba();
34 #my $dnadba = $self->param('dnadb');
35
36 #Need to change this...
37
38 #my $group = $self->_group_name();
39 my $group = 'efg';
40 $efgdba->fetch_group_details($group) || throw "Group $group does not exist in the database. Please create it";
41
42 #Theoretically we should be getting this at the registry...
43 #my $csa = $dnadba->get_adaptor('coordsystem');
44
45 my $assembly = $self->_assembly();
46 ##Check if the assembly is allowed in the current db we're working on...
47 ##In certain DBs this doesn't work: e.g. dev_*_funcgen
48 #if(!grep(/$assembly/, map($_->version, @{ $csa->fetch_all()}))){ throw $assembly." is not a valid assembly"; }
49
50 #Checks if the input dir for this experiment_name exists... all input files (including controls must be in this folder)
51 my $input_dir = $self->_input_dir();
52 if(! -d $input_dir){ throw("Didn't find input directory $input_dir"); }
53
54 #Sets up the output dir for this experiment_name
55 my $output_dir = $self->_output_dir();
56 if(! -d $output_dir){
57 system("mkdir -p $output_dir") && throw("Couldn't create output directory $output_dir");
58 }
59
60 ##Better pass this as a parameter... as sometimes the db does not return the species name...
61 #my $species = $dnadba->species;
62 my $species = $self->_species();
63
64 my $file_type = $self->_file_type();
65 if(($file_type ne 'sam') && ($file_type ne 'bed')){ throw "File type $file_type currently not supported"; }
66
67 #infer sam_header from species and assembly...
68 if($file_type eq 'sam'){
69 #Change the directory structure so it will agree with the rest, without the need to do uc()
70 my $sam_header = $self->_sam_header();
71 if(! -e $sam_header){ throw " File type is sam but could not find sam header $sam_header"; }
72 }
73
74 #Check if input file exists... do not do preprocessing here... as this can be done in parallel...
75 my $input_file = $self->_input_file();
76 if(! -e $input_dir."/".$input_file){ throw " Couldn't find input file $input_file in $input_dir"; }
77
78 #Check if control file exists
79 if(!$self->_skip_control()){
80 my $control_file = $self->_control_file();
81 if(! -e $input_dir."/".$control_file ){
82 #Force throw or just warn?
83 throw "Couldn't find control file ${input_dir}/${control_file}";
84 #$self->_skip_control(1);
85 }
86 } else {
87 throw "CCAT requires a control. Cannot skip it" if($self->_analysis()->logic_name =~ /ccat/i);
88 }
89
90 return 1;
91 }
92
93 sub run { # Check parameters and do appropriate database/file operations...
94 my $self = shift @_;
95
96 #Preprocess control file if needed. this cannot be done in parallel as several sets may require same control file
97 if(!$self->_skip_control()){
98 #Do not mix this "input" file with the true data input file...
99 my $input_file = $self->_input_dir().'/'.$self->_control_file();
100 my $output_file = $self->_output_dir().'/'.$self->_control_file();
101 #Only do it if it hasn't already been done..
102 if(! -e $output_file){
103 $self->_preprocess_file($input_file, $output_file, $self->_file_type()) || throw "Error processing file $input_file";
104 }
105 }
106
107 # Check experiment, data set, feature set and create as appropriate...
108 $self->_check_Experiment($self->_analysis(), $self->_input_file(), $self->_feature_set_name());
109
110 if(!$self->_skip_control()){
111 #Add control file as a subset, if needed...
112 my $input_set = $self->_efgdba()->get_InputSetAdaptor()->fetch_by_name($self->_set_name());
113 if(!$input_set) { throw "Input set was not created"; }
114 my $isubset_name = $self->_control_file();
115 if(!$input_set->get_subset_by_name($isubset_name)){
116 #Change InputSetAdaptor so one subset could be stored each time?
117 $input_set->add_new_subset($isubset_name);
118 #this expects a behavior where subsets already stored will just be ignored and no error is thrown
119 $input_set->adaptor->store_InputSubsets($input_set->get_InputSubsets);
120 }
121 }
122
123 return 1;
124 }
125
126
127 sub write_output { # Create the relevant job
128 my $self = shift @_;
129
130 #These numbers need to be parameters...
131 ## If feature type is H3K36me3 or H3K27me3 use broad peak caller... Maybe all histone data?
132 #if($self->_feature_type()->class eq 'Histone'){
133 #if(($self->_feature_type()->name eq 'H3K36me3') || ($self->_feature_type()->name eq 'H3K27me3')){
134 # $self->dataflow_output_id( $self->input_id, $self->_broad_peak_id);
135
136 if($self->_analysis()->logic_name =~ /ccat/i){
137 $self->dataflow_output_id( $self->input_id, 4);
138 } else {
139 $self->dataflow_output_id( $self->input_id, 3);
140 }
141
142 return 1;
143
144 }
145
146 1;