0
|
1 =pod
|
|
2
|
|
3 =head1 NAME
|
|
4
|
|
5 Bio::EnsEMBL::Hive::RunnableDB::Funcgen::SetupPeaksPipeline
|
|
6
|
|
7 =head1 DESCRIPTION
|
|
8
|
|
9 'SetupPeakPipeline' Does all the setup before the Peaks Pipeline
|
|
10 Checks for existence of Cell and Feature Type
|
|
11 (only flags if they do not exist, does not try to create them!!)
|
|
12 Creates Experiment and Set Entries, Checks Analysis, Group etc...
|
|
13 This Runnable CANNOT be run multiple times in parallell!
|
|
14
|
|
15 =cut
|
|
16
|
|
17 package Bio::EnsEMBL::Funcgen::RunnableDB::SetupPeaksPipeline;
|
|
18
|
|
19 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl');
|
|
20
|
|
21 use warnings;
|
|
22 use strict;
|
|
23 use Bio::EnsEMBL::DBSQL::DBAdaptor;
|
|
24 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
|
|
25 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
|
|
26 #use Data::Dumper;
|
|
27
|
|
28 sub fetch_input { # fetch parameters...
|
|
29 my $self = shift @_;
|
|
30
|
|
31 $self->SUPER::fetch_input();
|
|
32
|
|
33 my $efgdba = $self->_efgdba();
|
|
34 #my $dnadba = $self->param('dnadb');
|
|
35
|
|
36 #Need to change this...
|
|
37
|
|
38 #my $group = $self->_group_name();
|
|
39 my $group = 'efg';
|
|
40 $efgdba->fetch_group_details($group) || throw "Group $group does not exist in the database. Please create it";
|
|
41
|
|
42 #Theoretically we should be getting this at the registry...
|
|
43 #my $csa = $dnadba->get_adaptor('coordsystem');
|
|
44
|
|
45 my $assembly = $self->_assembly();
|
|
46 ##Check if the assembly is allowed in the current db we're working on...
|
|
47 ##In certain DBs this doesn't work: e.g. dev_*_funcgen
|
|
48 #if(!grep(/$assembly/, map($_->version, @{ $csa->fetch_all()}))){ throw $assembly." is not a valid assembly"; }
|
|
49
|
|
50 #Checks if the input dir for this experiment_name exists... all input files (including controls must be in this folder)
|
|
51 my $input_dir = $self->_input_dir();
|
|
52 if(! -d $input_dir){ throw("Didn't find input directory $input_dir"); }
|
|
53
|
|
54 #Sets up the output dir for this experiment_name
|
|
55 my $output_dir = $self->_output_dir();
|
|
56 if(! -d $output_dir){
|
|
57 system("mkdir -p $output_dir") && throw("Couldn't create output directory $output_dir");
|
|
58 }
|
|
59
|
|
60 ##Better pass this as a parameter... as sometimes the db does not return the species name...
|
|
61 #my $species = $dnadba->species;
|
|
62 my $species = $self->_species();
|
|
63
|
|
64 my $file_type = $self->_file_type();
|
|
65 if(($file_type ne 'sam') && ($file_type ne 'bed')){ throw "File type $file_type currently not supported"; }
|
|
66
|
|
67 #infer sam_header from species and assembly...
|
|
68 if($file_type eq 'sam'){
|
|
69 #Change the directory structure so it will agree with the rest, without the need to do uc()
|
|
70 my $sam_header = $self->_sam_header();
|
|
71 if(! -e $sam_header){ throw " File type is sam but could not find sam header $sam_header"; }
|
|
72 }
|
|
73
|
|
74 #Check if input file exists... do not do preprocessing here... as this can be done in parallel...
|
|
75 my $input_file = $self->_input_file();
|
|
76 if(! -e $input_dir."/".$input_file){ throw " Couldn't find input file $input_file in $input_dir"; }
|
|
77
|
|
78 #Check if control file exists
|
|
79 if(!$self->_skip_control()){
|
|
80 my $control_file = $self->_control_file();
|
|
81 if(! -e $input_dir."/".$control_file ){
|
|
82 #Force throw or just warn?
|
|
83 throw "Couldn't find control file ${input_dir}/${control_file}";
|
|
84 #$self->_skip_control(1);
|
|
85 }
|
|
86 } else {
|
|
87 throw "CCAT requires a control. Cannot skip it" if($self->_analysis()->logic_name =~ /ccat/i);
|
|
88 }
|
|
89
|
|
90 return 1;
|
|
91 }
|
|
92
|
|
93 sub run { # Check parameters and do appropriate database/file operations...
|
|
94 my $self = shift @_;
|
|
95
|
|
96 #Preprocess control file if needed. this cannot be done in parallel as several sets may require same control file
|
|
97 if(!$self->_skip_control()){
|
|
98 #Do not mix this "input" file with the true data input file...
|
|
99 my $input_file = $self->_input_dir().'/'.$self->_control_file();
|
|
100 my $output_file = $self->_output_dir().'/'.$self->_control_file();
|
|
101 #Only do it if it hasn't already been done..
|
|
102 if(! -e $output_file){
|
|
103 $self->_preprocess_file($input_file, $output_file, $self->_file_type()) || throw "Error processing file $input_file";
|
|
104 }
|
|
105 }
|
|
106
|
|
107 # Check experiment, data set, feature set and create as appropriate...
|
|
108 $self->_check_Experiment($self->_analysis(), $self->_input_file(), $self->_feature_set_name());
|
|
109
|
|
110 if(!$self->_skip_control()){
|
|
111 #Add control file as a subset, if needed...
|
|
112 my $input_set = $self->_efgdba()->get_InputSetAdaptor()->fetch_by_name($self->_set_name());
|
|
113 if(!$input_set) { throw "Input set was not created"; }
|
|
114 my $isubset_name = $self->_control_file();
|
|
115 if(!$input_set->get_subset_by_name($isubset_name)){
|
|
116 #Change InputSetAdaptor so one subset could be stored each time?
|
|
117 $input_set->add_new_subset($isubset_name);
|
|
118 #this expects a behavior where subsets already stored will just be ignored and no error is thrown
|
|
119 $input_set->adaptor->store_InputSubsets($input_set->get_InputSubsets);
|
|
120 }
|
|
121 }
|
|
122
|
|
123 return 1;
|
|
124 }
|
|
125
|
|
126
|
|
127 sub write_output { # Create the relevant job
|
|
128 my $self = shift @_;
|
|
129
|
|
130 #These numbers need to be parameters...
|
|
131 ## If feature type is H3K36me3 or H3K27me3 use broad peak caller... Maybe all histone data?
|
|
132 #if($self->_feature_type()->class eq 'Histone'){
|
|
133 #if(($self->_feature_type()->name eq 'H3K36me3') || ($self->_feature_type()->name eq 'H3K27me3')){
|
|
134 # $self->dataflow_output_id( $self->input_id, $self->_broad_peak_id);
|
|
135
|
|
136 if($self->_analysis()->logic_name =~ /ccat/i){
|
|
137 $self->dataflow_output_id( $self->input_id, 4);
|
|
138 } else {
|
|
139 $self->dataflow_output_id( $self->input_id, 3);
|
|
140 }
|
|
141
|
|
142 return 1;
|
|
143
|
|
144 }
|
|
145
|
|
146 1;
|