0
|
1 =pod
|
|
2
|
|
3 =head1 NAME
|
|
4
|
|
5 Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT
|
|
6
|
|
7 =head1 DESCRIPTION
|
|
8
|
|
9 'RunCCAT' Runs CCAT "broad peak" caller and stores peaks as an annotated feature set.
|
|
10 Assumes Files are organized and named with a specific convention
|
|
11 ($repository_folder)/experiment_name/cell-type_feature-type/
|
|
12 unless specific files are given
|
|
13
|
|
14 =cut
|
|
15
|
|
16 package Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT;
|
|
17
|
|
18 use warnings;
|
|
19 use strict;
|
|
20 use Bio::EnsEMBL::Funcgen::Utils::Helper;
|
|
21 use Bio::EnsEMBL::DBSQL::DBAdaptor;
|
|
22 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
|
|
23 use Bio::EnsEMBL::Funcgen::InputSet;
|
|
24 use Bio::EnsEMBL::Funcgen::DataSet;
|
|
25 use Bio::EnsEMBL::Funcgen::FeatureSet;
|
|
26 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
|
|
27 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
|
|
28 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd);
|
|
29
|
|
30
|
|
31 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl');
|
|
32
|
|
33 sub fetch_input { # fetch and preprocess the input file plus do some more checking
|
|
34 my $self = shift @_;
|
|
35
|
|
36 $self->SUPER::fetch_input();
|
|
37
|
|
38 my $efgdba = $self->_efgdba();
|
|
39 my $fsa = $efgdba->get_FeatureSetAdaptor();
|
|
40 if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){
|
|
41 throw "Feature Set was not Created";
|
|
42 }
|
|
43
|
|
44 my $bin_dir = $self->_bin_dir();
|
|
45
|
|
46 my $analysis = $self->_analysis();
|
|
47 #Use the definitions that are on the database
|
|
48
|
|
49 my $cell_type = $self->_cell_type()->name;
|
|
50 my $feature_type = $self->_feature_type()->name;
|
|
51 my $experiment_name = $self->_experiment_name();
|
|
52
|
|
53 my $file_type = $self->_file_type();
|
|
54
|
|
55 if(($file_type ne 'sam') && ($file_type ne 'bed') ){
|
|
56 throw "Only sam and bed currently supported for CCAT";
|
|
57 }
|
|
58
|
|
59 my $output_dir = $self->_output_dir();
|
|
60
|
|
61 my $size_file = $output_dir."/".$self->_set_name.".sizes";
|
|
62
|
|
63 #Get the size file... similar to the sam header...
|
|
64 open(FILE, $self->_sam_header);
|
|
65 #Consider having it pregenerated...
|
|
66 open(SIZES,">".$size_file);
|
|
67 while(<FILE>){
|
|
68 chomp;
|
|
69 /^(\S+)\s+(\d+)\s+/;
|
|
70 my $slice = $1;
|
|
71 my $slice_size = $2;
|
|
72 if(!$slice || !$slice_size){ throw " Could not process sam header line $_ "; }
|
|
73
|
|
74 #Mouse Hack!!
|
|
75 next if(($self->_species eq 'mus_musculus') && !($slice =~ /chromosome/));
|
|
76 print SIZES $slice."\t".$slice_size."\n";
|
|
77 }
|
|
78 close SIZES;
|
|
79 close FILE;
|
|
80
|
|
81 $self->_size_file($size_file);
|
|
82
|
|
83 my $input_dir = $self->_input_dir();
|
|
84
|
|
85 my $file_name = $self->_input_file();
|
|
86 my $input_file = $input_dir."/".$file_name;
|
|
87 if(-e $input_file){
|
|
88 my $output_file = $output_dir."/".$file_name;
|
|
89 if(! $self->param('reenter')){
|
|
90 #TODO Validate if existent file is ok.
|
|
91 $self->_preprocess_file($input_file, $output_file, $file_type) ||
|
|
92 throw "Error processing data file $input_file";
|
|
93 } else {
|
|
94 if(! -e $output_file){ warn "$output_file does not exist. May need to rerun from scratch."; }
|
|
95 }
|
|
96 $self->_input_file($output_file);
|
|
97
|
|
98 if(!$self->_results_file($self->param('results_file'))){
|
|
99 $self->_results_file($output_file.".".$analysis->logic_name);
|
|
100 }
|
|
101 } else { throw "No valid data file was given: ".$input_file; }
|
|
102
|
|
103 #Always require a control file...
|
|
104 my $control_file = $output_dir."/".$self->_control_file();
|
|
105 if(! -e $control_file){ throw "No valid control file was given: ".$control_file; }
|
|
106 $self->_control_file($control_file);
|
|
107
|
|
108 #May need to convert it...
|
|
109 if($file_type eq 'sam'){
|
|
110
|
|
111 my $input_file_bed = $self->_input_file;
|
|
112 $input_file_bed =~ s/\.sam/\.bed/;
|
|
113
|
|
114 #Mouse hack
|
|
115 if($self->_species eq 'mus_musculus'){
|
|
116 if(! $self->param('reenter')){
|
|
117 run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${input_file_bed}");
|
|
118 }
|
|
119 } else {
|
|
120 if(! $self->param('reenter')){
|
|
121 run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed >${input_file_bed}");
|
|
122 }
|
|
123 }
|
|
124 $self->_input_file($input_file_bed);
|
|
125
|
|
126 my $control_file_bed = $self->_control_file;
|
|
127 $control_file_bed =~ s/\.sam/\.bed/;
|
|
128 #Mouse Hack
|
|
129 if($self->_species eq 'mus_musculus'){
|
|
130 if(! $self->param('reenter')){
|
|
131 run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${control_file_bed}");
|
|
132 }
|
|
133 } else {
|
|
134 if(! $self->param('reenter')){
|
|
135 run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed >${control_file_bed}");
|
|
136 }
|
|
137 }
|
|
138 $self->_control_file($control_file_bed);
|
|
139
|
|
140 }
|
|
141
|
|
142 return 1;
|
|
143 }
|
|
144
|
|
145 sub run { # call SWEmbl
|
|
146 my $self = shift @_;
|
|
147
|
|
148 if($self->param('reenter')){ return 1; }
|
|
149
|
|
150 my $analysis = $self->_analysis;
|
|
151 #<CCAT path>/bin/CCAT <ChIP library read file name> <control library read file name>
|
|
152 # <chromosome length file name> <config file name> <project name>
|
|
153 my $bin_dir = $self->_bin_dir();
|
|
154 my $command = $bin_dir."/".$analysis->program_file .
|
|
155 " ".$self->_input_file() .
|
|
156 " ".$self->_control_file() .
|
|
157 " ". $self->_size_file() .
|
|
158 " ".$bin_dir."/ccat_config/".$analysis->parameters .
|
|
159 " " . $self->_results_file;
|
|
160 warn "Running analysis:\t$command";
|
|
161 run_system_cmd($command);
|
|
162
|
|
163 return 1;
|
|
164 }
|
|
165
|
|
166 sub write_output { # Store results
|
|
167 my $self = shift @_;
|
|
168
|
|
169 $self->_parse_result_file();
|
|
170
|
|
171 return 1;
|
|
172 }
|
|
173
|
|
174 sub _parse_result_file {
|
|
175
|
|
176 my $self = shift @_;
|
|
177
|
|
178 ### annotated features
|
|
179 my $fset = $self->_feature_set();
|
|
180
|
|
181 my $efgdba = $self->_efgdba();
|
|
182 my $sa = $efgdba->get_SliceAdaptor();
|
|
183
|
|
184 #Cache slices and features...
|
|
185 my %slice;
|
|
186 my @af;
|
|
187
|
|
188 my %cache_af;
|
|
189
|
|
190 open(FILE,$self->_results_file().".significant.region");
|
|
191 while(<FILE>){
|
|
192 chomp;
|
|
193
|
|
194 #Content of CCAT output file
|
|
195 my ($seqid,$summit,$start,$end,$chipreads,$ctrlreads,$fold,$fdr)= split("\t");
|
|
196 #Hardcode a minimum fdr... may pass as parameter
|
|
197 next if ($fdr>0.05);
|
|
198 my $score = $fold;
|
|
199
|
|
200 #This seqid may vary depending on the input given to SWEmbl...
|
|
201 # handle it manually at least for the moment... namely the sam seqid...
|
|
202 #Make sure to test thoroughly to see if it works...
|
|
203 #e.g. chromosome:GRCh37:15:1:102531392:1
|
|
204 if($seqid =~ /^\S*:\S*:(\S+):\S+:\S+:\S/) { $seqid = $1; }
|
|
205 #In case UCSC input is used...
|
|
206 $seqid =~ s/^chr//i;
|
|
207
|
|
208 if($self->param('slice') && ($seqid ne $self->param('slice'))){
|
|
209 warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:".
|
|
210 $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n";
|
|
211 next;
|
|
212 }
|
|
213
|
|
214 #May have some sort of repeats(?). Since it is ordered with significance, ignore remaining hits.
|
|
215 next if(defined($cache_af{$seqid."_".$start}));
|
|
216 $cache_af{$seqid."_".$start} = 1;
|
|
217
|
|
218 #next if ($seqid =~ m/^M/);
|
|
219
|
|
220 # filtering is done as a post-processing e.g. FilterBlacklist.pm
|
|
221 #$summit = int($summit);#Round up?
|
|
222
|
|
223 unless (exists $slice{"$seqid"}) {
|
|
224 $slice{"$seqid"} = $sa->fetch_by_region(undef, $seqid);
|
|
225 }
|
|
226
|
|
227 if( ($start < 1) || ($end > $slice{"$seqid"}->end)){
|
|
228 warn "Feature being ignored due to coordinates out of slice: Region:".
|
|
229 $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n";
|
|
230 }
|
|
231
|
|
232 #Gracefully handle errors...
|
|
233 my $af;
|
|
234 eval{
|
|
235 $af = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new
|
|
236 (
|
|
237 -slice => $slice{"$seqid"},
|
|
238 -start => $start,
|
|
239 -end => $end,
|
|
240 -strand => 0,
|
|
241 -score => $score,
|
|
242 -summit => $summit,
|
|
243 -feature_set => $fset,
|
|
244 );
|
|
245 };
|
|
246 if($@) { warn($@); next; }
|
|
247 if(!$af) { warn("Could not create feature - Region:".
|
|
248 $seqid." Start:".$start." End:".$end." Score:".$score.
|
|
249 " Summit:".$summit); next; }
|
|
250
|
|
251 push(@af, $af);
|
|
252 }
|
|
253 close FILE;
|
|
254
|
|
255 # Batch store features...
|
|
256 if(scalar(@af>0)){
|
|
257 $efgdba->get_AnnotatedFeatureAdaptor->store(@af);
|
|
258 } else {
|
|
259 warn "No significant features detected!";
|
|
260 }
|
|
261 #Do this on a wrapup runnable...so it will only be visible after reads are loaded...
|
|
262 $fset->adaptor->set_imported_states_by_Set($fset);
|
|
263
|
|
264 # Status should not be set at this stage
|
|
265
|
|
266 }
|
|
267
|
|
268
|
|
269 #Private getter / setter to the feature set
|
|
270 sub _feature_set {
|
|
271 return $_[0]->_getter_setter('feature_set',$_[1]);
|
|
272 }
|
|
273
|
|
274 #Private getter / setter to the results file
|
|
275 sub _results_file {
|
|
276 return $_[0]->_getter_setter('results_file',$_[1]);
|
|
277 }
|
|
278
|
|
279 #Private getter / setter to the size file
|
|
280 sub _size_file {
|
|
281 return $_[0]->_getter_setter('size_file',$_[1]);
|
|
282 }
|
|
283
|
|
284
|
|
285 1;
|
|
286
|