comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCCAT.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 NAME
4
5 Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT
6
7 =head1 DESCRIPTION
8
9 'RunCCAT' Runs CCAT "broad peak" caller and stores peaks as an annotated feature set.
10 Assumes Files are organized and named with a specific convention
11 ($repository_folder)/experiment_name/cell-type_feature-type/
12 unless specific files are given
13
14 =cut
15
16 package Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT;
17
18 use warnings;
19 use strict;
20 use Bio::EnsEMBL::Funcgen::Utils::Helper;
21 use Bio::EnsEMBL::DBSQL::DBAdaptor;
22 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
23 use Bio::EnsEMBL::Funcgen::InputSet;
24 use Bio::EnsEMBL::Funcgen::DataSet;
25 use Bio::EnsEMBL::Funcgen::FeatureSet;
26 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
27 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
28 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd);
29
30
31 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl');
32
33 sub fetch_input { # fetch and preprocess the input file plus do some more checking
34 my $self = shift @_;
35
36 $self->SUPER::fetch_input();
37
38 my $efgdba = $self->_efgdba();
39 my $fsa = $efgdba->get_FeatureSetAdaptor();
40 if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){
41 throw "Feature Set was not Created";
42 }
43
44 my $bin_dir = $self->_bin_dir();
45
46 my $analysis = $self->_analysis();
47 #Use the definitions that are on the database
48
49 my $cell_type = $self->_cell_type()->name;
50 my $feature_type = $self->_feature_type()->name;
51 my $experiment_name = $self->_experiment_name();
52
53 my $file_type = $self->_file_type();
54
55 if(($file_type ne 'sam') && ($file_type ne 'bed') ){
56 throw "Only sam and bed currently supported for CCAT";
57 }
58
59 my $output_dir = $self->_output_dir();
60
61 my $size_file = $output_dir."/".$self->_set_name.".sizes";
62
63 #Get the size file... similar to the sam header...
64 open(FILE, $self->_sam_header);
65 #Consider having it pregenerated...
66 open(SIZES,">".$size_file);
67 while(<FILE>){
68 chomp;
69 /^(\S+)\s+(\d+)\s+/;
70 my $slice = $1;
71 my $slice_size = $2;
72 if(!$slice || !$slice_size){ throw " Could not process sam header line $_ "; }
73
74 #Mouse Hack!!
75 next if(($self->_species eq 'mus_musculus') && !($slice =~ /chromosome/));
76 print SIZES $slice."\t".$slice_size."\n";
77 }
78 close SIZES;
79 close FILE;
80
81 $self->_size_file($size_file);
82
83 my $input_dir = $self->_input_dir();
84
85 my $file_name = $self->_input_file();
86 my $input_file = $input_dir."/".$file_name;
87 if(-e $input_file){
88 my $output_file = $output_dir."/".$file_name;
89 if(! $self->param('reenter')){
90 #TODO Validate if existent file is ok.
91 $self->_preprocess_file($input_file, $output_file, $file_type) ||
92 throw "Error processing data file $input_file";
93 } else {
94 if(! -e $output_file){ warn "$output_file does not exist. May need to rerun from scratch."; }
95 }
96 $self->_input_file($output_file);
97
98 if(!$self->_results_file($self->param('results_file'))){
99 $self->_results_file($output_file.".".$analysis->logic_name);
100 }
101 } else { throw "No valid data file was given: ".$input_file; }
102
103 #Always require a control file...
104 my $control_file = $output_dir."/".$self->_control_file();
105 if(! -e $control_file){ throw "No valid control file was given: ".$control_file; }
106 $self->_control_file($control_file);
107
108 #May need to convert it...
109 if($file_type eq 'sam'){
110
111 my $input_file_bed = $self->_input_file;
112 $input_file_bed =~ s/\.sam/\.bed/;
113
114 #Mouse hack
115 if($self->_species eq 'mus_musculus'){
116 if(! $self->param('reenter')){
117 run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${input_file_bed}");
118 }
119 } else {
120 if(! $self->param('reenter')){
121 run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed >${input_file_bed}");
122 }
123 }
124 $self->_input_file($input_file_bed);
125
126 my $control_file_bed = $self->_control_file;
127 $control_file_bed =~ s/\.sam/\.bed/;
128 #Mouse Hack
129 if($self->_species eq 'mus_musculus'){
130 if(! $self->param('reenter')){
131 run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${control_file_bed}");
132 }
133 } else {
134 if(! $self->param('reenter')){
135 run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed >${control_file_bed}");
136 }
137 }
138 $self->_control_file($control_file_bed);
139
140 }
141
142 return 1;
143 }
144
145 sub run { # call SWEmbl
146 my $self = shift @_;
147
148 if($self->param('reenter')){ return 1; }
149
150 my $analysis = $self->_analysis;
151 #<CCAT path>/bin/CCAT <ChIP library read file name> <control library read file name>
152 # <chromosome length file name> <config file name> <project name>
153 my $bin_dir = $self->_bin_dir();
154 my $command = $bin_dir."/".$analysis->program_file .
155 " ".$self->_input_file() .
156 " ".$self->_control_file() .
157 " ". $self->_size_file() .
158 " ".$bin_dir."/ccat_config/".$analysis->parameters .
159 " " . $self->_results_file;
160 warn "Running analysis:\t$command";
161 run_system_cmd($command);
162
163 return 1;
164 }
165
166 sub write_output { # Store results
167 my $self = shift @_;
168
169 $self->_parse_result_file();
170
171 return 1;
172 }
173
174 sub _parse_result_file {
175
176 my $self = shift @_;
177
178 ### annotated features
179 my $fset = $self->_feature_set();
180
181 my $efgdba = $self->_efgdba();
182 my $sa = $efgdba->get_SliceAdaptor();
183
184 #Cache slices and features...
185 my %slice;
186 my @af;
187
188 my %cache_af;
189
190 open(FILE,$self->_results_file().".significant.region");
191 while(<FILE>){
192 chomp;
193
194 #Content of CCAT output file
195 my ($seqid,$summit,$start,$end,$chipreads,$ctrlreads,$fold,$fdr)= split("\t");
196 #Hardcode a minimum fdr... may pass as parameter
197 next if ($fdr>0.05);
198 my $score = $fold;
199
200 #This seqid may vary depending on the input given to SWEmbl...
201 # handle it manually at least for the moment... namely the sam seqid...
202 #Make sure to test thoroughly to see if it works...
203 #e.g. chromosome:GRCh37:15:1:102531392:1
204 if($seqid =~ /^\S*:\S*:(\S+):\S+:\S+:\S/) { $seqid = $1; }
205 #In case UCSC input is used...
206 $seqid =~ s/^chr//i;
207
208 if($self->param('slice') && ($seqid ne $self->param('slice'))){
209 warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:".
210 $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n";
211 next;
212 }
213
214 #May have some sort of repeats(?). Since it is ordered with significance, ignore remaining hits.
215 next if(defined($cache_af{$seqid."_".$start}));
216 $cache_af{$seqid."_".$start} = 1;
217
218 #next if ($seqid =~ m/^M/);
219
220 # filtering is done as a post-processing e.g. FilterBlacklist.pm
221 #$summit = int($summit);#Round up?
222
223 unless (exists $slice{"$seqid"}) {
224 $slice{"$seqid"} = $sa->fetch_by_region(undef, $seqid);
225 }
226
227 if( ($start < 1) || ($end > $slice{"$seqid"}->end)){
228 warn "Feature being ignored due to coordinates out of slice: Region:".
229 $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n";
230 }
231
232 #Gracefully handle errors...
233 my $af;
234 eval{
235 $af = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new
236 (
237 -slice => $slice{"$seqid"},
238 -start => $start,
239 -end => $end,
240 -strand => 0,
241 -score => $score,
242 -summit => $summit,
243 -feature_set => $fset,
244 );
245 };
246 if($@) { warn($@); next; }
247 if(!$af) { warn("Could not create feature - Region:".
248 $seqid." Start:".$start." End:".$end." Score:".$score.
249 " Summit:".$summit); next; }
250
251 push(@af, $af);
252 }
253 close FILE;
254
255 # Batch store features...
256 if(scalar(@af>0)){
257 $efgdba->get_AnnotatedFeatureAdaptor->store(@af);
258 } else {
259 warn "No significant features detected!";
260 }
261 #Do this on a wrapup runnable...so it will only be visible after reads are loaded...
262 $fset->adaptor->set_imported_states_by_Set($fset);
263
264 # Status should not be set at this stage
265
266 }
267
268
269 #Private getter / setter to the feature set
270 sub _feature_set {
271 return $_[0]->_getter_setter('feature_set',$_[1]);
272 }
273
274 #Private getter / setter to the results file
275 sub _results_file {
276 return $_[0]->_getter_setter('results_file',$_[1]);
277 }
278
279 #Private getter / setter to the size file
280 sub _size_file {
281 return $_[0]->_getter_setter('size_file',$_[1]);
282 }
283
284
285 1;
286