Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunCCAT.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =pod | |
2 | |
3 =head1 NAME | |
4 | |
5 Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT | |
6 | |
7 =head1 DESCRIPTION | |
8 | |
9 'RunCCAT' Runs CCAT "broad peak" caller and stores peaks as an annotated feature set. | |
10 Assumes Files are organized and named with a specific convention | |
11 ($repository_folder)/experiment_name/cell-type_feature-type/ | |
12 unless specific files are given | |
13 | |
14 =cut | |
15 | |
16 package Bio::EnsEMBL::Funcgen::RunnableDB::RunCCAT; | |
17 | |
18 use warnings; | |
19 use strict; | |
20 use Bio::EnsEMBL::Funcgen::Utils::Helper; | |
21 use Bio::EnsEMBL::DBSQL::DBAdaptor; | |
22 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; | |
23 use Bio::EnsEMBL::Funcgen::InputSet; | |
24 use Bio::EnsEMBL::Funcgen::DataSet; | |
25 use Bio::EnsEMBL::Funcgen::FeatureSet; | |
26 use Bio::EnsEMBL::Funcgen::AnnotatedFeature; | |
27 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); | |
28 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd); | |
29 | |
30 | |
31 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl'); | |
32 | |
33 sub fetch_input { # fetch and preprocess the input file plus do some more checking | |
34 my $self = shift @_; | |
35 | |
36 $self->SUPER::fetch_input(); | |
37 | |
38 my $efgdba = $self->_efgdba(); | |
39 my $fsa = $efgdba->get_FeatureSetAdaptor(); | |
40 if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){ | |
41 throw "Feature Set was not Created"; | |
42 } | |
43 | |
44 my $bin_dir = $self->_bin_dir(); | |
45 | |
46 my $analysis = $self->_analysis(); | |
47 #Use the definitions that are on the database | |
48 | |
49 my $cell_type = $self->_cell_type()->name; | |
50 my $feature_type = $self->_feature_type()->name; | |
51 my $experiment_name = $self->_experiment_name(); | |
52 | |
53 my $file_type = $self->_file_type(); | |
54 | |
55 if(($file_type ne 'sam') && ($file_type ne 'bed') ){ | |
56 throw "Only sam and bed currently supported for CCAT"; | |
57 } | |
58 | |
59 my $output_dir = $self->_output_dir(); | |
60 | |
61 my $size_file = $output_dir."/".$self->_set_name.".sizes"; | |
62 | |
63 #Get the size file... similar to the sam header... | |
64 open(FILE, $self->_sam_header); | |
65 #Consider having it pregenerated... | |
66 open(SIZES,">".$size_file); | |
67 while(<FILE>){ | |
68 chomp; | |
69 /^(\S+)\s+(\d+)\s+/; | |
70 my $slice = $1; | |
71 my $slice_size = $2; | |
72 if(!$slice || !$slice_size){ throw " Could not process sam header line $_ "; } | |
73 | |
74 #Mouse Hack!! | |
75 next if(($self->_species eq 'mus_musculus') && !($slice =~ /chromosome/)); | |
76 print SIZES $slice."\t".$slice_size."\n"; | |
77 } | |
78 close SIZES; | |
79 close FILE; | |
80 | |
81 $self->_size_file($size_file); | |
82 | |
83 my $input_dir = $self->_input_dir(); | |
84 | |
85 my $file_name = $self->_input_file(); | |
86 my $input_file = $input_dir."/".$file_name; | |
87 if(-e $input_file){ | |
88 my $output_file = $output_dir."/".$file_name; | |
89 if(! $self->param('reenter')){ | |
90 #TODO Validate if existent file is ok. | |
91 $self->_preprocess_file($input_file, $output_file, $file_type) || | |
92 throw "Error processing data file $input_file"; | |
93 } else { | |
94 if(! -e $output_file){ warn "$output_file does not exist. May need to rerun from scratch."; } | |
95 } | |
96 $self->_input_file($output_file); | |
97 | |
98 if(!$self->_results_file($self->param('results_file'))){ | |
99 $self->_results_file($output_file.".".$analysis->logic_name); | |
100 } | |
101 } else { throw "No valid data file was given: ".$input_file; } | |
102 | |
103 #Always require a control file... | |
104 my $control_file = $output_dir."/".$self->_control_file(); | |
105 if(! -e $control_file){ throw "No valid control file was given: ".$control_file; } | |
106 $self->_control_file($control_file); | |
107 | |
108 #May need to convert it... | |
109 if($file_type eq 'sam'){ | |
110 | |
111 my $input_file_bed = $self->_input_file; | |
112 $input_file_bed =~ s/\.sam/\.bed/; | |
113 | |
114 #Mouse hack | |
115 if($self->_species eq 'mus_musculus'){ | |
116 if(! $self->param('reenter')){ | |
117 run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${input_file_bed}"); | |
118 } | |
119 } else { | |
120 if(! $self->param('reenter')){ | |
121 run_system_cmd($bin_dir."/samtools view -Su ".$self->_input_file." | ${bin_dir}/bamToBed >${input_file_bed}"); | |
122 } | |
123 } | |
124 $self->_input_file($input_file_bed); | |
125 | |
126 my $control_file_bed = $self->_control_file; | |
127 $control_file_bed =~ s/\.sam/\.bed/; | |
128 #Mouse Hack | |
129 if($self->_species eq 'mus_musculus'){ | |
130 if(! $self->param('reenter')){ | |
131 run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed | grep 'chromosome' >${control_file_bed}"); | |
132 } | |
133 } else { | |
134 if(! $self->param('reenter')){ | |
135 run_system_cmd($bin_dir."/samtools view -Su ".$self->_control_file." | ${bin_dir}/bamToBed >${control_file_bed}"); | |
136 } | |
137 } | |
138 $self->_control_file($control_file_bed); | |
139 | |
140 } | |
141 | |
142 return 1; | |
143 } | |
144 | |
145 sub run { # call SWEmbl | |
146 my $self = shift @_; | |
147 | |
148 if($self->param('reenter')){ return 1; } | |
149 | |
150 my $analysis = $self->_analysis; | |
151 #<CCAT path>/bin/CCAT <ChIP library read file name> <control library read file name> | |
152 # <chromosome length file name> <config file name> <project name> | |
153 my $bin_dir = $self->_bin_dir(); | |
154 my $command = $bin_dir."/".$analysis->program_file . | |
155 " ".$self->_input_file() . | |
156 " ".$self->_control_file() . | |
157 " ". $self->_size_file() . | |
158 " ".$bin_dir."/ccat_config/".$analysis->parameters . | |
159 " " . $self->_results_file; | |
160 warn "Running analysis:\t$command"; | |
161 run_system_cmd($command); | |
162 | |
163 return 1; | |
164 } | |
165 | |
166 sub write_output { # Store results | |
167 my $self = shift @_; | |
168 | |
169 $self->_parse_result_file(); | |
170 | |
171 return 1; | |
172 } | |
173 | |
174 sub _parse_result_file { | |
175 | |
176 my $self = shift @_; | |
177 | |
178 ### annotated features | |
179 my $fset = $self->_feature_set(); | |
180 | |
181 my $efgdba = $self->_efgdba(); | |
182 my $sa = $efgdba->get_SliceAdaptor(); | |
183 | |
184 #Cache slices and features... | |
185 my %slice; | |
186 my @af; | |
187 | |
188 my %cache_af; | |
189 | |
190 open(FILE,$self->_results_file().".significant.region"); | |
191 while(<FILE>){ | |
192 chomp; | |
193 | |
194 #Content of CCAT output file | |
195 my ($seqid,$summit,$start,$end,$chipreads,$ctrlreads,$fold,$fdr)= split("\t"); | |
196 #Hardcode a minimum fdr... may pass as parameter | |
197 next if ($fdr>0.05); | |
198 my $score = $fold; | |
199 | |
200 #This seqid may vary depending on the input given to SWEmbl... | |
201 # handle it manually at least for the moment... namely the sam seqid... | |
202 #Make sure to test thoroughly to see if it works... | |
203 #e.g. chromosome:GRCh37:15:1:102531392:1 | |
204 if($seqid =~ /^\S*:\S*:(\S+):\S+:\S+:\S/) { $seqid = $1; } | |
205 #In case UCSC input is used... | |
206 $seqid =~ s/^chr//i; | |
207 | |
208 if($self->param('slice') && ($seqid ne $self->param('slice'))){ | |
209 warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:". | |
210 $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; | |
211 next; | |
212 } | |
213 | |
214 #May have some sort of repeats(?). Since it is ordered with significance, ignore remaining hits. | |
215 next if(defined($cache_af{$seqid."_".$start})); | |
216 $cache_af{$seqid."_".$start} = 1; | |
217 | |
218 #next if ($seqid =~ m/^M/); | |
219 | |
220 # filtering is done as a post-processing e.g. FilterBlacklist.pm | |
221 #$summit = int($summit);#Round up? | |
222 | |
223 unless (exists $slice{"$seqid"}) { | |
224 $slice{"$seqid"} = $sa->fetch_by_region(undef, $seqid); | |
225 } | |
226 | |
227 if( ($start < 1) || ($end > $slice{"$seqid"}->end)){ | |
228 warn "Feature being ignored due to coordinates out of slice: Region:". | |
229 $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; | |
230 } | |
231 | |
232 #Gracefully handle errors... | |
233 my $af; | |
234 eval{ | |
235 $af = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new | |
236 ( | |
237 -slice => $slice{"$seqid"}, | |
238 -start => $start, | |
239 -end => $end, | |
240 -strand => 0, | |
241 -score => $score, | |
242 -summit => $summit, | |
243 -feature_set => $fset, | |
244 ); | |
245 }; | |
246 if($@) { warn($@); next; } | |
247 if(!$af) { warn("Could not create feature - Region:". | |
248 $seqid." Start:".$start." End:".$end." Score:".$score. | |
249 " Summit:".$summit); next; } | |
250 | |
251 push(@af, $af); | |
252 } | |
253 close FILE; | |
254 | |
255 # Batch store features... | |
256 if(scalar(@af>0)){ | |
257 $efgdba->get_AnnotatedFeatureAdaptor->store(@af); | |
258 } else { | |
259 warn "No significant features detected!"; | |
260 } | |
261 #Do this on a wrapup runnable...so it will only be visible after reads are loaded... | |
262 $fset->adaptor->set_imported_states_by_Set($fset); | |
263 | |
264 # Status should not be set at this stage | |
265 | |
266 } | |
267 | |
268 | |
269 #Private getter / setter to the feature set | |
270 sub _feature_set { | |
271 return $_[0]->_getter_setter('feature_set',$_[1]); | |
272 } | |
273 | |
274 #Private getter / setter to the results file | |
275 sub _results_file { | |
276 return $_[0]->_getter_setter('results_file',$_[1]); | |
277 } | |
278 | |
279 #Private getter / setter to the size file | |
280 sub _size_file { | |
281 return $_[0]->_getter_setter('size_file',$_[1]); | |
282 } | |
283 | |
284 | |
285 1; | |
286 |