Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/RunSWEmbl.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =pod | |
2 | |
3 =head1 NAME | |
4 | |
5 Bio::EnsEMBL::Funcgen::RunnableDB::RunSWEmbl | |
6 | |
7 =head1 DESCRIPTION | |
8 | |
9 'RunSWEmbl' Runs SWEmbl peak caller and stores peaks as an annotated feature set. | |
10 Assumes Files are organized and named with a specific convention | |
11 ($repository_folder)/experiment_name/cell-type_feature-type/ | |
12 unless specific files are given | |
13 | |
14 =cut | |
15 | |
16 package Bio::EnsEMBL::Funcgen::RunnableDB::RunSWEmbl; | |
17 | |
18 use warnings; | |
19 use strict; | |
20 use Bio::EnsEMBL::Funcgen::Utils::Helper; | |
21 use Bio::EnsEMBL::DBSQL::DBAdaptor; | |
22 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; | |
23 use Bio::EnsEMBL::Funcgen::InputSet; | |
24 use Bio::EnsEMBL::Funcgen::DataSet; | |
25 use Bio::EnsEMBL::Funcgen::FeatureSet; | |
26 use Bio::EnsEMBL::Funcgen::AnnotatedFeature; | |
27 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); | |
28 #use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (strip_param_args generate_slices_from_names strip_param_flags); | |
29 | |
30 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl'); | |
31 | |
32 #use Data::Dumper; | |
33 | |
34 sub fetch_input { # fetch and preprocess the input file plus do some more checking | |
35 my $self = shift @_; | |
36 | |
37 $self->SUPER::fetch_input(); | |
38 | |
39 # Consider using: Bio::EnsEMBL::Helper to debug and/or log to a file (given parameter) | |
40 | |
41 my $efgdba = $self->_efgdba(); | |
42 my $fsa = $efgdba->get_FeatureSetAdaptor(); | |
43 if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){ | |
44 throw "Feature Set was not Created"; | |
45 } | |
46 | |
47 #if(!$self->_feature_set($fsa->fetch_by_name($self->_feature_set_name()))){ | |
48 # throw "Feature set was not created. Maybe you're skipping the Setup Step of the Pipeline"; | |
49 #} | |
50 | |
51 my $analysis = $self->_analysis(); | |
52 #Use the definitions that are on the database | |
53 $self->_command($analysis->program_file); | |
54 $self->_command_options($analysis->parameters); | |
55 | |
56 my $cell_type = $self->_cell_type()->name; | |
57 my $feature_type = $self->_feature_type()->name; | |
58 my $experiment_name = $self->_experiment_name(); | |
59 | |
60 my $file_type = $self->_file_type(); | |
61 | |
62 my $output_dir = $self->_output_dir(); | |
63 | |
64 my $input_dir = $self->_input_dir(); | |
65 | |
66 my $file_name = $self->_input_file(); | |
67 my $input_file = $input_dir."/".$file_name; | |
68 if(-e $input_file){ | |
69 my $output_file = $output_dir."/".$file_name; | |
70 #TODO Validate if existent file is ok. | |
71 #TODO Add flag to enable override existent file... | |
72 if(!$self->param('reenter')){ | |
73 $self->_preprocess_file($input_file, $output_file, $file_type) || | |
74 throw "Error processing data file $input_file"; | |
75 } else { | |
76 if(! -e $output_file){ warn "$output_file not found. May need to rerun from scratch"; } | |
77 } | |
78 $self->_input_file($output_file); | |
79 | |
80 if(!$self->_results_file($self->param('results_file'))){ | |
81 $self->_results_file($output_file.".".$analysis->logic_name); | |
82 } | |
83 } else { throw "No valid data file was given: ".$input_file; } | |
84 | |
85 if(!$self->_skip_control()){ | |
86 my $control_file = $output_dir."/".$self->_control_file(); | |
87 if(! -e $control_file){ throw "No valid control file was given: ".$control_file; } | |
88 $self->_control_file($control_file); | |
89 } | |
90 | |
91 return 1; | |
92 } | |
93 | |
94 | |
95 | |
96 | |
97 sub run { # call SWEmbl | |
98 my $self = shift @_; | |
99 | |
100 if($self->param('reenter')){ return 1; } | |
101 | |
102 #Don't leave this here... as it can go wrong!! | |
103 #if( -e $self->_results_file()){ | |
104 # return 1; | |
105 #} | |
106 | |
107 my %fswitches = ( | |
108 bed => '-B', | |
109 sam => '-S', | |
110 #maq => '-M', | |
111 #eland => '-E', | |
112 #bam => '-?', | |
113 ); | |
114 | |
115 #Ideally it will unify to sam or bam | |
116 my $format_switch = $fswitches{$self->_file_type()}; | |
117 | |
118 throw("Could not identify valid SWEmbl input format") if(! $format_switch); | |
119 | |
120 my $command = $self->_bin_dir()."/".$self->_command() . " $format_switch -z -i " . $self->_input_file() . ' ' . | |
121 $self->_command_options() . ' -o ' . $self->_results_file(); | |
122 | |
123 if($self->_control_file() && !$self->_skip_control()){ | |
124 $command = $command." -r ".$self->_control_file(); | |
125 } | |
126 | |
127 #warn "Running analysis:\t$command"; | |
128 if(system($command)) { throw("FAILED to run $command"); } | |
129 | |
130 return 1; | |
131 } | |
132 | |
133 sub write_output { # Store SWEmbl results | |
134 my $self = shift @_; | |
135 | |
136 #This is now handled in SetupPeaksPipeline... | |
137 #$self->_add_experiment_to_db(); | |
138 | |
139 #TODO Add an option to only process certain slices... | |
140 #$self->_parse_result_file(@slices); | |
141 | |
142 $self->_parse_result_file(); | |
143 | |
144 #idea of calling the load reads only after peak calling... | |
145 #my $input_file = $self->_input_file(); | |
146 #my $nbr_reads = $self->_get_number_of_reads($input_file, $self->_file_type()); | |
147 #if($nbr_reads < 1) { throw "$input_file is empty"; } | |
148 #Get relevant slices to avoid creating unnecessary jobs... | |
149 #my @slices = $self->_get_reads_slices($input_file, $self->_file_type()); | |
150 | |
151 #my @rep_out_ids; | |
152 #Create the necessary LoadReads jobs | |
153 #@slices = @{&generate_slices_from_names($slice_adaptor, @slices, undef, 1, 1, 1)};#toplevel, nonref, incdups | |
154 #foreach my $slice (@slices){ | |
155 # my $new_input_id = eval($self->input_id); | |
156 # $new_input_id->{"slice"} = $slice; | |
157 # $new_input_id->{"nbr_reads"} = $nbr_reads; | |
158 # push(@rep_out_ids,$new_input_id); | |
159 #} | |
160 | |
161 #WrapUp... | |
162 #my ($funnel_job_id) = @{ $self->dataflow_output_id($new_input_id, 3, { -semaphore_count => scalar(@rep_out_ids) }) }; | |
163 #All the fanned jobs... | |
164 #my $fan_job_ids = $self->dataflow_output_id(\@rep_out_ids, 2, { -semaphored_job_id => $funnel_job_id } ); | |
165 | |
166 return 1; | |
167 } | |
168 | |
169 | |
170 | |
171 sub _parse_result_file { | |
172 | |
173 my $self = shift @_; | |
174 | |
175 ### annotated features | |
176 my $fset = $self->_feature_set(); | |
177 | |
178 my $efgdba = $self->_efgdba(); | |
179 my $sa = $efgdba->get_SliceAdaptor(); | |
180 | |
181 #Cache slices and features... | |
182 my %slice; | |
183 my @af; | |
184 | |
185 | |
186 #Parse the output file | |
187 my $results_file = $self->_results_file(); | |
188 if(!open(FILE,$results_file)){ throw "Could not open results file : ".$results_file; } | |
189 while(<FILE>){ | |
190 chomp; | |
191 #Ignore headers | |
192 if(/^#/ || /^Region/){ next; } | |
193 #Parse SWEmbl output here... not much sense in making a parser, | |
194 # unless SWEmbl ouput is used elsewhere (or becomes more complex to parse); | |
195 my ($seqid, $start, $end, undef, undef, undef, $score, undef, undef, $summit) = split(/\s+/); | |
196 | |
197 #This seqid may vary depending on the input given to SWEmbl... | |
198 # handle it manually at least for the moment... namely the sam seqid... | |
199 #Make sure to test thoroughly to see if it works... | |
200 #e.g. chromosome:GRCh37:15:1:102531392:1 | |
201 if($seqid =~ /^\S*:\S*:(\S+):\S+:\S+:\S/) { $seqid = $1; } | |
202 #In case UCSC input is used... carefull names may not match with ensembl db! | |
203 $seqid =~ s/^chr//i; | |
204 | |
205 if($self->param('slice') && ($seqid ne $self->param('slice'))){ | |
206 warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:". | |
207 $seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; | |
208 next; | |
209 } | |
210 | |
211 #Just in case some of the non-aligned were missed in a filtering step... though this seriously affects peak calling | |
212 #if($seqid =~ /\*/){ next; } | |
213 | |
214 # skip mitochondria calls - remove this when we have pre-processing step to filter alignments using blacklist? | |
215 #next if ($seqid =~ m/^M/); | |
216 | |
217 # filtering is done as a post-processing e.g. FilterBlacklist.pm | |
218 $summit = int($summit);#Round up? | |
219 | |
220 unless (exists $slice{"$seqid"}) { | |
221 $slice{"$seqid"} = $sa->fetch_by_region(undef, $seqid); | |
222 } | |
223 | |
224 #Do not enter peaks that are invalid ENSEMBL slices... | |
225 #See if this slows down the process significantly... | |
226 #May do that as post-processing?? Cannot cache since each feature should be independent... | |
227 | |
228 #This step seems irrelevant as negative coordinates are still passing and errors are likely in further steps... | |
229 #my $feature_slice = $sa->fetch_by_region(undef, $seqid, $start, $end); | |
230 #if(!$slice{"$seqid"} || !$feature_slice){ | |
231 | |
232 #Sometimes there are naming issues with the slices... e.g. special contigs... which are not "valid" slices in ENSEMBL | |
233 #if(!$slice{"$seqid"} || !(($start =~ /^-?\d+$/) && ($end =~ /^\d+$/))){ | |
234 | |
235 # warn "Feature being ignored due to incorrect coordinates: Region:".$seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; | |
236 # next; | |
237 #} | |
238 | |
239 if( ($start < 1) || ($end > $slice{"$seqid"}->end)){ | |
240 warn "Feature being ignored due to coordinates out of slice: Region:".$seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; | |
241 } | |
242 | |
243 #Gracefully handle errors... | |
244 my $af; | |
245 eval{ | |
246 $af = Bio::EnsEMBL::Funcgen::AnnotatedFeature->new | |
247 ( | |
248 -slice => $slice{"$seqid"}, | |
249 -start => $start, | |
250 -end => $end, | |
251 -strand => 0, | |
252 -score => $score, | |
253 -summit => $summit, | |
254 -feature_set => $fset, | |
255 ); | |
256 }; | |
257 if($@) { warn($@); next; } | |
258 if(!$af) { warn("Could not create feature - Region:".$seqid." Start:".$start." End:".$end." Score:".$score." Summit:".$summit); next; } | |
259 | |
260 #if($self->param('slice') && ($af->seq_region_name ne $self->param('slice'))){ | |
261 # warn "Feature being ignored as it is not in specified slice ".$self->param('slice')." : Region:". | |
262 # $af->seq_region_name." Start:".$start." End:".$end." Score:".$score." Summit:".$summit."\n"; | |
263 # next; | |
264 #} | |
265 | |
266 push(@af, $af); | |
267 } | |
268 close FILE; | |
269 | |
270 # Batch store features... | |
271 $efgdba->get_AnnotatedFeatureAdaptor->store(@af); | |
272 | |
273 #Do this on a wrapup runnable...so it will only be visible after reads are loaded... | |
274 $fset->adaptor->set_imported_states_by_Set($fset); | |
275 | |
276 } | |
277 | |
278 | |
279 #Private getter / setter to the feature set | |
280 sub _feature_set { | |
281 return $_[0]->_getter_setter('feature_set',$_[1]); | |
282 } | |
283 | |
284 #Private getter / setter to the command to be executed | |
285 sub _command { | |
286 return $_[0]->_getter_setter('command',$_[1]); | |
287 } | |
288 | |
289 #Private getter / setter to the command options | |
290 sub _command_options { | |
291 return $_[0]->_getter_setter('command_options',$_[1]); | |
292 } | |
293 | |
294 #Private getter / setter to the results file | |
295 sub _results_file { | |
296 return $_[0]->_getter_setter('results_file',$_[1]); | |
297 } | |
298 | |
299 1; | |
300 |