Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Nimblegen.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # | |
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Nimblegen | |
3 # | |
4 | |
5 =head1 LICENSE | |
6 | |
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
8 Genome Research Limited. All rights reserved. | |
9 | |
10 This software is distributed under a modified Apache license. | |
11 For license details, please see | |
12 | |
13 http://www.ensembl.org/info/about/code_licence.html | |
14 | |
15 =head1 CONTACT | |
16 | |
17 Please email comments or questions to the public Ensembl | |
18 developers list at <ensembl-dev@ebi.ac.uk>. | |
19 | |
20 Questions may also be sent to the Ensembl help desk at | |
21 <helpdesk@ensembl.org>. | |
22 | |
23 =head1 NAME | |
24 | |
25 Bio::EnsEMBL::Funcgen::Parsers::Nimblegen | |
26 | |
27 =head1 SYNOPSIS | |
28 | |
29 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Nimblegen"; | |
30 push @INC, $parser_type; | |
31 my $imp = $class->SUPER::new(@_); | |
32 | |
33 | |
34 =head1 DESCRIPTION | |
35 | |
36 This is a parser class which should not be instatiated directly, it | |
37 normally set by the Importer as the parent class. Nimblegen contains meta | |
38 data and methods specific to NimbleGen arrays to aid parsing and importing of | |
39 experimental data. | |
40 | |
41 =cut | |
42 | |
43 package Bio::EnsEMBL::Funcgen::Parsers::Nimblegen; | |
44 | |
45 use Bio::EnsEMBL::Funcgen::Array; | |
46 use Bio::EnsEMBL::Funcgen::ProbeSet; | |
47 use Bio::EnsEMBL::Funcgen::Probe; | |
48 use Bio::EnsEMBL::Funcgen::ProbeFeature; | |
49 use Bio::EnsEMBL::Funcgen::FeatureType; | |
50 use Bio::EnsEMBL::Funcgen::ExperimentalChip; | |
51 use Bio::EnsEMBL::Funcgen::ArrayChip; | |
52 use Bio::EnsEMBL::Funcgen::Channel; | |
53 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); | |
54 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file); | |
55 use Bio::EnsEMBL::Utils::Argument qw( rearrange ); | |
56 use Bio::EnsEMBL::Funcgen::Parsers::MAGE; | |
57 | |
58 use strict; | |
59 | |
60 use vars qw(@ISA); | |
61 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::MAGE); | |
62 | |
63 =head2 new | |
64 | |
65 Example : my $self = $class->SUPER::new(@_); | |
66 Description: Constructor method for Nimblegen class | |
67 Returntype : Bio::EnsEMBL::Funcgen::Parser::Nimblegen | |
68 Exceptions : throws if Experiment name not defined or if caller is not Importer | |
69 Caller : Bio::EnsEMBL::Funcgen::Importer | |
70 Status : at risk | |
71 | |
72 =cut | |
73 | |
74 | |
75 sub new{ | |
76 my $caller = shift; | |
77 | |
78 my $class = ref($caller) || $caller; | |
79 my $self = $class->SUPER::new(@_); | |
80 | |
81 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); | |
82 | |
83 #should we provide args override for all of these? | |
84 | |
85 $self->{'config'} = | |
86 {( | |
87 #order of these data arrays is important! | |
88 #Remove these method arrays, snd just run them serially? | |
89 array_data => ['experiment'],#Rename this!! | |
90 probe_data => ["probe"], | |
91 results_data => ["and_import_results"], | |
92 sample_key_fields => ['DESIGN_ID', 'CHIP_ID', 'DYE', 'PROMOT_SAMPLE_TYPE'],# 'SAMPLE_LABEL'],label now optional | |
93 # 'SAMPLE_DESCRIPTION removed due to naming disparities | |
94 ndf_fields => ['CONTAINER', 'PROBE_SEQUENCE', 'MISMATCH','FEATURE_ID', 'PROBE_ID'],#MISMATCH is always 0! | |
95 pos_fields => ['CHROMOSOME', 'PROBE_ID', 'POSITION', 'COUNT'], | |
96 result_fields => ['PROBE_ID', 'PM', 'X', 'Y'], | |
97 notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'], | |
98 norm_method => 'VSN_GLOG', | |
99 dye_freqs => {( | |
100 Cy5 => 635, | |
101 Cy3 => 532, | |
102 )}, | |
103 | |
104 | |
105 #Need to make these definable? | |
106 #have protocolfile arg and just parse tab2mage protocol section format | |
107 protocols => {( | |
108 grow => {( | |
109 accession => 'GROW_NIMB', | |
110 name => 'GROW NIMBLEGEN CULTURE CONDITIONS', | |
111 text => 'Nimblegen culture conditions description here. Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', | |
112 paramters => undef, | |
113 )}, | |
114 treatment => {( | |
115 accession => 'CROSSLINK_NIMB', | |
116 name => 'NIMBLEGEN CHROMATIN PREPARATION', | |
117 text => 'Nimblegen X-linking and DNA extraction protocol.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', | |
118 paramters => undef, | |
119 )}, | |
120 extraction => {( | |
121 accession => 'CHROMATIN_IP_NIMB', | |
122 name => 'NIMBLEGEN CHROMATIN IMMUNOPRECIPITATION and DNA RECOVERY', | |
123 text => 'Nimblegen chromatin immunoprecipitation and DNA extraction protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', | |
124 paramters => undef, | |
125 )}, | |
126 labeling => {( | |
127 accession => 'LABELLING_NIMB', | |
128 name => 'NIMBLEGEN LABELLING', | |
129 text => 'Nimblegen labelling protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', | |
130 paramteres => undef, | |
131 )}, | |
132 hybridization => {( | |
133 accession => 'HYBRIDISATION_NIMB', | |
134 name => 'NIMBLEGEN HYBRIDISATION', | |
135 text => 'Nimblegen chip hybridisation protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', | |
136 parameters => undef, | |
137 )}, | |
138 scanning => {( | |
139 accession => 'SCANNING_NIMB', | |
140 name => 'NIMBLESCAN', | |
141 text => 'Nimblegen Nimblescan protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.', | |
142 paramters => undef, | |
143 )}, | |
144 )}, | |
145 | |
146 )}; | |
147 | |
148 return $self; | |
149 } | |
150 | |
151 | |
152 =head2 set_config | |
153 | |
154 Example : my $self->set_config; | |
155 Description: Sets attribute dependent config | |
156 Returntype : None | |
157 Exceptions : None | |
158 Caller : Bio::EnsEMBL::Funcgen::Importer | |
159 Status : at risk | |
160 | |
161 =cut | |
162 | |
163 | |
164 sub set_config{ | |
165 my $self = shift; | |
166 | |
167 #This should be general config for all types of import | |
168 #dirs are not set in config to enable generic get_dir method access | |
169 #This is really just setting paths rather than config rename? | |
170 | |
171 #This is generic for all imports | |
172 | |
173 | |
174 | |
175 if($self->{'old_dvd_format'}){ | |
176 $self->{'design_dir'} = $self->get_dir('input').'/DesignFiles'; | |
177 }else{ | |
178 $self->{'design_dir'} = $self->get_dir('input').'/Design_information'; | |
179 } | |
180 | |
181 | |
182 if($self->{'old_dvd_format'}){ | |
183 $self->{'config'}{'notes_file'} = $self->get_dir('input').'/DesignNotes.txt'; | |
184 }else{ | |
185 $self->{'config'}{'notes_file'} = $self->get_dir('design').'/DesignNotes.txt'; | |
186 } | |
187 | |
188 $self->{'config'}{'chip_file'} = $self->get_dir('input').'/SampleKey.txt'; | |
189 | |
190 | |
191 | |
192 #Experiment(output) specific | |
193 #This should already be set in the run script | |
194 #As we could get log write errors before we have created the output dir otherwise | |
195 $self->{'output_dir'} ||= $self->get_dir("data").'/output/'.$self->{'param_species'}.'/'.$self->vendor().'/'.$self->name(); | |
196 | |
197 $self->{'config'}{'tab2mage_file'} = $self->get_dir('output').'/E-TABM-'.$self->name().'.txt'; | |
198 | |
199 $self->{'config'}{'mage_xml_file'} = $self->get_dir('output').'/{UNASSIGNED}.xml'; | |
200 | |
201 if($self->{'old_dvd_format'}){ | |
202 $self->{'results_dir'} = $self->get_dir('input').'/PairData'; | |
203 }else{ | |
204 $self->{'results_dir'} = $self->get_dir('input').'/Raw_data_files'; | |
205 } | |
206 | |
207 return; | |
208 } | |
209 | |
210 | |
211 | |
212 | |
213 =head2 read_array_data | |
214 | |
215 Example : $imp->read_array_data(); | |
216 Description: Parses NimbleGen DesignNotes.txt files to create and store new Arrays | |
217 Returntype : none | |
218 Exceptions : None | |
219 Caller : general | |
220 Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats? | |
221 | |
222 =cut | |
223 | |
224 | |
225 sub read_array_data{ | |
226 my ($self, $notes_file) = @_; | |
227 | |
228 | |
229 $notes_file ||= $self->get_config('notes_file'); | |
230 my ($line, $array, $array_chip, @data, %hpos); | |
231 my $oa_adaptor = $self->db->get_ArrayAdaptor(); | |
232 my $ac_adaptor = $self->db->get_ArrayChipAdaptor(); | |
233 | |
234 #Slurp file to string, sets local delimtter to null and subs new lines | |
235 my $fh = open_file($notes_file); | |
236 #($design_desc = do { local ($/); <$fh>;}) =~ s/\r*\n$//; | |
237 #close($fh); | |
238 | |
239 #Would be better if we only import the design info for the chips listed in the SampleKey.txt file | |
240 #Some cyclical dependency going on here :| | |
241 | |
242 while ($line = <$fh>){ | |
243 | |
244 $line =~ s/\r*\n//;#chump | |
245 @data = split/\t/o, $line; | |
246 | |
247 #We need to have a DESIGN vendor type? | |
248 #also need to be able to set file path independently of config | |
249 | |
250 if($. == 1){ | |
251 %hpos = %{$self->set_header_hash(\@data, $self->get_config('notes_fields'))}; | |
252 next; | |
253 } | |
254 | |
255 ### CREATE AND STORE Array and ArrayChips | |
256 if(! defined $array ){ | |
257 #This is treating each array chip as a separate array, unless arrayset is defined | |
258 #AT present we have no way of differentiating between different array_chips on same array???!!! | |
259 #Need to add functionality afterwards to collate array_chips into single array | |
260 | |
261 #This will use a stored array if present | |
262 | |
263 $array = Bio::EnsEMBL::Funcgen::Array->new | |
264 ( | |
265 -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}], | |
266 -FORMAT => uc($self->format()), | |
267 -VENDOR => uc($self->vendor()), | |
268 -TYPE => 'OLIGO', | |
269 -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here | |
270 ); | |
271 | |
272 ($array) = @{$oa_adaptor->store($array)}; | |
273 | |
274 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( | |
275 -ARRAY_ID => $array->dbID(), | |
276 -NAME => $data[$hpos{'DESIGN_NAME'}], | |
277 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], | |
278 #add description? | |
279 ); | |
280 | |
281 #This will use a stored array_chip if present | |
282 ($array_chip) = @{$ac_adaptor->store($array_chip)}; | |
283 $array->add_ArrayChip($array_chip); | |
284 | |
285 } | |
286 elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){ | |
287 | |
288 $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].") for same Array:\t".$array->name()."\n"); | |
289 | |
290 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( | |
291 -ARRAY_ID => $array->dbID(), | |
292 -NAME => $data[$hpos{'DESIGN_NAME'}], | |
293 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], | |
294 ); | |
295 | |
296 ($array_chip) = @{$ac_adaptor->store($array_chip)}; | |
297 $array->add_ArrayChip($array_chip); | |
298 | |
299 } | |
300 elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){ | |
301 throw("Found experiment with more than one design without -array_set"); | |
302 } | |
303 } | |
304 | |
305 | |
306 $self->add_Array($array); | |
307 | |
308 close($fh); | |
309 | |
310 return; | |
311 | |
312 } | |
313 | |
314 | |
315 =head2 read_experiment_data | |
316 | |
317 Example : $imp->read_array_chip_data(); | |
318 Description: Parses and imports array & experimental chip meta data/objects | |
319 Returntype : none | |
320 Exceptions : throws if more than one array/design found and not an "array set" | |
321 Caller : Importer | |
322 Status : At risk | |
323 | |
324 =cut | |
325 | |
326 sub read_experiment_data{ | |
327 my $self = shift; | |
328 | |
329 $self->read_array_data(); | |
330 | |
331 my $t2m_file = $self->init_tab2mage_export() if $self->{'write_mage'}; | |
332 | |
333 | |
334 my ($design_desc, $line, $tmp_uid, $channel, $echip, $sample_label); | |
335 my ($sample_desc, %hpos, @data, %uid_reps, %did_reps, %sample_reps); | |
336 my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor(); | |
337 my $chan_adaptor = $self->db->get_ChannelAdaptor(); | |
338 my $br_cnt = 1; | |
339 my $tr_cnt = 1; | |
340 | |
341 #Currently 1 design = 1 chip = 1 array /DVD | |
342 #Different designs are not currently collated into a chip_set/array in any ordered manner | |
343 #Register each design as an array and an array_chip | |
344 #May want to group array_chips into array/chip sets by association though the API | |
345 | |
346 | |
347 warn("Harcoded for one array(can have multiple chips from the same array) per experiment\n"); | |
348 my $fh = open_file($self->get_config("chip_file")); | |
349 $self->log("Reading chip data"); | |
350 | |
351 | |
352 #warn "Do we need to validate each line here against the header array?"; | |
353 | |
354 while ($line = <$fh>){ | |
355 next if $line =~ /^\s+\r*\n/; | |
356 $line =~ s/\r*\n//;#chump | |
357 @data = split/\t/o, $line; | |
358 #we could validate line against scalar of header array | |
359 #ORD_ID CHIP_ID DYE DESIGN_NAME DESIGN_ID SAMPLE_LABEL SAMPLE_SPECIES SAMPLE_DESCRIPTION TISSUE_TREATMENT PROMOT_SAMPLE_TYPE | |
360 if ($. == 1){ | |
361 %hpos = %{$self->set_header_hash(\@data, $self->get_config('sample_key_fields'))}; | |
362 | |
363 #we need to set the sample description field name, as it can vary :((( | |
364 @data = grep(/SAMPLE_DESCRIPTION/, keys %hpos); | |
365 $sample_desc = $data[0]; | |
366 | |
367 throw("More than one sample description(@data) in ".$self->get_config("chip_file")."\n") if(scalar @data >1); | |
368 next; | |
369 } | |
370 | |
371 #Need to handle array class here i.e. two channel arrays will have two lines | |
372 #validate species here | |
373 #look up alias from registry and match to self->species | |
374 #registry may not be loaded for local installation | |
375 | |
376 | |
377 ### CREATE AND STORE ExperimentalChips | |
378 if ((! $tmp_uid) || ($data[$hpos{'CHIP_ID'}] ne $tmp_uid)){ | |
379 | |
380 #Test both channels are available, i.e. the SampleKey has two TOTAL channels | |
381 if($echip){ | |
382 | |
383 for my $type('TOTAL', 'EXPERIMENTAL'){ | |
384 | |
385 my $test_chan = $chan_adaptor->fetch_by_type_experimental_chip_id($type, $echip->dbID()); | |
386 throw("ExperimentalChip(".$echip->unique_id(). | |
387 ") does not have a $type channel, please check the SampleKey.txt file") if ! $test_chan; | |
388 | |
389 } | |
390 } | |
391 | |
392 $tmp_uid = $data[$hpos{'CHIP_ID'}]; | |
393 $echip = $ec_adaptor->fetch_by_unique_id_vendor($data[$hpos{'CHIP_ID'}], 'NIMBLEGEN'); | |
394 | |
395 if($echip){ | |
396 | |
397 if(! $self->recovery()){ | |
398 throw("ExperimentalChip(".$echip->unqiue_id(). | |
399 " already exists in the database\nMaybe you want to recover?"); | |
400 } | |
401 }else{ | |
402 | |
403 $echip = Bio::EnsEMBL::Funcgen::ExperimentalChip->new | |
404 ( | |
405 -EXPERIMENT_ID => $self->experiment->dbID(), | |
406 -DESCRIPTION => $data[$hpos{$sample_desc}], | |
407 -FEATURE_TYPE => $self->feature_type, | |
408 -CELL_TYPE => $self->cell_type, | |
409 -ARRAY_CHIP_ID => $self->arrays->[0]->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])->dbID(), | |
410 -UNIQUE_ID => $data[$hpos{'CHIP_ID'}], | |
411 #-BIOLOGICAL_REPLICATE => , | |
412 #-TECHNICAL_REPLICATE => , | |
413 ); | |
414 | |
415 ($echip) = @{$ec_adaptor->store($echip)}; | |
416 $self->experiment->add_ExperimentalChip($echip); | |
417 } | |
418 } | |
419 | |
420 | |
421 ### CREATE AND STORE Channels | |
422 my $type = uc($data[$hpos{'PROMOT_SAMPLE_TYPE'}]); | |
423 my $sample_label = (! exists $hpos{'SAMPLE_LABEL'}) ? '' : $data[$hpos{'SAMPLE_LABEL'}]; | |
424 | |
425 | |
426 $type = 'TOTAL' if ($type ne 'EXPERIMENTAL'); | |
427 $channel = $chan_adaptor->fetch_by_type_experimental_chip_id($type, $echip->dbID()); | |
428 | |
429 if($channel){ | |
430 if(! $self->recovery()){ | |
431 throw("Channel(".$echip->unqiue_id().":".uc($data[$hpos{'PROMOT_SAMPLE_TYPE'}]). | |
432 " already exists in the database\nMaybe you want to recover?"); | |
433 }else{ | |
434 #push @{$self->{'_rollback_ids'}}, $channel->dbID(); | |
435 #No point in doing this as all Channels mey be pre-registered in recovery mode | |
436 #Hence all will be rolled back | |
437 } | |
438 }else{ | |
439 #Handles single/mutli | |
440 | |
441 $channel = Bio::EnsEMBL::Funcgen::Channel->new | |
442 ( | |
443 -EXPERIMENTAL_CHIP_ID => $echip->dbID(), | |
444 -DYE => $data[$hpos{'DYE'}], | |
445 -SAMPLE_ID => $sample_label, | |
446 -TYPE => $type, | |
447 ); | |
448 | |
449 #-SPECIES => $self->species(),#on channel/sample to enable multi-species chip/experiment | |
450 #would never happen on one chip? May happen between chips in one experiment | |
451 | |
452 ($channel) = @{$chan_adaptor->store($channel)}; | |
453 | |
454 } | |
455 | |
456 #we need to build the channel level tab2mage line here | |
457 | |
458 #For each BR there will be two sample_labels, one for each channel | |
459 #These will be used across multiple chips. | |
460 #If two chips the same design ID and the same sample labels, then we have a technical replicate | |
461 #else if they have different sample labels then we have another biological replicate | |
462 #We have a problem of associating channels to the same BR with differing sample labels | |
463 #This is solved by checking whether the chip ID has already been registered in a BR | |
464 | |
465 #This fails if more than one sample label is used for any given BR | |
466 #This will result in the BR being split into the number of unique sample label pairs(Experimental/Control channel) | |
467 #This also fails if the same sample label has been used for two different BRs | |
468 | |
469 if($self->{'write_mage'}){ | |
470 #my $sample_name = ($sample_label eq '') ? '???' : substr($sample_label, 0, (length($sample_label)-1)); | |
471 my $ctype_name = (defined $self->cell_type()) ? $self->cell_type->name() : '???'; | |
472 my $ftype_name = (defined $self->feature_type()) ? $self->feature_type->name() : '???'; | |
473 my $ctype_desc = (defined $self->cell_type()) ? $self->cell_type->description() : '???'; | |
474 | |
475 #define reps | |
476 | |
477 | |
478 #we need one to get the biorep based on the sample label and making sure the unique ID are the same | |
479 #we need to define the tech rep by matching the sample label and the making sure the design_id isn't already used | |
480 | |
481 #Is this doing the BR assignment properly? | |
482 | |
483 if(exists $sample_reps{$sample_label}){#Found chip in a previously seen BR | |
484 #Register the BR of this chip ID | |
485 $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'} = $sample_reps{$sample_label}; | |
486 | |
487 } | |
488 elsif(exists $uid_reps{$data[$hpos{'CHIP_ID'}]}){#Found the other channel | |
489 $sample_reps{$sample_label} = $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'}; | |
490 } | |
491 else{#assign new br | |
492 $sample_reps{$sample_label} = $br_cnt; #Assign BR to sample label | |
493 $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'} = $br_cnt; #Assign BR to chip id | |
494 $br_cnt++; | |
495 } | |
496 | |
497 | |
498 | |
499 #Something is going awry here. The TR is not being reset for some new BRs | |
500 | |
501 if(! exists $uid_reps{$data[$hpos{'CHIP_ID'}]}{'tr'}){ | |
502 #we only assign a new tr here if this design has not been seen in any of the reps | |
503 #i.e. we need to get the first tr which does not contain this design_id | |
504 | |
505 my $create_rep = 1; | |
506 my $tr; | |
507 my @chip_ids; | |
508 my $br = $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'}; | |
509 | |
510 foreach my $chip_id(keys %uid_reps){ | |
511 | |
512 push @chip_ids, $chip_id if($uid_reps{$chip_id}{'br'} == $br); | |
513 } | |
514 | |
515 #This is looping through all the TRs for all the design IDs | |
516 | |
517 foreach my $rep(sort keys %did_reps){ | |
518 #Doesn't exist for the given BR? | |
519 #So we need to get all the chip_ids for a given br | |
520 #Check wether it exists and wether it exists in did_reps and check wether the chip_id value is part of the BR set | |
521 #else we add it | |
522 | |
523 if(! exists $did_reps{$rep}{$data[$hpos{'DESIGN_ID'}]}){ | |
524 #Not seen in a TR of this $rep yet | |
525 $create_rep = 0; | |
526 }elsif(! grep(/$did_reps{$rep}{$data[$hpos{'DESIGN_ID'}]}/, @chip_ids)){ | |
527 #Not seen in this BR with this TR $rep | |
528 $create_rep = 0; | |
529 } | |
530 | |
531 if(! $create_rep){ | |
532 #Design ID not seen so add to this TR | |
533 $did_reps{$rep}{$data[$hpos{'DESIGN_ID'}]} = $data[$hpos{'CHIP_ID'}]; #don't really need to assign this | |
534 $tr = $rep; | |
535 last;#do not remove this or we get wierd TR incrementing | |
536 } | |
537 } | |
538 | |
539 | |
540 if($create_rep){ | |
541 #Get the next TR value for this given BR | |
542 my @trs; | |
543 | |
544 foreach my $rep(keys %did_reps){ | |
545 | |
546 foreach my $chip_id(values %{$did_reps{$rep}}){ | |
547 | |
548 #Push TR if chip_id is present in this BR | |
549 push @trs, $rep if(grep(/$chip_id/, @chip_ids)); | |
550 } | |
551 } | |
552 ($tr) = sort {$b<=>$a} @trs; | |
553 | |
554 $tr ||=0; | |
555 $tr++; | |
556 | |
557 #register design ID to chip ID mapping for this TR | |
558 $did_reps{$tr}{$data[$hpos{'DESIGN_ID'}]} = $data[$hpos{'CHIP_ID'}]; | |
559 } | |
560 | |
561 #register TR for this chip ID | |
562 $uid_reps{$data[$hpos{'CHIP_ID'}]}{'tr'} = $tr; | |
563 } | |
564 | |
565 my $br = $self->experiment->name().'_BR'. $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'}; | |
566 my $tr = $br.'_TR'.$uid_reps{$data[$hpos{'CHIP_ID'}]}{'tr'}; | |
567 | |
568 | |
569 #File[raw] | |
570 my $tsm_line = $echip->unique_id().'_'.$self->get_config('dye_freqs')->{$data[$hpos{'DYE'}]}.'_pair.txt'; | |
571 #Array[accession] # Should this be left blank for AE accession? | |
572 $tsm_line .= "\t".$data[$hpos{'DESIGN_ID'}]; | |
573 #Array[serial] | |
574 $tsm_line .= "\t".$echip->unique_id(); | |
575 | |
576 #Protocol(s)[grow][treatment][extraction][labelling][hybridisation][scanning] | |
577 foreach my $protocol(sort (keys %{$self->get_config('protocols')})){ | |
578 $tsm_line .= "\t".$self->get_config('protocols')->{$protocol}->{'accession'}; | |
579 } | |
580 | |
581 | |
582 #BioSource | |
583 $tsm_line .= "\t$ctype_name"; | |
584 #Sample | |
585 $tsm_line .= "\t$br"; | |
586 #Extract | |
587 $tsm_line .= "\t$tr"; | |
588 | |
589 #LabeledExtract & Immunoprecipitate | |
590 if($type eq 'EXPERIMENTAL'){ | |
591 $tsm_line .= "\t$sample_label - IP of $tr with anti $ftype_name (Ab vendor, Ab ID)"; | |
592 $tsm_line .= "\t$tr IP"; | |
593 }else{ | |
594 $tsm_line .= "\t$sample_label - Input control DNA of $tr\t"; | |
595 } | |
596 | |
597 #Hybridization | |
598 #U2OS BR1_TR1 ChIP H3KAc 46092 hyb | |
599 $tsm_line .= "\t$ctype_name $tr ChIP $ftype_name ".$echip->unique_id().' hyb'; | |
600 | |
601 #BioSourceMaterial SampleMaterial ExtractMaterial LabeledExtractMaterial | |
602 $tsm_line .= "\tcell\tgenomic_DNA\tgenomic_DNA\tsynthetic_DNA"; | |
603 | |
604 #Dye | |
605 $tsm_line .= "\t".$data[$hpos{'DYE'}]; | |
606 | |
607 #BioMaterialCharacteristics[Organism] | |
608 $tsm_line .= "\t".$self->species(); | |
609 | |
610 #BioMaterialCharacteristics[BioSourceType] | |
611 $tsm_line .= "\tfrozen_sample"; | |
612 | |
613 #BioMaterialCharacteristics[StrainOrLine] | |
614 $tsm_line .= "\t$ctype_name"; | |
615 | |
616 #BioMaterialCharacteristics[CellType] | |
617 $tsm_line .= "\t$ctype_name"; | |
618 | |
619 #BioMaterialCharacteristics[Sex] | |
620 $tsm_line .= "\t???"; | |
621 #FactorValue[StrainOrLine] | |
622 $tsm_line .= "\t$ctype_name"; | |
623 #FactorValue[Immunoprecipitate] | |
624 $tsm_line .= ($type eq 'EXPERIMENTAL') ? "\tanti-${ftype_name} antibody\n" : "\t\n"; | |
625 | |
626 print $t2m_file $tsm_line; | |
627 | |
628 } | |
629 | |
630 } | |
631 | |
632 close($t2m_file) if $self->{'write_mage'}; | |
633 close($fh); | |
634 | |
635 return; | |
636 } | |
637 | |
638 | |
639 | |
640 | |
641 =head2 read_probe_data | |
642 | |
643 Example : $imp->read_probe_data(); | |
644 Description: Parses and imports probes, probe sets and features of a given array | |
645 No duplicate handling or probe caching is performed due to memory | |
646 issues, this is done in resolve_probe_data. | |
647 Returntype : none | |
648 Exceptions : none | |
649 Caller : Importer | |
650 Status : Medium | |
651 | |
652 =cut | |
653 | |
654 | |
655 #Assumes one chip_design per experimental set. | |
656 sub read_probe_data{ | |
657 my ($self) = shift; | |
658 | |
659 my ($fh, $line, @data, %hpos, %probe_pos);#, %duplicate_probes); | |
660 $self->log("Parsing and importing ".$self->vendor()." probe data (".localtime().")", 1); | |
661 | |
662 ### Read in | |
663 #ndf file: probe_set, probe and probe_feature(.err contains multiple mappings) | |
664 #pos file: probe chromosome locations | |
665 | |
666 #Need to change how probe_names are generated for nimblegen? | |
667 #native probe_ids may not be unique, but should be when combined with the seq_id which is currently being used as the xref_id | |
668 #Handle with API!! | |
669 | |
670 #READ REGION POSITIONS | |
671 #We need to handle different coord systems and possibly different assmemblies | |
672 my $slice_a = $self->db->get_SliceAdaptor(); | |
673 my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name('chromosome'); | |
674 | |
675 | |
676 #TIED FILE CACHE!!! | |
677 #We need to rebuild the cache from the DB before we start adding new probe info | |
678 #We only need to rebuild cache if we find a chip that hasn't been imported? | |
679 #No, we just need to import without cache, then re-do the resolve step | |
680 #Are we still going to get disconnects when we dump the cache? | |
681 | |
682 | |
683 #warn "Read probe data should only read in the array chips which are specified by the ExperimentalChip? Not just what is present in the DesignNotes file?"; | |
684 | |
685 | |
686 foreach my $array(@{$self->arrays()}){ | |
687 | |
688 foreach my $achip(@{$array->get_ArrayChips()}){ | |
689 | |
690 my (@log, %probe_pos, $fasta_file, $f_out); | |
691 #do we need to fetch probe by seq and array? | |
692 #this would also id non-unique seqs in design | |
693 | |
694 #warn "We need to account for different cs feature amppings here | |
695 | |
696 if($achip->has_status('IMPORTED')){ | |
697 $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id()); | |
698 next; | |
699 }elsif($self->recovery()){ | |
700 $self->log("Rolling back ArrayChip:\t".$achip->design_id()); | |
701 $self->rollback_ArrayChips([$achip]); | |
702 } | |
703 | |
704 $self->log("Importing ArrayChip:".$achip->design_id()); | |
705 | |
706 #Always use pos file, ndf file cannot be guranteed to contain all location info | |
707 #pos file also gives a key to which probes should be considered 'EXPERIMENTAL' | |
708 | |
709 #CACHE PROBE POSITIONS | |
710 $fh = open_file($self->get_dir("design")."/".$achip->name().".pos"); | |
711 | |
712 #don't % = map ! Takes a lot longer than a while ;) | |
713 while($line = <$fh>){ | |
714 $line =~ s/\r*\n//o;#Not using last element | |
715 @data = split/\t/o, $line; | |
716 | |
717 #SEQ_ID CHROMOSOME PROBE_ID POSITION COUNT | |
718 if ($. == 1){ | |
719 %hpos = %{$self->set_header_hash(\@data, $self->get_config('pos_fields'))}; | |
720 next; | |
721 } | |
722 | |
723 #Skip probe if there is a duplicate | |
724 if(exists $probe_pos{$data[$hpos{'PROBE_ID'}]}){ | |
725 | |
726 if($data[$hpos{'CHROMOSOME'}] eq $probe_pos{$data[$hpos{'PROBE_ID'}]}->{chr} && | |
727 ($data[$hpos{'POSITION'}]+1) eq $probe_pos{$data[$hpos{'PROBE_ID'}]}->{start}){ | |
728 #log or warn here? | |
729 | |
730 #Not matching probe length here | |
731 | |
732 next; | |
733 #Do we need to skip this in the ndf file too? | |
734 | |
735 } | |
736 else{ | |
737 throw("Found duplicate mapping for ".$data[$hpos{'PROBE_ID'}]. | |
738 " need implement duplicate logging/cleaning"); | |
739 } | |
740 #need to build duplicate hash to clean elements from hash | |
741 # $duplicate_probes{$data[$hpos{'PROBE_ID'}]} = 1; | |
742 #next; | |
743 } | |
744 | |
745 | |
746 my $random = 0; | |
747 | |
748 if(! $self->cache_slice($data[$hpos{'CHROMOSOME'}])){ | |
749 push @log, "Skipping feature import for probe ".$data[$hpos{'PROBE_ID'}]." with non-standard region ".$data[$hpos{'CHROMOSOME'}]; | |
750 | |
751 #should we try and resolve the random chrs here? | |
752 #at least store probe/set/result and skip feature | |
753 #can we just import as chr with no start postition? | |
754 | |
755 #if ($data[$hpos{'CHROMOSOME'}] =~ /_random/){ | |
756 | |
757 # if(! $self->cache_slice($data[$hpos{'CHROMOSOME'}])){ | |
758 #push @log, "Skipping probe ".$data[$hpos{'PROBE_ID'}]." with non-standard region ".$data[$hpos{'CHROMOSOME'}]; | |
759 #}else{ | |
760 #we should really log this in a seprate file to avoid overloading the lgo file | |
761 # push @log, "Importing random probe ".$data[$hpos{'PROBE_ID'}]." on ".$data[$hpos{'CHROMOSOME'}]." omitting position"; | |
762 | |
763 #} | |
764 | |
765 #} | |
766 | |
767 } | |
768 | |
769 #This is not handling probes with random chrs | |
770 | |
771 $probe_pos{$data[$hpos{'PROBE_ID'}]} = {( | |
772 chr => $data[$hpos{'CHROMOSOME'}], | |
773 start => ($data[$hpos{'POSITION'}] +1),#default UCSC->Ensembl coord conversion | |
774 )}; | |
775 | |
776 } | |
777 | |
778 | |
779 | |
780 | |
781 #Remove duplicate probes | |
782 | |
783 $self->log("Built position cache from : ".$achip->name().".pos", 1); | |
784 close($fh); | |
785 | |
786 $self->log("Importing design probes from : ".$achip->name().".ndf"); | |
787 #OPEN PROBE IN/OUT FILES | |
788 $fh = open_file($self->get_dir("design")."/".$achip->name().".ndf"); | |
789 #Need to set these paths in each achip hash, file names could be tablename.chip_id.txt | |
790 | |
791 #Need to add dbname/port/species/vendor to this path? | |
792 #.efg DropDatabase should also clean the fasta dumps and caches for a given DB | |
793 | |
794 if($self->dump_fasta()){ | |
795 $fasta_file = $self->get_dir('fastas').'/'.$achip->name().".fasta"; | |
796 $self->backup_file($fasta_file); | |
797 $f_out = open_file($fasta_file, '>'); | |
798 } | |
799 | |
800 | |
801 my ($length, $ops, $op, $of, %pfs); | |
802 | |
803 #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap | |
804 my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("VendorMap"); | |
805 | |
806 | |
807 my $strand = 0; #default for nimblegen, should be config hash? | |
808 | |
809 #my $cig_line = "50M"; #default for nimblegen, should be config hash? | |
810 #probe length can change within design, should be built from length | |
811 | |
812 | |
813 my $fasta = ""; | |
814 | |
815 #$self->Timer()->mark("Starting probe loop"); | |
816 | |
817 | |
818 #This is leaking about 30-60MB for each normal density chip? | |
819 #need Devel::Monitor here? | |
820 | |
821 | |
822 while($line = <$fh>){ | |
823 $line =~ s/\r*\n//; | |
824 @data = split/\t/o, $line; | |
825 my $loc = ""; | |
826 my $class = "EXPERIMENTAL"; | |
827 | |
828 #PROBE_DESIGN_ID CONTAINER DESIGN_NOTE SELECTION_CRITERIA SEQ_ID PROBE_SEQUENCE MISMATCH MATCH_INDEX FEATURE_ID ROW_NUM COL_NUM PROBE_CLASS PROBE_ID POSITION DESIGN_ID X Y | |
829 #2067_0025_0001 BLOCK1 0 chrX TTAGTTTAAAATAAACAAAAAGATACTCTCTGGTTATTAAATCAATTTCT 0 52822449 52822449 1 25 experimental chrXP10404896 10404896 2067 25 1 | |
830 | |
831 if ($. == 1){ | |
832 %hpos = %{$self->set_header_hash(\@data, $self->get_config('ndf_fields'))}; | |
833 next; | |
834 } | |
835 | |
836 | |
837 if (! exists $probe_pos{$data[$hpos{'PROBE_ID'}]}){ | |
838 push @log, "Skipping non-experimental probe:\t".$data[$hpos{'PROBE_ID'}]; | |
839 next; | |
840 } | |
841 | |
842 #Which non-experimental probes might we want to store? | |
843 #if($data[$hpos{'CONTAINER'}] =~ /control/io){ | |
844 # $class = "CONTROL"; | |
845 #} | |
846 #elsif($data[$hpos{'CONTAINER'}] =~ /random/io){ | |
847 # $class = "RANDOM"; | |
848 #} | |
849 #elsif($data[$hpos{'PROBE_CLASS'}] !~ /experimental/io){ | |
850 # $class = "OTHER"; | |
851 #} | |
852 #elsif(! exists $probe_pos{$data[$hpos{'PROBE_ID'}]}){ #HACKY HACKY HACK HACK!! Needed for valid region retrival | |
853 # $class = "OTHER"; | |
854 #} | |
855 #SPIKE INS? | |
856 | |
857 | |
858 | |
859 #This assumes all probes in feature/probeset are next to each other!!!!!!!!!! | |
860 | |
861 | |
862 if($data[$hpos{'FEATURE_ID'}] != $data[$hpos{'MATCH_INDEX'}]){#Probe set data | |
863 #print "Generating new probeset:\tFeature id:\t".$data[$hpos{'FEATURE_ID'}]."\tmatchindex:\t".$data[$hpos{'MATCH_INDEX'}]."\n"; | |
864 | |
865 if($ops && ($data[$hpos{'FEATURE_ID'}] ne $ops->name())){ | |
866 #THis is where we chose to update/validate | |
867 #Do we need to pass probes if they're already stored..may aswell to reduce mysql load? | |
868 #No point as we have to query anyway | |
869 $self->store_set_probes_features($achip->dbID(), \%pfs, $ops); | |
870 throw("ops still defined in caller") if defined $ops; | |
871 } | |
872 | |
873 $ops = Bio::EnsEMBL::Funcgen::ProbeSet->new( | |
874 -NAME => $data[$hpos{'FEATURE_ID'}], | |
875 -SIZE => undef, | |
876 -FAMILY => $data[$hpos{'CONTAINER'}], | |
877 #xref_id => $data[$hpos{'SEQ_ID'}],#Need to populate xref table | |
878 ); | |
879 | |
880 #should we store straight away or build a probeset/probe/feature set, and then store and validate in turn? | |
881 #Store directly have separate method to validate and update? | |
882 #would need to check if one exists before storing anyway, else we could potentially duplicate the same probe/probeset from a different array | |
883 #remember for affy we need duplicate probe records with identical probe ids, probeset records unique across all arrays | |
884 | |
885 undef %pfs | |
886 } | |
887 elsif($. > 2){#may have previous ops set, but next has no ops, or maybe just no ops's at all | |
888 $self->store_set_probes_features($achip->dbID(), \%pfs, $ops); | |
889 throw("ops still defined in caller") if defined $ops; | |
890 } | |
891 | |
892 | |
893 ###PROBES | |
894 #should we cat $xref_id to $probe_id here to generate unique id? | |
895 #would be messy to handle in the code, but would have to somewhere(in the retrieval code) | |
896 | |
897 $length = length($data[$hpos{'PROBE_SEQUENCE'}]); | |
898 #$probe_string .= "\t${psid}\t".$data[$hpos{'PROBE_ID'}]."\t${length}\t$ac_id\t${class}\n"; | |
899 | |
900 $op = Bio::EnsEMBL::Funcgen::Probe->new( | |
901 -NAME => $data[$hpos{'PROBE_ID'}], | |
902 -LENGTH => $length, | |
903 -ARRAY => $array, | |
904 -ARRAY_CHIP_ID => $achip->dbID(), | |
905 -CLASS => $class, | |
906 ); | |
907 | |
908 | |
909 %{$pfs{$data[$hpos{'PROBE_ID'}]}} = ( | |
910 probe => $op, | |
911 features => [], | |
912 ); | |
913 | |
914 | |
915 ###PROBE FEATURES | |
916 #How can we be certain that we have the same mapping in the DB? | |
917 #Put checks in here for build? | |
918 #Need to handle controls/randoms here | |
919 #won't have features but will have results!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | |
920 #The format of the pos file looks like it should have all the data required, but | |
921 # chromsome is missing, first undef :( | |
922 #Need to use $pos here instead of .pos file | |
923 #However have problems defining probe class, as not populated in test set | |
924 #derive from container! :( | |
925 #Ignore controls/random as they won't have a region | |
926 #Also need to handle multiple mappings? | |
927 #As results reference probes, no features, can have multiple features on different builds | |
928 | |
929 #if($class eq "EXPERIMENTAL"){ | |
930 | |
931 #if(exists $regions{$data[$hpos{'SEQ_ID'}]}){ | |
932 # $fid++; | |
933 # $pf_string .= "\t".$regions{$data[$hpos{'SEQ_ID'}]}{'seq_region_id'}."\t".$data[$hpos{'POSITION'}]."\t". | |
934 # ($data[$hpos{'POSITION'}] + $length)."\t${strand}\t".$regions{$data[$hpos{'SEQ_ID'}]}{'coord_system_id'}. | |
935 # "\t${pid}\t${anal_id}\t".$data[$hpos{'MISMATCH'}]."\t${cig_line}\n"; | |
936 #$loc .= $regions{$data[$hpos{'SEQ_ID'}]}{'seq_region_id'}.":".$data[$hpos{'POSITION'}]. | |
937 # "-".($data[$hpos{'POSITION'}] + $length).";" if ($self->{'_dump_fasta'}); | |
938 # } | |
939 # else{ | |
940 # die("No regions defined for ".$data[$hpos{'SEQ_ID'}]." ".$data[$hpos{'PROBE_ID'}]. | |
941 #" with family ".$data[$hpos{'CONTAINER'}]); | |
942 # } | |
943 | |
944 | |
945 | |
946 | |
947 if ($self->dump_fasta()){ | |
948 #(my $chr = $probe_pos{$data[$hpos{'PROBE_ID'}]}->{'chr'}) =~ s/chr//; | |
949 | |
950 #$loc .= $chr.":".$probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'}."-". | |
951 # ($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'}+ $length).";"; | |
952 | |
953 #$fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}]. | |
954 # "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; | |
955 | |
956 | |
957 | |
958 #filter controls/randoms? Or would it be sensible to see where they map | |
959 #wrap seq here? | |
960 #$fasta .= ">".$data[$hpos{'PROBE_ID'}]."\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; | |
961 | |
962 | |
963 #To use this for mapping, we really need the dbID nr fasta | |
964 #This can be generated after the import, or maybe during resolve? | |
965 #This is also currently done on a chip level, where as the cache is resolved at the array level | |
966 #We could simply cat the files before resolving the fasta file | |
967 #Need to do this otherwise we risk overwriting the fasta file with incomplete data. | |
968 #Can we validate sequence across probes with same name in this step? | |
969 #Just use probe name for now. | |
970 | |
971 #We could cat and sort the fastas to make sure we have the same sequences | |
972 #Need to dump the design_id in the fasta header | |
973 #This would also reduce IO on the DB as identical probe will be consecutive, hence just one query to get the id. | |
974 | |
975 #Changed th format an content of this to facilitate dbID nr fasta file generation and sequence validation | |
976 | |
977 $fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$achip->design_id."\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; | |
978 | |
979 #Print fasta every 10000 lines | |
980 if(! ($. % 10000)){ | |
981 print $f_out $fasta; | |
982 $fasta = ''; | |
983 } | |
984 } | |
985 | |
986 | |
987 #Hack!!!!!! Still importing probe (and result?) | |
988 next if(! $self->cache_slice($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'chr'})); | |
989 #warn("Skipping non standard probe (".$data[$hpos{'PROBE_ID'}].") with location:\t$loc\n"); | |
990 | |
991 | |
992 $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new | |
993 ( | |
994 -START => $probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'}, | |
995 -END =>($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'} + $length), | |
996 -STRAND => $strand, | |
997 -SLICE => $self->cache_slice($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'chr'}), | |
998 -ANALYSIS => $anal, | |
999 -MISMATCHCOUNT => $data[$hpos{'MISMATCH'}],#Is this always 0 for import? remove from header hash? | |
1000 -PROBE => undef, #Need to update this in the store method | |
1001 ); | |
1002 | |
1003 | |
1004 push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of; | |
1005 | |
1006 } | |
1007 | |
1008 #need to store last data here | |
1009 $self->store_set_probes_features($achip->dbID(), \%pfs, $ops); | |
1010 $self->log(join("\n", @log)); | |
1011 $achip->adaptor->store_status("IMPORTED", $achip); | |
1012 $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED"); | |
1013 | |
1014 if ($self->dump_fasta()){ | |
1015 print $f_out $fasta; | |
1016 close($f_out); | |
1017 } | |
1018 | |
1019 $self->log("Imported design from:\t".$achip->name().".ndf", 1); | |
1020 | |
1021 | |
1022 | |
1023 #$self->{'_probe_cache'} = undef;#As we can't get Y and Y info from the DB, this is only possible as the results files contain X and Y info | |
1024 } | |
1025 | |
1026 #Should we build hash of probe_names:probe_feature_ids here for results import | |
1027 #Should we dump this as a lookup file for easier recoverability | |
1028 #This is the biggest step in the import | |
1029 #Building the hash would be fastest but least recoverable | |
1030 #Would have to write recover statments for each step i.e. build the most recent data structure required for the next import step | |
1031 #Individual queries for each result would take ages | |
1032 #This is all assuming there are no random records in the table i.e. ID series is linear with no gaps. | |
1033 | |
1034 | |
1035 #Could throw a random validation check in every X entries? | |
1036 #This would only work for non-parallel imports | |
1037 #periodic import when hit new probe_set, but no new data printed | |
1038 | |
1039 } | |
1040 | |
1041 | |
1042 $self->log("Finished parsing probe data"); | |
1043 #Total probe_sets:\t$psid\n". | |
1044 # "Total probes:\t$pid\nTotal probe_features:\t$fid"); | |
1045 | |
1046 | |
1047 $self->resolve_probe_data(); | |
1048 | |
1049 return; | |
1050 } | |
1051 | |
1052 | |
1053 | |
1054 | |
1055 =head2 read_and_import_results_data | |
1056 | |
1057 Example : $imp->read_results_data(); | |
1058 Description: Parses and dumps raw results to file | |
1059 Returntype : none | |
1060 Exceptions : none | |
1061 Caller : Importer | |
1062 Status : at risk | |
1063 | |
1064 =cut | |
1065 | |
1066 | |
1067 sub read_and_import_results_data{ | |
1068 my $self = shift; | |
1069 | |
1070 $self->log("Parsing ".$self->vendor()." results"); | |
1071 my (@header, @data, @design_ids, @lines); | |
1072 my ($fh, $pid, $line, $file); | |
1073 my $anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("RawValue"); | |
1074 my $result_set = $self->get_import_ResultSet($anal, 'channel'); | |
1075 | |
1076 | |
1077 if ($result_set) { #we have some new data to import | |
1078 | |
1079 foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) { | |
1080 | |
1081 #if( ! $echip->has_status('IMPORTED')){ | |
1082 | |
1083 foreach my $chan (@{$echip->get_Channels()}) { | |
1084 | |
1085 if ( ! $chan->has_status('IMPORTED')) { | |
1086 | |
1087 #we need to set write_mage here | |
1088 | |
1089 my $array = $echip->get_ArrayChip->get_Array(); | |
1090 | |
1091 $self->get_probe_cache_by_Array($array) || throw('Failed to get the probe cache handle for results import, resolve cache here?'); | |
1092 my ($probe_elem, $score_elem, %hpos); | |
1093 my $cnt = 0; | |
1094 my $r_string = ""; | |
1095 my $chan_name = $echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()}; | |
1096 my $cc_id = $result_set->get_chip_channel_id($chan->dbID()); | |
1097 | |
1098 | |
1099 #if ($self->recovery()) { | |
1100 # $self->log("Rolling back results for channel:\t${chan_name}"); | |
1101 # $self->db->rollback_results($cc_id); | |
1102 # } | |
1103 | |
1104 #open/backup output | |
1105 my $out_file = $self->get_dir("raw")."/result.".$chan_name.".txt"; | |
1106 $self->backup_file($out_file); | |
1107 my $r_out = open_file($out_file, '>'); | |
1108 | |
1109 (my $alt_chan_name = $chan_name) =~ s/\_/\_1h\_/; | |
1110 my $found = 0; | |
1111 | |
1112 FILE: foreach my $name($chan_name, $alt_chan_name){ | |
1113 | |
1114 foreach my $suffix ("_pair.txt", ".pair", ".txt") { | |
1115 | |
1116 $file = $self->get_dir("results")."/".$name.$suffix; | |
1117 | |
1118 if (-f $file) { | |
1119 $found = 1; | |
1120 last FILE; | |
1121 } | |
1122 } | |
1123 } | |
1124 | |
1125 throw("Could not find result file for Channel(${chan_name}) in ".$self->get_dir('results')) if ! $found; | |
1126 | |
1127 #open/slurp input | |
1128 $self->log("Reading result for channel $chan_name:\t$file", 1); | |
1129 $fh = open_file($file); | |
1130 @lines = <$fh>; | |
1131 close($fh); | |
1132 | |
1133 | |
1134 ###PROCESS HEADER | |
1135 | |
1136 foreach my $i (0..$#lines) { | |
1137 | |
1138 if ($lines[$i] =~ /PROBE_ID/o) { | |
1139 $lines[$i] =~ s/\r*\n//o; | |
1140 @data = split/\t/o, $lines[$i]; | |
1141 | |
1142 %hpos = %{$self->set_header_hash(\@data, $self->get_config('result_fields'))}; | |
1143 | |
1144 #remove header | |
1145 splice @lines, $i, 1; | |
1146 | |
1147 last; #finished processing header | |
1148 } | |
1149 } | |
1150 | |
1151 #we need to sort the result files based on the unique key(name at present, should replace with seq at some point) | |
1152 @lines = sort {(split/\t/o, $a)[$hpos{'PROBE_ID'}] cmp (split/\t/o, $b)[$hpos{'PROBE_ID'}]} @lines; | |
1153 | |
1154 $self->log('Parsing results', 1); | |
1155 | |
1156 foreach $line(@lines) { | |
1157 | |
1158 #can we preprocess effectively? | |
1159 next if $line =~ /^#/; | |
1160 next if $line =~ /NGS_CONTROLS/; | |
1161 next if $line =~ /V_CODE/; | |
1162 next if $line =~ /H_CODE/; | |
1163 next if $line =~ /RANDOM/; | |
1164 | |
1165 $line =~ s/\r*\n//o; | |
1166 @data = split/\t/o, $line; | |
1167 | |
1168 ###PROCESS HEADER | |
1169 #if ($line =~ /PROBE_ID/o){ | |
1170 # | |
1171 # %hpos = %{$self->set_header_hash(\@data, $self->get_config('result_fields'))}; | |
1172 # next;#finished processing header | |
1173 #} | |
1174 | |
1175 ###PROCESS DATA | |
1176 #Is this string concat causing the slow down, would it befaster to use an array and print a join? | |
1177 | |
1178 if ($pid = $self->get_probe_id_by_name_Array($data[$hpos{'PROBE_ID'}], $array)) { | |
1179 $cnt ++; | |
1180 $r_string .= '\N'."\t${pid}\t".$data[$hpos{'PM'}]."\t${cc_id}\t".$data[$hpos{'X'}]."\t".$data[$hpos{'Y'}]."\n"; | |
1181 } else { | |
1182 warn "Found unfiltered non-experimental probe in input $data[$hpos{'PROBE_ID'}]"; | |
1183 } | |
1184 | |
1185 ###PRINT SOME RESULTS | |
1186 if ($cnt > 10000) { | |
1187 $cnt = 0; | |
1188 print $r_out $r_string; | |
1189 $r_string =""; | |
1190 #could we fork here and import in the background? | |
1191 } | |
1192 | |
1193 } | |
1194 #PRINT/CLOSE Channel file | |
1195 print $r_out $r_string; | |
1196 close($r_out); | |
1197 $self->log("Finished parsing $chan_name result", 1); | |
1198 | |
1199 #Import directly here to avoid having to reparse all results if we crash!!!! | |
1200 $self->log("Importing:\t$out_file"); | |
1201 $self->db->load_table_data("result", $out_file); | |
1202 $self->log("Finished importing:\t$out_file", 1); | |
1203 $chan->adaptor->store_status('IMPORTED', $chan); | |
1204 | |
1205 | |
1206 } | |
1207 } | |
1208 } | |
1209 } | |
1210 else { | |
1211 $self->log("Skipping results parse and import"); | |
1212 } | |
1213 | |
1214 $self->log("Finished parsing and importing results"); | |
1215 | |
1216 return; | |
1217 } | |
1218 | |
1219 | |
1220 | |
1221 1; |