0
|
1 #
|
|
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Nimblegen
|
|
3 #
|
|
4
|
|
5 =head1 LICENSE
|
|
6
|
|
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
8 Genome Research Limited. All rights reserved.
|
|
9
|
|
10 This software is distributed under a modified Apache license.
|
|
11 For license details, please see
|
|
12
|
|
13 http://www.ensembl.org/info/about/code_licence.html
|
|
14
|
|
15 =head1 CONTACT
|
|
16
|
|
17 Please email comments or questions to the public Ensembl
|
|
18 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
19
|
|
20 Questions may also be sent to the Ensembl help desk at
|
|
21 <helpdesk@ensembl.org>.
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Bio::EnsEMBL::Funcgen::Parsers::Nimblegen
|
|
26
|
|
27 =head1 SYNOPSIS
|
|
28
|
|
29 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Nimblegen";
|
|
30 push @INC, $parser_type;
|
|
31 my $imp = $class->SUPER::new(@_);
|
|
32
|
|
33
|
|
34 =head1 DESCRIPTION
|
|
35
|
|
36 This is a parser class which should not be instatiated directly, it
|
|
37 normally set by the Importer as the parent class. Nimblegen contains meta
|
|
38 data and methods specific to NimbleGen arrays to aid parsing and importing of
|
|
39 experimental data.
|
|
40
|
|
41 =cut
|
|
42
|
|
43 package Bio::EnsEMBL::Funcgen::Parsers::Nimblegen;
|
|
44
|
|
45 use Bio::EnsEMBL::Funcgen::Array;
|
|
46 use Bio::EnsEMBL::Funcgen::ProbeSet;
|
|
47 use Bio::EnsEMBL::Funcgen::Probe;
|
|
48 use Bio::EnsEMBL::Funcgen::ProbeFeature;
|
|
49 use Bio::EnsEMBL::Funcgen::FeatureType;
|
|
50 use Bio::EnsEMBL::Funcgen::ExperimentalChip;
|
|
51 use Bio::EnsEMBL::Funcgen::ArrayChip;
|
|
52 use Bio::EnsEMBL::Funcgen::Channel;
|
|
53 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
|
|
54 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file);
|
|
55 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
|
|
56 use Bio::EnsEMBL::Funcgen::Parsers::MAGE;
|
|
57
|
|
58 use strict;
|
|
59
|
|
60 use vars qw(@ISA);
|
|
61 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::MAGE);
|
|
62
|
|
63 =head2 new
|
|
64
|
|
65 Example : my $self = $class->SUPER::new(@_);
|
|
66 Description: Constructor method for Nimblegen class
|
|
67 Returntype : Bio::EnsEMBL::Funcgen::Parser::Nimblegen
|
|
68 Exceptions : throws if Experiment name not defined or if caller is not Importer
|
|
69 Caller : Bio::EnsEMBL::Funcgen::Importer
|
|
70 Status : at risk
|
|
71
|
|
72 =cut
|
|
73
|
|
74
|
|
75 sub new{
|
|
76 my $caller = shift;
|
|
77
|
|
78 my $class = ref($caller) || $caller;
|
|
79 my $self = $class->SUPER::new(@_);
|
|
80
|
|
81 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
|
|
82
|
|
83 #should we provide args override for all of these?
|
|
84
|
|
85 $self->{'config'} =
|
|
86 {(
|
|
87 #order of these data arrays is important!
|
|
88 #Remove these method arrays, snd just run them serially?
|
|
89 array_data => ['experiment'],#Rename this!!
|
|
90 probe_data => ["probe"],
|
|
91 results_data => ["and_import_results"],
|
|
92 sample_key_fields => ['DESIGN_ID', 'CHIP_ID', 'DYE', 'PROMOT_SAMPLE_TYPE'],# 'SAMPLE_LABEL'],label now optional
|
|
93 # 'SAMPLE_DESCRIPTION removed due to naming disparities
|
|
94 ndf_fields => ['CONTAINER', 'PROBE_SEQUENCE', 'MISMATCH','FEATURE_ID', 'PROBE_ID'],#MISMATCH is always 0!
|
|
95 pos_fields => ['CHROMOSOME', 'PROBE_ID', 'POSITION', 'COUNT'],
|
|
96 result_fields => ['PROBE_ID', 'PM', 'X', 'Y'],
|
|
97 notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'],
|
|
98 norm_method => 'VSN_GLOG',
|
|
99 dye_freqs => {(
|
|
100 Cy5 => 635,
|
|
101 Cy3 => 532,
|
|
102 )},
|
|
103
|
|
104
|
|
105 #Need to make these definable?
|
|
106 #have protocolfile arg and just parse tab2mage protocol section format
|
|
107 protocols => {(
|
|
108 grow => {(
|
|
109 accession => 'GROW_NIMB',
|
|
110 name => 'GROW NIMBLEGEN CULTURE CONDITIONS',
|
|
111 text => 'Nimblegen culture conditions description here. Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.',
|
|
112 paramters => undef,
|
|
113 )},
|
|
114 treatment => {(
|
|
115 accession => 'CROSSLINK_NIMB',
|
|
116 name => 'NIMBLEGEN CHROMATIN PREPARATION',
|
|
117 text => 'Nimblegen X-linking and DNA extraction protocol.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.',
|
|
118 paramters => undef,
|
|
119 )},
|
|
120 extraction => {(
|
|
121 accession => 'CHROMATIN_IP_NIMB',
|
|
122 name => 'NIMBLEGEN CHROMATIN IMMUNOPRECIPITATION and DNA RECOVERY',
|
|
123 text => 'Nimblegen chromatin immunoprecipitation and DNA extraction protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.',
|
|
124 paramters => undef,
|
|
125 )},
|
|
126 labeling => {(
|
|
127 accession => 'LABELLING_NIMB',
|
|
128 name => 'NIMBLEGEN LABELLING',
|
|
129 text => 'Nimblegen labelling protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.',
|
|
130 paramteres => undef,
|
|
131 )},
|
|
132 hybridization => {(
|
|
133 accession => 'HYBRIDISATION_NIMB',
|
|
134 name => 'NIMBLEGEN HYBRIDISATION',
|
|
135 text => 'Nimblegen chip hybridisation protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.',
|
|
136 parameters => undef,
|
|
137 )},
|
|
138 scanning => {(
|
|
139 accession => 'SCANNING_NIMB',
|
|
140 name => 'NIMBLESCAN',
|
|
141 text => 'Nimblegen Nimblescan protocol here.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.Padding text here to avoid description too short warnings.',
|
|
142 paramters => undef,
|
|
143 )},
|
|
144 )},
|
|
145
|
|
146 )};
|
|
147
|
|
148 return $self;
|
|
149 }
|
|
150
|
|
151
|
|
152 =head2 set_config
|
|
153
|
|
154 Example : my $self->set_config;
|
|
155 Description: Sets attribute dependent config
|
|
156 Returntype : None
|
|
157 Exceptions : None
|
|
158 Caller : Bio::EnsEMBL::Funcgen::Importer
|
|
159 Status : at risk
|
|
160
|
|
161 =cut
|
|
162
|
|
163
|
|
164 sub set_config{
|
|
165 my $self = shift;
|
|
166
|
|
167 #This should be general config for all types of import
|
|
168 #dirs are not set in config to enable generic get_dir method access
|
|
169 #This is really just setting paths rather than config rename?
|
|
170
|
|
171 #This is generic for all imports
|
|
172
|
|
173
|
|
174
|
|
175 if($self->{'old_dvd_format'}){
|
|
176 $self->{'design_dir'} = $self->get_dir('input').'/DesignFiles';
|
|
177 }else{
|
|
178 $self->{'design_dir'} = $self->get_dir('input').'/Design_information';
|
|
179 }
|
|
180
|
|
181
|
|
182 if($self->{'old_dvd_format'}){
|
|
183 $self->{'config'}{'notes_file'} = $self->get_dir('input').'/DesignNotes.txt';
|
|
184 }else{
|
|
185 $self->{'config'}{'notes_file'} = $self->get_dir('design').'/DesignNotes.txt';
|
|
186 }
|
|
187
|
|
188 $self->{'config'}{'chip_file'} = $self->get_dir('input').'/SampleKey.txt';
|
|
189
|
|
190
|
|
191
|
|
192 #Experiment(output) specific
|
|
193 #This should already be set in the run script
|
|
194 #As we could get log write errors before we have created the output dir otherwise
|
|
195 $self->{'output_dir'} ||= $self->get_dir("data").'/output/'.$self->{'param_species'}.'/'.$self->vendor().'/'.$self->name();
|
|
196
|
|
197 $self->{'config'}{'tab2mage_file'} = $self->get_dir('output').'/E-TABM-'.$self->name().'.txt';
|
|
198
|
|
199 $self->{'config'}{'mage_xml_file'} = $self->get_dir('output').'/{UNASSIGNED}.xml';
|
|
200
|
|
201 if($self->{'old_dvd_format'}){
|
|
202 $self->{'results_dir'} = $self->get_dir('input').'/PairData';
|
|
203 }else{
|
|
204 $self->{'results_dir'} = $self->get_dir('input').'/Raw_data_files';
|
|
205 }
|
|
206
|
|
207 return;
|
|
208 }
|
|
209
|
|
210
|
|
211
|
|
212
|
|
213 =head2 read_array_data
|
|
214
|
|
215 Example : $imp->read_array_data();
|
|
216 Description: Parses NimbleGen DesignNotes.txt files to create and store new Arrays
|
|
217 Returntype : none
|
|
218 Exceptions : None
|
|
219 Caller : general
|
|
220 Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats?
|
|
221
|
|
222 =cut
|
|
223
|
|
224
|
|
225 sub read_array_data{
|
|
226 my ($self, $notes_file) = @_;
|
|
227
|
|
228
|
|
229 $notes_file ||= $self->get_config('notes_file');
|
|
230 my ($line, $array, $array_chip, @data, %hpos);
|
|
231 my $oa_adaptor = $self->db->get_ArrayAdaptor();
|
|
232 my $ac_adaptor = $self->db->get_ArrayChipAdaptor();
|
|
233
|
|
234 #Slurp file to string, sets local delimtter to null and subs new lines
|
|
235 my $fh = open_file($notes_file);
|
|
236 #($design_desc = do { local ($/); <$fh>;}) =~ s/\r*\n$//;
|
|
237 #close($fh);
|
|
238
|
|
239 #Would be better if we only import the design info for the chips listed in the SampleKey.txt file
|
|
240 #Some cyclical dependency going on here :|
|
|
241
|
|
242 while ($line = <$fh>){
|
|
243
|
|
244 $line =~ s/\r*\n//;#chump
|
|
245 @data = split/\t/o, $line;
|
|
246
|
|
247 #We need to have a DESIGN vendor type?
|
|
248 #also need to be able to set file path independently of config
|
|
249
|
|
250 if($. == 1){
|
|
251 %hpos = %{$self->set_header_hash(\@data, $self->get_config('notes_fields'))};
|
|
252 next;
|
|
253 }
|
|
254
|
|
255 ### CREATE AND STORE Array and ArrayChips
|
|
256 if(! defined $array ){
|
|
257 #This is treating each array chip as a separate array, unless arrayset is defined
|
|
258 #AT present we have no way of differentiating between different array_chips on same array???!!!
|
|
259 #Need to add functionality afterwards to collate array_chips into single array
|
|
260
|
|
261 #This will use a stored array if present
|
|
262
|
|
263 $array = Bio::EnsEMBL::Funcgen::Array->new
|
|
264 (
|
|
265 -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}],
|
|
266 -FORMAT => uc($self->format()),
|
|
267 -VENDOR => uc($self->vendor()),
|
|
268 -TYPE => 'OLIGO',
|
|
269 -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here
|
|
270 );
|
|
271
|
|
272 ($array) = @{$oa_adaptor->store($array)};
|
|
273
|
|
274 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
|
|
275 -ARRAY_ID => $array->dbID(),
|
|
276 -NAME => $data[$hpos{'DESIGN_NAME'}],
|
|
277 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}],
|
|
278 #add description?
|
|
279 );
|
|
280
|
|
281 #This will use a stored array_chip if present
|
|
282 ($array_chip) = @{$ac_adaptor->store($array_chip)};
|
|
283 $array->add_ArrayChip($array_chip);
|
|
284
|
|
285 }
|
|
286 elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){
|
|
287
|
|
288 $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].") for same Array:\t".$array->name()."\n");
|
|
289
|
|
290 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
|
|
291 -ARRAY_ID => $array->dbID(),
|
|
292 -NAME => $data[$hpos{'DESIGN_NAME'}],
|
|
293 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}],
|
|
294 );
|
|
295
|
|
296 ($array_chip) = @{$ac_adaptor->store($array_chip)};
|
|
297 $array->add_ArrayChip($array_chip);
|
|
298
|
|
299 }
|
|
300 elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){
|
|
301 throw("Found experiment with more than one design without -array_set");
|
|
302 }
|
|
303 }
|
|
304
|
|
305
|
|
306 $self->add_Array($array);
|
|
307
|
|
308 close($fh);
|
|
309
|
|
310 return;
|
|
311
|
|
312 }
|
|
313
|
|
314
|
|
315 =head2 read_experiment_data
|
|
316
|
|
317 Example : $imp->read_array_chip_data();
|
|
318 Description: Parses and imports array & experimental chip meta data/objects
|
|
319 Returntype : none
|
|
320 Exceptions : throws if more than one array/design found and not an "array set"
|
|
321 Caller : Importer
|
|
322 Status : At risk
|
|
323
|
|
324 =cut
|
|
325
|
|
326 sub read_experiment_data{
|
|
327 my $self = shift;
|
|
328
|
|
329 $self->read_array_data();
|
|
330
|
|
331 my $t2m_file = $self->init_tab2mage_export() if $self->{'write_mage'};
|
|
332
|
|
333
|
|
334 my ($design_desc, $line, $tmp_uid, $channel, $echip, $sample_label);
|
|
335 my ($sample_desc, %hpos, @data, %uid_reps, %did_reps, %sample_reps);
|
|
336 my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor();
|
|
337 my $chan_adaptor = $self->db->get_ChannelAdaptor();
|
|
338 my $br_cnt = 1;
|
|
339 my $tr_cnt = 1;
|
|
340
|
|
341 #Currently 1 design = 1 chip = 1 array /DVD
|
|
342 #Different designs are not currently collated into a chip_set/array in any ordered manner
|
|
343 #Register each design as an array and an array_chip
|
|
344 #May want to group array_chips into array/chip sets by association though the API
|
|
345
|
|
346
|
|
347 warn("Harcoded for one array(can have multiple chips from the same array) per experiment\n");
|
|
348 my $fh = open_file($self->get_config("chip_file"));
|
|
349 $self->log("Reading chip data");
|
|
350
|
|
351
|
|
352 #warn "Do we need to validate each line here against the header array?";
|
|
353
|
|
354 while ($line = <$fh>){
|
|
355 next if $line =~ /^\s+\r*\n/;
|
|
356 $line =~ s/\r*\n//;#chump
|
|
357 @data = split/\t/o, $line;
|
|
358 #we could validate line against scalar of header array
|
|
359 #ORD_ID CHIP_ID DYE DESIGN_NAME DESIGN_ID SAMPLE_LABEL SAMPLE_SPECIES SAMPLE_DESCRIPTION TISSUE_TREATMENT PROMOT_SAMPLE_TYPE
|
|
360 if ($. == 1){
|
|
361 %hpos = %{$self->set_header_hash(\@data, $self->get_config('sample_key_fields'))};
|
|
362
|
|
363 #we need to set the sample description field name, as it can vary :(((
|
|
364 @data = grep(/SAMPLE_DESCRIPTION/, keys %hpos);
|
|
365 $sample_desc = $data[0];
|
|
366
|
|
367 throw("More than one sample description(@data) in ".$self->get_config("chip_file")."\n") if(scalar @data >1);
|
|
368 next;
|
|
369 }
|
|
370
|
|
371 #Need to handle array class here i.e. two channel arrays will have two lines
|
|
372 #validate species here
|
|
373 #look up alias from registry and match to self->species
|
|
374 #registry may not be loaded for local installation
|
|
375
|
|
376
|
|
377 ### CREATE AND STORE ExperimentalChips
|
|
378 if ((! $tmp_uid) || ($data[$hpos{'CHIP_ID'}] ne $tmp_uid)){
|
|
379
|
|
380 #Test both channels are available, i.e. the SampleKey has two TOTAL channels
|
|
381 if($echip){
|
|
382
|
|
383 for my $type('TOTAL', 'EXPERIMENTAL'){
|
|
384
|
|
385 my $test_chan = $chan_adaptor->fetch_by_type_experimental_chip_id($type, $echip->dbID());
|
|
386 throw("ExperimentalChip(".$echip->unique_id().
|
|
387 ") does not have a $type channel, please check the SampleKey.txt file") if ! $test_chan;
|
|
388
|
|
389 }
|
|
390 }
|
|
391
|
|
392 $tmp_uid = $data[$hpos{'CHIP_ID'}];
|
|
393 $echip = $ec_adaptor->fetch_by_unique_id_vendor($data[$hpos{'CHIP_ID'}], 'NIMBLEGEN');
|
|
394
|
|
395 if($echip){
|
|
396
|
|
397 if(! $self->recovery()){
|
|
398 throw("ExperimentalChip(".$echip->unqiue_id().
|
|
399 " already exists in the database\nMaybe you want to recover?");
|
|
400 }
|
|
401 }else{
|
|
402
|
|
403 $echip = Bio::EnsEMBL::Funcgen::ExperimentalChip->new
|
|
404 (
|
|
405 -EXPERIMENT_ID => $self->experiment->dbID(),
|
|
406 -DESCRIPTION => $data[$hpos{$sample_desc}],
|
|
407 -FEATURE_TYPE => $self->feature_type,
|
|
408 -CELL_TYPE => $self->cell_type,
|
|
409 -ARRAY_CHIP_ID => $self->arrays->[0]->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])->dbID(),
|
|
410 -UNIQUE_ID => $data[$hpos{'CHIP_ID'}],
|
|
411 #-BIOLOGICAL_REPLICATE => ,
|
|
412 #-TECHNICAL_REPLICATE => ,
|
|
413 );
|
|
414
|
|
415 ($echip) = @{$ec_adaptor->store($echip)};
|
|
416 $self->experiment->add_ExperimentalChip($echip);
|
|
417 }
|
|
418 }
|
|
419
|
|
420
|
|
421 ### CREATE AND STORE Channels
|
|
422 my $type = uc($data[$hpos{'PROMOT_SAMPLE_TYPE'}]);
|
|
423 my $sample_label = (! exists $hpos{'SAMPLE_LABEL'}) ? '' : $data[$hpos{'SAMPLE_LABEL'}];
|
|
424
|
|
425
|
|
426 $type = 'TOTAL' if ($type ne 'EXPERIMENTAL');
|
|
427 $channel = $chan_adaptor->fetch_by_type_experimental_chip_id($type, $echip->dbID());
|
|
428
|
|
429 if($channel){
|
|
430 if(! $self->recovery()){
|
|
431 throw("Channel(".$echip->unqiue_id().":".uc($data[$hpos{'PROMOT_SAMPLE_TYPE'}]).
|
|
432 " already exists in the database\nMaybe you want to recover?");
|
|
433 }else{
|
|
434 #push @{$self->{'_rollback_ids'}}, $channel->dbID();
|
|
435 #No point in doing this as all Channels mey be pre-registered in recovery mode
|
|
436 #Hence all will be rolled back
|
|
437 }
|
|
438 }else{
|
|
439 #Handles single/mutli
|
|
440
|
|
441 $channel = Bio::EnsEMBL::Funcgen::Channel->new
|
|
442 (
|
|
443 -EXPERIMENTAL_CHIP_ID => $echip->dbID(),
|
|
444 -DYE => $data[$hpos{'DYE'}],
|
|
445 -SAMPLE_ID => $sample_label,
|
|
446 -TYPE => $type,
|
|
447 );
|
|
448
|
|
449 #-SPECIES => $self->species(),#on channel/sample to enable multi-species chip/experiment
|
|
450 #would never happen on one chip? May happen between chips in one experiment
|
|
451
|
|
452 ($channel) = @{$chan_adaptor->store($channel)};
|
|
453
|
|
454 }
|
|
455
|
|
456 #we need to build the channel level tab2mage line here
|
|
457
|
|
458 #For each BR there will be two sample_labels, one for each channel
|
|
459 #These will be used across multiple chips.
|
|
460 #If two chips the same design ID and the same sample labels, then we have a technical replicate
|
|
461 #else if they have different sample labels then we have another biological replicate
|
|
462 #We have a problem of associating channels to the same BR with differing sample labels
|
|
463 #This is solved by checking whether the chip ID has already been registered in a BR
|
|
464
|
|
465 #This fails if more than one sample label is used for any given BR
|
|
466 #This will result in the BR being split into the number of unique sample label pairs(Experimental/Control channel)
|
|
467 #This also fails if the same sample label has been used for two different BRs
|
|
468
|
|
469 if($self->{'write_mage'}){
|
|
470 #my $sample_name = ($sample_label eq '') ? '???' : substr($sample_label, 0, (length($sample_label)-1));
|
|
471 my $ctype_name = (defined $self->cell_type()) ? $self->cell_type->name() : '???';
|
|
472 my $ftype_name = (defined $self->feature_type()) ? $self->feature_type->name() : '???';
|
|
473 my $ctype_desc = (defined $self->cell_type()) ? $self->cell_type->description() : '???';
|
|
474
|
|
475 #define reps
|
|
476
|
|
477
|
|
478 #we need one to get the biorep based on the sample label and making sure the unique ID are the same
|
|
479 #we need to define the tech rep by matching the sample label and the making sure the design_id isn't already used
|
|
480
|
|
481 #Is this doing the BR assignment properly?
|
|
482
|
|
483 if(exists $sample_reps{$sample_label}){#Found chip in a previously seen BR
|
|
484 #Register the BR of this chip ID
|
|
485 $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'} = $sample_reps{$sample_label};
|
|
486
|
|
487 }
|
|
488 elsif(exists $uid_reps{$data[$hpos{'CHIP_ID'}]}){#Found the other channel
|
|
489 $sample_reps{$sample_label} = $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'};
|
|
490 }
|
|
491 else{#assign new br
|
|
492 $sample_reps{$sample_label} = $br_cnt; #Assign BR to sample label
|
|
493 $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'} = $br_cnt; #Assign BR to chip id
|
|
494 $br_cnt++;
|
|
495 }
|
|
496
|
|
497
|
|
498
|
|
499 #Something is going awry here. The TR is not being reset for some new BRs
|
|
500
|
|
501 if(! exists $uid_reps{$data[$hpos{'CHIP_ID'}]}{'tr'}){
|
|
502 #we only assign a new tr here if this design has not been seen in any of the reps
|
|
503 #i.e. we need to get the first tr which does not contain this design_id
|
|
504
|
|
505 my $create_rep = 1;
|
|
506 my $tr;
|
|
507 my @chip_ids;
|
|
508 my $br = $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'};
|
|
509
|
|
510 foreach my $chip_id(keys %uid_reps){
|
|
511
|
|
512 push @chip_ids, $chip_id if($uid_reps{$chip_id}{'br'} == $br);
|
|
513 }
|
|
514
|
|
515 #This is looping through all the TRs for all the design IDs
|
|
516
|
|
517 foreach my $rep(sort keys %did_reps){
|
|
518 #Doesn't exist for the given BR?
|
|
519 #So we need to get all the chip_ids for a given br
|
|
520 #Check wether it exists and wether it exists in did_reps and check wether the chip_id value is part of the BR set
|
|
521 #else we add it
|
|
522
|
|
523 if(! exists $did_reps{$rep}{$data[$hpos{'DESIGN_ID'}]}){
|
|
524 #Not seen in a TR of this $rep yet
|
|
525 $create_rep = 0;
|
|
526 }elsif(! grep(/$did_reps{$rep}{$data[$hpos{'DESIGN_ID'}]}/, @chip_ids)){
|
|
527 #Not seen in this BR with this TR $rep
|
|
528 $create_rep = 0;
|
|
529 }
|
|
530
|
|
531 if(! $create_rep){
|
|
532 #Design ID not seen so add to this TR
|
|
533 $did_reps{$rep}{$data[$hpos{'DESIGN_ID'}]} = $data[$hpos{'CHIP_ID'}]; #don't really need to assign this
|
|
534 $tr = $rep;
|
|
535 last;#do not remove this or we get wierd TR incrementing
|
|
536 }
|
|
537 }
|
|
538
|
|
539
|
|
540 if($create_rep){
|
|
541 #Get the next TR value for this given BR
|
|
542 my @trs;
|
|
543
|
|
544 foreach my $rep(keys %did_reps){
|
|
545
|
|
546 foreach my $chip_id(values %{$did_reps{$rep}}){
|
|
547
|
|
548 #Push TR if chip_id is present in this BR
|
|
549 push @trs, $rep if(grep(/$chip_id/, @chip_ids));
|
|
550 }
|
|
551 }
|
|
552 ($tr) = sort {$b<=>$a} @trs;
|
|
553
|
|
554 $tr ||=0;
|
|
555 $tr++;
|
|
556
|
|
557 #register design ID to chip ID mapping for this TR
|
|
558 $did_reps{$tr}{$data[$hpos{'DESIGN_ID'}]} = $data[$hpos{'CHIP_ID'}];
|
|
559 }
|
|
560
|
|
561 #register TR for this chip ID
|
|
562 $uid_reps{$data[$hpos{'CHIP_ID'}]}{'tr'} = $tr;
|
|
563 }
|
|
564
|
|
565 my $br = $self->experiment->name().'_BR'. $uid_reps{$data[$hpos{'CHIP_ID'}]}{'br'};
|
|
566 my $tr = $br.'_TR'.$uid_reps{$data[$hpos{'CHIP_ID'}]}{'tr'};
|
|
567
|
|
568
|
|
569 #File[raw]
|
|
570 my $tsm_line = $echip->unique_id().'_'.$self->get_config('dye_freqs')->{$data[$hpos{'DYE'}]}.'_pair.txt';
|
|
571 #Array[accession] # Should this be left blank for AE accession?
|
|
572 $tsm_line .= "\t".$data[$hpos{'DESIGN_ID'}];
|
|
573 #Array[serial]
|
|
574 $tsm_line .= "\t".$echip->unique_id();
|
|
575
|
|
576 #Protocol(s)[grow][treatment][extraction][labelling][hybridisation][scanning]
|
|
577 foreach my $protocol(sort (keys %{$self->get_config('protocols')})){
|
|
578 $tsm_line .= "\t".$self->get_config('protocols')->{$protocol}->{'accession'};
|
|
579 }
|
|
580
|
|
581
|
|
582 #BioSource
|
|
583 $tsm_line .= "\t$ctype_name";
|
|
584 #Sample
|
|
585 $tsm_line .= "\t$br";
|
|
586 #Extract
|
|
587 $tsm_line .= "\t$tr";
|
|
588
|
|
589 #LabeledExtract & Immunoprecipitate
|
|
590 if($type eq 'EXPERIMENTAL'){
|
|
591 $tsm_line .= "\t$sample_label - IP of $tr with anti $ftype_name (Ab vendor, Ab ID)";
|
|
592 $tsm_line .= "\t$tr IP";
|
|
593 }else{
|
|
594 $tsm_line .= "\t$sample_label - Input control DNA of $tr\t";
|
|
595 }
|
|
596
|
|
597 #Hybridization
|
|
598 #U2OS BR1_TR1 ChIP H3KAc 46092 hyb
|
|
599 $tsm_line .= "\t$ctype_name $tr ChIP $ftype_name ".$echip->unique_id().' hyb';
|
|
600
|
|
601 #BioSourceMaterial SampleMaterial ExtractMaterial LabeledExtractMaterial
|
|
602 $tsm_line .= "\tcell\tgenomic_DNA\tgenomic_DNA\tsynthetic_DNA";
|
|
603
|
|
604 #Dye
|
|
605 $tsm_line .= "\t".$data[$hpos{'DYE'}];
|
|
606
|
|
607 #BioMaterialCharacteristics[Organism]
|
|
608 $tsm_line .= "\t".$self->species();
|
|
609
|
|
610 #BioMaterialCharacteristics[BioSourceType]
|
|
611 $tsm_line .= "\tfrozen_sample";
|
|
612
|
|
613 #BioMaterialCharacteristics[StrainOrLine]
|
|
614 $tsm_line .= "\t$ctype_name";
|
|
615
|
|
616 #BioMaterialCharacteristics[CellType]
|
|
617 $tsm_line .= "\t$ctype_name";
|
|
618
|
|
619 #BioMaterialCharacteristics[Sex]
|
|
620 $tsm_line .= "\t???";
|
|
621 #FactorValue[StrainOrLine]
|
|
622 $tsm_line .= "\t$ctype_name";
|
|
623 #FactorValue[Immunoprecipitate]
|
|
624 $tsm_line .= ($type eq 'EXPERIMENTAL') ? "\tanti-${ftype_name} antibody\n" : "\t\n";
|
|
625
|
|
626 print $t2m_file $tsm_line;
|
|
627
|
|
628 }
|
|
629
|
|
630 }
|
|
631
|
|
632 close($t2m_file) if $self->{'write_mage'};
|
|
633 close($fh);
|
|
634
|
|
635 return;
|
|
636 }
|
|
637
|
|
638
|
|
639
|
|
640
|
|
641 =head2 read_probe_data
|
|
642
|
|
643 Example : $imp->read_probe_data();
|
|
644 Description: Parses and imports probes, probe sets and features of a given array
|
|
645 No duplicate handling or probe caching is performed due to memory
|
|
646 issues, this is done in resolve_probe_data.
|
|
647 Returntype : none
|
|
648 Exceptions : none
|
|
649 Caller : Importer
|
|
650 Status : Medium
|
|
651
|
|
652 =cut
|
|
653
|
|
654
|
|
655 #Assumes one chip_design per experimental set.
|
|
656 sub read_probe_data{
|
|
657 my ($self) = shift;
|
|
658
|
|
659 my ($fh, $line, @data, %hpos, %probe_pos);#, %duplicate_probes);
|
|
660 $self->log("Parsing and importing ".$self->vendor()." probe data (".localtime().")", 1);
|
|
661
|
|
662 ### Read in
|
|
663 #ndf file: probe_set, probe and probe_feature(.err contains multiple mappings)
|
|
664 #pos file: probe chromosome locations
|
|
665
|
|
666 #Need to change how probe_names are generated for nimblegen?
|
|
667 #native probe_ids may not be unique, but should be when combined with the seq_id which is currently being used as the xref_id
|
|
668 #Handle with API!!
|
|
669
|
|
670 #READ REGION POSITIONS
|
|
671 #We need to handle different coord systems and possibly different assmemblies
|
|
672 my $slice_a = $self->db->get_SliceAdaptor();
|
|
673 my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name('chromosome');
|
|
674
|
|
675
|
|
676 #TIED FILE CACHE!!!
|
|
677 #We need to rebuild the cache from the DB before we start adding new probe info
|
|
678 #We only need to rebuild cache if we find a chip that hasn't been imported?
|
|
679 #No, we just need to import without cache, then re-do the resolve step
|
|
680 #Are we still going to get disconnects when we dump the cache?
|
|
681
|
|
682
|
|
683 #warn "Read probe data should only read in the array chips which are specified by the ExperimentalChip? Not just what is present in the DesignNotes file?";
|
|
684
|
|
685
|
|
686 foreach my $array(@{$self->arrays()}){
|
|
687
|
|
688 foreach my $achip(@{$array->get_ArrayChips()}){
|
|
689
|
|
690 my (@log, %probe_pos, $fasta_file, $f_out);
|
|
691 #do we need to fetch probe by seq and array?
|
|
692 #this would also id non-unique seqs in design
|
|
693
|
|
694 #warn "We need to account for different cs feature amppings here
|
|
695
|
|
696 if($achip->has_status('IMPORTED')){
|
|
697 $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id());
|
|
698 next;
|
|
699 }elsif($self->recovery()){
|
|
700 $self->log("Rolling back ArrayChip:\t".$achip->design_id());
|
|
701 $self->rollback_ArrayChips([$achip]);
|
|
702 }
|
|
703
|
|
704 $self->log("Importing ArrayChip:".$achip->design_id());
|
|
705
|
|
706 #Always use pos file, ndf file cannot be guranteed to contain all location info
|
|
707 #pos file also gives a key to which probes should be considered 'EXPERIMENTAL'
|
|
708
|
|
709 #CACHE PROBE POSITIONS
|
|
710 $fh = open_file($self->get_dir("design")."/".$achip->name().".pos");
|
|
711
|
|
712 #don't % = map ! Takes a lot longer than a while ;)
|
|
713 while($line = <$fh>){
|
|
714 $line =~ s/\r*\n//o;#Not using last element
|
|
715 @data = split/\t/o, $line;
|
|
716
|
|
717 #SEQ_ID CHROMOSOME PROBE_ID POSITION COUNT
|
|
718 if ($. == 1){
|
|
719 %hpos = %{$self->set_header_hash(\@data, $self->get_config('pos_fields'))};
|
|
720 next;
|
|
721 }
|
|
722
|
|
723 #Skip probe if there is a duplicate
|
|
724 if(exists $probe_pos{$data[$hpos{'PROBE_ID'}]}){
|
|
725
|
|
726 if($data[$hpos{'CHROMOSOME'}] eq $probe_pos{$data[$hpos{'PROBE_ID'}]}->{chr} &&
|
|
727 ($data[$hpos{'POSITION'}]+1) eq $probe_pos{$data[$hpos{'PROBE_ID'}]}->{start}){
|
|
728 #log or warn here?
|
|
729
|
|
730 #Not matching probe length here
|
|
731
|
|
732 next;
|
|
733 #Do we need to skip this in the ndf file too?
|
|
734
|
|
735 }
|
|
736 else{
|
|
737 throw("Found duplicate mapping for ".$data[$hpos{'PROBE_ID'}].
|
|
738 " need implement duplicate logging/cleaning");
|
|
739 }
|
|
740 #need to build duplicate hash to clean elements from hash
|
|
741 # $duplicate_probes{$data[$hpos{'PROBE_ID'}]} = 1;
|
|
742 #next;
|
|
743 }
|
|
744
|
|
745
|
|
746 my $random = 0;
|
|
747
|
|
748 if(! $self->cache_slice($data[$hpos{'CHROMOSOME'}])){
|
|
749 push @log, "Skipping feature import for probe ".$data[$hpos{'PROBE_ID'}]." with non-standard region ".$data[$hpos{'CHROMOSOME'}];
|
|
750
|
|
751 #should we try and resolve the random chrs here?
|
|
752 #at least store probe/set/result and skip feature
|
|
753 #can we just import as chr with no start postition?
|
|
754
|
|
755 #if ($data[$hpos{'CHROMOSOME'}] =~ /_random/){
|
|
756
|
|
757 # if(! $self->cache_slice($data[$hpos{'CHROMOSOME'}])){
|
|
758 #push @log, "Skipping probe ".$data[$hpos{'PROBE_ID'}]." with non-standard region ".$data[$hpos{'CHROMOSOME'}];
|
|
759 #}else{
|
|
760 #we should really log this in a seprate file to avoid overloading the lgo file
|
|
761 # push @log, "Importing random probe ".$data[$hpos{'PROBE_ID'}]." on ".$data[$hpos{'CHROMOSOME'}]." omitting position";
|
|
762
|
|
763 #}
|
|
764
|
|
765 #}
|
|
766
|
|
767 }
|
|
768
|
|
769 #This is not handling probes with random chrs
|
|
770
|
|
771 $probe_pos{$data[$hpos{'PROBE_ID'}]} = {(
|
|
772 chr => $data[$hpos{'CHROMOSOME'}],
|
|
773 start => ($data[$hpos{'POSITION'}] +1),#default UCSC->Ensembl coord conversion
|
|
774 )};
|
|
775
|
|
776 }
|
|
777
|
|
778
|
|
779
|
|
780
|
|
781 #Remove duplicate probes
|
|
782
|
|
783 $self->log("Built position cache from : ".$achip->name().".pos", 1);
|
|
784 close($fh);
|
|
785
|
|
786 $self->log("Importing design probes from : ".$achip->name().".ndf");
|
|
787 #OPEN PROBE IN/OUT FILES
|
|
788 $fh = open_file($self->get_dir("design")."/".$achip->name().".ndf");
|
|
789 #Need to set these paths in each achip hash, file names could be tablename.chip_id.txt
|
|
790
|
|
791 #Need to add dbname/port/species/vendor to this path?
|
|
792 #.efg DropDatabase should also clean the fasta dumps and caches for a given DB
|
|
793
|
|
794 if($self->dump_fasta()){
|
|
795 $fasta_file = $self->get_dir('fastas').'/'.$achip->name().".fasta";
|
|
796 $self->backup_file($fasta_file);
|
|
797 $f_out = open_file($fasta_file, '>');
|
|
798 }
|
|
799
|
|
800
|
|
801 my ($length, $ops, $op, $of, %pfs);
|
|
802
|
|
803 #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap
|
|
804 my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("VendorMap");
|
|
805
|
|
806
|
|
807 my $strand = 0; #default for nimblegen, should be config hash?
|
|
808
|
|
809 #my $cig_line = "50M"; #default for nimblegen, should be config hash?
|
|
810 #probe length can change within design, should be built from length
|
|
811
|
|
812
|
|
813 my $fasta = "";
|
|
814
|
|
815 #$self->Timer()->mark("Starting probe loop");
|
|
816
|
|
817
|
|
818 #This is leaking about 30-60MB for each normal density chip?
|
|
819 #need Devel::Monitor here?
|
|
820
|
|
821
|
|
822 while($line = <$fh>){
|
|
823 $line =~ s/\r*\n//;
|
|
824 @data = split/\t/o, $line;
|
|
825 my $loc = "";
|
|
826 my $class = "EXPERIMENTAL";
|
|
827
|
|
828 #PROBE_DESIGN_ID CONTAINER DESIGN_NOTE SELECTION_CRITERIA SEQ_ID PROBE_SEQUENCE MISMATCH MATCH_INDEX FEATURE_ID ROW_NUM COL_NUM PROBE_CLASS PROBE_ID POSITION DESIGN_ID X Y
|
|
829 #2067_0025_0001 BLOCK1 0 chrX TTAGTTTAAAATAAACAAAAAGATACTCTCTGGTTATTAAATCAATTTCT 0 52822449 52822449 1 25 experimental chrXP10404896 10404896 2067 25 1
|
|
830
|
|
831 if ($. == 1){
|
|
832 %hpos = %{$self->set_header_hash(\@data, $self->get_config('ndf_fields'))};
|
|
833 next;
|
|
834 }
|
|
835
|
|
836
|
|
837 if (! exists $probe_pos{$data[$hpos{'PROBE_ID'}]}){
|
|
838 push @log, "Skipping non-experimental probe:\t".$data[$hpos{'PROBE_ID'}];
|
|
839 next;
|
|
840 }
|
|
841
|
|
842 #Which non-experimental probes might we want to store?
|
|
843 #if($data[$hpos{'CONTAINER'}] =~ /control/io){
|
|
844 # $class = "CONTROL";
|
|
845 #}
|
|
846 #elsif($data[$hpos{'CONTAINER'}] =~ /random/io){
|
|
847 # $class = "RANDOM";
|
|
848 #}
|
|
849 #elsif($data[$hpos{'PROBE_CLASS'}] !~ /experimental/io){
|
|
850 # $class = "OTHER";
|
|
851 #}
|
|
852 #elsif(! exists $probe_pos{$data[$hpos{'PROBE_ID'}]}){ #HACKY HACKY HACK HACK!! Needed for valid region retrival
|
|
853 # $class = "OTHER";
|
|
854 #}
|
|
855 #SPIKE INS?
|
|
856
|
|
857
|
|
858
|
|
859 #This assumes all probes in feature/probeset are next to each other!!!!!!!!!!
|
|
860
|
|
861
|
|
862 if($data[$hpos{'FEATURE_ID'}] != $data[$hpos{'MATCH_INDEX'}]){#Probe set data
|
|
863 #print "Generating new probeset:\tFeature id:\t".$data[$hpos{'FEATURE_ID'}]."\tmatchindex:\t".$data[$hpos{'MATCH_INDEX'}]."\n";
|
|
864
|
|
865 if($ops && ($data[$hpos{'FEATURE_ID'}] ne $ops->name())){
|
|
866 #THis is where we chose to update/validate
|
|
867 #Do we need to pass probes if they're already stored..may aswell to reduce mysql load?
|
|
868 #No point as we have to query anyway
|
|
869 $self->store_set_probes_features($achip->dbID(), \%pfs, $ops);
|
|
870 throw("ops still defined in caller") if defined $ops;
|
|
871 }
|
|
872
|
|
873 $ops = Bio::EnsEMBL::Funcgen::ProbeSet->new(
|
|
874 -NAME => $data[$hpos{'FEATURE_ID'}],
|
|
875 -SIZE => undef,
|
|
876 -FAMILY => $data[$hpos{'CONTAINER'}],
|
|
877 #xref_id => $data[$hpos{'SEQ_ID'}],#Need to populate xref table
|
|
878 );
|
|
879
|
|
880 #should we store straight away or build a probeset/probe/feature set, and then store and validate in turn?
|
|
881 #Store directly have separate method to validate and update?
|
|
882 #would need to check if one exists before storing anyway, else we could potentially duplicate the same probe/probeset from a different array
|
|
883 #remember for affy we need duplicate probe records with identical probe ids, probeset records unique across all arrays
|
|
884
|
|
885 undef %pfs
|
|
886 }
|
|
887 elsif($. > 2){#may have previous ops set, but next has no ops, or maybe just no ops's at all
|
|
888 $self->store_set_probes_features($achip->dbID(), \%pfs, $ops);
|
|
889 throw("ops still defined in caller") if defined $ops;
|
|
890 }
|
|
891
|
|
892
|
|
893 ###PROBES
|
|
894 #should we cat $xref_id to $probe_id here to generate unique id?
|
|
895 #would be messy to handle in the code, but would have to somewhere(in the retrieval code)
|
|
896
|
|
897 $length = length($data[$hpos{'PROBE_SEQUENCE'}]);
|
|
898 #$probe_string .= "\t${psid}\t".$data[$hpos{'PROBE_ID'}]."\t${length}\t$ac_id\t${class}\n";
|
|
899
|
|
900 $op = Bio::EnsEMBL::Funcgen::Probe->new(
|
|
901 -NAME => $data[$hpos{'PROBE_ID'}],
|
|
902 -LENGTH => $length,
|
|
903 -ARRAY => $array,
|
|
904 -ARRAY_CHIP_ID => $achip->dbID(),
|
|
905 -CLASS => $class,
|
|
906 );
|
|
907
|
|
908
|
|
909 %{$pfs{$data[$hpos{'PROBE_ID'}]}} = (
|
|
910 probe => $op,
|
|
911 features => [],
|
|
912 );
|
|
913
|
|
914
|
|
915 ###PROBE FEATURES
|
|
916 #How can we be certain that we have the same mapping in the DB?
|
|
917 #Put checks in here for build?
|
|
918 #Need to handle controls/randoms here
|
|
919 #won't have features but will have results!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
920 #The format of the pos file looks like it should have all the data required, but
|
|
921 # chromsome is missing, first undef :(
|
|
922 #Need to use $pos here instead of .pos file
|
|
923 #However have problems defining probe class, as not populated in test set
|
|
924 #derive from container! :(
|
|
925 #Ignore controls/random as they won't have a region
|
|
926 #Also need to handle multiple mappings?
|
|
927 #As results reference probes, no features, can have multiple features on different builds
|
|
928
|
|
929 #if($class eq "EXPERIMENTAL"){
|
|
930
|
|
931 #if(exists $regions{$data[$hpos{'SEQ_ID'}]}){
|
|
932 # $fid++;
|
|
933 # $pf_string .= "\t".$regions{$data[$hpos{'SEQ_ID'}]}{'seq_region_id'}."\t".$data[$hpos{'POSITION'}]."\t".
|
|
934 # ($data[$hpos{'POSITION'}] + $length)."\t${strand}\t".$regions{$data[$hpos{'SEQ_ID'}]}{'coord_system_id'}.
|
|
935 # "\t${pid}\t${anal_id}\t".$data[$hpos{'MISMATCH'}]."\t${cig_line}\n";
|
|
936 #$loc .= $regions{$data[$hpos{'SEQ_ID'}]}{'seq_region_id'}.":".$data[$hpos{'POSITION'}].
|
|
937 # "-".($data[$hpos{'POSITION'}] + $length).";" if ($self->{'_dump_fasta'});
|
|
938 # }
|
|
939 # else{
|
|
940 # die("No regions defined for ".$data[$hpos{'SEQ_ID'}]." ".$data[$hpos{'PROBE_ID'}].
|
|
941 #" with family ".$data[$hpos{'CONTAINER'}]);
|
|
942 # }
|
|
943
|
|
944
|
|
945
|
|
946
|
|
947 if ($self->dump_fasta()){
|
|
948 #(my $chr = $probe_pos{$data[$hpos{'PROBE_ID'}]}->{'chr'}) =~ s/chr//;
|
|
949
|
|
950 #$loc .= $chr.":".$probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'}."-".
|
|
951 # ($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'}+ $length).";";
|
|
952
|
|
953 #$fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}].
|
|
954 # "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n";
|
|
955
|
|
956
|
|
957
|
|
958 #filter controls/randoms? Or would it be sensible to see where they map
|
|
959 #wrap seq here?
|
|
960 #$fasta .= ">".$data[$hpos{'PROBE_ID'}]."\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n";
|
|
961
|
|
962
|
|
963 #To use this for mapping, we really need the dbID nr fasta
|
|
964 #This can be generated after the import, or maybe during resolve?
|
|
965 #This is also currently done on a chip level, where as the cache is resolved at the array level
|
|
966 #We could simply cat the files before resolving the fasta file
|
|
967 #Need to do this otherwise we risk overwriting the fasta file with incomplete data.
|
|
968 #Can we validate sequence across probes with same name in this step?
|
|
969 #Just use probe name for now.
|
|
970
|
|
971 #We could cat and sort the fastas to make sure we have the same sequences
|
|
972 #Need to dump the design_id in the fasta header
|
|
973 #This would also reduce IO on the DB as identical probe will be consecutive, hence just one query to get the id.
|
|
974
|
|
975 #Changed th format an content of this to facilitate dbID nr fasta file generation and sequence validation
|
|
976
|
|
977 $fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$achip->design_id."\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n";
|
|
978
|
|
979 #Print fasta every 10000 lines
|
|
980 if(! ($. % 10000)){
|
|
981 print $f_out $fasta;
|
|
982 $fasta = '';
|
|
983 }
|
|
984 }
|
|
985
|
|
986
|
|
987 #Hack!!!!!! Still importing probe (and result?)
|
|
988 next if(! $self->cache_slice($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'chr'}));
|
|
989 #warn("Skipping non standard probe (".$data[$hpos{'PROBE_ID'}].") with location:\t$loc\n");
|
|
990
|
|
991
|
|
992 $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new
|
|
993 (
|
|
994 -START => $probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'},
|
|
995 -END =>($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'start'} + $length),
|
|
996 -STRAND => $strand,
|
|
997 -SLICE => $self->cache_slice($probe_pos{$data[$hpos{'PROBE_ID'}]}->{'chr'}),
|
|
998 -ANALYSIS => $anal,
|
|
999 -MISMATCHCOUNT => $data[$hpos{'MISMATCH'}],#Is this always 0 for import? remove from header hash?
|
|
1000 -PROBE => undef, #Need to update this in the store method
|
|
1001 );
|
|
1002
|
|
1003
|
|
1004 push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of;
|
|
1005
|
|
1006 }
|
|
1007
|
|
1008 #need to store last data here
|
|
1009 $self->store_set_probes_features($achip->dbID(), \%pfs, $ops);
|
|
1010 $self->log(join("\n", @log));
|
|
1011 $achip->adaptor->store_status("IMPORTED", $achip);
|
|
1012 $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED");
|
|
1013
|
|
1014 if ($self->dump_fasta()){
|
|
1015 print $f_out $fasta;
|
|
1016 close($f_out);
|
|
1017 }
|
|
1018
|
|
1019 $self->log("Imported design from:\t".$achip->name().".ndf", 1);
|
|
1020
|
|
1021
|
|
1022
|
|
1023 #$self->{'_probe_cache'} = undef;#As we can't get Y and Y info from the DB, this is only possible as the results files contain X and Y info
|
|
1024 }
|
|
1025
|
|
1026 #Should we build hash of probe_names:probe_feature_ids here for results import
|
|
1027 #Should we dump this as a lookup file for easier recoverability
|
|
1028 #This is the biggest step in the import
|
|
1029 #Building the hash would be fastest but least recoverable
|
|
1030 #Would have to write recover statments for each step i.e. build the most recent data structure required for the next import step
|
|
1031 #Individual queries for each result would take ages
|
|
1032 #This is all assuming there are no random records in the table i.e. ID series is linear with no gaps.
|
|
1033
|
|
1034
|
|
1035 #Could throw a random validation check in every X entries?
|
|
1036 #This would only work for non-parallel imports
|
|
1037 #periodic import when hit new probe_set, but no new data printed
|
|
1038
|
|
1039 }
|
|
1040
|
|
1041
|
|
1042 $self->log("Finished parsing probe data");
|
|
1043 #Total probe_sets:\t$psid\n".
|
|
1044 # "Total probes:\t$pid\nTotal probe_features:\t$fid");
|
|
1045
|
|
1046
|
|
1047 $self->resolve_probe_data();
|
|
1048
|
|
1049 return;
|
|
1050 }
|
|
1051
|
|
1052
|
|
1053
|
|
1054
|
|
1055 =head2 read_and_import_results_data
|
|
1056
|
|
1057 Example : $imp->read_results_data();
|
|
1058 Description: Parses and dumps raw results to file
|
|
1059 Returntype : none
|
|
1060 Exceptions : none
|
|
1061 Caller : Importer
|
|
1062 Status : at risk
|
|
1063
|
|
1064 =cut
|
|
1065
|
|
1066
|
|
1067 sub read_and_import_results_data{
|
|
1068 my $self = shift;
|
|
1069
|
|
1070 $self->log("Parsing ".$self->vendor()." results");
|
|
1071 my (@header, @data, @design_ids, @lines);
|
|
1072 my ($fh, $pid, $line, $file);
|
|
1073 my $anal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("RawValue");
|
|
1074 my $result_set = $self->get_import_ResultSet($anal, 'channel');
|
|
1075
|
|
1076
|
|
1077 if ($result_set) { #we have some new data to import
|
|
1078
|
|
1079 foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) {
|
|
1080
|
|
1081 #if( ! $echip->has_status('IMPORTED')){
|
|
1082
|
|
1083 foreach my $chan (@{$echip->get_Channels()}) {
|
|
1084
|
|
1085 if ( ! $chan->has_status('IMPORTED')) {
|
|
1086
|
|
1087 #we need to set write_mage here
|
|
1088
|
|
1089 my $array = $echip->get_ArrayChip->get_Array();
|
|
1090
|
|
1091 $self->get_probe_cache_by_Array($array) || throw('Failed to get the probe cache handle for results import, resolve cache here?');
|
|
1092 my ($probe_elem, $score_elem, %hpos);
|
|
1093 my $cnt = 0;
|
|
1094 my $r_string = "";
|
|
1095 my $chan_name = $echip->unique_id()."_".$self->get_config('dye_freqs')->{$chan->dye()};
|
|
1096 my $cc_id = $result_set->get_chip_channel_id($chan->dbID());
|
|
1097
|
|
1098
|
|
1099 #if ($self->recovery()) {
|
|
1100 # $self->log("Rolling back results for channel:\t${chan_name}");
|
|
1101 # $self->db->rollback_results($cc_id);
|
|
1102 # }
|
|
1103
|
|
1104 #open/backup output
|
|
1105 my $out_file = $self->get_dir("raw")."/result.".$chan_name.".txt";
|
|
1106 $self->backup_file($out_file);
|
|
1107 my $r_out = open_file($out_file, '>');
|
|
1108
|
|
1109 (my $alt_chan_name = $chan_name) =~ s/\_/\_1h\_/;
|
|
1110 my $found = 0;
|
|
1111
|
|
1112 FILE: foreach my $name($chan_name, $alt_chan_name){
|
|
1113
|
|
1114 foreach my $suffix ("_pair.txt", ".pair", ".txt") {
|
|
1115
|
|
1116 $file = $self->get_dir("results")."/".$name.$suffix;
|
|
1117
|
|
1118 if (-f $file) {
|
|
1119 $found = 1;
|
|
1120 last FILE;
|
|
1121 }
|
|
1122 }
|
|
1123 }
|
|
1124
|
|
1125 throw("Could not find result file for Channel(${chan_name}) in ".$self->get_dir('results')) if ! $found;
|
|
1126
|
|
1127 #open/slurp input
|
|
1128 $self->log("Reading result for channel $chan_name:\t$file", 1);
|
|
1129 $fh = open_file($file);
|
|
1130 @lines = <$fh>;
|
|
1131 close($fh);
|
|
1132
|
|
1133
|
|
1134 ###PROCESS HEADER
|
|
1135
|
|
1136 foreach my $i (0..$#lines) {
|
|
1137
|
|
1138 if ($lines[$i] =~ /PROBE_ID/o) {
|
|
1139 $lines[$i] =~ s/\r*\n//o;
|
|
1140 @data = split/\t/o, $lines[$i];
|
|
1141
|
|
1142 %hpos = %{$self->set_header_hash(\@data, $self->get_config('result_fields'))};
|
|
1143
|
|
1144 #remove header
|
|
1145 splice @lines, $i, 1;
|
|
1146
|
|
1147 last; #finished processing header
|
|
1148 }
|
|
1149 }
|
|
1150
|
|
1151 #we need to sort the result files based on the unique key(name at present, should replace with seq at some point)
|
|
1152 @lines = sort {(split/\t/o, $a)[$hpos{'PROBE_ID'}] cmp (split/\t/o, $b)[$hpos{'PROBE_ID'}]} @lines;
|
|
1153
|
|
1154 $self->log('Parsing results', 1);
|
|
1155
|
|
1156 foreach $line(@lines) {
|
|
1157
|
|
1158 #can we preprocess effectively?
|
|
1159 next if $line =~ /^#/;
|
|
1160 next if $line =~ /NGS_CONTROLS/;
|
|
1161 next if $line =~ /V_CODE/;
|
|
1162 next if $line =~ /H_CODE/;
|
|
1163 next if $line =~ /RANDOM/;
|
|
1164
|
|
1165 $line =~ s/\r*\n//o;
|
|
1166 @data = split/\t/o, $line;
|
|
1167
|
|
1168 ###PROCESS HEADER
|
|
1169 #if ($line =~ /PROBE_ID/o){
|
|
1170 #
|
|
1171 # %hpos = %{$self->set_header_hash(\@data, $self->get_config('result_fields'))};
|
|
1172 # next;#finished processing header
|
|
1173 #}
|
|
1174
|
|
1175 ###PROCESS DATA
|
|
1176 #Is this string concat causing the slow down, would it befaster to use an array and print a join?
|
|
1177
|
|
1178 if ($pid = $self->get_probe_id_by_name_Array($data[$hpos{'PROBE_ID'}], $array)) {
|
|
1179 $cnt ++;
|
|
1180 $r_string .= '\N'."\t${pid}\t".$data[$hpos{'PM'}]."\t${cc_id}\t".$data[$hpos{'X'}]."\t".$data[$hpos{'Y'}]."\n";
|
|
1181 } else {
|
|
1182 warn "Found unfiltered non-experimental probe in input $data[$hpos{'PROBE_ID'}]";
|
|
1183 }
|
|
1184
|
|
1185 ###PRINT SOME RESULTS
|
|
1186 if ($cnt > 10000) {
|
|
1187 $cnt = 0;
|
|
1188 print $r_out $r_string;
|
|
1189 $r_string ="";
|
|
1190 #could we fork here and import in the background?
|
|
1191 }
|
|
1192
|
|
1193 }
|
|
1194 #PRINT/CLOSE Channel file
|
|
1195 print $r_out $r_string;
|
|
1196 close($r_out);
|
|
1197 $self->log("Finished parsing $chan_name result", 1);
|
|
1198
|
|
1199 #Import directly here to avoid having to reparse all results if we crash!!!!
|
|
1200 $self->log("Importing:\t$out_file");
|
|
1201 $self->db->load_table_data("result", $out_file);
|
|
1202 $self->log("Finished importing:\t$out_file", 1);
|
|
1203 $chan->adaptor->store_status('IMPORTED', $chan);
|
|
1204
|
|
1205
|
|
1206 }
|
|
1207 }
|
|
1208 }
|
|
1209 }
|
|
1210 else {
|
|
1211 $self->log("Skipping results parse and import");
|
|
1212 }
|
|
1213
|
|
1214 $self->log("Finished parsing and importing results");
|
|
1215
|
|
1216 return;
|
|
1217 }
|
|
1218
|
|
1219
|
|
1220
|
|
1221 1;
|