comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Sanger.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 #
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Sanger
3 #
4
5 =head1 LICENSE
6
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
8 Genome Research Limited. All rights reserved.
9
10 This software is distributed under a modified Apache license.
11 For license details, please see
12
13 http://www.ensembl.org/info/about/code_licence.html
14
15 =head1 CONTACT
16
17 Please email comments or questions to the public Ensembl
18 developers list at <ensembl-dev@ebi.ac.uk>.
19
20 Questions may also be sent to the Ensembl help desk at
21 <helpdesk@ensembl.org>.
22
23 =head1 NAME
24
25 Bio::EnsEMBL::Funcgen::Parsers::Sanger
26
27 =head1 SYNOPSIS
28
29 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Sanger";
30 push @INC, $parser_type;
31 my $imp = $class->SUPER::new(@_);
32
33
34 =head1 DESCRIPTION
35
36 This is a definitions class which should not be instatiated directly, it
37 normally inherited from the Importer. Sanger contains meta data and methods
38 specific to Sanger PCR arrays to aid parsing and importing of experimental data.
39
40 =cut
41
42 package Bio::EnsEMBL::Funcgen::Parsers::Sanger;
43
44 use Bio::EnsEMBL::Funcgen::Array;
45 use Bio::EnsEMBL::Funcgen::ProbeSet;
46 use Bio::EnsEMBL::Funcgen::Probe;
47 use Bio::EnsEMBL::Funcgen::ProbeFeature;
48 use Bio::EnsEMBL::Funcgen::FeatureType;
49 use Bio::EnsEMBL::Funcgen::ExperimentalChip;
50 use Bio::EnsEMBL::Funcgen::ArrayChip;
51 use Bio::EnsEMBL::Funcgen::Channel;
52 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
53 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file);
54 use Bio::EnsEMBL::Funcgen::Utils::Helper;
55 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
56 use strict;
57
58 use vars qw(@ISA);
59 @ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
60
61 =head2 new
62
63 Example : my $self = $class->SUPER::new(@_);
64 Description: Constructor method for Sanger class
65 Returntype : Bio::EnsEMBL::Funcgen::Parsers::Sanger
66 Exceptions : throws if Experiment name not defined or if caller is not Importer
67 Caller : Bio::EnsEMBL::Funcgen::Importer
68 Status : at risk
69
70 =cut
71
72
73 sub new{
74 my $caller = shift;
75
76 my $class = ref($caller) || $caller;
77 my $self = $class->SUPER::new();
78
79 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
80
81 $self->{'config'} = {(
82 #order of these data arrays is important!
83 array_data => [], #["array_chip"],
84 probe_data => ["array_probe"],
85 results_data => ["and_import_result"],
86 #import_methods => [],
87 #data paths here?
88 norm_method => undef,
89 #is this disabling -input_dir override option?
90 )};
91
92
93
94 return $self;
95 }
96
97 =head2 set_config
98
99 Example : $imp->set_config();
100 Description: Sets a attribute dependent variables
101 Returntype : none
102 Exceptions : None
103 Caller : Importer
104 Status : At risk
105
106 =cut
107
108 sub set_config{
109 my ($self) = @_;
110
111 #placeholder method for setting any attr dependant vars e.g. file paths etc.
112
113
114 return;
115 }
116
117
118 sub read_array_probe_data{
119 my ($self, $array_file) = @_;
120
121 warn("Remove hard coding for Sanger array import, and accomodate adf format");
122
123
124 $array_file ||= $self->array_file();
125 my ($line, $fh, @list, $array_file_format, $cmd);
126 my ($op, $of, $imported, $fimported, $fanal);
127 my $oa_adaptor = $self->db->get_ArrayAdaptor();
128 my $op_adaptor = $self->db->get_ProbeAdaptor();
129 my $of_adaptor = $self->db->get_ProbeFeatureAdaptor();
130 my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor();
131 my $ac_adaptor = $self->db->get_ArrayChipAdaptor();
132 my $slice_adaptor = $self->db->get_SliceAdaptor();
133 my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("SangerPCR")->dbID();
134 #have LiftOver? Could then use liftover in pipeline to redo mappings
135
136 #store now checks whether already stored and updates array chips accordingly
137 my $array = Bio::EnsEMBL::Funcgen::Array->new
138 (
139 -NAME => $self->array_name(),
140 -FORMAT => uc($self->format()),
141 -VENDOR => uc($self->vendor()),
142 -TYPE => 'PCR',
143 -DESCRIPTION => "Sanger ENCODE PCR array 3.1.1",
144 );
145
146 ($array) = @{$oa_adaptor->store($array)};
147
148 #This is treating each array chip as a separate array, unless arrayset is defined
149 #AT present we have no way of differentiating between different array_chips on same array???!!!
150 #Need to add functionality afterwards to collate array_chips into single array
151 my $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
152 -NAME => $array->name(),
153 -DESIGN_ID => $array->name(),
154 -ARRAY_ID =>$array->dbID(),
155 );
156
157 ($array_chip) = @{$ac_adaptor->store($array_chip)};
158 $array->add_ArrayChip($array_chip);
159 $self->add_Array($array);
160
161
162 #we also need to test wether the array as been imported as well as the mappings
163 #THis needs to use coord_sys-id not schema_build!! Duplcaite entries for different schema_builds
164 #with same assembly
165
166 my $dnadb_cs = $self->db->dnadb->get_CoordSystemAdaptor->fetch_by_name('chromosome');
167 my $fg_cs = $self->db->get_FGCoordSystemAdaptor->validate_and_store_coord_system($dnadb_cs);
168
169
170 #This fails if we're pointing to an old DB during the release cycle. Will be fine if we manage to cs mapping dynamically
171
172
173 if ($array_chip->has_status('IMPORTED')) {
174 $imported = 1;
175 $self->log("Skipping ArrayChip probe import (".$array_chip->name().") already fully imported");
176
177 #need to build cache here, from file first else from DB????
178 #This is required for feature only imports
179 #as we won't have the probe dbID available
180
181 if(! $self->get_probe_cache_by_Array($array)){
182 $self->get_probe_cache_by_Array($array, 1);
183 }
184
185
186
187 } elsif ($self->recovery()) {
188 $self->log("Rolling back partially imported ArrayChip:\t".$array_chip->name());
189 $self->db->rollback_ArrayChip([$array_chip]); #This should really remove all CS imports too?
190 }
191
192
193 #should never really have CS imports if not IMPORTED
194 #there is however the potential to trash a lot of data if we were to remove the CS importes by mistake
195 #do we need to check whether any other sets are using the data?
196 #we have to check for result using relevant cs_id and cc_id
197 #no removal of probes is the key thing here as nothing is dependent on the feature_ids
198 #get all result sets by array chip? or get all ExperimentalChips by array chip
199 #would have to be result set as we would find our own ecs. May find our own rset
200
201
202 throw('This needs updating');
203
204 if ($array_chip->has_status('IMPORTED_CS_'.$fg_cs->dbID())) {
205 $fimported = 1;
206 $self->log("Skipping ArrayChip feature import (".$array_chip->name().") already fully imported for ".$self->data_version());
207 } elsif ($self->recovery()) {
208 $self->log("Rolling back partially imported ArrayChip features:\t".$array_chip->name());
209 $self->db->rollback_ArrayChip_features($array_chip, $fg_cs);
210 }
211
212
213 #need to check whether already imported on specified schema_build
214 #check for appropriate file given format in input dir or take path
215
216 #if (! $fimported) {#now need to do this irrespective of import status due to x y requirements
217 #need only do this once, i.e. if the cache isn't defined yet
218 #this is assuming cache will be built properly
219 #may cause problems if not cleaned up properly after use.
220
221 #ignore xy requirements for now, these should be associated with results file
222
223
224
225 #if (! defined $self->{'_probe_cache'}) {
226 if (! $fimported) {
227
228
229
230 if (! $array_file) {
231
232 if (! defined $self->get_dir('input')) {
233 throw("No input_dir defined, if you are running in a non Experiment context please use -array_file");
234 }
235
236 #hacky ..do better?
237 for my $suffix ("gff", "adf") {
238 $cmd = $self->get_dir('input')."/".$self->array_name()."*".$suffix;
239 @list = `ls $cmd 2>/dev/null`;
240
241 if ((scalar(@list) == 1) &&
242 ($list[0] !~ /No such file or directory/o)) { ###this is only printed to STDERR?
243
244 if (! defined $array_file) {
245 $array_file = $list[0];
246 } else {
247 throw("Found more than one array file : $array_file\t$list[0]\nSpecify one with -array_file");
248 }
249 }
250 }
251
252 throw("Cannot find array file. Specify one with -array_file") if (! defined $array_file);
253 }
254
255
256 if ($array_file =~ /gff/io) {
257 $array_file_format = "GFF";
258 } elsif ($array_file =~ /adf/io) {
259 $array_file_format = "ADF";
260 throw("Does not yet accomodate Sanger adf format");
261 } else {
262 throw("Could not determine array file format: $array_file");
263 }
264
265
266 #if (! $fimported) {
267 $fanal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name(($array_file_format eq "ADF") ? "VendorMap" : "LiftOver");
268 #}
269
270 $self->log("Parsing ".$self->vendor()." array data (".localtime().")");
271 $fh = open_file($array_file);
272 my @lines = <$fh>;
273 close($fh);
274
275
276
277 my ($chr, $start, $end, $strand, $pid);#, $x, $y, $meta_x, $meta_y, @xy);
278
279 #avoid mutliple calls for same array
280 my $ac_dbid = $array->get_ArrayChip_by_design_id($array->name())->dbID();
281
282 #sort file to enable probe cache method for new feature imports
283 @lines = sort {(split/\t|\;/o, $a)[8] cmp (split/\t|\;/o, $b)[8]} @lines;
284
285 #This is not sorting properly!!
286
287 #my @tmp = map ((split/\t|\;/o, $_)[8], @lines);
288 #@tmp = sort @tmp;
289
290
291 #$self->log('Tmp sorted array is :\n'.join("\n", @tmp)."\n");
292
293
294
295
296 foreach $line(@lines) {
297 $line =~ s/\r*\n//;
298
299 #($chr, $start, $end, $ratio, $pid) = split/\t/o, $line;
300 #($chr, undef, undef, $start, $end, undef, $strand, undef, $pid, $x, $y, $meta_x, $meta_y) = split/\t|\;/o, $line;
301 ($chr, undef, undef, $start, $end, undef, $strand, undef, $pid) = split/\t|\;/o, $line;
302
303
304 if($self->ucsc_coords){
305 $start += 1;
306 }
307
308
309 #$meta_x =~ s/META_X=//;
310 #$x =~ s/X=//;
311 #$x = $x + (($meta_x -1)*26);
312 #$meta_y =~ s/META_Y=//;
313 #$y =~ s/Y=//;
314 #$y = $y + (($meta_y -1)*25);
315 $pid =~ s/reporter_id=//o;
316 $chr =~ s/chr//;
317 $strand = ($strand eq "+") ? 0 : 1;
318
319 #Hack!!!!!! This is still maintaining the probe entry (and result?)
320 if (! $self->cache_slice($chr)) {
321 warn("-- Skipping non standard probe (${pid}) with location:\t${chr}:${start}-${end}\n");
322 next;
323 }
324
325
326 #need to parse dependant on file format
327 #also need to account for duplicate probes on grid
328
329 #need to test for imprted here for rebuilding the probe_info cache
330 #this will result in always using first x y for the inital import (i.e. skip any probe already in cache)
331 #or using last x y for previosuly imported as we can't check the cache as it will already be there
332 #could check for x y
333 #should always check x y as this will also implicitly check if it is in the cache
334
335 #if (! $self->get_probe_id_by_name($pid)) { #already present in cache
336 #if(! (@xy = @{$self->get_probe_x_y_by_name($pid)})){
337
338 #can we not use store_set_probes_features
339 #would have to add x y to probe, which is not logical as probe can have many x y's
340 #keep like this and just change cache_probe_info
341
342 if (! $imported) {
343 #when we utilise array coords, we need to look up probe cache and store again with new coords
344 #we're currently storing duplicates i.e. different ids with for same probe
345 #when we should be storing two records for the same probe/id
346 #the criteria for this will be different for each vendor, may have to check container etc for NimbleGen
347
348 $op = Bio::EnsEMBL::Funcgen::Probe->new(
349 -NAME => $pid,
350 -LENGTH => ($end - $start),
351 -ARRAY => $array,
352 -ARRAY_CHIP_ID => $ac_dbid,
353 -CLASS => 'EXPERIMENTAL',
354 );
355
356 ($op) = @{$op_adaptor->store($op)};
357 #$self->cache_probe_info($pid, $op->dbID, $x, $y);
358 } else {
359 #update XY cache for previously imported array
360 #$self->cache_probe_info($pid, $self->get_probe_id_by_name($pid), $x, $y);
361 }
362
363 #if (! $fimported) {
364 $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
365 -START => $start,
366 -END => $end,
367 -STRAND => $strand,
368 -SLICE => $self->cache_slice($chr),
369 -ANALYSIS => $fanal,
370 -MISMATCHCOUNT => 0,
371 -PROBE_ID => ($imported) ?
372 $self->get_probe_id_by_name_Array($pid, $array) : $op->dbID(),
373 );
374
375 #get_probe_id will throw if not in cache, which means that we have an unimported probe
376 #for an ArrayChip which is flagged as imported, must have been omitted from the import deisgn
377 #probably a manual fix required. Can we log these and write an update/repair script.
378
379 $of_adaptor->store($of);
380 #}
381
382 #} else {
383 #warn("Sanger does not accomodate on plate duplicates yet, result are not linked to X Y coords, using first coords for probe if present in results for $pid\n");ĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦ
384 #}
385 }
386
387 $array_chip->adaptor->set_status('IMPORTED_CS_'.$fg_cs->dbID(), $array_chip) if ! $fimported;
388 $self->log("ArrayChip:\t".$array_chip->design_id()." has been IMPORTED_CS_".$fg_cs->dbID());
389
390 }
391
392
393
394 if (! $imported) {
395 $array_chip->adaptor->set_status('IMPORTED', $array_chip);
396 $self->log("ArrayChip:\t".$array_chip->design_id()." has been IMPORTED");
397 $self->resolve_probe_data();
398 }
399
400 $self->log("Finished parsing ".$self->vendor()." array/probe data (".localtime().")");
401 #warn("Finished parsing ".$self->vendor()." array/probe data (".localtime().")");
402
403 return;
404 }
405
406 =head2 read_and_import_result_data
407
408 Example : $imp->read_and_import_result_data();
409 Description: Parses and imports result for the sanger PCR array platform
410 Returntype : none
411 Exceptions : none
412 Caller : Importer
413 Status : At risk
414
415 =cut
416
417 sub read_and_import_result_data{
418 my $self = shift;
419
420 #change this to read_gff_chip_results
421 #as opposed to gff channel results
422 #This should also use the default logic names for the Vendor, or take a user defined list
423 $self->log("Reading ".$self->vendor()." result data (".localtime().")");
424
425 my ($file, $chip_uid, $line, $echip);
426 my ($ratio, $pid, %chip_files, %roll_back);
427 my $of_adaptor = $self->db->get_ProbeFeatureAdaptor();
428 my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor();
429 my $chan_adaptor = $self->db->get_ChannelAdaptor();
430 my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("SangerPCR");
431 my $result_adaptor = $self->db->get_ResultSetAdaptor();
432 #this is done to avoid having to self->array_name in loop, will make multiple array loop easier
433 my $array = ${$self->arrays()}[0];
434
435 #This works a little differently as we're not parsing a meta file
436 #so the echips haven't been added yet.
437 #This is treating each array chip as a separate array, unless arrayset is defined
438 #AT present we have no way of differentiating between different array_chips on same array???!!!
439 #Need to add functionality afterwards to collate array_chips into single array
440
441 #First add the echips to the Experiment
442
443 if (! @{$self->result_files()}) {
444 my $list = "ls ".$self->input_dir().'/[0-9]*-[0-9a-zA-Z]*\.all\.*';
445 my @rfiles = `$list`;
446 $self->result_files(\@rfiles);
447 }
448
449
450 foreach $file(@{$self->result_files()}) {
451 chomp $file;
452 ($chip_uid = $file) =~ s/.*\///;
453 $chip_uid =~ s/\..*//;
454
455 $self->log("Found SANGER results file for $chip_uid:\t$file");
456 $chip_files{$chip_uid} = $file;
457
458
459 $echip = $ec_adaptor->fetch_by_unique_id_vendor($chip_uid, 'SANGER');
460
461 #this should throw if not recovery
462 #Nee to check Nimbelgen methods
463
464 if ($echip) {
465
466 if (! $self->recovery()) {
467 throw("ExperimentalChip(".$echip->unqiue_id().") already exists in the database\nMaybe you want to recover?");
468 }else{
469 #log pre-reg'd chips for rollback
470 $roll_back{$echip->dbID()} = 1;
471 }
472 } else {
473
474 $echip = Bio::EnsEMBL::Funcgen::ExperimentalChip->new
475 (
476 -EXPERIMENT_ID => $self->experiment->dbID(),
477 -ARRAY_CHIP_ID => $self->arrays->[0]->get_ArrayChip_by_design_id($array->name())->dbID(),
478 -UNIQUE_ID => $chip_uid,
479 );
480
481 ($echip) = @{$ec_adaptor->store($echip)};
482 $self->experiment->add_ExperimentalChip($echip); #if we need a contains method in here , always add!!
483 }
484
485 #do we need DUMMY entries any more?
486
487 #sub this passing the echip?
488 foreach my $type ('DUMMY_TOTAL', 'DUMMY_EXPERIMENTAL') {
489
490 my $channel = $chan_adaptor->fetch_by_type_experimental_chip_id($type, $echip->dbID());
491
492 if ($channel) {
493 if (! $self->recovery()) {
494 throw("Channel(".$echip->unique_id().":$type) already exists in the database\nMaybe you want to recover?");
495 }
496 } else {
497
498 $channel = Bio::EnsEMBL::Funcgen::Channel->new
499 (
500 -EXPERIMENTAL_CHIP_ID => $echip->dbID(),
501 -TYPE => $type,
502 );
503
504 ($channel) = @{$chan_adaptor->store($channel)};
505 }
506 }
507 }
508
509
510
511 #Now get rset using experiment echips
512 my $rset = $self->get_import_ResultSet($analysis, 'experimental_chip');
513
514 if ($rset) { #we have some new data
515
516 foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) {
517
518 if ($echip->has_status('IMPORTED_SangerPCR', $echip)) {
519 $self->log("ExperimentalChip(".$echip->unique_id().") has already been imported");
520 } else {
521
522 my $cc_id = $rset->get_chip_channel_id($echip->dbID());
523
524 if ($self->recovery() && $roll_back{$echip->dbID()}){
525 $self->log("Rolling back results for ExperimentalChip:\t".$echip->unique_id());
526 $self->rollback_results($cc_id);
527 }
528
529 $self->log("Reading SANGER result file for ".$echip->unique_id().":\t".$chip_files{$echip->unique_id()});
530 $self->get_probe_cache_by_Array($array) || throw('Failed to reset probe cache handle');
531 my $fh = open_file($chip_files{$echip->unique_id()});
532 my @lines = <$fh>;
533 close($fh);
534
535 my $rfile_path = $self->get_dir("norm")."/result.SangerPCR.".$echip->unique_id().".txt";
536 my $rfile = open_file($rfile_path, '>');
537 my $r_string = "";
538
539
540 @lines = sort {(split/\t|\:/o, $a)[5] cmp (split/\t|\:/o, $b)[5]} @lines;
541
542 foreach my $line (@lines) {
543 $line =~ s/\r*\n//o;
544
545 ($ratio, undef, $pid) = (split/\t|\:/o, $line)[3..5];
546 $pid =~ s/.*://o;
547
548 $ratio = '\N' if $ratio eq 'NA'; #NULL is still useful info to store in result
549 #my ($x, $y) = @{$self->get_probe_x_y_by_name($pid)};
550
551 #this is throwing away the encode region which could be used for the probeset/family?
552 $r_string .= '\N'."\t".$self->get_probe_id_by_name_Array($pid, $array)."\t${ratio}\t${cc_id}\t".'\N'."\t".'\N'."\n";#${x}\t${y}\n";
553 }
554
555 print $rfile $r_string;
556 close($rfile);
557
558 $self->log("Importing:\t$rfile_path");
559 $self->db->load_table_data("result", $rfile_path);
560 $self->log("Finished importing:\t$rfile_path");
561 $echip->adaptor->set_status('IMPORTED_SangerPCR', $echip);
562 }
563 }
564
565
566
567 } else {
568 $self->log("No new data, skipping result parse");
569 }
570
571 $self->log("Finished reading and importing ".$self->vendor()." result data (".localtime().")");
572 return;
573 }
574
575
576
577 1;