comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/ArrayDesign.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 #
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
3 #
4
5 =head1 LICENSE
6
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
8 Genome Research Limited. All rights reserved.
9
10 This software is distributed under a modified Apache license.
11 For license details, please see
12
13 http://www.ensembl.org/info/about/code_licence.html
14
15 =head1 CONTACT
16
17 Please email comments or questions to the public Ensembl
18 developers list at <ensembl-dev@ebi.ac.uk>.
19
20 Questions may also be sent to the Ensembl help desk at
21 <helpdesk@ensembl.org>.
22
23 =head1 NAME
24
25 Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
26
27 =head1 SYNOPSIS
28
29 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign";
30 push @INC, $parser_type;
31 my $imp = $class->SUPER::new(@_); my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params);
32
33 $imp->set_config();
34
35
36 =head1 DESCRIPTION
37
38 This is a definitions class which should not be instatiated directly, it
39 normally inherited from the Importer. ArrayDesign contains meta data and methods
40 specific to handling array designs only (i.e. no experimental data), which have
41 been produced from the eFG array design software.
42
43 =cut
44
45 package Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign;
46
47 use Bio::EnsEMBL::Funcgen::Array;
48 use Bio::EnsEMBL::Funcgen::ProbeSet;
49 use Bio::EnsEMBL::Funcgen::Probe;
50 use Bio::EnsEMBL::Funcgen::ProbeFeature;
51 use Bio::EnsEMBL::Funcgen::FeatureType;
52 use Bio::EnsEMBL::Funcgen::ExperimentalChip;
53 use Bio::EnsEMBL::Funcgen::ArrayChip;
54 use Bio::EnsEMBL::Funcgen::Channel;
55 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
56 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file);
57 use Bio::EnsEMBL::Funcgen::Utils::Helper;
58 use strict;
59
60 use vars qw(@ISA);
61 @ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
62
63
64
65 =head2 new
66
67 Example : my $self = $class->SUPER::new(@_);
68 Description: Constructor method for ArrayDesign class
69 Returntype : Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
70 Exceptions : throws if Experiment name not defined or if caller is not Importer
71 Caller : Bio::EnsEMBL::Funcgen::Importer
72 Status : at risk
73
74 =cut
75
76
77 sub new{
78 my $caller = shift;
79
80 my $class = ref($caller) || $caller;
81 my $self = $class->SUPER::new();
82
83 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
84
85
86 $self->{'config'} =
87 {(
88 probe_data => ["probe"],
89 prb_fields => ['SEQ_ID', 'POSITION', 'LENGTH', 'PROBE_SEQUENCE', 'PROBE_ID', 'UNIQUENESS_SCORE', 'TM', 'MAS_CYCLES'],
90 notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'],
91 )};
92
93 return $self;
94 }
95
96
97
98 =head2 set_config
99
100 Example : my $self->set_config;
101 Description: Sets attribute dependent config
102 Returntype : None
103 Exceptions : None
104 Caller : Bio::EnsEMBL::Funcgen::Importer
105 Status : at risk
106
107 =cut
108
109 sub set_config{
110 my ($self) = @_;
111
112 #placeholder method
113 #set paths
114
115 return;
116 }
117
118
119 =head2 read_array_data
120
121 Example : $imp->read_array_data();
122 Description: Parses NimbleGen style DesignNotes.txt format files to create and store new Arrays
123 Returntype : none
124 Exceptions : None
125 Caller : general
126 Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats?
127
128 =cut
129
130 #this is currently OLIGO specific.
131
132 sub read_array_data{
133 my ($self, $design_notes) = @_;
134
135 $self->log("Reading and importing array data");
136
137 throw('You need to pass the path to a DesignNotes.txt file') if ! defined $design_notes;
138 $self->{'design_notes'} = $design_notes;
139
140 my ($line, $array, $array_chip, @data, %hpos);
141 my $oa_adaptor = $self->db->get_ArrayAdaptor();
142 my $ac_adaptor = $self->db->get_ArrayChipAdaptor();
143 my $fh = open_file("<", $self->{'design_notes'});
144
145 while ($line = <$fh>){
146
147 $line =~ s/\r*\n//;#chump
148 @data = split/\t/o, $line;
149
150
151
152
153 #We need to have a DESIGN vendor type?
154 #also need to be able to set file path independently of config
155
156 if($. == 1){
157 %hpos = %{$self->set_header_hash(\@data, $self->get_config('notes_fields'))};
158 next;
159 }
160
161 ### CREATE AND STORE Array and ArrayChips
162 if(! defined $array ){
163 #This is treating each array chip as a separate array, unless arrayset is defined
164 #AT present we have no way of differentiating between different array_chips on same array???!!!
165 #Need to add functionality afterwards to collate array_chips into single array
166
167 #This will use a stored array if present
168
169 $array = Bio::EnsEMBL::Funcgen::Array->new
170 (
171 -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}],
172 -FORMAT => uc($self->format()),
173 -VENDOR => uc($self->vendor()),
174 -TYPE => 'OLIGO',
175 -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here
176 );
177
178 ($array) = @{$oa_adaptor->store($array)};
179
180
181 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
182 -ARRAY_ID => $array->dbID(),
183 -NAME => $data[$hpos{'DESIGN_NAME'}],
184 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}],
185 #add description?
186 );
187
188 #This will use a stored array_chip if present
189 ($array_chip) = @{$ac_adaptor->store($array_chip)};
190 $array->add_ArrayChip($array_chip);
191
192 }
193 elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){
194
195 $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].". for same Array ".$array->name()."\n");
196
197 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
198 -ARRAY_ID => $array->dbID(),
199 -NAME => $data[$hpos{'DESIGN_NAME'}],
200 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}],
201 );
202
203 ($array_chip) = @{$ac_adaptor->store($array_chip)};
204 $array->add_ArrayChip($array_chip);
205
206 }
207 elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){
208 throw("Found experiment with more than one design without -array_set");
209 }
210 }
211
212
213 $self->add_Array($array);
214
215 close($fh);
216
217 return;
218
219 }
220
221
222
223
224
225 =head2 read_probe_data
226
227 Example : $imp->read_probe_data();
228 Description: Parses and imports probes, probe sets and features for a given array design
229 Returntype : none
230 Exceptions : throws is not tiling format
231 Caller : Importer
232 Status : at risk
233
234 =cut
235
236
237 #Assumes one chip_design per experimental set.
238 sub read_probe_data{
239 my ($self, $array_file) = @_;
240
241 $self->log("Reading and importing probe data");
242
243
244 my ($fh, $line, @data, @log, %hpos, %probe_pos);#, %duplicate_probes);
245 my $aa = $self->db->get_AnalysisAdaptor();
246 my $manal = $aa->fetch_by_logic_name('MASCycles');
247 my $uanal = $aa->fetch_by_logic_name('UScore');
248 my $tmanal= $aa->fetch_by_logic_name('NimblegenTM');
249
250 $array_file ||= $self->array_file();
251 $self->log("Parsing ".$self->vendor()." probe data (".localtime().")");
252 throw("ArrayDesign only accomodates a tiling design with no feature/probesets") if ($self->format() ne 'TILED');
253
254 ### Read in
255 # eFG prb file, not chiip info yet so only one ArrayChip per design
256 # potential to have pos file here for probes built on generic slices of genome
257
258 #We need to handle different coord systems and possibly different assmemblies
259 my $slice_a = $self->db->get_SliceAdaptor();
260 my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name_schema_build_version(
261 'chromosome',
262 $self->db->_get_schema_build($self->db->dnadb())
263 );
264
265
266 #sanity check we're only dealing with one array/chip
267 my @arrays = @{$self->arrays()};
268 if(scalar(@arrays) != 1){
269 throw("Array DESIGN imports only accomodate one Array per import, please check ".$self->{'design_notes'});
270 }
271
272 my @achips = @{$arrays[0]->get_ArrayChips()};
273 if(scalar(@achips) != 1){
274 throw("Array DESIGN imports only accomodates one ArrayChip per import, please check ".$self->{'design_notes'});
275 }
276
277 my $achip = $achips[0];
278
279 #foreach my $array(@{$self->arrays()}){
280
281
282 # foreach my $achip(@{$array->get_ArrayChips()}){
283
284 $self->log("Importing array design(".$achip->name().") from ".$array_file);
285
286
287 if($achip->has_status('IMPORTED')){
288 $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id());
289 return;
290 }elsif($self->recovery()){
291 $self->log("Rolling back partially imported ArrayChip:\t".$achip->design_id());
292 $self->db->rollback_ArrayChip([$achip]);
293 }
294
295 $self->log("Importing ArrayChip:".$achip->design_id());
296
297
298 #OPEN PROBE IN/OUT FILES
299 $fh = open_file("<", $array_file);
300 my $f_out = open_file(">", $self->get_dir("output")."/probe.".$achip->name()."fasta") if($self->{'_dump_fasta'});
301 my ($op, $of, %pfs);
302
303 #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap
304 my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("TileMap");##???
305 my $strand = 0; #default for TileMap, should be config hash?
306 my $fasta = "";
307
308 while($line = <$fh>){
309 $line =~ s/\r*\n//;
310 @data = split/\t/o, $line;
311 my $loc = "";
312
313 #SEQ_ID WINDOW_START WINDOW_END POSITION LENGTH PROBE_SEQUENCE TM UNIQUENESS_SCORE MAS_CYCLES
314 #X 3000001 3000100 3000041 56 TGACATCTTCAGTTCTTTACATAGTTTTCATATTAGTCCTCTATCAGATGTGGAGT 73.09 132 15
315
316
317 if ($. == 1){
318 %hpos = %{$self->set_header_hash(\@data, $self->get_config('prb_fields'))};
319 next;
320 }
321
322
323 #This assumes tiling format with no feature/probe sets
324
325
326 if(%pfs){
327 $self->store_set_probes_features($achip->dbID(), \%pfs);
328 undef %pfs;
329 }
330
331
332 #PROBE
333 $op = Bio::EnsEMBL::Funcgen::Probe->new(
334 -NAME => $data[$hpos{'PROBE_ID'}],
335 -LENGTH => $data[$hpos{'LENGTH'}],
336 -ARRAY => $arrays[0],
337 -ARRAY_CHIP_ID => $achip->dbID(),
338 -CLASS => 'DESIGN',
339 );
340
341 $op->add_Analysis_score($manal, $data[$hpos{'MAS_CYCLES'}]);
342 $op->add_Analysis_score($tmanal, $data[$hpos{'TM'}]);
343 $op->add_Analysis_CoordSystem_score($uanal, $cs, $data[$hpos{'UNIQUENESS_SCORE'}]);
344
345 #would need to pass cs to store USCORE, CYCLES and TM are seq/anal dependent not cs
346 #options:
347 #associate cs dependent scores with features
348 #we are duplicating cs in probe_design table
349
350 #do we need another object? ProbeDesign
351 #-cs would be empty for all but uscore
352 #-mas_cycles analysis_id
353 #-uscore analysis_id cs_id
354 #-tm analysis_id (anal most likely wont change so highly redundant)
355 #this would produce 3 records for each probe, with cs being empty for two and anal being redundant for the other
356 #would however provide for extensible design attributes
357 #would only need one tm and mas_cycles for each probe irrespective of cs
358 #could have separate table probe_design_feature?
359 #mmm doesn't have location, just cs
360 #just have empty cs fields, calls by cs would have to be in ('cs_id', 'NULL')
361 #or just make uscore method dependent on cs.
362 #or split table?
363 #ProbeDesign::add_analysis_score
364 #ProbeDesign::add_coord_sys_analysis_score
365
366 #can we add this directly to the probe?
367 #separate retrieval in ProbeAdaptor so we're not joining everytime
368 #this would require generic get_analysis/analysis_coord_system_attribute method
369
370
371
372 %{$pfs{$data[$hpos{'PROBE_ID'}]}} = (
373 probe => $op,
374 features => [],
375 );
376
377
378 #PROBE FEATURE
379 if(! $self->cache_slice($data[$hpos{'SEQ_ID'}])){
380 warn("Skipping non-standard probe chromosome");
381 undef %pfs;
382 next;
383 }
384
385 my $end = ($data[$hpos{'POSITION'}] + $data[$hpos{'LENGTH'}]);
386
387 if ($self->{'_dump_fasta'}){
388 $loc .= $data[$hpos{'SEQ_ID'}].":".$data[$hpos{'POSITION'}]."-${end};";
389 }
390
391
392 $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new
393 (
394 -START => $data[$hpos{'POSITION'}],
395 -END => $end,
396 -STRAND => $strand,
397 -SLICE => $self->cache_slice($data[$hpos{'SEQ_ID'}]),
398 -ANALYSIS => $anal,
399 -MISMATCHCOUNT => 0,
400 -CIGAR_LINE => $data[$hpos{'LENGTH'}].'M',
401 -PROBE => undef,#Need to update this in the store method
402 );
403
404
405 push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of;
406
407 if($self->{'_dump_fasta'}){
408 #filter controls/randoms? Or would it be sensible to see where they map
409 #wrap seq here?
410 $fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}].
411 "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n";
412 }
413 }
414
415 #need to store last data here
416 $self->store_set_probes_features($achip->dbID(), \%pfs);
417 $self->log(join("\n", @log));
418 $achip->adaptor->set_status("IMPORTED", $achip);
419 $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED");
420
421 if ($self->{'_dump_fasta'}){
422 print $f_out $fasta if($self->{'_dump_fasta'});
423 close($f_out);
424 }
425
426 $self->log("Finished parsing probe data");
427 #Total probe_sets:\t$psid\n".
428 # "Total probes:\t$pid\nTotal probe_features:\t$fid");
429
430 return;
431 }
432
433
434 1;