0
|
1 #
|
|
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
|
|
3 #
|
|
4
|
|
5 =head1 LICENSE
|
|
6
|
|
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
8 Genome Research Limited. All rights reserved.
|
|
9
|
|
10 This software is distributed under a modified Apache license.
|
|
11 For license details, please see
|
|
12
|
|
13 http://www.ensembl.org/info/about/code_licence.html
|
|
14
|
|
15 =head1 CONTACT
|
|
16
|
|
17 Please email comments or questions to the public Ensembl
|
|
18 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
19
|
|
20 Questions may also be sent to the Ensembl help desk at
|
|
21 <helpdesk@ensembl.org>.
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
|
|
26
|
|
27 =head1 SYNOPSIS
|
|
28
|
|
29 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign";
|
|
30 push @INC, $parser_type;
|
|
31 my $imp = $class->SUPER::new(@_); my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params);
|
|
32
|
|
33 $imp->set_config();
|
|
34
|
|
35
|
|
36 =head1 DESCRIPTION
|
|
37
|
|
38 This is a definitions class which should not be instatiated directly, it
|
|
39 normally inherited from the Importer. ArrayDesign contains meta data and methods
|
|
40 specific to handling array designs only (i.e. no experimental data), which have
|
|
41 been produced from the eFG array design software.
|
|
42
|
|
43 =cut
|
|
44
|
|
45 package Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign;
|
|
46
|
|
47 use Bio::EnsEMBL::Funcgen::Array;
|
|
48 use Bio::EnsEMBL::Funcgen::ProbeSet;
|
|
49 use Bio::EnsEMBL::Funcgen::Probe;
|
|
50 use Bio::EnsEMBL::Funcgen::ProbeFeature;
|
|
51 use Bio::EnsEMBL::Funcgen::FeatureType;
|
|
52 use Bio::EnsEMBL::Funcgen::ExperimentalChip;
|
|
53 use Bio::EnsEMBL::Funcgen::ArrayChip;
|
|
54 use Bio::EnsEMBL::Funcgen::Channel;
|
|
55 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
|
|
56 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file);
|
|
57 use Bio::EnsEMBL::Funcgen::Utils::Helper;
|
|
58 use strict;
|
|
59
|
|
60 use vars qw(@ISA);
|
|
61 @ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
|
|
62
|
|
63
|
|
64
|
|
65 =head2 new
|
|
66
|
|
67 Example : my $self = $class->SUPER::new(@_);
|
|
68 Description: Constructor method for ArrayDesign class
|
|
69 Returntype : Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
|
|
70 Exceptions : throws if Experiment name not defined or if caller is not Importer
|
|
71 Caller : Bio::EnsEMBL::Funcgen::Importer
|
|
72 Status : at risk
|
|
73
|
|
74 =cut
|
|
75
|
|
76
|
|
77 sub new{
|
|
78 my $caller = shift;
|
|
79
|
|
80 my $class = ref($caller) || $caller;
|
|
81 my $self = $class->SUPER::new();
|
|
82
|
|
83 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
|
|
84
|
|
85
|
|
86 $self->{'config'} =
|
|
87 {(
|
|
88 probe_data => ["probe"],
|
|
89 prb_fields => ['SEQ_ID', 'POSITION', 'LENGTH', 'PROBE_SEQUENCE', 'PROBE_ID', 'UNIQUENESS_SCORE', 'TM', 'MAS_CYCLES'],
|
|
90 notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'],
|
|
91 )};
|
|
92
|
|
93 return $self;
|
|
94 }
|
|
95
|
|
96
|
|
97
|
|
98 =head2 set_config
|
|
99
|
|
100 Example : my $self->set_config;
|
|
101 Description: Sets attribute dependent config
|
|
102 Returntype : None
|
|
103 Exceptions : None
|
|
104 Caller : Bio::EnsEMBL::Funcgen::Importer
|
|
105 Status : at risk
|
|
106
|
|
107 =cut
|
|
108
|
|
109 sub set_config{
|
|
110 my ($self) = @_;
|
|
111
|
|
112 #placeholder method
|
|
113 #set paths
|
|
114
|
|
115 return;
|
|
116 }
|
|
117
|
|
118
|
|
119 =head2 read_array_data
|
|
120
|
|
121 Example : $imp->read_array_data();
|
|
122 Description: Parses NimbleGen style DesignNotes.txt format files to create and store new Arrays
|
|
123 Returntype : none
|
|
124 Exceptions : None
|
|
125 Caller : general
|
|
126 Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats?
|
|
127
|
|
128 =cut
|
|
129
|
|
130 #this is currently OLIGO specific.
|
|
131
|
|
132 sub read_array_data{
|
|
133 my ($self, $design_notes) = @_;
|
|
134
|
|
135 $self->log("Reading and importing array data");
|
|
136
|
|
137 throw('You need to pass the path to a DesignNotes.txt file') if ! defined $design_notes;
|
|
138 $self->{'design_notes'} = $design_notes;
|
|
139
|
|
140 my ($line, $array, $array_chip, @data, %hpos);
|
|
141 my $oa_adaptor = $self->db->get_ArrayAdaptor();
|
|
142 my $ac_adaptor = $self->db->get_ArrayChipAdaptor();
|
|
143 my $fh = open_file("<", $self->{'design_notes'});
|
|
144
|
|
145 while ($line = <$fh>){
|
|
146
|
|
147 $line =~ s/\r*\n//;#chump
|
|
148 @data = split/\t/o, $line;
|
|
149
|
|
150
|
|
151
|
|
152
|
|
153 #We need to have a DESIGN vendor type?
|
|
154 #also need to be able to set file path independently of config
|
|
155
|
|
156 if($. == 1){
|
|
157 %hpos = %{$self->set_header_hash(\@data, $self->get_config('notes_fields'))};
|
|
158 next;
|
|
159 }
|
|
160
|
|
161 ### CREATE AND STORE Array and ArrayChips
|
|
162 if(! defined $array ){
|
|
163 #This is treating each array chip as a separate array, unless arrayset is defined
|
|
164 #AT present we have no way of differentiating between different array_chips on same array???!!!
|
|
165 #Need to add functionality afterwards to collate array_chips into single array
|
|
166
|
|
167 #This will use a stored array if present
|
|
168
|
|
169 $array = Bio::EnsEMBL::Funcgen::Array->new
|
|
170 (
|
|
171 -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}],
|
|
172 -FORMAT => uc($self->format()),
|
|
173 -VENDOR => uc($self->vendor()),
|
|
174 -TYPE => 'OLIGO',
|
|
175 -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here
|
|
176 );
|
|
177
|
|
178 ($array) = @{$oa_adaptor->store($array)};
|
|
179
|
|
180
|
|
181 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
|
|
182 -ARRAY_ID => $array->dbID(),
|
|
183 -NAME => $data[$hpos{'DESIGN_NAME'}],
|
|
184 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}],
|
|
185 #add description?
|
|
186 );
|
|
187
|
|
188 #This will use a stored array_chip if present
|
|
189 ($array_chip) = @{$ac_adaptor->store($array_chip)};
|
|
190 $array->add_ArrayChip($array_chip);
|
|
191
|
|
192 }
|
|
193 elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){
|
|
194
|
|
195 $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].". for same Array ".$array->name()."\n");
|
|
196
|
|
197 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
|
|
198 -ARRAY_ID => $array->dbID(),
|
|
199 -NAME => $data[$hpos{'DESIGN_NAME'}],
|
|
200 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}],
|
|
201 );
|
|
202
|
|
203 ($array_chip) = @{$ac_adaptor->store($array_chip)};
|
|
204 $array->add_ArrayChip($array_chip);
|
|
205
|
|
206 }
|
|
207 elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){
|
|
208 throw("Found experiment with more than one design without -array_set");
|
|
209 }
|
|
210 }
|
|
211
|
|
212
|
|
213 $self->add_Array($array);
|
|
214
|
|
215 close($fh);
|
|
216
|
|
217 return;
|
|
218
|
|
219 }
|
|
220
|
|
221
|
|
222
|
|
223
|
|
224
|
|
225 =head2 read_probe_data
|
|
226
|
|
227 Example : $imp->read_probe_data();
|
|
228 Description: Parses and imports probes, probe sets and features for a given array design
|
|
229 Returntype : none
|
|
230 Exceptions : throws is not tiling format
|
|
231 Caller : Importer
|
|
232 Status : at risk
|
|
233
|
|
234 =cut
|
|
235
|
|
236
|
|
237 #Assumes one chip_design per experimental set.
|
|
238 sub read_probe_data{
|
|
239 my ($self, $array_file) = @_;
|
|
240
|
|
241 $self->log("Reading and importing probe data");
|
|
242
|
|
243
|
|
244 my ($fh, $line, @data, @log, %hpos, %probe_pos);#, %duplicate_probes);
|
|
245 my $aa = $self->db->get_AnalysisAdaptor();
|
|
246 my $manal = $aa->fetch_by_logic_name('MASCycles');
|
|
247 my $uanal = $aa->fetch_by_logic_name('UScore');
|
|
248 my $tmanal= $aa->fetch_by_logic_name('NimblegenTM');
|
|
249
|
|
250 $array_file ||= $self->array_file();
|
|
251 $self->log("Parsing ".$self->vendor()." probe data (".localtime().")");
|
|
252 throw("ArrayDesign only accomodates a tiling design with no feature/probesets") if ($self->format() ne 'TILED');
|
|
253
|
|
254 ### Read in
|
|
255 # eFG prb file, not chiip info yet so only one ArrayChip per design
|
|
256 # potential to have pos file here for probes built on generic slices of genome
|
|
257
|
|
258 #We need to handle different coord systems and possibly different assmemblies
|
|
259 my $slice_a = $self->db->get_SliceAdaptor();
|
|
260 my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name_schema_build_version(
|
|
261 'chromosome',
|
|
262 $self->db->_get_schema_build($self->db->dnadb())
|
|
263 );
|
|
264
|
|
265
|
|
266 #sanity check we're only dealing with one array/chip
|
|
267 my @arrays = @{$self->arrays()};
|
|
268 if(scalar(@arrays) != 1){
|
|
269 throw("Array DESIGN imports only accomodate one Array per import, please check ".$self->{'design_notes'});
|
|
270 }
|
|
271
|
|
272 my @achips = @{$arrays[0]->get_ArrayChips()};
|
|
273 if(scalar(@achips) != 1){
|
|
274 throw("Array DESIGN imports only accomodates one ArrayChip per import, please check ".$self->{'design_notes'});
|
|
275 }
|
|
276
|
|
277 my $achip = $achips[0];
|
|
278
|
|
279 #foreach my $array(@{$self->arrays()}){
|
|
280
|
|
281
|
|
282 # foreach my $achip(@{$array->get_ArrayChips()}){
|
|
283
|
|
284 $self->log("Importing array design(".$achip->name().") from ".$array_file);
|
|
285
|
|
286
|
|
287 if($achip->has_status('IMPORTED')){
|
|
288 $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id());
|
|
289 return;
|
|
290 }elsif($self->recovery()){
|
|
291 $self->log("Rolling back partially imported ArrayChip:\t".$achip->design_id());
|
|
292 $self->db->rollback_ArrayChip([$achip]);
|
|
293 }
|
|
294
|
|
295 $self->log("Importing ArrayChip:".$achip->design_id());
|
|
296
|
|
297
|
|
298 #OPEN PROBE IN/OUT FILES
|
|
299 $fh = open_file("<", $array_file);
|
|
300 my $f_out = open_file(">", $self->get_dir("output")."/probe.".$achip->name()."fasta") if($self->{'_dump_fasta'});
|
|
301 my ($op, $of, %pfs);
|
|
302
|
|
303 #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap
|
|
304 my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("TileMap");##???
|
|
305 my $strand = 0; #default for TileMap, should be config hash?
|
|
306 my $fasta = "";
|
|
307
|
|
308 while($line = <$fh>){
|
|
309 $line =~ s/\r*\n//;
|
|
310 @data = split/\t/o, $line;
|
|
311 my $loc = "";
|
|
312
|
|
313 #SEQ_ID WINDOW_START WINDOW_END POSITION LENGTH PROBE_SEQUENCE TM UNIQUENESS_SCORE MAS_CYCLES
|
|
314 #X 3000001 3000100 3000041 56 TGACATCTTCAGTTCTTTACATAGTTTTCATATTAGTCCTCTATCAGATGTGGAGT 73.09 132 15
|
|
315
|
|
316
|
|
317 if ($. == 1){
|
|
318 %hpos = %{$self->set_header_hash(\@data, $self->get_config('prb_fields'))};
|
|
319 next;
|
|
320 }
|
|
321
|
|
322
|
|
323 #This assumes tiling format with no feature/probe sets
|
|
324
|
|
325
|
|
326 if(%pfs){
|
|
327 $self->store_set_probes_features($achip->dbID(), \%pfs);
|
|
328 undef %pfs;
|
|
329 }
|
|
330
|
|
331
|
|
332 #PROBE
|
|
333 $op = Bio::EnsEMBL::Funcgen::Probe->new(
|
|
334 -NAME => $data[$hpos{'PROBE_ID'}],
|
|
335 -LENGTH => $data[$hpos{'LENGTH'}],
|
|
336 -ARRAY => $arrays[0],
|
|
337 -ARRAY_CHIP_ID => $achip->dbID(),
|
|
338 -CLASS => 'DESIGN',
|
|
339 );
|
|
340
|
|
341 $op->add_Analysis_score($manal, $data[$hpos{'MAS_CYCLES'}]);
|
|
342 $op->add_Analysis_score($tmanal, $data[$hpos{'TM'}]);
|
|
343 $op->add_Analysis_CoordSystem_score($uanal, $cs, $data[$hpos{'UNIQUENESS_SCORE'}]);
|
|
344
|
|
345 #would need to pass cs to store USCORE, CYCLES and TM are seq/anal dependent not cs
|
|
346 #options:
|
|
347 #associate cs dependent scores with features
|
|
348 #we are duplicating cs in probe_design table
|
|
349
|
|
350 #do we need another object? ProbeDesign
|
|
351 #-cs would be empty for all but uscore
|
|
352 #-mas_cycles analysis_id
|
|
353 #-uscore analysis_id cs_id
|
|
354 #-tm analysis_id (anal most likely wont change so highly redundant)
|
|
355 #this would produce 3 records for each probe, with cs being empty for two and anal being redundant for the other
|
|
356 #would however provide for extensible design attributes
|
|
357 #would only need one tm and mas_cycles for each probe irrespective of cs
|
|
358 #could have separate table probe_design_feature?
|
|
359 #mmm doesn't have location, just cs
|
|
360 #just have empty cs fields, calls by cs would have to be in ('cs_id', 'NULL')
|
|
361 #or just make uscore method dependent on cs.
|
|
362 #or split table?
|
|
363 #ProbeDesign::add_analysis_score
|
|
364 #ProbeDesign::add_coord_sys_analysis_score
|
|
365
|
|
366 #can we add this directly to the probe?
|
|
367 #separate retrieval in ProbeAdaptor so we're not joining everytime
|
|
368 #this would require generic get_analysis/analysis_coord_system_attribute method
|
|
369
|
|
370
|
|
371
|
|
372 %{$pfs{$data[$hpos{'PROBE_ID'}]}} = (
|
|
373 probe => $op,
|
|
374 features => [],
|
|
375 );
|
|
376
|
|
377
|
|
378 #PROBE FEATURE
|
|
379 if(! $self->cache_slice($data[$hpos{'SEQ_ID'}])){
|
|
380 warn("Skipping non-standard probe chromosome");
|
|
381 undef %pfs;
|
|
382 next;
|
|
383 }
|
|
384
|
|
385 my $end = ($data[$hpos{'POSITION'}] + $data[$hpos{'LENGTH'}]);
|
|
386
|
|
387 if ($self->{'_dump_fasta'}){
|
|
388 $loc .= $data[$hpos{'SEQ_ID'}].":".$data[$hpos{'POSITION'}]."-${end};";
|
|
389 }
|
|
390
|
|
391
|
|
392 $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new
|
|
393 (
|
|
394 -START => $data[$hpos{'POSITION'}],
|
|
395 -END => $end,
|
|
396 -STRAND => $strand,
|
|
397 -SLICE => $self->cache_slice($data[$hpos{'SEQ_ID'}]),
|
|
398 -ANALYSIS => $anal,
|
|
399 -MISMATCHCOUNT => 0,
|
|
400 -CIGAR_LINE => $data[$hpos{'LENGTH'}].'M',
|
|
401 -PROBE => undef,#Need to update this in the store method
|
|
402 );
|
|
403
|
|
404
|
|
405 push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of;
|
|
406
|
|
407 if($self->{'_dump_fasta'}){
|
|
408 #filter controls/randoms? Or would it be sensible to see where they map
|
|
409 #wrap seq here?
|
|
410 $fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}].
|
|
411 "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n";
|
|
412 }
|
|
413 }
|
|
414
|
|
415 #need to store last data here
|
|
416 $self->store_set_probes_features($achip->dbID(), \%pfs);
|
|
417 $self->log(join("\n", @log));
|
|
418 $achip->adaptor->set_status("IMPORTED", $achip);
|
|
419 $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED");
|
|
420
|
|
421 if ($self->{'_dump_fasta'}){
|
|
422 print $f_out $fasta if($self->{'_dump_fasta'});
|
|
423 close($f_out);
|
|
424 }
|
|
425
|
|
426 $self->log("Finished parsing probe data");
|
|
427 #Total probe_sets:\t$psid\n".
|
|
428 # "Total probes:\t$pid\nTotal probe_features:\t$fid");
|
|
429
|
|
430 return;
|
|
431 }
|
|
432
|
|
433
|
|
434 1;
|