Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/ArrayDesign.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # | |
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign | |
3 # | |
4 | |
5 =head1 LICENSE | |
6 | |
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
8 Genome Research Limited. All rights reserved. | |
9 | |
10 This software is distributed under a modified Apache license. | |
11 For license details, please see | |
12 | |
13 http://www.ensembl.org/info/about/code_licence.html | |
14 | |
15 =head1 CONTACT | |
16 | |
17 Please email comments or questions to the public Ensembl | |
18 developers list at <ensembl-dev@ebi.ac.uk>. | |
19 | |
20 Questions may also be sent to the Ensembl help desk at | |
21 <helpdesk@ensembl.org>. | |
22 | |
23 =head1 NAME | |
24 | |
25 Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign | |
26 | |
27 =head1 SYNOPSIS | |
28 | |
29 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign"; | |
30 push @INC, $parser_type; | |
31 my $imp = $class->SUPER::new(@_); my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params); | |
32 | |
33 $imp->set_config(); | |
34 | |
35 | |
36 =head1 DESCRIPTION | |
37 | |
38 This is a definitions class which should not be instatiated directly, it | |
39 normally inherited from the Importer. ArrayDesign contains meta data and methods | |
40 specific to handling array designs only (i.e. no experimental data), which have | |
41 been produced from the eFG array design software. | |
42 | |
43 =cut | |
44 | |
45 package Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign; | |
46 | |
47 use Bio::EnsEMBL::Funcgen::Array; | |
48 use Bio::EnsEMBL::Funcgen::ProbeSet; | |
49 use Bio::EnsEMBL::Funcgen::Probe; | |
50 use Bio::EnsEMBL::Funcgen::ProbeFeature; | |
51 use Bio::EnsEMBL::Funcgen::FeatureType; | |
52 use Bio::EnsEMBL::Funcgen::ExperimentalChip; | |
53 use Bio::EnsEMBL::Funcgen::ArrayChip; | |
54 use Bio::EnsEMBL::Funcgen::Channel; | |
55 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); | |
56 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file); | |
57 use Bio::EnsEMBL::Funcgen::Utils::Helper; | |
58 use strict; | |
59 | |
60 use vars qw(@ISA); | |
61 @ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); | |
62 | |
63 | |
64 | |
65 =head2 new | |
66 | |
67 Example : my $self = $class->SUPER::new(@_); | |
68 Description: Constructor method for ArrayDesign class | |
69 Returntype : Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign | |
70 Exceptions : throws if Experiment name not defined or if caller is not Importer | |
71 Caller : Bio::EnsEMBL::Funcgen::Importer | |
72 Status : at risk | |
73 | |
74 =cut | |
75 | |
76 | |
77 sub new{ | |
78 my $caller = shift; | |
79 | |
80 my $class = ref($caller) || $caller; | |
81 my $self = $class->SUPER::new(); | |
82 | |
83 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); | |
84 | |
85 | |
86 $self->{'config'} = | |
87 {( | |
88 probe_data => ["probe"], | |
89 prb_fields => ['SEQ_ID', 'POSITION', 'LENGTH', 'PROBE_SEQUENCE', 'PROBE_ID', 'UNIQUENESS_SCORE', 'TM', 'MAS_CYCLES'], | |
90 notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'], | |
91 )}; | |
92 | |
93 return $self; | |
94 } | |
95 | |
96 | |
97 | |
98 =head2 set_config | |
99 | |
100 Example : my $self->set_config; | |
101 Description: Sets attribute dependent config | |
102 Returntype : None | |
103 Exceptions : None | |
104 Caller : Bio::EnsEMBL::Funcgen::Importer | |
105 Status : at risk | |
106 | |
107 =cut | |
108 | |
109 sub set_config{ | |
110 my ($self) = @_; | |
111 | |
112 #placeholder method | |
113 #set paths | |
114 | |
115 return; | |
116 } | |
117 | |
118 | |
119 =head2 read_array_data | |
120 | |
121 Example : $imp->read_array_data(); | |
122 Description: Parses NimbleGen style DesignNotes.txt format files to create and store new Arrays | |
123 Returntype : none | |
124 Exceptions : None | |
125 Caller : general | |
126 Status : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats? | |
127 | |
128 =cut | |
129 | |
130 #this is currently OLIGO specific. | |
131 | |
132 sub read_array_data{ | |
133 my ($self, $design_notes) = @_; | |
134 | |
135 $self->log("Reading and importing array data"); | |
136 | |
137 throw('You need to pass the path to a DesignNotes.txt file') if ! defined $design_notes; | |
138 $self->{'design_notes'} = $design_notes; | |
139 | |
140 my ($line, $array, $array_chip, @data, %hpos); | |
141 my $oa_adaptor = $self->db->get_ArrayAdaptor(); | |
142 my $ac_adaptor = $self->db->get_ArrayChipAdaptor(); | |
143 my $fh = open_file("<", $self->{'design_notes'}); | |
144 | |
145 while ($line = <$fh>){ | |
146 | |
147 $line =~ s/\r*\n//;#chump | |
148 @data = split/\t/o, $line; | |
149 | |
150 | |
151 | |
152 | |
153 #We need to have a DESIGN vendor type? | |
154 #also need to be able to set file path independently of config | |
155 | |
156 if($. == 1){ | |
157 %hpos = %{$self->set_header_hash(\@data, $self->get_config('notes_fields'))}; | |
158 next; | |
159 } | |
160 | |
161 ### CREATE AND STORE Array and ArrayChips | |
162 if(! defined $array ){ | |
163 #This is treating each array chip as a separate array, unless arrayset is defined | |
164 #AT present we have no way of differentiating between different array_chips on same array???!!! | |
165 #Need to add functionality afterwards to collate array_chips into single array | |
166 | |
167 #This will use a stored array if present | |
168 | |
169 $array = Bio::EnsEMBL::Funcgen::Array->new | |
170 ( | |
171 -NAME => $self->array_name() || $data[$hpos{'DESIGN_NAME'}], | |
172 -FORMAT => uc($self->format()), | |
173 -VENDOR => uc($self->vendor()), | |
174 -TYPE => 'OLIGO', | |
175 -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here | |
176 ); | |
177 | |
178 ($array) = @{$oa_adaptor->store($array)}; | |
179 | |
180 | |
181 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( | |
182 -ARRAY_ID => $array->dbID(), | |
183 -NAME => $data[$hpos{'DESIGN_NAME'}], | |
184 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], | |
185 #add description? | |
186 ); | |
187 | |
188 #This will use a stored array_chip if present | |
189 ($array_chip) = @{$ac_adaptor->store($array_chip)}; | |
190 $array->add_ArrayChip($array_chip); | |
191 | |
192 } | |
193 elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){ | |
194 | |
195 $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].". for same Array ".$array->name()."\n"); | |
196 | |
197 $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new( | |
198 -ARRAY_ID => $array->dbID(), | |
199 -NAME => $data[$hpos{'DESIGN_NAME'}], | |
200 -DESIGN_ID => $data[$hpos{'DESIGN_ID'}], | |
201 ); | |
202 | |
203 ($array_chip) = @{$ac_adaptor->store($array_chip)}; | |
204 $array->add_ArrayChip($array_chip); | |
205 | |
206 } | |
207 elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){ | |
208 throw("Found experiment with more than one design without -array_set"); | |
209 } | |
210 } | |
211 | |
212 | |
213 $self->add_Array($array); | |
214 | |
215 close($fh); | |
216 | |
217 return; | |
218 | |
219 } | |
220 | |
221 | |
222 | |
223 | |
224 | |
225 =head2 read_probe_data | |
226 | |
227 Example : $imp->read_probe_data(); | |
228 Description: Parses and imports probes, probe sets and features for a given array design | |
229 Returntype : none | |
230 Exceptions : throws is not tiling format | |
231 Caller : Importer | |
232 Status : at risk | |
233 | |
234 =cut | |
235 | |
236 | |
237 #Assumes one chip_design per experimental set. | |
238 sub read_probe_data{ | |
239 my ($self, $array_file) = @_; | |
240 | |
241 $self->log("Reading and importing probe data"); | |
242 | |
243 | |
244 my ($fh, $line, @data, @log, %hpos, %probe_pos);#, %duplicate_probes); | |
245 my $aa = $self->db->get_AnalysisAdaptor(); | |
246 my $manal = $aa->fetch_by_logic_name('MASCycles'); | |
247 my $uanal = $aa->fetch_by_logic_name('UScore'); | |
248 my $tmanal= $aa->fetch_by_logic_name('NimblegenTM'); | |
249 | |
250 $array_file ||= $self->array_file(); | |
251 $self->log("Parsing ".$self->vendor()." probe data (".localtime().")"); | |
252 throw("ArrayDesign only accomodates a tiling design with no feature/probesets") if ($self->format() ne 'TILED'); | |
253 | |
254 ### Read in | |
255 # eFG prb file, not chiip info yet so only one ArrayChip per design | |
256 # potential to have pos file here for probes built on generic slices of genome | |
257 | |
258 #We need to handle different coord systems and possibly different assmemblies | |
259 my $slice_a = $self->db->get_SliceAdaptor(); | |
260 my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name_schema_build_version( | |
261 'chromosome', | |
262 $self->db->_get_schema_build($self->db->dnadb()) | |
263 ); | |
264 | |
265 | |
266 #sanity check we're only dealing with one array/chip | |
267 my @arrays = @{$self->arrays()}; | |
268 if(scalar(@arrays) != 1){ | |
269 throw("Array DESIGN imports only accomodate one Array per import, please check ".$self->{'design_notes'}); | |
270 } | |
271 | |
272 my @achips = @{$arrays[0]->get_ArrayChips()}; | |
273 if(scalar(@achips) != 1){ | |
274 throw("Array DESIGN imports only accomodates one ArrayChip per import, please check ".$self->{'design_notes'}); | |
275 } | |
276 | |
277 my $achip = $achips[0]; | |
278 | |
279 #foreach my $array(@{$self->arrays()}){ | |
280 | |
281 | |
282 # foreach my $achip(@{$array->get_ArrayChips()}){ | |
283 | |
284 $self->log("Importing array design(".$achip->name().") from ".$array_file); | |
285 | |
286 | |
287 if($achip->has_status('IMPORTED')){ | |
288 $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id()); | |
289 return; | |
290 }elsif($self->recovery()){ | |
291 $self->log("Rolling back partially imported ArrayChip:\t".$achip->design_id()); | |
292 $self->db->rollback_ArrayChip([$achip]); | |
293 } | |
294 | |
295 $self->log("Importing ArrayChip:".$achip->design_id()); | |
296 | |
297 | |
298 #OPEN PROBE IN/OUT FILES | |
299 $fh = open_file("<", $array_file); | |
300 my $f_out = open_file(">", $self->get_dir("output")."/probe.".$achip->name()."fasta") if($self->{'_dump_fasta'}); | |
301 my ($op, $of, %pfs); | |
302 | |
303 #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap | |
304 my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("TileMap");##??? | |
305 my $strand = 0; #default for TileMap, should be config hash? | |
306 my $fasta = ""; | |
307 | |
308 while($line = <$fh>){ | |
309 $line =~ s/\r*\n//; | |
310 @data = split/\t/o, $line; | |
311 my $loc = ""; | |
312 | |
313 #SEQ_ID WINDOW_START WINDOW_END POSITION LENGTH PROBE_SEQUENCE TM UNIQUENESS_SCORE MAS_CYCLES | |
314 #X 3000001 3000100 3000041 56 TGACATCTTCAGTTCTTTACATAGTTTTCATATTAGTCCTCTATCAGATGTGGAGT 73.09 132 15 | |
315 | |
316 | |
317 if ($. == 1){ | |
318 %hpos = %{$self->set_header_hash(\@data, $self->get_config('prb_fields'))}; | |
319 next; | |
320 } | |
321 | |
322 | |
323 #This assumes tiling format with no feature/probe sets | |
324 | |
325 | |
326 if(%pfs){ | |
327 $self->store_set_probes_features($achip->dbID(), \%pfs); | |
328 undef %pfs; | |
329 } | |
330 | |
331 | |
332 #PROBE | |
333 $op = Bio::EnsEMBL::Funcgen::Probe->new( | |
334 -NAME => $data[$hpos{'PROBE_ID'}], | |
335 -LENGTH => $data[$hpos{'LENGTH'}], | |
336 -ARRAY => $arrays[0], | |
337 -ARRAY_CHIP_ID => $achip->dbID(), | |
338 -CLASS => 'DESIGN', | |
339 ); | |
340 | |
341 $op->add_Analysis_score($manal, $data[$hpos{'MAS_CYCLES'}]); | |
342 $op->add_Analysis_score($tmanal, $data[$hpos{'TM'}]); | |
343 $op->add_Analysis_CoordSystem_score($uanal, $cs, $data[$hpos{'UNIQUENESS_SCORE'}]); | |
344 | |
345 #would need to pass cs to store USCORE, CYCLES and TM are seq/anal dependent not cs | |
346 #options: | |
347 #associate cs dependent scores with features | |
348 #we are duplicating cs in probe_design table | |
349 | |
350 #do we need another object? ProbeDesign | |
351 #-cs would be empty for all but uscore | |
352 #-mas_cycles analysis_id | |
353 #-uscore analysis_id cs_id | |
354 #-tm analysis_id (anal most likely wont change so highly redundant) | |
355 #this would produce 3 records for each probe, with cs being empty for two and anal being redundant for the other | |
356 #would however provide for extensible design attributes | |
357 #would only need one tm and mas_cycles for each probe irrespective of cs | |
358 #could have separate table probe_design_feature? | |
359 #mmm doesn't have location, just cs | |
360 #just have empty cs fields, calls by cs would have to be in ('cs_id', 'NULL') | |
361 #or just make uscore method dependent on cs. | |
362 #or split table? | |
363 #ProbeDesign::add_analysis_score | |
364 #ProbeDesign::add_coord_sys_analysis_score | |
365 | |
366 #can we add this directly to the probe? | |
367 #separate retrieval in ProbeAdaptor so we're not joining everytime | |
368 #this would require generic get_analysis/analysis_coord_system_attribute method | |
369 | |
370 | |
371 | |
372 %{$pfs{$data[$hpos{'PROBE_ID'}]}} = ( | |
373 probe => $op, | |
374 features => [], | |
375 ); | |
376 | |
377 | |
378 #PROBE FEATURE | |
379 if(! $self->cache_slice($data[$hpos{'SEQ_ID'}])){ | |
380 warn("Skipping non-standard probe chromosome"); | |
381 undef %pfs; | |
382 next; | |
383 } | |
384 | |
385 my $end = ($data[$hpos{'POSITION'}] + $data[$hpos{'LENGTH'}]); | |
386 | |
387 if ($self->{'_dump_fasta'}){ | |
388 $loc .= $data[$hpos{'SEQ_ID'}].":".$data[$hpos{'POSITION'}]."-${end};"; | |
389 } | |
390 | |
391 | |
392 $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new | |
393 ( | |
394 -START => $data[$hpos{'POSITION'}], | |
395 -END => $end, | |
396 -STRAND => $strand, | |
397 -SLICE => $self->cache_slice($data[$hpos{'SEQ_ID'}]), | |
398 -ANALYSIS => $anal, | |
399 -MISMATCHCOUNT => 0, | |
400 -CIGAR_LINE => $data[$hpos{'LENGTH'}].'M', | |
401 -PROBE => undef,#Need to update this in the store method | |
402 ); | |
403 | |
404 | |
405 push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of; | |
406 | |
407 if($self->{'_dump_fasta'}){ | |
408 #filter controls/randoms? Or would it be sensible to see where they map | |
409 #wrap seq here? | |
410 $fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}]. | |
411 "\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n"; | |
412 } | |
413 } | |
414 | |
415 #need to store last data here | |
416 $self->store_set_probes_features($achip->dbID(), \%pfs); | |
417 $self->log(join("\n", @log)); | |
418 $achip->adaptor->set_status("IMPORTED", $achip); | |
419 $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED"); | |
420 | |
421 if ($self->{'_dump_fasta'}){ | |
422 print $f_out $fasta if($self->{'_dump_fasta'}); | |
423 close($f_out); | |
424 } | |
425 | |
426 $self->log("Finished parsing probe data"); | |
427 #Total probe_sets:\t$psid\n". | |
428 # "Total probes:\t$pid\nTotal probe_features:\t$fid"); | |
429 | |
430 return; | |
431 } | |
432 | |
433 | |
434 1; |