Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/DataSet.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 # | |
2 # Ensembl module for Bio::EnsEMBL::Funcgen::DataSet | |
3 # | |
4 # You may distribute this module under the same terms as Perl itself | |
5 | |
6 | |
7 =head1 LICENSE | |
8 | |
9 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
10 Genome Research Limited. All rights reserved. | |
11 | |
12 This software is distributed under a modified Apache license. | |
13 For license details, please see | |
14 | |
15 http://www.ensembl.org/info/about/code_licence.html | |
16 | |
17 =head1 CONTACT | |
18 | |
19 Please email comments or questions to the public Ensembl | |
20 developers list at <ensembl-dev@ebi.ac.uk>. | |
21 | |
22 Questions may also be sent to the Ensembl help desk at | |
23 <helpdesk@ensembl.org>. | |
24 | |
25 | |
26 =head1 NAME | |
27 | |
28 Bio::EnsEMBL::Funcgen::DataSet - A module to represent DataSet object. | |
29 | |
30 | |
31 =head1 SYNOPSIS | |
32 | |
33 use Bio::EnsEMBL::Funcgen::DataSet; | |
34 | |
35 my $data_set = Bio::EnsEMBL::Funcgen::DataSet->new( | |
36 -DBID => $dbID, | |
37 -ADAPTOR => $self, | |
38 -SUPPORTING_SETS => [$rset], | |
39 -FEATURE_SET => $fset, | |
40 -DISPLAYABLE => 1, | |
41 -NAME => 'DATASET1', | |
42 ); | |
43 | |
44 | |
45 | |
46 =head1 DESCRIPTION | |
47 | |
48 A DataSet object provides access to either or both raw results and AnnotatedFeatures | |
49 for a given experiment within a Slice, associated with set wide experimental meta data. | |
50 This was aimed primarily at easing access to data via the web API by creating | |
51 a wrapper class with convenience methods. The focus of this class is to contain raw and | |
52 associated processed/analysed data to be displayed as a set within the browser i.e. an | |
53 experiment may have different cell lines, features or time points, these would require different DataSets. | |
54 # However a DataSet may contain mixed data types i.e. promoter & histone???? No give separate sets? | |
55 May have duplicates for raw data but only one predicted features track?? | |
56 The data in this class is kept as lightweight as possible with data being loaded dynamically. | |
57 | |
58 | |
59 =cut | |
60 | |
61 use strict; | |
62 use warnings; | |
63 | |
64 package Bio::EnsEMBL::Funcgen::DataSet; | |
65 | |
66 use Bio::EnsEMBL::Utils::Argument qw( rearrange ); | |
67 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate); | |
68 use Bio::EnsEMBL::Funcgen::Storable; | |
69 | |
70 use vars qw(@ISA); | |
71 @ISA = qw(Bio::EnsEMBL::Funcgen::Storable); | |
72 #Should not be a Set as is sufficiently different | |
73 | |
74 | |
75 =head2 new | |
76 | |
77 | |
78 | |
79 Example : my $dset = Bio::EnsEMBL::Funcgen::DataSet->new( | |
80 -SUPPORTING_SETS => [$fset1, $fset2], | |
81 -FEATURE_SET => $fset, | |
82 -DISPLAYABLE => 1, | |
83 -NAME => 'DATASET1', | |
84 ); | |
85 | |
86 Description: Constructor for DataSet objects. | |
87 Returntype : Bio::EnsEMBL::Funcgen::DataSet | |
88 Exceptions : Throws if no experiment_id defined | |
89 Caller : General | |
90 Status : At risk | |
91 | |
92 =cut | |
93 | |
94 sub new { | |
95 my $caller = shift; | |
96 | |
97 my $class = ref($caller) || $caller; | |
98 | |
99 my $self = $class->SUPER::new(@_); | |
100 | |
101 #do we need to add $fg_ids to this? Currently maintaining one feature_group focus.(combi exps?) | |
102 my ($fset, $sets, $name) | |
103 = rearrange(['FEATURE_SET', 'SUPPORTING_SETS', 'NAME'], @_); | |
104 | |
105 | |
106 my @caller = caller(); | |
107 | |
108 #do we need to passexperiment_id to check that table_name/id correspond for storage? | |
109 #'EXPERIMENT_ID', 'EXPERIMENT_IDS', | |
110 | |
111 #Can have more than one experiment_id for a combined feature set. But shouldn't query like that. | |
112 #therefore we need to be able to track back from feature to ec's rather than exps. | |
113 #as there may be mixed data in an exp which didn't necessarily contribute to the combined feature | |
114 #We are now separating potentially different featuretype from the same exp into different result_groups | |
115 #therefore we only have to track back to the result_group e.g. the contig chip set | |
116 | |
117 #We also need a way of pulling back GOLDEN/combined resultssets based on feature_set_id | |
118 #Set status as GOLDEN, then pull back displayable or GOLDEN raw results | |
119 | |
120 #Could link experiment_feature_type/feature_set to ec or result_set table? | |
121 #latter would mean we don't have to specifiy which ec, just part of set. | |
122 #This will make it easier for populating pfs but will mean that we can't easily track back to a particular ec without doing some probe/slice look up via the array chip. | |
123 #Not really a requirement, so let's take this hit. | |
124 | |
125 #Could then maybe use DataSet to store pfs, otherwise we'd have to pass the rset or at the very least the result_set_id. | |
126 #do we need some control of creating new objects with dbID and adding result_groups/feature_sets and them storing/updating them | |
127 #potential for someone to create one from new using a duplicate dbID and then linking incorrect data to a pre-existing ResultGroup | |
128 #can we check wether caller is DataSetAdaptor if we have dbID? | |
129 | |
130 if($self->dbID() && $caller[0] ne "Bio::EnsEMBL::Funcgen::DBSQL::DataSetAdaptor"){ | |
131 throw('You must use the DataSetAdaptor to generate DataSets with dbID i.e. from the DB,'. | |
132 ' as this module accomodates updating which may cause incorrect data if the object'. | |
133 ' is not generated from the DB'); | |
134 } | |
135 | |
136 | |
137 $self->{'supporting_sets'} ||= {}; | |
138 #throw("Must specify at least one Result/FeatureSet") if((! $sets) && (! $fset)); | |
139 #removed this to allow generation of DataSets without feature sets | |
140 #could reimplement this if we change the DataSetAdaptor::_obj_from_sth | |
141 | |
142 $self->add_supporting_sets($sets) if $sets; | |
143 $self->product_FeatureSet($fset) if $fset; | |
144 $self->name($name) if $name; | |
145 | |
146 return $self; | |
147 } | |
148 | |
149 | |
150 | |
151 | |
152 | |
153 | |
154 | |
155 #methods | |
156 #set wide display label(predicted_feature) + more wordy label for wiggle tracks? | |
157 #defined by experiment type i.e. time course would require timepoint in display label | |
158 #deal with this dynamically or have display_label in table | |
159 #Need call on type, or fetch all would | |
160 | |
161 #_get_ec_ids or contigsets? | |
162 #this should now be an intrinsic part of this class/adaptor | |
163 | |
164 #cell line | |
165 #feature_type | |
166 #displayable...should have one for the whole set and one for each raw and predicted? | |
167 | |
168 #have analysis as arg? Or do we get all analysis sets? | |
169 #we need to be able to set analyses for DataSets dynamically from DB | |
170 #pick up all DataSets | |
171 #displayable field in DataSets also? | |
172 | |
173 #If we have mixed types in the same experiment then we could get promoter features and histone wiggle tracks displayed togeter | |
174 #Not v.good for display purposes? We may want to separate the promoter and histone tracks, or we may want ll the experiment data together but of mixed types. | |
175 #We need to be able to pull back the experiment type for each set, therefore this needs setting on an ec level, not an experiment level. | |
176 #This is also v.reliant on putting contig set info in place, otherwise we may get mixed chip types in same set. | |
177 | |
178 #get_raw_analysis_name | |
179 #get_predicted_feature_analysis_name | |
180 #set ResultFeatures and AnnotatedFeatures in hash keyed by analysis_name? | |
181 | |
182 #Need to change to simple accessor | |
183 #or should we maintain to provide explicit method for delineating between parent and supporting FeatureSets? | |
184 #yes, and sub the feature_type/cell_type checks | |
185 | |
186 | |
187 =head2 product_FeatureSet | |
188 | |
189 Arg [1] : (optional) Bio::EnsEMBL::Funcgen::FeatureSet | |
190 Example : $data_set->product_FeatureSet($fset); | |
191 Description: Getter and setter for the main feature_set attribute for this DataSet. | |
192 Returntype : Bio::EnsEMBL::Funcgen::FeatureSet | |
193 Exceptions : Throws not a valid FeatureSet or if main feature_set has already been set. | |
194 Caller : General | |
195 Status : At Risk - change to get_product_FeatureSet | |
196 | |
197 =cut | |
198 | |
199 sub product_FeatureSet { | |
200 my ($self, $fset) = @_; | |
201 | |
202 if($fset){ | |
203 | |
204 if (! ($fset && ref($fset) && $fset->isa("Bio::EnsEMBL::Funcgen::FeatureSet"))){ | |
205 throw("Need to pass a valid Bio::EnsEMBL::Funcgen::FeatureSet") | |
206 } | |
207 | |
208 if(defined $self->{'feature_set'}){ | |
209 throw("The main feature_set has already been set for this DataSet, maybe you want add_SupportingSets?"); | |
210 } | |
211 else{ | |
212 $self->_validate_and_set_types($fset); | |
213 $self->{'feature_set'} = $fset; | |
214 } | |
215 } | |
216 | |
217 return $self->{'feature_set'}; | |
218 } | |
219 | |
220 | |
221 =head2 add_supporting_sets | |
222 | |
223 Arg [1] : Array of Bio::EnsEMBL::Feature/ResultSet object | |
224 Example : $dset->add_supporting_sets($rset); | |
225 Description: Adds Result/FeatureSets to the DataSet | |
226 Returntype : none | |
227 Exceptions : Throws if set not valid for supporting_set type of DataSet | |
228 Throws if supporting_sets is not an array ref | |
229 Caller : General | |
230 Status : At Risk | |
231 | |
232 =cut | |
233 | |
234 | |
235 sub add_supporting_sets { | |
236 my ($self, $sets) = @_; | |
237 | |
238 #should we handle displayable here, and propogate to the ResultSet if update_status is set | |
239 #is there scope to write a Funcgen::Storable, which provides convenience methods to StatusAdaptor? | |
240 #would have to make sure Feature object also inherited from Funcgen::Storable aswell as BaseFeature | |
241 | |
242 throw("Supporting sets need to be a reference to an ARRAY:\t".$sets) if ref($sets) ne 'ARRAY'; | |
243 | |
244 foreach my $set(@$sets){ | |
245 | |
246 if(!(ref($set) && $set->isa('Bio::EnsEMBL::Funcgen::Set') && $set->set_type ne 'data' && $set->dbID)){ | |
247 throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::Set which is not a DataSet:\t$set"); | |
248 } | |
249 #set type cannot be data at present as it does not inherit from Set.pm | |
250 | |
251 | |
252 | |
253 #Only validate if we are dealing with result type data | |
254 #As we can have various cell/feature_types for compound analyses e.g. RegulatoryFeatures | |
255 | |
256 $self->_validate_and_set_types($set) if $set->set_type() ne 'feature'; | |
257 | |
258 #should ResultSet/Adaptor contain all the fetch_methods, and leave DataSet as a kind of organisational class as a single point of access. | |
259 #DataSetAdaptor to perform the ordering according to feature/celltype | |
260 #This will still not resolve the complex data sets which can be accomodated by the DB. | |
261 #Maybe we can keep the data sets as simple as there are and confer the association by tracking back to the experiment? | |
262 #Would there only ever be one experiment for a complex data_set? | |
263 | |
264 | |
265 #Can have more than one experiment for a compound feature set, would we ever want to display raw data? | |
266 #This is actually an easier problem unless we are displaying two feature types(i.e. complex and compound) | |
267 | |
268 | |
269 $self->{'supporting_sets'}->{$set->analysis->dbID()} ||= (); | |
270 push @{$self->{'supporting_sets'}->{$set->analysis->dbID()}}, $set; | |
271 } | |
272 | |
273 return; | |
274 } | |
275 | |
276 | |
277 =head2 _validate_and_set_types | |
278 | |
279 Arg [1] : Bio::EnsEMBL::Feature/ResultSet object | |
280 Example : $dset->_validate_and_set_types($rset); | |
281 Description: Validates and sets DataSet cell and feature types | |
282 Returntype : none | |
283 Exceptions : Throws if types not valid | |
284 Caller : General | |
285 Status : At Risk | |
286 | |
287 =cut | |
288 | |
289 | |
290 sub _validate_and_set_types{ | |
291 my ($self, $set) = @_; | |
292 | |
293 #slightly dodgy bypassing methods, but extendable | |
294 | |
295 #This currently restricts all set types to one cell and feature type | |
296 #this is incorrect for feature_set types as we want to munge several feature and possibly cell types | |
297 #into one combined data set. | |
298 #this should set it to the FeatureSet type if is feature_set data_set | |
299 #this only works as we only validate supporting_sets if type is not feature | |
300 | |
301 for my $type('feature_type', 'cell_type'){ | |
302 | |
303 if(defined $self->{$type}){ | |
304 | |
305 #Need to test isa here? Why is this passing the defined test if not set? | |
306 if($set->{$type}->name() ne $self->{$type}->name()){ | |
307 | |
308 throw(ref($set)." $type(".$set->{$type}->name(). | |
309 ") does not match DataSet $type(".$self->{$type}->name().")"); | |
310 | |
311 } | |
312 } | |
313 else{ | |
314 $self->{$type} = $set->{$type}; | |
315 } | |
316 } | |
317 | |
318 return; | |
319 } | |
320 | |
321 | |
322 | |
323 =head2 get_supporting_sets_by_Analysis | |
324 | |
325 Arg [1] : Bio::EnsEMBL::Funcgen:Analysis | |
326 Arg [2] : (optional) status - e.g 'DISPLAYABLE' | |
327 Example : my $anal_sets = @{$result_set->get_ResultSets_by_Analysis($analysis)}; | |
328 Description: Getter for the SupportingSet objects of a given Analysis. | |
329 Returntype : ARRAYREF | |
330 Exceptions : Throws if arg is not a valid stored Bio::EnsEMBL::Anaylsis | |
331 Caller : General | |
332 Status : At Risk | |
333 | |
334 =cut | |
335 | |
336 sub get_supporting_sets_by_Analysis { | |
337 my ($self, $analysis, $status) = @_; | |
338 | |
339 | |
340 my @rsets; | |
341 | |
342 | |
343 #should we handle displayable here, and propogate to the ResultSet if update_status is set | |
344 #is there scope to write a Funcgen::Storable, which provides convenience methods to StatusAdaptor? | |
345 #would have to make sure Feature object also inherited from Funcgen::Storable aswell as BaseFeature | |
346 | |
347 | |
348 if (! ($analysis->isa("Bio::EnsEMBL::Analysis") && $analysis->dbID())){ | |
349 throw("Need to pass a valid stored Bio::EnsEMBL::Funcgen::ResultSet"); | |
350 } | |
351 | |
352 #will have to generate new array of object here if we want to filter displayable | |
353 #This may result in returning a ref to the stored ResultSets for no status | |
354 #And a ref to the abstracted/filtered i.e. non-stored ResultSets if we have a status | |
355 #This could cause problems if people want to edit the real ResultSets via the refs | |
356 #If we edit the ResultSets like this, we would still store via their adaptor | |
357 #so would need to refresh DataSet anyway. | |
358 | |
359 #should ResultSet/Adaptor contain all the fetch_methods, and leave DataSet as a kind of organisational class as a single point of access. | |
360 #DataSetAdaptor to perform the ordering according to feature/celltype | |
361 #This will still not resolve the complex data sets which can be accomodated by the DB. | |
362 #Maybe we can keep the data sets as simple as there are and confer the association by tracking back to the experiment? | |
363 #Would there only ever be one experiment for a complex data_set? | |
364 | |
365 | |
366 #Can have more than one experiment for a compound feature set, would we ever want to display raw data? | |
367 #This is actually an easier problem unless we are displaying two feature types(i.e. complex and compound) | |
368 | |
369 #could we have >1 rset with the same analysis? | |
370 | |
371 foreach my $anal_rset(@{$self->{'supporting_sets'}->{$analysis->dbID()}}){ | |
372 | |
373 if(! defined $status){ | |
374 push @rsets, $anal_rset; | |
375 } | |
376 elsif($anal_rset->has_status($status)){ | |
377 push @rsets, $anal_rset; | |
378 } | |
379 } | |
380 | |
381 return \@rsets; | |
382 } | |
383 | |
384 | |
385 | |
386 =head2 get_supporting_sets | |
387 | |
388 Arg [1] : (optional) status - e.g 'DISPLAYABLE' | |
389 Example : my @status_sets = @{$data_set->get_supporting_sets($status)}; | |
390 Description: Getter for the ResultSets for this DataSet. | |
391 Returntype : Arrayref | |
392 Exceptions : None | |
393 Caller : General | |
394 Status : At Risk | |
395 | |
396 =cut | |
397 | |
398 sub get_supporting_sets{ | |
399 my ($self, $status, $set_type) = @_; | |
400 #swap the args here | |
401 | |
402 #Add analysis here and make above method wrapper | |
403 | |
404 #Validate type here | |
405 if($set_type && | |
406 ($set_type ne 'result' && | |
407 $set_type ne 'feature' && | |
408 $set_type ne 'input')){ | |
409 throw("You have specified an invalid supporting set type:\t$set_type"); | |
410 } | |
411 | |
412 | |
413 my @ssets; | |
414 | |
415 foreach my $anal_id(keys %{$self->{'supporting_sets'}}){ | |
416 | |
417 foreach my $sset(@{$self->{'supporting_sets'}->{$anal_id}}){ | |
418 | |
419 if(defined $status && | |
420 (! $sset->has_status($status))){ | |
421 next; | |
422 } | |
423 | |
424 if(defined $set_type && | |
425 ($sset->set_type ne $set_type)){ | |
426 next; | |
427 } | |
428 | |
429 push @ssets, $sset; | |
430 } | |
431 } | |
432 | |
433 return \@ssets; | |
434 } | |
435 | |
436 | |
437 | |
438 | |
439 =head2 get_displayable_supporting_sets | |
440 | |
441 Example : my @displayable_rsets = @{$result_set->get_displayable_supporting_sets()}; | |
442 Description: Convenience method for web display | |
443 Returntype : Arrayref | |
444 Exceptions : None | |
445 Caller : General | |
446 Status : At Risk | |
447 | |
448 =cut | |
449 | |
450 sub get_displayable_supporting_sets{ | |
451 my ($self, $set_type) = @_; | |
452 | |
453 return $self->get_supporting_sets('DISPLAYABLE', $set_type); | |
454 } | |
455 | |
456 | |
457 | |
458 =head2 get_displayable_product_FeatureSet | |
459 | |
460 Example : my $fset = $data_set->get_displayable_product_FeatureSet(); | |
461 Description: Convenience method for web display | |
462 Returntype : Bio::EnsEMBL::Funcgen::FeatureSet | |
463 Exceptions : None | |
464 Caller : General | |
465 Status : At Risk | |
466 | |
467 =cut | |
468 | |
469 sub get_displayable_product_FeatureSet{ | |
470 my $self = shift; | |
471 | |
472 return $self->product_FeatureSet->has_status('DISPLAYABLE') ? $self->product_FeatureSet() : undef; | |
473 } | |
474 | |
475 | |
476 | |
477 | |
478 | |
479 =head2 name | |
480 | |
481 Example : my $dset->name('DATASET1'); | |
482 Description: Getter/Setter for the name of this DataSet. | |
483 Returntype : string | |
484 Exceptions : None | |
485 Caller : General | |
486 Status : At Risk | |
487 | |
488 =cut | |
489 | |
490 sub name { | |
491 my $self = shift; | |
492 | |
493 $self->{'name'} = shift if @_; | |
494 | |
495 return $self->{'name'}; | |
496 } | |
497 | |
498 | |
499 | |
500 | |
501 #The following attributes are generated dynamically from the consituent Result/FeatureSets | |
502 | |
503 =head2 cell_type | |
504 | |
505 Example : my $dset_ctype_name = $dset->cell_type->name(); | |
506 Description: Getter for the cell_type for this DataSet. | |
507 Returntype : Bio::EnsEMBL::Funcgen::CellType | |
508 Exceptions : None | |
509 Caller : General | |
510 Status : At Risk | |
511 | |
512 =cut | |
513 | |
514 sub cell_type { | |
515 my $self = shift; | |
516 | |
517 return $self->{'cell_type'}; | |
518 } | |
519 | |
520 =head2 feature_type | |
521 | |
522 Example : my $dset_ftype_name = $dset->feature_type->name(); | |
523 Description: Getter for the feature_type for this DataSet. | |
524 Returntype : Bio::EnsEMBL::Funcgen::FeatureType | |
525 Exceptions : None | |
526 Caller : General | |
527 Status : At Risk | |
528 | |
529 =cut | |
530 | |
531 sub feature_type { | |
532 my $self = shift; | |
533 | |
534 return $self->{'feature_type'}; | |
535 } | |
536 | |
537 | |
538 | |
539 | |
540 | |
541 =head2 display_label | |
542 | |
543 Example : print $rset->display_label(); | |
544 Description: Getter for the display_label attribute for this DataSet. | |
545 This is more appropriate for teh predicted_features of the set. | |
546 Use the individual display_labels for each raw result set. | |
547 Returntype : str | |
548 Exceptions : None | |
549 Caller : General | |
550 Status : At Risk | |
551 | |
552 =cut | |
553 | |
554 sub display_label { | |
555 my $self = shift; | |
556 | |
557 | |
558 #Add display label in table? | |
559 | |
560 if(! $self->{'display_label'}){ | |
561 | |
562 #This does not account for DataSet without a product FeatureSet | |
563 my $fset = $self->product_FeatureSet; | |
564 | |
565 if($fset && ($fset->feature_type->class() eq 'Regulatory Feature')){ | |
566 $self->{'display_label'} = 'Regulatory Features'; | |
567 } | |
568 else{ | |
569 | |
570 $self->{'display_label'} = $self->feature_type->name()." -"; | |
571 $self->{'display_label'} .= " ".($self->cell_type->display_label() || | |
572 $self->cell_type->description() || | |
573 $self->cell_type()->name()); | |
574 $self->{'display_label'} .= " Enriched Sites"; | |
575 } | |
576 } | |
577 | |
578 return $self->{'display_label'}; | |
579 } | |
580 | |
581 | |
582 #sub get_type_config{ | |
583 # my ($self) = @_; | |
584 # | |
585 # if (! defined $self->{type_config}){ | |
586 # $self->{type_config} = $self->adaptor->fetch_type_config_by_DataSet($self); | |
587 # } | |
588 # | |
589 # return $self->{type_config}; | |
590 #} | |
591 | |
592 | |
593 | |
594 1; | |
595 |