Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/InputSet.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 # | |
2 # Ensembl module for Bio::EnsEMBL::Funcgen::InputSet | |
3 # | |
4 | |
5 =head1 LICENSE | |
6 | |
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
8 Genome Research Limited. All rights reserved. | |
9 | |
10 This software is distributed under a modified Apache license. | |
11 For license details, please see | |
12 | |
13 http://www.ensembl.org/info/about/code_licence.html | |
14 | |
15 =head1 CONTACT | |
16 | |
17 Please email comments or questions to the public Ensembl | |
18 developers list at <ensembl-dev@ebi.ac.uk>. | |
19 | |
20 Questions may also be sent to the Ensembl help desk at | |
21 <helpdesk@ensembl.org>. | |
22 | |
23 =head1 NAME | |
24 | |
25 Bio::EnsEMBL::InputSet - A module to represent InputSet object. | |
26 | |
27 | |
28 =head1 SYNOPSIS | |
29 | |
30 use Bio::EnsEMBL::Funcgen::InputSet; | |
31 | |
32 #Create an InputSet | |
33 | |
34 my $inp_set = Bio::EnsEMBL::Funcgen::InputSet->new | |
35 ( | |
36 -DBID => $dbID, | |
37 -ADAPTOR => $self, | |
38 -EXPERIMENT => $exp, | |
39 -FEATURE_TYPE => $ftype, | |
40 -CELL_TYPE => $ctype, | |
41 -FORMAT => 'READ_FORMAT', | |
42 -VENDOR => 'SOLEXA', | |
43 -NAME => 'ExpSet1', | |
44 -REPLICATE => 1, | |
45 ); | |
46 | |
47 # Add some InputSubsets | |
48 | |
49 $inp_set->add_new_subsets($subset_name, $ | |
50 | |
51 | |
52 | |
53 | |
54 =head1 DESCRIPTION | |
55 | |
56 An InputSet object provides a generic container for any non-array based feature import, | |
57 allowing tracking of file import via the status table and integration into Data and FeatureSets to | |
58 provide traceability to the source experiment from a given FeatureSet. | |
59 | |
60 =cut | |
61 | |
62 use strict; | |
63 use warnings; | |
64 | |
65 package Bio::EnsEMBL::Funcgen::InputSet; | |
66 | |
67 use Bio::EnsEMBL::Funcgen::InputSubset; | |
68 use Bio::EnsEMBL::Utils::Argument qw( rearrange ); | |
69 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate); | |
70 use Bio::EnsEMBL::Funcgen::Set; | |
71 use Bio::EnsEMBL::Analysis; | |
72 | |
73 use vars qw(@ISA); | |
74 @ISA = qw(Bio::EnsEMBL::Funcgen::Set); | |
75 | |
76 | |
77 =head2 new | |
78 | |
79 | |
80 | |
81 Example : my $eset = Bio::EnsEMBL::Funcgen::InputSet->new( | |
82 -EXPERIMENT => $exp, | |
83 -FEATURE_TYPE => $ftype, | |
84 -CELL_TYPE => $ctype, | |
85 -FORMAT => 'READ_FORMAT', | |
86 -VENDOR => 'SOLEXA', | |
87 -NAME => 'ExpSet1', | |
88 -ANALYSIS => $anal, | |
89 -FEATURE_CLASS => 'annotated', | |
90 ); | |
91 | |
92 Do we want to define subsets likes this or are we more likely to add them one by one? | |
93 | |
94 Description: Constructor for InputSet objects. | |
95 Returntype : Bio::EnsEMBL::Funcgen::InputSet | |
96 Exceptions : Throws if no Experiment defined | |
97 Throws if CellType or FeatureType are not valid or stored | |
98 Caller : General | |
99 Status : At risk | |
100 | |
101 =cut | |
102 | |
103 sub new { | |
104 my $caller = shift; | |
105 | |
106 my $class = ref($caller) || $caller; | |
107 | |
108 #Add set_type here to overwrite default ref parsing in Set::set_type | |
109 #This need to stay like this until we patch the DB | |
110 my $self = $class->SUPER::new(@_); | |
111 | |
112 my ($exp, $format, $vendor, $rep) | |
113 = rearrange(['EXPERIMENT', 'FORMAT', 'VENDOR', 'REPLICATE'], @_); | |
114 | |
115 if (! (ref $exp && $exp->isa('Bio::EnsEMBL::Funcgen::Experiment') && $exp->dbID())){ | |
116 throw('Must specify a valid stored Bio::EnsEMBL::Funcgen::Experiment'); | |
117 } | |
118 | |
119 | |
120 #These are set in Set, just validate here | |
121 throw ('Must provide a FeatureType') if(! defined $self->feature_type); | |
122 throw ('Must provide a CellType') if(! defined $self->cell_type); | |
123 | |
124 my $type = $self->feature_class; | |
125 | |
126 #Need to move these types to config | |
127 | |
128 if(! ($type && grep /^${type}$/, ('annotated', 'result', 'segmentation'))){ | |
129 throw("You must define a valid InputSet feature_class e.g. 'annotated' or 'result'"); | |
130 } | |
131 | |
132 if(($type eq 'result') && | |
133 ($format ne 'SEQUENCING')){ | |
134 throw('InputSet does not yet support a result type InputSet which does not have the \'SEQUENCING\' format'); | |
135 | |
136 } | |
137 | |
138 | |
139 #if(! defined $self->analysis){ | |
140 ##default analysis hack for v47 | |
141 ##Set directly to avoid dbID boolean check | |
142 #This is to support supporting_set cache in data_set? | |
143 $self->{'analysis'} = Bio::EnsEMBL::Analysis->new | |
144 (-logic_name => 'external', | |
145 -id => 0,#??someone needs to rewrite analysis | |
146 ); | |
147 | |
148 #Change to direct setting for speed | |
149 $self->{format} = $format; | |
150 $self->{vendor} = $vendor; | |
151 $self->{replicate} = $rep; | |
152 $self->{experiment} = $exp; | |
153 $self->{subsets} = {}; | |
154 | |
155 return $self; | |
156 } | |
157 | |
158 | |
159 =head2 add_new_subset | |
160 | |
161 Arg [1] : string - sub set name e.g. the file name (not path as we're restricted to 30 chars) | |
162 Arg [2] : Bio::EnsEMBL::Funcgen::InputSubset - optional | |
163 If not defined will create a sparse InputSubset based on the name | |
164 Example : $expset->add_new_subset($ss_name, $exp_subset); | |
165 Description: Adds input_subset | |
166 Returntype : none | |
167 Exceptions : Throws if set is already present | |
168 Throws if InputSubset is not valid or stored | |
169 Caller : General | |
170 Status : At Risk | |
171 | |
172 =cut | |
173 | |
174 #Do we still use the optional subset function? | |
175 | |
176 sub add_new_subset { | |
177 my ($self, $ss_name, $exp_sset) = @_; | |
178 | |
179 #Need to test $ss_name here | |
180 if(! ($ss_name && ref(\$ss_name) eq 'SCALAR')){#ref($exp_sset) would be 'REF' | |
181 throw('You must pass a InputSubset name'); | |
182 } | |
183 | |
184 if($self->get_subset_by_name($ss_name)){ | |
185 throw("Subset $ss_name is already present in this InputSet, maybe you need to alter the filename?"); | |
186 } | |
187 | |
188 if(defined $exp_sset){ | |
189 | |
190 if(!(ref($exp_sset) && $exp_sset->isa('Bio::EnsEMBL::Funcgen::InputSubset') && $exp_sset->dbID())){ | |
191 throw('InputSubsets must be valid and stored'); | |
192 } | |
193 } | |
194 else{ | |
195 | |
196 $exp_sset = Bio::EnsEMBL::Funcgen::InputSubset->new( | |
197 -name => $ss_name, | |
198 -input_set => $self, | |
199 ); | |
200 } | |
201 | |
202 $self->{subsets}{$ss_name} = $exp_sset; | |
203 | |
204 return $self->{subsets}{$ss_name}; | |
205 } | |
206 | |
207 | |
208 =head2 get_Experiment | |
209 | |
210 Example : my $exp = $exp_set->get_Experiment(); | |
211 Description: Getter for the Experiment of this DataSet. | |
212 Returntype : Bio::EnsEMBL::Fuuncgen::Experiment | |
213 Exceptions : None | |
214 Caller : General | |
215 Status : At Risk | |
216 | |
217 =cut | |
218 | |
219 sub get_Experiment{ return $_[0]->{experiment}; } | |
220 | |
221 | |
222 =head2 get_InputSubsets | |
223 | |
224 Example : my @subsets = @{$exp_set->get_InputSubsets()}; | |
225 Description: Getter for the InputSubsets for this InputSet. | |
226 Returntype : Arrayref | |
227 Exceptions : None | |
228 Caller : General | |
229 Status : At Risk | |
230 | |
231 =cut | |
232 | |
233 sub get_InputSubsets{ | |
234 my ($self) = shift; | |
235 | |
236 return [ values %{$self->{'subsets'}} ]; | |
237 } | |
238 | |
239 | |
240 | |
241 | |
242 =head2 get_subset_by_name | |
243 | |
244 Example : my $subsets = $exp_set->get_subset_by_name('subset1'); | |
245 Description: Getter for the subset of a given name for this InputSet. | |
246 Returntype : Bio::EnsEMBL::Funcgen::InputSubset | |
247 Exceptions : None | |
248 Caller : General | |
249 Status : At Risk | |
250 | |
251 =cut | |
252 | |
253 sub get_subset_by_name{ | |
254 my ($self, $name) = @_; | |
255 return (exists $self->{'subsets'}{$name}) ? $self->{'subsets'}{$name} : undef; | |
256 } | |
257 | |
258 | |
259 =head2 get_subset_names | |
260 | |
261 Example : my @subset_names = @{$exp_set->get_subset_names()}; | |
262 Description: Getter for the subset names for this InputSet. | |
263 Returntype : Arrayref | |
264 Exceptions : None | |
265 Caller : General | |
266 Status : At Risk | |
267 | |
268 =cut | |
269 | |
270 sub get_subset_names{ | |
271 my ($self) = shift; | |
272 return [ keys %{$self->{'subsets'}} ]; | |
273 } | |
274 | |
275 | |
276 | |
277 | |
278 =head2 vendor | |
279 | |
280 Arg[1] : String - vendor e.g. ILLUMINA | |
281 Example : my $iset_vendor = $iset->vendor; | |
282 Description: Getter for the vendor attribute of this InputSet. | |
283 Returntype : String | |
284 Exceptions : None | |
285 Caller : General | |
286 Status : At Risk | |
287 | |
288 =cut | |
289 | |
290 sub vendor { return $_[0]->{vendor}; } | |
291 | |
292 | |
293 =head2 format | |
294 | |
295 Arg[1] : string - format i.e. product type/format | |
296 Example : my $iset_format = $iset->format; | |
297 Description: Getter for the format attribute of this InputSet. | |
298 Returntype : String | |
299 Exceptions : None | |
300 Caller : General | |
301 Status : At Risk | |
302 | |
303 =cut | |
304 | |
305 sub format { return $_[0]->{format}; } | |
306 | |
307 | |
308 =head2 replicate | |
309 | |
310 Arg[1] : Integer - replicate 0 = merged or NA, >0 refers to individual replicate | |
311 Example : if($iset->replicate){ #Do something replicate specific in here } | |
312 Description: Getter for the replicate attribute of this InputSet. | |
313 Returntype : Integer | |
314 Exceptions : None | |
315 Caller : General | |
316 Status : At Risk | |
317 | |
318 =cut | |
319 | |
320 sub replicate { return $_[0]->{replicate}; } | |
321 | |
322 | |
323 | |
324 =head2 source_info | |
325 | |
326 Example : my $source_info = $input_set->source_info; | |
327 Description: Getter for the experiment source info i.e. [ $label, $url ] | |
328 Returntype : Listref | |
329 Exceptions : None | |
330 Caller : General | |
331 Status : At risk | |
332 | |
333 =cut | |
334 | |
335 #Currently handling redundant/absent InputSubset data | |
336 | |
337 sub source_info{ | |
338 my $self = shift; | |
339 | |
340 if(! defined $self->{source_info}){ | |
341 #could have data_url as highest priority here | |
342 #but we need to ensure removal when adding archive ids | |
343 #so we link to the archive and not the old data url | |
344 | |
345 my $exp_group = $self->get_Experiment->experimental_group; | |
346 my %source_info; #Handles redundant InputSubsets | |
347 my ($proj_name, $proj_link, $source_label, $source_link); | |
348 | |
349 if($exp_group->is_project){ | |
350 $proj_name = $exp_group->name; | |
351 $proj_link = $exp_group->url; | |
352 } | |
353 | |
354 foreach my $isset(@{$self->get_InputSubsets}){ | |
355 | |
356 if(defined $isset->archive_id ){ | |
357 $source_label = $isset->archive_id; | |
358 | |
359 if(! exists $source_info{$source_label}){ | |
360 $source_info{$source_label} = [$source_label, undef]; | |
361 #source_link can is undef here as archive_id overrides display url | |
362 #undef links will automatically go to the SRA | |
363 } | |
364 } | |
365 elsif(defined $proj_name){ | |
366 #$source_label = $self->experimental_group->name; | |
367 $source_link = $isset->display_url || $proj_link; | |
368 | |
369 if(! exists $source_info{$source_link}){ | |
370 $source_info{$source_link} = [$proj_name, $source_link]; | |
371 } | |
372 } | |
373 } | |
374 | |
375 $self->{source_info} = [values %source_info]; | |
376 } | |
377 | |
378 return $self->{source_info}; | |
379 } | |
380 | |
381 | |
382 | |
383 1; | |
384 |