0
|
1 #
|
|
2 # Ensembl module for Bio::EnsEMBL::Funcgen::InputSet
|
|
3 #
|
|
4
|
|
5 =head1 LICENSE
|
|
6
|
|
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
8 Genome Research Limited. All rights reserved.
|
|
9
|
|
10 This software is distributed under a modified Apache license.
|
|
11 For license details, please see
|
|
12
|
|
13 http://www.ensembl.org/info/about/code_licence.html
|
|
14
|
|
15 =head1 CONTACT
|
|
16
|
|
17 Please email comments or questions to the public Ensembl
|
|
18 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
19
|
|
20 Questions may also be sent to the Ensembl help desk at
|
|
21 <helpdesk@ensembl.org>.
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Bio::EnsEMBL::InputSet - A module to represent InputSet object.
|
|
26
|
|
27
|
|
28 =head1 SYNOPSIS
|
|
29
|
|
30 use Bio::EnsEMBL::Funcgen::InputSet;
|
|
31
|
|
32 #Create an InputSet
|
|
33
|
|
34 my $inp_set = Bio::EnsEMBL::Funcgen::InputSet->new
|
|
35 (
|
|
36 -DBID => $dbID,
|
|
37 -ADAPTOR => $self,
|
|
38 -EXPERIMENT => $exp,
|
|
39 -FEATURE_TYPE => $ftype,
|
|
40 -CELL_TYPE => $ctype,
|
|
41 -FORMAT => 'READ_FORMAT',
|
|
42 -VENDOR => 'SOLEXA',
|
|
43 -NAME => 'ExpSet1',
|
|
44 -REPLICATE => 1,
|
|
45 );
|
|
46
|
|
47 # Add some InputSubsets
|
|
48
|
|
49 $inp_set->add_new_subsets($subset_name, $
|
|
50
|
|
51
|
|
52
|
|
53
|
|
54 =head1 DESCRIPTION
|
|
55
|
|
56 An InputSet object provides a generic container for any non-array based feature import,
|
|
57 allowing tracking of file import via the status table and integration into Data and FeatureSets to
|
|
58 provide traceability to the source experiment from a given FeatureSet.
|
|
59
|
|
60 =cut
|
|
61
|
|
62 use strict;
|
|
63 use warnings;
|
|
64
|
|
65 package Bio::EnsEMBL::Funcgen::InputSet;
|
|
66
|
|
67 use Bio::EnsEMBL::Funcgen::InputSubset;
|
|
68 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
|
|
69 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate);
|
|
70 use Bio::EnsEMBL::Funcgen::Set;
|
|
71 use Bio::EnsEMBL::Analysis;
|
|
72
|
|
73 use vars qw(@ISA);
|
|
74 @ISA = qw(Bio::EnsEMBL::Funcgen::Set);
|
|
75
|
|
76
|
|
77 =head2 new
|
|
78
|
|
79
|
|
80
|
|
81 Example : my $eset = Bio::EnsEMBL::Funcgen::InputSet->new(
|
|
82 -EXPERIMENT => $exp,
|
|
83 -FEATURE_TYPE => $ftype,
|
|
84 -CELL_TYPE => $ctype,
|
|
85 -FORMAT => 'READ_FORMAT',
|
|
86 -VENDOR => 'SOLEXA',
|
|
87 -NAME => 'ExpSet1',
|
|
88 -ANALYSIS => $anal,
|
|
89 -FEATURE_CLASS => 'annotated',
|
|
90 );
|
|
91
|
|
92 Do we want to define subsets likes this or are we more likely to add them one by one?
|
|
93
|
|
94 Description: Constructor for InputSet objects.
|
|
95 Returntype : Bio::EnsEMBL::Funcgen::InputSet
|
|
96 Exceptions : Throws if no Experiment defined
|
|
97 Throws if CellType or FeatureType are not valid or stored
|
|
98 Caller : General
|
|
99 Status : At risk
|
|
100
|
|
101 =cut
|
|
102
|
|
103 sub new {
|
|
104 my $caller = shift;
|
|
105
|
|
106 my $class = ref($caller) || $caller;
|
|
107
|
|
108 #Add set_type here to overwrite default ref parsing in Set::set_type
|
|
109 #This need to stay like this until we patch the DB
|
|
110 my $self = $class->SUPER::new(@_);
|
|
111
|
|
112 my ($exp, $format, $vendor, $rep)
|
|
113 = rearrange(['EXPERIMENT', 'FORMAT', 'VENDOR', 'REPLICATE'], @_);
|
|
114
|
|
115 if (! (ref $exp && $exp->isa('Bio::EnsEMBL::Funcgen::Experiment') && $exp->dbID())){
|
|
116 throw('Must specify a valid stored Bio::EnsEMBL::Funcgen::Experiment');
|
|
117 }
|
|
118
|
|
119
|
|
120 #These are set in Set, just validate here
|
|
121 throw ('Must provide a FeatureType') if(! defined $self->feature_type);
|
|
122 throw ('Must provide a CellType') if(! defined $self->cell_type);
|
|
123
|
|
124 my $type = $self->feature_class;
|
|
125
|
|
126 #Need to move these types to config
|
|
127
|
|
128 if(! ($type && grep /^${type}$/, ('annotated', 'result', 'segmentation'))){
|
|
129 throw("You must define a valid InputSet feature_class e.g. 'annotated' or 'result'");
|
|
130 }
|
|
131
|
|
132 if(($type eq 'result') &&
|
|
133 ($format ne 'SEQUENCING')){
|
|
134 throw('InputSet does not yet support a result type InputSet which does not have the \'SEQUENCING\' format');
|
|
135
|
|
136 }
|
|
137
|
|
138
|
|
139 #if(! defined $self->analysis){
|
|
140 ##default analysis hack for v47
|
|
141 ##Set directly to avoid dbID boolean check
|
|
142 #This is to support supporting_set cache in data_set?
|
|
143 $self->{'analysis'} = Bio::EnsEMBL::Analysis->new
|
|
144 (-logic_name => 'external',
|
|
145 -id => 0,#??someone needs to rewrite analysis
|
|
146 );
|
|
147
|
|
148 #Change to direct setting for speed
|
|
149 $self->{format} = $format;
|
|
150 $self->{vendor} = $vendor;
|
|
151 $self->{replicate} = $rep;
|
|
152 $self->{experiment} = $exp;
|
|
153 $self->{subsets} = {};
|
|
154
|
|
155 return $self;
|
|
156 }
|
|
157
|
|
158
|
|
159 =head2 add_new_subset
|
|
160
|
|
161 Arg [1] : string - sub set name e.g. the file name (not path as we're restricted to 30 chars)
|
|
162 Arg [2] : Bio::EnsEMBL::Funcgen::InputSubset - optional
|
|
163 If not defined will create a sparse InputSubset based on the name
|
|
164 Example : $expset->add_new_subset($ss_name, $exp_subset);
|
|
165 Description: Adds input_subset
|
|
166 Returntype : none
|
|
167 Exceptions : Throws if set is already present
|
|
168 Throws if InputSubset is not valid or stored
|
|
169 Caller : General
|
|
170 Status : At Risk
|
|
171
|
|
172 =cut
|
|
173
|
|
174 #Do we still use the optional subset function?
|
|
175
|
|
176 sub add_new_subset {
|
|
177 my ($self, $ss_name, $exp_sset) = @_;
|
|
178
|
|
179 #Need to test $ss_name here
|
|
180 if(! ($ss_name && ref(\$ss_name) eq 'SCALAR')){#ref($exp_sset) would be 'REF'
|
|
181 throw('You must pass a InputSubset name');
|
|
182 }
|
|
183
|
|
184 if($self->get_subset_by_name($ss_name)){
|
|
185 throw("Subset $ss_name is already present in this InputSet, maybe you need to alter the filename?");
|
|
186 }
|
|
187
|
|
188 if(defined $exp_sset){
|
|
189
|
|
190 if(!(ref($exp_sset) && $exp_sset->isa('Bio::EnsEMBL::Funcgen::InputSubset') && $exp_sset->dbID())){
|
|
191 throw('InputSubsets must be valid and stored');
|
|
192 }
|
|
193 }
|
|
194 else{
|
|
195
|
|
196 $exp_sset = Bio::EnsEMBL::Funcgen::InputSubset->new(
|
|
197 -name => $ss_name,
|
|
198 -input_set => $self,
|
|
199 );
|
|
200 }
|
|
201
|
|
202 $self->{subsets}{$ss_name} = $exp_sset;
|
|
203
|
|
204 return $self->{subsets}{$ss_name};
|
|
205 }
|
|
206
|
|
207
|
|
208 =head2 get_Experiment
|
|
209
|
|
210 Example : my $exp = $exp_set->get_Experiment();
|
|
211 Description: Getter for the Experiment of this DataSet.
|
|
212 Returntype : Bio::EnsEMBL::Fuuncgen::Experiment
|
|
213 Exceptions : None
|
|
214 Caller : General
|
|
215 Status : At Risk
|
|
216
|
|
217 =cut
|
|
218
|
|
219 sub get_Experiment{ return $_[0]->{experiment}; }
|
|
220
|
|
221
|
|
222 =head2 get_InputSubsets
|
|
223
|
|
224 Example : my @subsets = @{$exp_set->get_InputSubsets()};
|
|
225 Description: Getter for the InputSubsets for this InputSet.
|
|
226 Returntype : Arrayref
|
|
227 Exceptions : None
|
|
228 Caller : General
|
|
229 Status : At Risk
|
|
230
|
|
231 =cut
|
|
232
|
|
233 sub get_InputSubsets{
|
|
234 my ($self) = shift;
|
|
235
|
|
236 return [ values %{$self->{'subsets'}} ];
|
|
237 }
|
|
238
|
|
239
|
|
240
|
|
241
|
|
242 =head2 get_subset_by_name
|
|
243
|
|
244 Example : my $subsets = $exp_set->get_subset_by_name('subset1');
|
|
245 Description: Getter for the subset of a given name for this InputSet.
|
|
246 Returntype : Bio::EnsEMBL::Funcgen::InputSubset
|
|
247 Exceptions : None
|
|
248 Caller : General
|
|
249 Status : At Risk
|
|
250
|
|
251 =cut
|
|
252
|
|
253 sub get_subset_by_name{
|
|
254 my ($self, $name) = @_;
|
|
255 return (exists $self->{'subsets'}{$name}) ? $self->{'subsets'}{$name} : undef;
|
|
256 }
|
|
257
|
|
258
|
|
259 =head2 get_subset_names
|
|
260
|
|
261 Example : my @subset_names = @{$exp_set->get_subset_names()};
|
|
262 Description: Getter for the subset names for this InputSet.
|
|
263 Returntype : Arrayref
|
|
264 Exceptions : None
|
|
265 Caller : General
|
|
266 Status : At Risk
|
|
267
|
|
268 =cut
|
|
269
|
|
270 sub get_subset_names{
|
|
271 my ($self) = shift;
|
|
272 return [ keys %{$self->{'subsets'}} ];
|
|
273 }
|
|
274
|
|
275
|
|
276
|
|
277
|
|
278 =head2 vendor
|
|
279
|
|
280 Arg[1] : String - vendor e.g. ILLUMINA
|
|
281 Example : my $iset_vendor = $iset->vendor;
|
|
282 Description: Getter for the vendor attribute of this InputSet.
|
|
283 Returntype : String
|
|
284 Exceptions : None
|
|
285 Caller : General
|
|
286 Status : At Risk
|
|
287
|
|
288 =cut
|
|
289
|
|
290 sub vendor { return $_[0]->{vendor}; }
|
|
291
|
|
292
|
|
293 =head2 format
|
|
294
|
|
295 Arg[1] : string - format i.e. product type/format
|
|
296 Example : my $iset_format = $iset->format;
|
|
297 Description: Getter for the format attribute of this InputSet.
|
|
298 Returntype : String
|
|
299 Exceptions : None
|
|
300 Caller : General
|
|
301 Status : At Risk
|
|
302
|
|
303 =cut
|
|
304
|
|
305 sub format { return $_[0]->{format}; }
|
|
306
|
|
307
|
|
308 =head2 replicate
|
|
309
|
|
310 Arg[1] : Integer - replicate 0 = merged or NA, >0 refers to individual replicate
|
|
311 Example : if($iset->replicate){ #Do something replicate specific in here }
|
|
312 Description: Getter for the replicate attribute of this InputSet.
|
|
313 Returntype : Integer
|
|
314 Exceptions : None
|
|
315 Caller : General
|
|
316 Status : At Risk
|
|
317
|
|
318 =cut
|
|
319
|
|
320 sub replicate { return $_[0]->{replicate}; }
|
|
321
|
|
322
|
|
323
|
|
324 =head2 source_info
|
|
325
|
|
326 Example : my $source_info = $input_set->source_info;
|
|
327 Description: Getter for the experiment source info i.e. [ $label, $url ]
|
|
328 Returntype : Listref
|
|
329 Exceptions : None
|
|
330 Caller : General
|
|
331 Status : At risk
|
|
332
|
|
333 =cut
|
|
334
|
|
335 #Currently handling redundant/absent InputSubset data
|
|
336
|
|
337 sub source_info{
|
|
338 my $self = shift;
|
|
339
|
|
340 if(! defined $self->{source_info}){
|
|
341 #could have data_url as highest priority here
|
|
342 #but we need to ensure removal when adding archive ids
|
|
343 #so we link to the archive and not the old data url
|
|
344
|
|
345 my $exp_group = $self->get_Experiment->experimental_group;
|
|
346 my %source_info; #Handles redundant InputSubsets
|
|
347 my ($proj_name, $proj_link, $source_label, $source_link);
|
|
348
|
|
349 if($exp_group->is_project){
|
|
350 $proj_name = $exp_group->name;
|
|
351 $proj_link = $exp_group->url;
|
|
352 }
|
|
353
|
|
354 foreach my $isset(@{$self->get_InputSubsets}){
|
|
355
|
|
356 if(defined $isset->archive_id ){
|
|
357 $source_label = $isset->archive_id;
|
|
358
|
|
359 if(! exists $source_info{$source_label}){
|
|
360 $source_info{$source_label} = [$source_label, undef];
|
|
361 #source_link can is undef here as archive_id overrides display url
|
|
362 #undef links will automatically go to the SRA
|
|
363 }
|
|
364 }
|
|
365 elsif(defined $proj_name){
|
|
366 #$source_label = $self->experimental_group->name;
|
|
367 $source_link = $isset->display_url || $proj_link;
|
|
368
|
|
369 if(! exists $source_info{$source_link}){
|
|
370 $source_info{$source_link} = [$proj_name, $source_link];
|
|
371 }
|
|
372 }
|
|
373 }
|
|
374
|
|
375 $self->{source_info} = [values %source_info];
|
|
376 }
|
|
377
|
|
378 return $self->{source_info};
|
|
379 }
|
|
380
|
|
381
|
|
382
|
|
383 1;
|
|
384
|