0
|
1 #
|
|
2 # Ensembl module for Bio::EnsEMBL::Funcgen::ProbeFeature
|
|
3 #
|
|
4
|
|
5 =head1 LICENSE
|
|
6
|
|
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
8 Genome Research Limited. All rights reserved.
|
|
9
|
|
10 This software is distributed under a modified Apache license.
|
|
11 For license details, please see
|
|
12
|
|
13 http://www.ensembl.org/info/about/code_licence.html
|
|
14
|
|
15 =head1 CONTACT
|
|
16
|
|
17 Please email comments or questions to the public Ensembl
|
|
18 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
19
|
|
20 Questions may also be sent to the Ensembl help desk at
|
|
21 <helpdesk@ensembl.org>.
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Bio::EnsEMBL::ProbeFeature - A module to represent an nucleotide probe
|
|
26 genomic mapping.
|
|
27
|
|
28 =head1 SYNOPSIS
|
|
29
|
|
30 use Bio::EnsEMBL::Funcgen::ProbeFeature;
|
|
31
|
|
32 my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
|
|
33 -PROBE => $probe,
|
|
34 -MISMATCHCOUNT => 0,
|
|
35 -SLICE => $chr_1_slice,
|
|
36 -START => 1_000_000,
|
|
37 -END => 1_000_024,
|
|
38 -STRAND => -1,
|
|
39 -ANALYSIS => $analysis,
|
|
40 -CIGAR_STRING => '1U2M426D2M1m21M',
|
|
41 );
|
|
42
|
|
43
|
|
44 =head1 DESCRIPTION
|
|
45
|
|
46 An ProbeFeature object represents the genomic placement of an Probe
|
|
47 object. The data are stored in the probe_feature table.
|
|
48
|
|
49 =cut
|
|
50
|
|
51 use strict;
|
|
52 use warnings;
|
|
53
|
|
54 package Bio::EnsEMBL::Funcgen::ProbeFeature;
|
|
55
|
|
56 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
|
|
57 use Bio::EnsEMBL::Utils::Exception qw( throw );
|
|
58 use Bio::EnsEMBL::Feature;
|
|
59 use Bio::EnsEMBL::Funcgen::Storable;
|
|
60 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(median);
|
|
61
|
|
62 use vars qw(@ISA);
|
|
63 @ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable);
|
|
64
|
|
65
|
|
66 =head2 new
|
|
67
|
|
68 Arg [-PROBE] : Bio::EnsEMBL::Funcgen::Probe - probe
|
|
69 A ProbeFeature must have a probe. This probe must already be stored if
|
|
70 you plan to store the feature.
|
|
71 Arg [-MISMATCHCOUNT]: int
|
|
72 Number of mismatches over the length of the probe.
|
|
73 Arg [-SLICE] : Bio::EnsEMBL::Slice
|
|
74 The slice on which this feature is.
|
|
75 Arg [-START] : int
|
|
76 The start coordinate of this feature relative to the start of the slice
|
|
77 it is sitting on. Coordinates start at 1 and are inclusive.
|
|
78 Arg [-END] : int
|
|
79 The end coordinate of this feature relative to the start of the slice
|
|
80 it is sitting on. Coordinates start at 1 and are inclusive.
|
|
81 Arg [-STRAND] : int
|
|
82 The orientation of this feature. Valid values are 1, -1 and 0.
|
|
83 Arg [-dbID] : (optional) int
|
|
84 Internal database ID.
|
|
85 Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor
|
|
86 Database adaptor.
|
|
87 Example : my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
|
|
88 -PROBE => $probe,
|
|
89 -MISMATCHCOUNT => 0,
|
|
90 -SLICE => $chr_1_slice,
|
|
91 -START => 1_000_000,
|
|
92 -END => 1_000_024,
|
|
93 -STRAND => -1,
|
|
94 -ANALYSIS => $analysis,
|
|
95 -CIGARLINE => '15M2m3d4M',
|
|
96 #Can represent transcript alignment as gapped genomic alignments
|
|
97 #D(eletions) representing introns
|
|
98 #Lowercase m's showing sequence mismatches
|
|
99 );
|
|
100 Description: Constructor for ProbeFeature objects.
|
|
101 Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature
|
|
102 Exceptions : None
|
|
103 Caller : General
|
|
104 Status : At risk
|
|
105
|
|
106 =cut
|
|
107
|
|
108 sub new {
|
|
109 my $caller = shift;
|
|
110
|
|
111 my $class = ref($caller) || $caller;
|
|
112
|
|
113 my $self = $class->SUPER::new(@_);
|
|
114
|
|
115 my ($probe, $mismatchcount, $pid, $cig_line)
|
|
116 = rearrange(['PROBE', 'MISMATCHCOUNT', 'PROBE_ID', 'CIGAR_STRING'], @_);
|
|
117
|
|
118 #remove mismatch?
|
|
119 #mandatory args?
|
|
120
|
|
121 #warn "creating probe feature with $pid";
|
|
122 $self->{'probe_id'} = $pid if $pid;
|
|
123 $self->probe($probe) if $probe;
|
|
124 $self->mismatchcount($mismatchcount) if defined $mismatchcount;#do not remove until probe mapping pipeline fixed
|
|
125 $self->cigar_string($cig_line) if defined $cig_line;
|
|
126
|
|
127 #do we need to validate this against the db? Grab from slice and create new if not present? Will this be from the dnadb?
|
|
128
|
|
129 #do we need this coordsys id if we're passing a slice? We should have the method but not in here?
|
|
130
|
|
131 return $self;
|
|
132 }
|
|
133
|
|
134 =head2 new_fast
|
|
135
|
|
136 Args : Hashref with all internal attributes set
|
|
137 Example : none
|
|
138 Description: Quick and dirty version of new. Only works if the code is very
|
|
139 disciplined.
|
|
140 Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature
|
|
141 Exceptions : None
|
|
142 Caller : General
|
|
143 Status : At Risk
|
|
144
|
|
145 =cut
|
|
146
|
|
147 sub new_fast {
|
|
148 bless ($_[1], $_[0]);
|
|
149 }
|
|
150
|
|
151 =head2 probeset
|
|
152
|
|
153 Arg [1] : (optional) string - probeset
|
|
154 Example : my $probeset = $feature->probeset();
|
|
155 Description: Getter and setter for the probeset for this feature. Shortcut
|
|
156 for $feature->probe->probeset(), which should be used instead.
|
|
157 Probeset is not persisted if set with this method.
|
|
158 Returntype : string
|
|
159 Exceptions : None
|
|
160 Caller : General
|
|
161 Status : Medium Risk
|
|
162 : Use $feature->probe->probeset() because this may be removed
|
|
163
|
|
164 =cut
|
|
165
|
|
166 sub probeset {
|
|
167 my $self = shift;
|
|
168
|
|
169 $self->{'probeset'} = shift if @_;
|
|
170
|
|
171 if (! $self->{'probeset'}) {
|
|
172 $self->{'probeset'} = $self->probe()->probeset();
|
|
173 }
|
|
174
|
|
175 #We could bypass this entirely and call directly using proveset_id?
|
|
176
|
|
177
|
|
178 return $self->{'probeset'};
|
|
179 }
|
|
180
|
|
181
|
|
182 #Only ever needs to be set in _objs_from_sth
|
|
183 #This is to allow linkage of probe_feature glyphs without retrieving the probeset.
|
|
184
|
|
185 sub probeset_id{
|
|
186 my $self = shift;
|
|
187
|
|
188 return $self->{'_probeset_id'};
|
|
189 }
|
|
190
|
|
191 =head2 mismatchcount
|
|
192
|
|
193 Arg [1] : int - number of mismatches
|
|
194 Example : my $mismatches = $feature->mismatchcount();
|
|
195 Description: Getter and setter for number of mismatches for this feature.
|
|
196 Returntype : int
|
|
197 Exceptions : None
|
|
198 Caller : General
|
|
199 Status : High Risk
|
|
200
|
|
201 =cut
|
|
202
|
|
203 sub mismatchcount {
|
|
204 my $self = shift;
|
|
205
|
|
206
|
|
207 #replace with dynamic check of cigarline?
|
|
208
|
|
209 $self->{'mismatchcount'} = shift if @_;
|
|
210
|
|
211 return $self->{'mismatchcount'};
|
|
212 }
|
|
213
|
|
214
|
|
215 =head2 cigar_string
|
|
216
|
|
217 Arg [1] : str - Cigar line alignment annotation (M = Align & Seq match, m = Align matcht & Seq mismatch, D = Deletion in ProbeFeature wrt genome, U = Unknown at time of alignment)
|
|
218 Example : my $cg = $feature->cigar_string();
|
|
219 Description: Getter and setter for number of the cigar line attribute for this feature.
|
|
220 Returntype : str
|
|
221 Exceptions : None
|
|
222 Caller : General
|
|
223 Status : High Risk
|
|
224
|
|
225 =cut
|
|
226
|
|
227 sub cigar_string {
|
|
228 my $self = shift;
|
|
229
|
|
230 $self->{'cigar_string'} = shift if @_;
|
|
231
|
|
232 return $self->{'cigar_string'};
|
|
233 }
|
|
234
|
|
235
|
|
236 =head2 probe
|
|
237
|
|
238 Arg [1] : Bio::EnsEMBL::Funcgen::Probe - probe
|
|
239 Example : my $probe = $feature->probe();
|
|
240 Description: Getter, setter and lazy loader of probe attribute for
|
|
241 ProbeFeature objects. Features are retrieved from the database
|
|
242 without attached probes, so retrieving probe information for a
|
|
243 feature will involve another query.
|
|
244 Returntype : Bio::EnsEMBL::Funcgen::Probe
|
|
245 Exceptions : None
|
|
246 Caller : General
|
|
247 Status : at risk
|
|
248
|
|
249 =cut
|
|
250
|
|
251 sub probe {
|
|
252 my $self = shift;
|
|
253 my $probe = shift;
|
|
254
|
|
255 #can we not use _probe_id here?
|
|
256 #why is probe_id not set sometimes?
|
|
257
|
|
258
|
|
259 #warn "in pf and probe is ".$self->{'probe_id'};
|
|
260
|
|
261 if ($probe) {
|
|
262
|
|
263 #warn "Probe defined and is ".$probe. "and probe id is".$self->{'probe_id'};
|
|
264
|
|
265 if ( !ref $probe || !$probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) {
|
|
266 throw('Probe must be a Bio::EnsEMBL::Funcgen::Probe object');
|
|
267 }
|
|
268 $self->{'probe'} = $probe;
|
|
269 }
|
|
270
|
|
271 if ( ! defined $self->{'probe'}){
|
|
272 # && $self->dbID() && $self->adaptor() ) {
|
|
273 #$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_ProbeFeature($self);
|
|
274 #warn "fetching probe with dbID ".$self->probe_id();
|
|
275 $self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_dbID($self->probe_id());
|
|
276 }
|
|
277 return $self->{'probe'};
|
|
278 }
|
|
279
|
|
280
|
|
281 =head2 probe_id
|
|
282
|
|
283 Example : my $probe_id = $pfeature->probe_id();
|
|
284 Description: Getter for the probe db id of the ProbeFeature
|
|
285 Returntype : int
|
|
286 Exceptions : None
|
|
287 Caller : General
|
|
288 Status : at risk
|
|
289
|
|
290 =cut
|
|
291
|
|
292 sub probe_id{
|
|
293 my $self = shift;
|
|
294
|
|
295 return $self->{'probe_id'} || $self->probe->dbID();
|
|
296 }
|
|
297
|
|
298 =head2 get_results_by_channel_id
|
|
299
|
|
300 Arg [1] : int - channel_id (mandatory)
|
|
301 Arg [2] : string - Analysis name e.g. RawValue, VSN (optional)
|
|
302 Example : my @results = $feature->results();
|
|
303 Description: Getter, setter and lazy loader of results attribute for
|
|
304 ProbeFeature objects.
|
|
305 Returntype : List ref to arrays containing ('score', 'Analysis logic_name');
|
|
306 Exceptions : None
|
|
307 Caller : General
|
|
308 Status : Medium Risk
|
|
309
|
|
310 =cut
|
|
311
|
|
312 sub get_results_by_channel_id {
|
|
313 my $self = shift;
|
|
314 my $channel_id = shift;
|
|
315 my $anal_name = shift;
|
|
316
|
|
317 warn("This method not fully implemented, remove/deprecate?");
|
|
318
|
|
319 #$self->{'results'} ||= {};
|
|
320 $self->{'results_complete'} ||= 0;
|
|
321
|
|
322 if(! $self->{'results'} || ($anal_name && ! exists $self->{'results'}{$anal_name})){
|
|
323 #fetch all, set complete set flag
|
|
324 $self->{'results_complete'} ||= 1 if(! $anal_name);
|
|
325
|
|
326 foreach my $results_ref(@{$self->adaptor->fetch_results_by_channel_analysis($self->probe->dbID(),
|
|
327 $channel_id, $anal_name)}){
|
|
328
|
|
329 $self->{'results'}{$$results_ref[1]} = $$results_ref[0];
|
|
330 }
|
|
331 }
|
|
332
|
|
333 return $self->{'results'}
|
|
334 }
|
|
335
|
|
336
|
|
337 #The experiment/al chip specificity has already been done by the ofa->fetch_all_by_Slice_Experiment
|
|
338 #This may be called with no preceding Experiment specificity
|
|
339 #this would return results for all experiments
|
|
340 #do we need to set a default Experiment?
|
|
341
|
|
342
|
|
343 #THis should return both Chip and Channel based results
|
|
344 #just Chip for now
|
|
345 #maybe retrieve and hash all if not Analysis object passed? Then return what?
|
|
346
|
|
347
|
|
348 =head2 get_result_by_Analysis_ExperimentalChips
|
|
349
|
|
350 Arg [1] : Bio::EnsEMBL::Analysis
|
|
351 Arg [2] : listref - Bio::EnsEMBL::Funcgen::ExperimentalChip
|
|
352 Example : my $result = $feature->get_result_by_Analysis_ExperimentalChips($anal, \@echips);
|
|
353 Description: Getter of results attribute for a given Analysis and set of ExperimentalChips
|
|
354 Returntype : float
|
|
355 Exceptions : Throws is no Analysis or ExperimentalChips are not passed?
|
|
356 Caller : General
|
|
357 Status : High Risk
|
|
358
|
|
359 =cut
|
|
360
|
|
361
|
|
362 #make ExperimentalChips optional?
|
|
363
|
|
364 #or have ResultSetAdaptor? Do we need a ResultSet?
|
|
365 #may not have ExperimentalChip, so would need to return ec dbID aswell
|
|
366
|
|
367
|
|
368 ######This will break/return anomalous if
|
|
369 #ECs are passed from different experiments
|
|
370 #ECs are passed from different Arrays
|
|
371
|
|
372
|
|
373 sub get_result_by_Analysis_ExperimentalChips{
|
|
374 my ($self, $anal, $exp_chips) = @_;
|
|
375
|
|
376 throw("Need to pass listref of ExperiemntalChips") if(scalar(@$exp_chips) == 0);
|
|
377 throw("Need to pass a valid Bio::EnsEMBL::Analysis") if ! $anal->isa("Bio::EnsEMBL::Analysis");
|
|
378
|
|
379 my (%query_ids, %all_ids, %ac_ids);
|
|
380 my $anal_name = $anal->logic_name();
|
|
381
|
|
382 foreach my $ec(@$exp_chips){
|
|
383
|
|
384 throw("Need to pass a listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects")
|
|
385 if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip");
|
|
386
|
|
387 #my $tmp_id = $self->adaptor->db->get_ArrayAdaptor->fetch_by_array_chip_dbID($ec->array_chip_id())->dbID();
|
|
388
|
|
389 #$array_id ||= $tmp_id;
|
|
390
|
|
391 #throw("You have passed ExperimentalChips from different if($array_id != $tmp_id)
|
|
392
|
|
393 #if(exists $ac_ids{$ec->array_chip_id()}){
|
|
394 # throw("Multiple chip query only works with contiguous chips within an array, rather than duplicates");
|
|
395 # }
|
|
396
|
|
397 $ac_ids{$ec->array_chip_id()} = 1;
|
|
398 $all_ids{$ec->dbID()} = 1;
|
|
399 $query_ids{$ec->dbID()} = 1 if(! exists $self->{'results'}{$anal_name}{$ec->dbID()});
|
|
400
|
|
401 }
|
|
402
|
|
403
|
|
404 my @ec_ids = keys %query_ids;
|
|
405 my @all_ids = keys %all_ids;
|
|
406
|
|
407
|
|
408 #warn "ec ids @ec_ids\n";
|
|
409 #warn "all ids @all_ids\n";
|
|
410
|
|
411 #$self->{'results'} ||= {};
|
|
412 #$self->{'results_complete'} ||= 0;#do we need this now?
|
|
413
|
|
414 if((scalar(@all_ids) - scalar(@ec_ids))> 1){
|
|
415 throw("DATA ERROR - There is more than one result stored for the following ExperimentalChip ids: @all_ids");
|
|
416 }
|
|
417 elsif(! $self->{'results'} || (($anal_name && scalar(@ec_ids) > 0) && scalar(@all_ids) == scalar(@ec_ids))){
|
|
418 #fetch all, set complete set flag
|
|
419 #$self->{'results_complete'} ||= 1 if(! $anal_name);
|
|
420 #would need to look up chip and channel analyses here and call relevant fetch
|
|
421 #or pass the chip and then build the query as = or IN dependent on context of logic name
|
|
422 #if there are multiple results, last one will overwrite others
|
|
423 #could do foreach here to deal with retrieving all i.e. no logic name
|
|
424 #Can supply mutliple chips, but probe ids "should" be unique(in the DB at least) amongst contiguous array_chips
|
|
425 #build the cache based on logic name and table_id
|
|
426 #cahce key?? should we cat the ec_ids together?
|
|
427
|
|
428 my @result_refs = @{$self->adaptor->fetch_results_by_Probe_Analysis_experimental_chip_ids($self->probe(),
|
|
429 $anal,
|
|
430 \@ec_ids)};
|
|
431
|
|
432 #Remove lines with no result
|
|
433 while(@result_refs && (! $result_refs[0]->[0])){
|
|
434 shift @result_refs;
|
|
435 }
|
|
436
|
|
437 my $num_results = scalar(@result_refs);
|
|
438 my ($result, $mpos);
|
|
439 #throw("Fetched more than one result for this ProbeFeature, Analysis and ExperimentalChips") if (scalar(@result_refs) >1);
|
|
440
|
|
441 #No sort needed as we sort in the query
|
|
442
|
|
443 if($num_results == 1){
|
|
444 $result = $result_refs[0]->[0];
|
|
445 }
|
|
446 elsif($num_results == 2){#mean
|
|
447 $result = ($result_refs[0]->[0] + $result_refs[1]->[0])/2;
|
|
448
|
|
449 }
|
|
450 elsif($num_results > 2){#median or mean of median flanks
|
|
451 $mpos = $num_results/2;
|
|
452
|
|
453 if($mpos =~ /\./){#true median
|
|
454 $mpos =~ s/\..*//;
|
|
455 $mpos ++;
|
|
456 $result = $result_refs[$mpos]->[0];
|
|
457 }else{
|
|
458 $result = ($result_refs[$mpos]->[0] + $result_refs[($mpos+1)]->[0])/2 ;
|
|
459 }
|
|
460 }
|
|
461
|
|
462 $self->{'results'}{$anal_name}{":".join(":", @ec_ids).":"} = $result;
|
|
463 }
|
|
464
|
|
465 #do we return the ec ids here to, or do we trust that the user will know to only pass contiguous rather than duplicate chips
|
|
466
|
|
467 #how are we going to retrieve the result for one of many possible ec id keys?
|
|
468 #options, cat ec dbids as key, and grep them to find full key, then return result
|
|
469 #this may hide the duplicate chip problem
|
|
470 #If a query has already been made and cached,another query with one differing ID(duplicate result) may never be queried as we already have a cahced result
|
|
471 #We shoulld pick up duplicates before this happens
|
|
472 #If we try and mix ExperimentalChips from different experiments, then this would also cause multiple results, and hence hide some data
|
|
473
|
|
474 my @keys;
|
|
475 foreach my $id(@all_ids){
|
|
476 my @tmp = grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}});
|
|
477 #Hacky needs sorting, quick fix for release!!
|
|
478
|
|
479 if(@tmp){
|
|
480 push @keys, grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}});
|
|
481
|
|
482 last;
|
|
483 }
|
|
484
|
|
485 }
|
|
486
|
|
487 throw("Got more than one key for the results cache") if scalar(@keys) > 1;
|
|
488
|
|
489 return $self->{'results'}{$anal_name}{$keys[0]};
|
|
490 }
|
|
491
|
|
492
|
|
493 #Will this be too slow, can we not do one query across all tables
|
|
494
|
|
495 sub get_result_by_ResultSet{
|
|
496 my ($self, $rset) = @_;
|
|
497
|
|
498 my $results = $rset->adaptor->fetch_results_by_probe_id_ResultSet($self->probe_id(), $rset);
|
|
499
|
|
500 return median($results);
|
|
501 }
|
|
502
|
|
503
|
|
504
|
|
505
|
|
506
|
|
507 1;
|
|
508
|