comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 #
2 # Ensembl module for Bio::EnsEMBL::Funcgen::ProbeFeature
3 #
4
5 =head1 LICENSE
6
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
8 Genome Research Limited. All rights reserved.
9
10 This software is distributed under a modified Apache license.
11 For license details, please see
12
13 http://www.ensembl.org/info/about/code_licence.html
14
15 =head1 CONTACT
16
17 Please email comments or questions to the public Ensembl
18 developers list at <ensembl-dev@ebi.ac.uk>.
19
20 Questions may also be sent to the Ensembl help desk at
21 <helpdesk@ensembl.org>.
22
23 =head1 NAME
24
25 Bio::EnsEMBL::ProbeFeature - A module to represent an nucleotide probe
26 genomic mapping.
27
28 =head1 SYNOPSIS
29
30 use Bio::EnsEMBL::Funcgen::ProbeFeature;
31
32 my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
33 -PROBE => $probe,
34 -MISMATCHCOUNT => 0,
35 -SLICE => $chr_1_slice,
36 -START => 1_000_000,
37 -END => 1_000_024,
38 -STRAND => -1,
39 -ANALYSIS => $analysis,
40 -CIGAR_STRING => '1U2M426D2M1m21M',
41 );
42
43
44 =head1 DESCRIPTION
45
46 An ProbeFeature object represents the genomic placement of an Probe
47 object. The data are stored in the probe_feature table.
48
49 =cut
50
51 use strict;
52 use warnings;
53
54 package Bio::EnsEMBL::Funcgen::ProbeFeature;
55
56 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
57 use Bio::EnsEMBL::Utils::Exception qw( throw );
58 use Bio::EnsEMBL::Feature;
59 use Bio::EnsEMBL::Funcgen::Storable;
60 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(median);
61
62 use vars qw(@ISA);
63 @ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable);
64
65
66 =head2 new
67
68 Arg [-PROBE] : Bio::EnsEMBL::Funcgen::Probe - probe
69 A ProbeFeature must have a probe. This probe must already be stored if
70 you plan to store the feature.
71 Arg [-MISMATCHCOUNT]: int
72 Number of mismatches over the length of the probe.
73 Arg [-SLICE] : Bio::EnsEMBL::Slice
74 The slice on which this feature is.
75 Arg [-START] : int
76 The start coordinate of this feature relative to the start of the slice
77 it is sitting on. Coordinates start at 1 and are inclusive.
78 Arg [-END] : int
79 The end coordinate of this feature relative to the start of the slice
80 it is sitting on. Coordinates start at 1 and are inclusive.
81 Arg [-STRAND] : int
82 The orientation of this feature. Valid values are 1, -1 and 0.
83 Arg [-dbID] : (optional) int
84 Internal database ID.
85 Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor
86 Database adaptor.
87 Example : my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
88 -PROBE => $probe,
89 -MISMATCHCOUNT => 0,
90 -SLICE => $chr_1_slice,
91 -START => 1_000_000,
92 -END => 1_000_024,
93 -STRAND => -1,
94 -ANALYSIS => $analysis,
95 -CIGARLINE => '15M2m3d4M',
96 #Can represent transcript alignment as gapped genomic alignments
97 #D(eletions) representing introns
98 #Lowercase m's showing sequence mismatches
99 );
100 Description: Constructor for ProbeFeature objects.
101 Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature
102 Exceptions : None
103 Caller : General
104 Status : At risk
105
106 =cut
107
108 sub new {
109 my $caller = shift;
110
111 my $class = ref($caller) || $caller;
112
113 my $self = $class->SUPER::new(@_);
114
115 my ($probe, $mismatchcount, $pid, $cig_line)
116 = rearrange(['PROBE', 'MISMATCHCOUNT', 'PROBE_ID', 'CIGAR_STRING'], @_);
117
118 #remove mismatch?
119 #mandatory args?
120
121 #warn "creating probe feature with $pid";
122 $self->{'probe_id'} = $pid if $pid;
123 $self->probe($probe) if $probe;
124 $self->mismatchcount($mismatchcount) if defined $mismatchcount;#do not remove until probe mapping pipeline fixed
125 $self->cigar_string($cig_line) if defined $cig_line;
126
127 #do we need to validate this against the db? Grab from slice and create new if not present? Will this be from the dnadb?
128
129 #do we need this coordsys id if we're passing a slice? We should have the method but not in here?
130
131 return $self;
132 }
133
134 =head2 new_fast
135
136 Args : Hashref with all internal attributes set
137 Example : none
138 Description: Quick and dirty version of new. Only works if the code is very
139 disciplined.
140 Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature
141 Exceptions : None
142 Caller : General
143 Status : At Risk
144
145 =cut
146
147 sub new_fast {
148 bless ($_[1], $_[0]);
149 }
150
151 =head2 probeset
152
153 Arg [1] : (optional) string - probeset
154 Example : my $probeset = $feature->probeset();
155 Description: Getter and setter for the probeset for this feature. Shortcut
156 for $feature->probe->probeset(), which should be used instead.
157 Probeset is not persisted if set with this method.
158 Returntype : string
159 Exceptions : None
160 Caller : General
161 Status : Medium Risk
162 : Use $feature->probe->probeset() because this may be removed
163
164 =cut
165
166 sub probeset {
167 my $self = shift;
168
169 $self->{'probeset'} = shift if @_;
170
171 if (! $self->{'probeset'}) {
172 $self->{'probeset'} = $self->probe()->probeset();
173 }
174
175 #We could bypass this entirely and call directly using proveset_id?
176
177
178 return $self->{'probeset'};
179 }
180
181
182 #Only ever needs to be set in _objs_from_sth
183 #This is to allow linkage of probe_feature glyphs without retrieving the probeset.
184
185 sub probeset_id{
186 my $self = shift;
187
188 return $self->{'_probeset_id'};
189 }
190
191 =head2 mismatchcount
192
193 Arg [1] : int - number of mismatches
194 Example : my $mismatches = $feature->mismatchcount();
195 Description: Getter and setter for number of mismatches for this feature.
196 Returntype : int
197 Exceptions : None
198 Caller : General
199 Status : High Risk
200
201 =cut
202
203 sub mismatchcount {
204 my $self = shift;
205
206
207 #replace with dynamic check of cigarline?
208
209 $self->{'mismatchcount'} = shift if @_;
210
211 return $self->{'mismatchcount'};
212 }
213
214
215 =head2 cigar_string
216
217 Arg [1] : str - Cigar line alignment annotation (M = Align & Seq match, m = Align matcht & Seq mismatch, D = Deletion in ProbeFeature wrt genome, U = Unknown at time of alignment)
218 Example : my $cg = $feature->cigar_string();
219 Description: Getter and setter for number of the cigar line attribute for this feature.
220 Returntype : str
221 Exceptions : None
222 Caller : General
223 Status : High Risk
224
225 =cut
226
227 sub cigar_string {
228 my $self = shift;
229
230 $self->{'cigar_string'} = shift if @_;
231
232 return $self->{'cigar_string'};
233 }
234
235
236 =head2 probe
237
238 Arg [1] : Bio::EnsEMBL::Funcgen::Probe - probe
239 Example : my $probe = $feature->probe();
240 Description: Getter, setter and lazy loader of probe attribute for
241 ProbeFeature objects. Features are retrieved from the database
242 without attached probes, so retrieving probe information for a
243 feature will involve another query.
244 Returntype : Bio::EnsEMBL::Funcgen::Probe
245 Exceptions : None
246 Caller : General
247 Status : at risk
248
249 =cut
250
251 sub probe {
252 my $self = shift;
253 my $probe = shift;
254
255 #can we not use _probe_id here?
256 #why is probe_id not set sometimes?
257
258
259 #warn "in pf and probe is ".$self->{'probe_id'};
260
261 if ($probe) {
262
263 #warn "Probe defined and is ".$probe. "and probe id is".$self->{'probe_id'};
264
265 if ( !ref $probe || !$probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) {
266 throw('Probe must be a Bio::EnsEMBL::Funcgen::Probe object');
267 }
268 $self->{'probe'} = $probe;
269 }
270
271 if ( ! defined $self->{'probe'}){
272 # && $self->dbID() && $self->adaptor() ) {
273 #$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_ProbeFeature($self);
274 #warn "fetching probe with dbID ".$self->probe_id();
275 $self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_dbID($self->probe_id());
276 }
277 return $self->{'probe'};
278 }
279
280
281 =head2 probe_id
282
283 Example : my $probe_id = $pfeature->probe_id();
284 Description: Getter for the probe db id of the ProbeFeature
285 Returntype : int
286 Exceptions : None
287 Caller : General
288 Status : at risk
289
290 =cut
291
292 sub probe_id{
293 my $self = shift;
294
295 return $self->{'probe_id'} || $self->probe->dbID();
296 }
297
298 =head2 get_results_by_channel_id
299
300 Arg [1] : int - channel_id (mandatory)
301 Arg [2] : string - Analysis name e.g. RawValue, VSN (optional)
302 Example : my @results = $feature->results();
303 Description: Getter, setter and lazy loader of results attribute for
304 ProbeFeature objects.
305 Returntype : List ref to arrays containing ('score', 'Analysis logic_name');
306 Exceptions : None
307 Caller : General
308 Status : Medium Risk
309
310 =cut
311
312 sub get_results_by_channel_id {
313 my $self = shift;
314 my $channel_id = shift;
315 my $anal_name = shift;
316
317 warn("This method not fully implemented, remove/deprecate?");
318
319 #$self->{'results'} ||= {};
320 $self->{'results_complete'} ||= 0;
321
322 if(! $self->{'results'} || ($anal_name && ! exists $self->{'results'}{$anal_name})){
323 #fetch all, set complete set flag
324 $self->{'results_complete'} ||= 1 if(! $anal_name);
325
326 foreach my $results_ref(@{$self->adaptor->fetch_results_by_channel_analysis($self->probe->dbID(),
327 $channel_id, $anal_name)}){
328
329 $self->{'results'}{$$results_ref[1]} = $$results_ref[0];
330 }
331 }
332
333 return $self->{'results'}
334 }
335
336
337 #The experiment/al chip specificity has already been done by the ofa->fetch_all_by_Slice_Experiment
338 #This may be called with no preceding Experiment specificity
339 #this would return results for all experiments
340 #do we need to set a default Experiment?
341
342
343 #THis should return both Chip and Channel based results
344 #just Chip for now
345 #maybe retrieve and hash all if not Analysis object passed? Then return what?
346
347
348 =head2 get_result_by_Analysis_ExperimentalChips
349
350 Arg [1] : Bio::EnsEMBL::Analysis
351 Arg [2] : listref - Bio::EnsEMBL::Funcgen::ExperimentalChip
352 Example : my $result = $feature->get_result_by_Analysis_ExperimentalChips($anal, \@echips);
353 Description: Getter of results attribute for a given Analysis and set of ExperimentalChips
354 Returntype : float
355 Exceptions : Throws is no Analysis or ExperimentalChips are not passed?
356 Caller : General
357 Status : High Risk
358
359 =cut
360
361
362 #make ExperimentalChips optional?
363
364 #or have ResultSetAdaptor? Do we need a ResultSet?
365 #may not have ExperimentalChip, so would need to return ec dbID aswell
366
367
368 ######This will break/return anomalous if
369 #ECs are passed from different experiments
370 #ECs are passed from different Arrays
371
372
373 sub get_result_by_Analysis_ExperimentalChips{
374 my ($self, $anal, $exp_chips) = @_;
375
376 throw("Need to pass listref of ExperiemntalChips") if(scalar(@$exp_chips) == 0);
377 throw("Need to pass a valid Bio::EnsEMBL::Analysis") if ! $anal->isa("Bio::EnsEMBL::Analysis");
378
379 my (%query_ids, %all_ids, %ac_ids);
380 my $anal_name = $anal->logic_name();
381
382 foreach my $ec(@$exp_chips){
383
384 throw("Need to pass a listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects")
385 if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip");
386
387 #my $tmp_id = $self->adaptor->db->get_ArrayAdaptor->fetch_by_array_chip_dbID($ec->array_chip_id())->dbID();
388
389 #$array_id ||= $tmp_id;
390
391 #throw("You have passed ExperimentalChips from different if($array_id != $tmp_id)
392
393 #if(exists $ac_ids{$ec->array_chip_id()}){
394 # throw("Multiple chip query only works with contiguous chips within an array, rather than duplicates");
395 # }
396
397 $ac_ids{$ec->array_chip_id()} = 1;
398 $all_ids{$ec->dbID()} = 1;
399 $query_ids{$ec->dbID()} = 1 if(! exists $self->{'results'}{$anal_name}{$ec->dbID()});
400
401 }
402
403
404 my @ec_ids = keys %query_ids;
405 my @all_ids = keys %all_ids;
406
407
408 #warn "ec ids @ec_ids\n";
409 #warn "all ids @all_ids\n";
410
411 #$self->{'results'} ||= {};
412 #$self->{'results_complete'} ||= 0;#do we need this now?
413
414 if((scalar(@all_ids) - scalar(@ec_ids))> 1){
415 throw("DATA ERROR - There is more than one result stored for the following ExperimentalChip ids: @all_ids");
416 }
417 elsif(! $self->{'results'} || (($anal_name && scalar(@ec_ids) > 0) && scalar(@all_ids) == scalar(@ec_ids))){
418 #fetch all, set complete set flag
419 #$self->{'results_complete'} ||= 1 if(! $anal_name);
420 #would need to look up chip and channel analyses here and call relevant fetch
421 #or pass the chip and then build the query as = or IN dependent on context of logic name
422 #if there are multiple results, last one will overwrite others
423 #could do foreach here to deal with retrieving all i.e. no logic name
424 #Can supply mutliple chips, but probe ids "should" be unique(in the DB at least) amongst contiguous array_chips
425 #build the cache based on logic name and table_id
426 #cahce key?? should we cat the ec_ids together?
427
428 my @result_refs = @{$self->adaptor->fetch_results_by_Probe_Analysis_experimental_chip_ids($self->probe(),
429 $anal,
430 \@ec_ids)};
431
432 #Remove lines with no result
433 while(@result_refs && (! $result_refs[0]->[0])){
434 shift @result_refs;
435 }
436
437 my $num_results = scalar(@result_refs);
438 my ($result, $mpos);
439 #throw("Fetched more than one result for this ProbeFeature, Analysis and ExperimentalChips") if (scalar(@result_refs) >1);
440
441 #No sort needed as we sort in the query
442
443 if($num_results == 1){
444 $result = $result_refs[0]->[0];
445 }
446 elsif($num_results == 2){#mean
447 $result = ($result_refs[0]->[0] + $result_refs[1]->[0])/2;
448
449 }
450 elsif($num_results > 2){#median or mean of median flanks
451 $mpos = $num_results/2;
452
453 if($mpos =~ /\./){#true median
454 $mpos =~ s/\..*//;
455 $mpos ++;
456 $result = $result_refs[$mpos]->[0];
457 }else{
458 $result = ($result_refs[$mpos]->[0] + $result_refs[($mpos+1)]->[0])/2 ;
459 }
460 }
461
462 $self->{'results'}{$anal_name}{":".join(":", @ec_ids).":"} = $result;
463 }
464
465 #do we return the ec ids here to, or do we trust that the user will know to only pass contiguous rather than duplicate chips
466
467 #how are we going to retrieve the result for one of many possible ec id keys?
468 #options, cat ec dbids as key, and grep them to find full key, then return result
469 #this may hide the duplicate chip problem
470 #If a query has already been made and cached,another query with one differing ID(duplicate result) may never be queried as we already have a cahced result
471 #We shoulld pick up duplicates before this happens
472 #If we try and mix ExperimentalChips from different experiments, then this would also cause multiple results, and hence hide some data
473
474 my @keys;
475 foreach my $id(@all_ids){
476 my @tmp = grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}});
477 #Hacky needs sorting, quick fix for release!!
478
479 if(@tmp){
480 push @keys, grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}});
481
482 last;
483 }
484
485 }
486
487 throw("Got more than one key for the results cache") if scalar(@keys) > 1;
488
489 return $self->{'results'}{$anal_name}{$keys[0]};
490 }
491
492
493 #Will this be too slow, can we not do one query across all tables
494
495 sub get_result_by_ResultSet{
496 my ($self, $rset) = @_;
497
498 my $results = $rset->adaptor->fetch_results_by_probe_id_ResultSet($self->probe_id(), $rset);
499
500 return median($results);
501 }
502
503
504
505
506
507 1;
508