Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/ProbeFeature.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # | |
2 # Ensembl module for Bio::EnsEMBL::Funcgen::ProbeFeature | |
3 # | |
4 | |
5 =head1 LICENSE | |
6 | |
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
8 Genome Research Limited. All rights reserved. | |
9 | |
10 This software is distributed under a modified Apache license. | |
11 For license details, please see | |
12 | |
13 http://www.ensembl.org/info/about/code_licence.html | |
14 | |
15 =head1 CONTACT | |
16 | |
17 Please email comments or questions to the public Ensembl | |
18 developers list at <ensembl-dev@ebi.ac.uk>. | |
19 | |
20 Questions may also be sent to the Ensembl help desk at | |
21 <helpdesk@ensembl.org>. | |
22 | |
23 =head1 NAME | |
24 | |
25 Bio::EnsEMBL::ProbeFeature - A module to represent an nucleotide probe | |
26 genomic mapping. | |
27 | |
28 =head1 SYNOPSIS | |
29 | |
30 use Bio::EnsEMBL::Funcgen::ProbeFeature; | |
31 | |
32 my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( | |
33 -PROBE => $probe, | |
34 -MISMATCHCOUNT => 0, | |
35 -SLICE => $chr_1_slice, | |
36 -START => 1_000_000, | |
37 -END => 1_000_024, | |
38 -STRAND => -1, | |
39 -ANALYSIS => $analysis, | |
40 -CIGAR_STRING => '1U2M426D2M1m21M', | |
41 ); | |
42 | |
43 | |
44 =head1 DESCRIPTION | |
45 | |
46 An ProbeFeature object represents the genomic placement of an Probe | |
47 object. The data are stored in the probe_feature table. | |
48 | |
49 =cut | |
50 | |
51 use strict; | |
52 use warnings; | |
53 | |
54 package Bio::EnsEMBL::Funcgen::ProbeFeature; | |
55 | |
56 use Bio::EnsEMBL::Utils::Argument qw( rearrange ); | |
57 use Bio::EnsEMBL::Utils::Exception qw( throw ); | |
58 use Bio::EnsEMBL::Feature; | |
59 use Bio::EnsEMBL::Funcgen::Storable; | |
60 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(median); | |
61 | |
62 use vars qw(@ISA); | |
63 @ISA = qw(Bio::EnsEMBL::Feature Bio::EnsEMBL::Funcgen::Storable); | |
64 | |
65 | |
66 =head2 new | |
67 | |
68 Arg [-PROBE] : Bio::EnsEMBL::Funcgen::Probe - probe | |
69 A ProbeFeature must have a probe. This probe must already be stored if | |
70 you plan to store the feature. | |
71 Arg [-MISMATCHCOUNT]: int | |
72 Number of mismatches over the length of the probe. | |
73 Arg [-SLICE] : Bio::EnsEMBL::Slice | |
74 The slice on which this feature is. | |
75 Arg [-START] : int | |
76 The start coordinate of this feature relative to the start of the slice | |
77 it is sitting on. Coordinates start at 1 and are inclusive. | |
78 Arg [-END] : int | |
79 The end coordinate of this feature relative to the start of the slice | |
80 it is sitting on. Coordinates start at 1 and are inclusive. | |
81 Arg [-STRAND] : int | |
82 The orientation of this feature. Valid values are 1, -1 and 0. | |
83 Arg [-dbID] : (optional) int | |
84 Internal database ID. | |
85 Arg [-ADAPTOR] : (optional) Bio::EnsEMBL::DBSQL::BaseAdaptor | |
86 Database adaptor. | |
87 Example : my $feature = Bio::EnsEMBL::Funcgen::ProbeFeature->new( | |
88 -PROBE => $probe, | |
89 -MISMATCHCOUNT => 0, | |
90 -SLICE => $chr_1_slice, | |
91 -START => 1_000_000, | |
92 -END => 1_000_024, | |
93 -STRAND => -1, | |
94 -ANALYSIS => $analysis, | |
95 -CIGARLINE => '15M2m3d4M', | |
96 #Can represent transcript alignment as gapped genomic alignments | |
97 #D(eletions) representing introns | |
98 #Lowercase m's showing sequence mismatches | |
99 ); | |
100 Description: Constructor for ProbeFeature objects. | |
101 Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature | |
102 Exceptions : None | |
103 Caller : General | |
104 Status : At risk | |
105 | |
106 =cut | |
107 | |
108 sub new { | |
109 my $caller = shift; | |
110 | |
111 my $class = ref($caller) || $caller; | |
112 | |
113 my $self = $class->SUPER::new(@_); | |
114 | |
115 my ($probe, $mismatchcount, $pid, $cig_line) | |
116 = rearrange(['PROBE', 'MISMATCHCOUNT', 'PROBE_ID', 'CIGAR_STRING'], @_); | |
117 | |
118 #remove mismatch? | |
119 #mandatory args? | |
120 | |
121 #warn "creating probe feature with $pid"; | |
122 $self->{'probe_id'} = $pid if $pid; | |
123 $self->probe($probe) if $probe; | |
124 $self->mismatchcount($mismatchcount) if defined $mismatchcount;#do not remove until probe mapping pipeline fixed | |
125 $self->cigar_string($cig_line) if defined $cig_line; | |
126 | |
127 #do we need to validate this against the db? Grab from slice and create new if not present? Will this be from the dnadb? | |
128 | |
129 #do we need this coordsys id if we're passing a slice? We should have the method but not in here? | |
130 | |
131 return $self; | |
132 } | |
133 | |
134 =head2 new_fast | |
135 | |
136 Args : Hashref with all internal attributes set | |
137 Example : none | |
138 Description: Quick and dirty version of new. Only works if the code is very | |
139 disciplined. | |
140 Returntype : Bio::EnsEMBL::Funcgen::ProbeFeature | |
141 Exceptions : None | |
142 Caller : General | |
143 Status : At Risk | |
144 | |
145 =cut | |
146 | |
147 sub new_fast { | |
148 bless ($_[1], $_[0]); | |
149 } | |
150 | |
151 =head2 probeset | |
152 | |
153 Arg [1] : (optional) string - probeset | |
154 Example : my $probeset = $feature->probeset(); | |
155 Description: Getter and setter for the probeset for this feature. Shortcut | |
156 for $feature->probe->probeset(), which should be used instead. | |
157 Probeset is not persisted if set with this method. | |
158 Returntype : string | |
159 Exceptions : None | |
160 Caller : General | |
161 Status : Medium Risk | |
162 : Use $feature->probe->probeset() because this may be removed | |
163 | |
164 =cut | |
165 | |
166 sub probeset { | |
167 my $self = shift; | |
168 | |
169 $self->{'probeset'} = shift if @_; | |
170 | |
171 if (! $self->{'probeset'}) { | |
172 $self->{'probeset'} = $self->probe()->probeset(); | |
173 } | |
174 | |
175 #We could bypass this entirely and call directly using proveset_id? | |
176 | |
177 | |
178 return $self->{'probeset'}; | |
179 } | |
180 | |
181 | |
182 #Only ever needs to be set in _objs_from_sth | |
183 #This is to allow linkage of probe_feature glyphs without retrieving the probeset. | |
184 | |
185 sub probeset_id{ | |
186 my $self = shift; | |
187 | |
188 return $self->{'_probeset_id'}; | |
189 } | |
190 | |
191 =head2 mismatchcount | |
192 | |
193 Arg [1] : int - number of mismatches | |
194 Example : my $mismatches = $feature->mismatchcount(); | |
195 Description: Getter and setter for number of mismatches for this feature. | |
196 Returntype : int | |
197 Exceptions : None | |
198 Caller : General | |
199 Status : High Risk | |
200 | |
201 =cut | |
202 | |
203 sub mismatchcount { | |
204 my $self = shift; | |
205 | |
206 | |
207 #replace with dynamic check of cigarline? | |
208 | |
209 $self->{'mismatchcount'} = shift if @_; | |
210 | |
211 return $self->{'mismatchcount'}; | |
212 } | |
213 | |
214 | |
215 =head2 cigar_string | |
216 | |
217 Arg [1] : str - Cigar line alignment annotation (M = Align & Seq match, m = Align matcht & Seq mismatch, D = Deletion in ProbeFeature wrt genome, U = Unknown at time of alignment) | |
218 Example : my $cg = $feature->cigar_string(); | |
219 Description: Getter and setter for number of the cigar line attribute for this feature. | |
220 Returntype : str | |
221 Exceptions : None | |
222 Caller : General | |
223 Status : High Risk | |
224 | |
225 =cut | |
226 | |
227 sub cigar_string { | |
228 my $self = shift; | |
229 | |
230 $self->{'cigar_string'} = shift if @_; | |
231 | |
232 return $self->{'cigar_string'}; | |
233 } | |
234 | |
235 | |
236 =head2 probe | |
237 | |
238 Arg [1] : Bio::EnsEMBL::Funcgen::Probe - probe | |
239 Example : my $probe = $feature->probe(); | |
240 Description: Getter, setter and lazy loader of probe attribute for | |
241 ProbeFeature objects. Features are retrieved from the database | |
242 without attached probes, so retrieving probe information for a | |
243 feature will involve another query. | |
244 Returntype : Bio::EnsEMBL::Funcgen::Probe | |
245 Exceptions : None | |
246 Caller : General | |
247 Status : at risk | |
248 | |
249 =cut | |
250 | |
251 sub probe { | |
252 my $self = shift; | |
253 my $probe = shift; | |
254 | |
255 #can we not use _probe_id here? | |
256 #why is probe_id not set sometimes? | |
257 | |
258 | |
259 #warn "in pf and probe is ".$self->{'probe_id'}; | |
260 | |
261 if ($probe) { | |
262 | |
263 #warn "Probe defined and is ".$probe. "and probe id is".$self->{'probe_id'}; | |
264 | |
265 if ( !ref $probe || !$probe->isa('Bio::EnsEMBL::Funcgen::Probe') ) { | |
266 throw('Probe must be a Bio::EnsEMBL::Funcgen::Probe object'); | |
267 } | |
268 $self->{'probe'} = $probe; | |
269 } | |
270 | |
271 if ( ! defined $self->{'probe'}){ | |
272 # && $self->dbID() && $self->adaptor() ) { | |
273 #$self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_ProbeFeature($self); | |
274 #warn "fetching probe with dbID ".$self->probe_id(); | |
275 $self->{'probe'} = $self->adaptor()->db()->get_ProbeAdaptor()->fetch_by_dbID($self->probe_id()); | |
276 } | |
277 return $self->{'probe'}; | |
278 } | |
279 | |
280 | |
281 =head2 probe_id | |
282 | |
283 Example : my $probe_id = $pfeature->probe_id(); | |
284 Description: Getter for the probe db id of the ProbeFeature | |
285 Returntype : int | |
286 Exceptions : None | |
287 Caller : General | |
288 Status : at risk | |
289 | |
290 =cut | |
291 | |
292 sub probe_id{ | |
293 my $self = shift; | |
294 | |
295 return $self->{'probe_id'} || $self->probe->dbID(); | |
296 } | |
297 | |
298 =head2 get_results_by_channel_id | |
299 | |
300 Arg [1] : int - channel_id (mandatory) | |
301 Arg [2] : string - Analysis name e.g. RawValue, VSN (optional) | |
302 Example : my @results = $feature->results(); | |
303 Description: Getter, setter and lazy loader of results attribute for | |
304 ProbeFeature objects. | |
305 Returntype : List ref to arrays containing ('score', 'Analysis logic_name'); | |
306 Exceptions : None | |
307 Caller : General | |
308 Status : Medium Risk | |
309 | |
310 =cut | |
311 | |
312 sub get_results_by_channel_id { | |
313 my $self = shift; | |
314 my $channel_id = shift; | |
315 my $anal_name = shift; | |
316 | |
317 warn("This method not fully implemented, remove/deprecate?"); | |
318 | |
319 #$self->{'results'} ||= {}; | |
320 $self->{'results_complete'} ||= 0; | |
321 | |
322 if(! $self->{'results'} || ($anal_name && ! exists $self->{'results'}{$anal_name})){ | |
323 #fetch all, set complete set flag | |
324 $self->{'results_complete'} ||= 1 if(! $anal_name); | |
325 | |
326 foreach my $results_ref(@{$self->adaptor->fetch_results_by_channel_analysis($self->probe->dbID(), | |
327 $channel_id, $anal_name)}){ | |
328 | |
329 $self->{'results'}{$$results_ref[1]} = $$results_ref[0]; | |
330 } | |
331 } | |
332 | |
333 return $self->{'results'} | |
334 } | |
335 | |
336 | |
337 #The experiment/al chip specificity has already been done by the ofa->fetch_all_by_Slice_Experiment | |
338 #This may be called with no preceding Experiment specificity | |
339 #this would return results for all experiments | |
340 #do we need to set a default Experiment? | |
341 | |
342 | |
343 #THis should return both Chip and Channel based results | |
344 #just Chip for now | |
345 #maybe retrieve and hash all if not Analysis object passed? Then return what? | |
346 | |
347 | |
348 =head2 get_result_by_Analysis_ExperimentalChips | |
349 | |
350 Arg [1] : Bio::EnsEMBL::Analysis | |
351 Arg [2] : listref - Bio::EnsEMBL::Funcgen::ExperimentalChip | |
352 Example : my $result = $feature->get_result_by_Analysis_ExperimentalChips($anal, \@echips); | |
353 Description: Getter of results attribute for a given Analysis and set of ExperimentalChips | |
354 Returntype : float | |
355 Exceptions : Throws is no Analysis or ExperimentalChips are not passed? | |
356 Caller : General | |
357 Status : High Risk | |
358 | |
359 =cut | |
360 | |
361 | |
362 #make ExperimentalChips optional? | |
363 | |
364 #or have ResultSetAdaptor? Do we need a ResultSet? | |
365 #may not have ExperimentalChip, so would need to return ec dbID aswell | |
366 | |
367 | |
368 ######This will break/return anomalous if | |
369 #ECs are passed from different experiments | |
370 #ECs are passed from different Arrays | |
371 | |
372 | |
373 sub get_result_by_Analysis_ExperimentalChips{ | |
374 my ($self, $anal, $exp_chips) = @_; | |
375 | |
376 throw("Need to pass listref of ExperiemntalChips") if(scalar(@$exp_chips) == 0); | |
377 throw("Need to pass a valid Bio::EnsEMBL::Analysis") if ! $anal->isa("Bio::EnsEMBL::Analysis"); | |
378 | |
379 my (%query_ids, %all_ids, %ac_ids); | |
380 my $anal_name = $anal->logic_name(); | |
381 | |
382 foreach my $ec(@$exp_chips){ | |
383 | |
384 throw("Need to pass a listref of Bio::EnsEMBL::Funcgen::ExperimentalChip objects") | |
385 if ! $ec->isa("Bio::EnsEMBL::Funcgen::ExperimentalChip"); | |
386 | |
387 #my $tmp_id = $self->adaptor->db->get_ArrayAdaptor->fetch_by_array_chip_dbID($ec->array_chip_id())->dbID(); | |
388 | |
389 #$array_id ||= $tmp_id; | |
390 | |
391 #throw("You have passed ExperimentalChips from different if($array_id != $tmp_id) | |
392 | |
393 #if(exists $ac_ids{$ec->array_chip_id()}){ | |
394 # throw("Multiple chip query only works with contiguous chips within an array, rather than duplicates"); | |
395 # } | |
396 | |
397 $ac_ids{$ec->array_chip_id()} = 1; | |
398 $all_ids{$ec->dbID()} = 1; | |
399 $query_ids{$ec->dbID()} = 1 if(! exists $self->{'results'}{$anal_name}{$ec->dbID()}); | |
400 | |
401 } | |
402 | |
403 | |
404 my @ec_ids = keys %query_ids; | |
405 my @all_ids = keys %all_ids; | |
406 | |
407 | |
408 #warn "ec ids @ec_ids\n"; | |
409 #warn "all ids @all_ids\n"; | |
410 | |
411 #$self->{'results'} ||= {}; | |
412 #$self->{'results_complete'} ||= 0;#do we need this now? | |
413 | |
414 if((scalar(@all_ids) - scalar(@ec_ids))> 1){ | |
415 throw("DATA ERROR - There is more than one result stored for the following ExperimentalChip ids: @all_ids"); | |
416 } | |
417 elsif(! $self->{'results'} || (($anal_name && scalar(@ec_ids) > 0) && scalar(@all_ids) == scalar(@ec_ids))){ | |
418 #fetch all, set complete set flag | |
419 #$self->{'results_complete'} ||= 1 if(! $anal_name); | |
420 #would need to look up chip and channel analyses here and call relevant fetch | |
421 #or pass the chip and then build the query as = or IN dependent on context of logic name | |
422 #if there are multiple results, last one will overwrite others | |
423 #could do foreach here to deal with retrieving all i.e. no logic name | |
424 #Can supply mutliple chips, but probe ids "should" be unique(in the DB at least) amongst contiguous array_chips | |
425 #build the cache based on logic name and table_id | |
426 #cahce key?? should we cat the ec_ids together? | |
427 | |
428 my @result_refs = @{$self->adaptor->fetch_results_by_Probe_Analysis_experimental_chip_ids($self->probe(), | |
429 $anal, | |
430 \@ec_ids)}; | |
431 | |
432 #Remove lines with no result | |
433 while(@result_refs && (! $result_refs[0]->[0])){ | |
434 shift @result_refs; | |
435 } | |
436 | |
437 my $num_results = scalar(@result_refs); | |
438 my ($result, $mpos); | |
439 #throw("Fetched more than one result for this ProbeFeature, Analysis and ExperimentalChips") if (scalar(@result_refs) >1); | |
440 | |
441 #No sort needed as we sort in the query | |
442 | |
443 if($num_results == 1){ | |
444 $result = $result_refs[0]->[0]; | |
445 } | |
446 elsif($num_results == 2){#mean | |
447 $result = ($result_refs[0]->[0] + $result_refs[1]->[0])/2; | |
448 | |
449 } | |
450 elsif($num_results > 2){#median or mean of median flanks | |
451 $mpos = $num_results/2; | |
452 | |
453 if($mpos =~ /\./){#true median | |
454 $mpos =~ s/\..*//; | |
455 $mpos ++; | |
456 $result = $result_refs[$mpos]->[0]; | |
457 }else{ | |
458 $result = ($result_refs[$mpos]->[0] + $result_refs[($mpos+1)]->[0])/2 ; | |
459 } | |
460 } | |
461 | |
462 $self->{'results'}{$anal_name}{":".join(":", @ec_ids).":"} = $result; | |
463 } | |
464 | |
465 #do we return the ec ids here to, or do we trust that the user will know to only pass contiguous rather than duplicate chips | |
466 | |
467 #how are we going to retrieve the result for one of many possible ec id keys? | |
468 #options, cat ec dbids as key, and grep them to find full key, then return result | |
469 #this may hide the duplicate chip problem | |
470 #If a query has already been made and cached,another query with one differing ID(duplicate result) may never be queried as we already have a cahced result | |
471 #We shoulld pick up duplicates before this happens | |
472 #If we try and mix ExperimentalChips from different experiments, then this would also cause multiple results, and hence hide some data | |
473 | |
474 my @keys; | |
475 foreach my $id(@all_ids){ | |
476 my @tmp = grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); | |
477 #Hacky needs sorting, quick fix for release!! | |
478 | |
479 if(@tmp){ | |
480 push @keys, grep(/:${id}:/, keys %{$self->{'results'}{$anal_name}}); | |
481 | |
482 last; | |
483 } | |
484 | |
485 } | |
486 | |
487 throw("Got more than one key for the results cache") if scalar(@keys) > 1; | |
488 | |
489 return $self->{'results'}{$anal_name}{$keys[0]}; | |
490 } | |
491 | |
492 | |
493 #Will this be too slow, can we not do one query across all tables | |
494 | |
495 sub get_result_by_ResultSet{ | |
496 my ($self, $rset) = @_; | |
497 | |
498 my $results = $rset->adaptor->fetch_results_by_probe_id_ResultSet($self->probe_id(), $rset); | |
499 | |
500 return median($results); | |
501 } | |
502 | |
503 | |
504 | |
505 | |
506 | |
507 1; | |
508 |