Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/AssemblyProjector.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Utils::AssemblyProjector - | |
24 utility class to post-process projections from one assembly to another | |
25 | |
26 =head1 SYNOPSIS | |
27 | |
28 # connect to an old database | |
29 my $dba_old = new Bio::EnsEMBL::DBSQL::DBAdaptor( | |
30 -host => 'ensembldb.ensembl.org', | |
31 -port => 3306, | |
32 -user => ensro, | |
33 -dbname => 'mus_musculus_core_46_36g', | |
34 -group => 'core_old', | |
35 ); | |
36 | |
37 # connect to the new database containing the mapping between old and | |
38 # new assembly | |
39 my $dba_new = new Bio::EnsEMBL::DBSQL::DBAdaptor( | |
40 -host => 'ensembldb.ensembl.org', | |
41 -port => 3306, | |
42 -user => ensro, | |
43 -dbname => 'mus_musculus_core_47_37', | |
44 -group => 'core_new', | |
45 ); | |
46 | |
47 my $assembly_projector = Bio::EnsEMBL::Utils::AssemblyProjector->new( | |
48 -OLD_ASSEMBLY => 'NCBIM36', | |
49 -NEW_ASSEMBLY => 'NCBIM37', | |
50 -ADAPTOR => $dba_new, | |
51 -EXTERNAL_SOURCE => 1, | |
52 -MERGE_FRAGMENTS => 1, | |
53 -CHECK_LENGTH => 0, | |
54 ); | |
55 | |
56 # fetch a slice on the old assembly | |
57 my $slice_adaptor = $dba_old->get_SliceAdaptor; | |
58 my $slice = | |
59 $slice_adaptor->fetch_by_region( 'chromosome', 1, undef, undef, | |
60 undef, 'NCBIM36' ); | |
61 | |
62 my $new_slice = $assembly_projector->old_to_new($slice); | |
63 | |
64 print $new_slice->name, " (", $assembly_projector->last_status, ")\n"; | |
65 | |
66 =head1 DESCRIPTION | |
67 | |
68 This class implements some utility functions for converting coordinates | |
69 between assemblies. A mapping between the two assemblies has to present | |
70 the database for this to work, see the 'Related Modules' section below | |
71 on how to generate the mapping. | |
72 | |
73 In addition to the "raw" projecting of features and slices, the methods | |
74 in this module also apply some sensible rules to the results of the | |
75 projection (like discarding unwanted results or merging fragmented | |
76 projections). These are the rules (depending on configuration): | |
77 | |
78 Discard the projected feature/slice if: | |
79 | |
80 1. it doesn't project at all (no segments returned) | |
81 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more | |
82 than one segment) | |
83 3. [if CHECK_LENGTH is set] the projection doesn't have the same | |
84 length as the original feature/slice | |
85 4. all segments are on same chromosome and strand | |
86 | |
87 If a projection fails any of these rules, undef is returned instead of | |
88 a projected feature/slice. You can use the last_status() method to find | |
89 out about the results of the rules tests. | |
90 | |
91 Also note that when projecting features, only a shallow projection is | |
92 performed, i.e. other features attached to your features (e.g. the | |
93 transcripts of a gene) are not projected automatically, so it will be | |
94 the responsability of the user code project all levels of features | |
95 involved. | |
96 | |
97 =head1 METHODS | |
98 | |
99 new | |
100 project | |
101 old_to_new | |
102 new_to_old | |
103 adaptor | |
104 external_source | |
105 old_assembly | |
106 new_assembly | |
107 merge_fragments | |
108 check_length | |
109 | |
110 =head1 RELATED MODULES | |
111 | |
112 The process of creating a whole genome alignment between two assemblies | |
113 (which is the basis for the use of the methods in this class) is done by | |
114 a series of scripts. Please see | |
115 | |
116 ensembl/misc-scripts/assembly/README | |
117 | |
118 for a high-level description of this process, and POD in the individual | |
119 scripts for the details. | |
120 | |
121 =cut | |
122 | |
123 package Bio::EnsEMBL::Utils::AssemblyProjector; | |
124 | |
125 use strict; | |
126 use warnings; | |
127 no warnings qw(uninitialized); | |
128 | |
129 use Bio::EnsEMBL::Utils::Exception qw(throw warning); | |
130 use Bio::EnsEMBL::Utils::Argument qw(rearrange); | |
131 use Bio::EnsEMBL::Slice; | |
132 use Scalar::Util qw(weaken); | |
133 | |
134 =head2 new | |
135 | |
136 Arg [ADAPTOR] : Bio::EnsEMBL::DBSQL::DBAdaptor $adaptor - a db adaptor | |
137 for a database containing the assembly mapping | |
138 Arg [EXTERNAL_SOURCE] : (optional) Boolean $external_source - indicates if | |
139 source is from a different database | |
140 Arg [OLD_ASSEMBLY] : name of the old assembly | |
141 Arg [OLD_ASSEMBLY] : name of the new assembly | |
142 Arg [OBJECT_TYPE] : (optional) object type ('slice' or 'feature') | |
143 Arg [MERGE_FRAGMENTS] : (optional) Boolean - determines if segments are merged | |
144 to return a single object spanning all segments | |
145 (default: true) | |
146 Arg [CHECK_LENGTH] : (optional) Boolean - determines if projected objects | |
147 have to have same length as original (default: false) | |
148 Example : my $ap = Bio::EnsEMBL::Utils::AssemblyProjector->new( | |
149 -DBADAPTOR => $dba, | |
150 -OLD_ASSEMBLY => NCBIM36, | |
151 -NEW_ASSEMBLY => NCBIM37, | |
152 ); | |
153 Description : Constructor. | |
154 Return type : a Bio::EnsEMBL::Utils::AssemblyProjector object | |
155 Exceptions : thrown on missing arguments | |
156 thrown on invalid OBJECT_TYPE | |
157 Caller : general | |
158 Status : At Risk | |
159 : under development | |
160 | |
161 =cut | |
162 | |
163 sub new { | |
164 my $caller = shift; | |
165 my $class = ref($caller) || $caller; | |
166 | |
167 my ($adaptor, $external_source, $old_assembly, $new_assembly, | |
168 $merge_fragments, $check_length) = rearrange([qw(ADAPTOR EXTERNAL_SOURCE | |
169 OLD_ASSEMBLY NEW_ASSEMBLY MERGE_FRAGMENTS CHECK_LENGTH)], @_); | |
170 | |
171 unless ($adaptor and ref($adaptor) and | |
172 $adaptor->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) { | |
173 throw("You must provide a DBAdaptor to a database containing the assembly mapping."); | |
174 } | |
175 | |
176 unless ($old_assembly and $new_assembly) { | |
177 throw("You must provide an old and new assembly name."); | |
178 } | |
179 | |
180 my $self = {}; | |
181 bless ($self, $class); | |
182 | |
183 # initialise | |
184 $self->adaptor($adaptor); | |
185 $self->{'old_assembly'} = $old_assembly; | |
186 $self->{'new_assembly'} = $new_assembly; | |
187 | |
188 # by default, merge fragments | |
189 $self->{'merge_fragments'} = $merge_fragments || 1; | |
190 | |
191 # by default, do not check length | |
192 $self->{'check_length'} = $check_length || 0; | |
193 | |
194 # by default, features and slices are expected in same database as the | |
195 # assembly mapping | |
196 $self->{'external_source'} = $external_source || 0; | |
197 | |
198 return $self; | |
199 } | |
200 | |
201 | |
202 =head2 project | |
203 | |
204 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object - | |
205 the object to project | |
206 Arg[2] : String $to_assembly - assembly to project to | |
207 Example : my $new_slice = $assembly_projector->project($old_slice, | |
208 'NCBIM37'); | |
209 Description : Projects a Slice or Feature to the specified assembly. | |
210 | |
211 Several tests are performed on the result to discard unwanted | |
212 results. All projection segments have to be on the same | |
213 seq_region and strand. If -MERGE_FRAGMENTS is set, gaps will be | |
214 bridged by creating a single object from first_segment_start to | |
215 last_segment_end. If -CHECK_LENGTH is set, the projected object | |
216 will have to have the same length as the original. You can use | |
217 the last_status() method to find out what the result of some of | |
218 these rule tests were. Please see the comments in the code for | |
219 more details about these rules. | |
220 | |
221 The return value of this method will always be a single object, | |
222 or undef if the projection fails any of the rules. | |
223 | |
224 Note that when projecting features, only a "shallow" projection | |
225 is performed, i.e. attached features aren't projected | |
226 automatically! (e.g. if you project a gene, its transcripts will | |
227 have to be projected manually before storing the new gene) | |
228 Return type : same a Arg 1, or undef if projection fails any of the rules | |
229 Exceptions : thrown on invalid arguments | |
230 Caller : general, $self->old_to_new, $self->new_to_old | |
231 Status : At Risk | |
232 : under development | |
233 | |
234 =cut | |
235 | |
236 sub project { | |
237 my ($self, $object, $to_assembly) = @_; | |
238 | |
239 throw("Need an assembly version to project to.") unless ($to_assembly); | |
240 throw("Need an object to project.") unless ($object and ref($object)); | |
241 | |
242 my ($slice, $object_type); | |
243 | |
244 if ($object->isa('Bio::EnsEMBL::Feature')) { | |
245 $object_type = 'feature'; | |
246 } elsif ($object->isa('Bio::EnsEMBL::Slice')) { | |
247 $object_type = 'slice'; | |
248 } else { | |
249 throw("Need a Feature or Slice to project."); | |
250 } | |
251 | |
252 # if the feature or slice is sourced from another db, we have to "transfer" | |
253 # it to the db that contains the assembly mapping. the transfer is very | |
254 # shallow but that should do for our purposes | |
255 if ($self->external_source) { | |
256 my $slice_adaptor = $self->adaptor->get_SliceAdaptor; | |
257 | |
258 if ($object_type eq 'feature') { | |
259 | |
260 # createa a new slice from the target db | |
261 my $f_slice = $object->slice; | |
262 my $target_slice = $slice_adaptor->fetch_by_name($f_slice->name); | |
263 | |
264 # now change the feature so that it appears it's from the target db | |
265 $object->slice($target_slice); | |
266 | |
267 } else { | |
268 | |
269 # createa a new slice from the target db | |
270 $object = $slice_adaptor->fetch_by_name($object->name); | |
271 | |
272 } | |
273 } | |
274 | |
275 if ($object_type eq 'feature') { | |
276 $slice = $object->feature_Slice; | |
277 } else { | |
278 $slice = $object; | |
279 } | |
280 | |
281 # warn if trying to project to assembly version the object already is on | |
282 if ($slice->coord_system->version eq $to_assembly) { | |
283 warning("Assembly version to project to ($to_assembly) is the same as your object's assembly (".$slice->coord_system->version.")."); | |
284 } | |
285 | |
286 # now project the slice | |
287 my $cs_name = $slice->coord_system_name; | |
288 my @segments = @{ $slice->project($cs_name, $to_assembly) }; | |
289 | |
290 # we need to reverse the projection segment list if the orignial | |
291 if ($slice->strand == -1) { | |
292 @segments = reverse(@segments); | |
293 } | |
294 | |
295 # apply rules to projection results | |
296 # | |
297 # discard the projected feature/slice if | |
298 # 1. it doesn't project at all (no segments returned) | |
299 # 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more | |
300 # than one segment) | |
301 # 3. [if CHECK_LENGTH is set] the projection doesn't have the same length | |
302 # as the original feature/slice | |
303 # 4. all segments are on same chromosome and strand | |
304 | |
305 # keep track of the status of applied rules | |
306 my @status = (); | |
307 | |
308 # test (1) | |
309 return undef unless (@segments); | |
310 #warn "DEBUG: passed test 1\n"; | |
311 | |
312 # test (2) | |
313 return undef if (!($self->merge_fragments) and scalar(@segments) > 1); | |
314 push @status, 'fragmented' if (scalar(@segments) > 1); | |
315 #warn "DEBUG: passed test 2\n"; | |
316 | |
317 # test (3) | |
318 my $first_slice = $segments[0]->to_Slice; | |
319 my $last_slice = $segments[-1]->to_Slice; | |
320 my $length_mismatch = (($last_slice->end - $first_slice->start + 1) != | |
321 $object->length); | |
322 return undef if ($self->check_length and $length_mismatch); | |
323 push @status, 'length_mismatch' if ($length_mismatch); | |
324 #warn "DEBUG: passed test 3\n"; | |
325 | |
326 # test (4) | |
327 my %sr_names = (); | |
328 my %strands = (); | |
329 foreach my $seg (@segments) { | |
330 my $sl = $seg->to_Slice; | |
331 $sr_names{$sl->seq_region_name}++; | |
332 $strands{$sl->strand}++; | |
333 } | |
334 return undef if (scalar(keys %sr_names) > 1 or scalar(keys %strands) > 1); | |
335 #warn "DEBUG: passed test 4\n"; | |
336 | |
337 # remember rule status | |
338 $self->last_status(join('|', @status)); | |
339 | |
340 # everything looks fine, so adjust the coords of your feature/slice | |
341 my $new_slice = $first_slice; | |
342 $new_slice->{'end'} = $last_slice->end; | |
343 | |
344 if ($object_type eq 'slice') { | |
345 return $new_slice; | |
346 } else { | |
347 | |
348 $object->start($new_slice->start); | |
349 $object->end($new_slice->end); | |
350 $object->strand($new_slice->strand); | |
351 $object->slice($new_slice->seq_region_Slice); | |
352 | |
353 # undef dbID and adaptor so you can store the feature in the target db | |
354 $object->dbID(undef); | |
355 $object->adaptor(undef); | |
356 | |
357 return $object; | |
358 } | |
359 | |
360 } | |
361 | |
362 | |
363 =head2 old_to_new | |
364 | |
365 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object - | |
366 the object to project | |
367 Example : my $new_slice = $assembly_projector->old_to_new($old_slice); | |
368 Description : Projects a Slice or Feature from old to new assembly. | |
369 This method is just a convenience wrapper for $self->project. | |
370 Return type : same a Arg 1, or undef | |
371 Exceptions : none | |
372 Caller : general | |
373 Status : At Risk | |
374 : under development | |
375 | |
376 =cut | |
377 | |
378 sub old_to_new { | |
379 my ($self, $object) = @_; | |
380 return $self->project($object, $self->new_assembly); | |
381 } | |
382 | |
383 | |
384 =head2 new_to_old | |
385 | |
386 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object - | |
387 the object to project | |
388 Example : my $old_slice = $assembly_projector->new_to_old($new_slice, 1); | |
389 Description : Projects a Slice or Feature from new to old assembly. | |
390 This method is just a convenience wrapper for $self->project. | |
391 Return type : same a Arg 1, or undef | |
392 Exceptions : none | |
393 Caller : general | |
394 Status : At Risk | |
395 : under development | |
396 | |
397 =cut | |
398 | |
399 sub new_to_old { | |
400 my ($self, $object) = @_; | |
401 return $self->project($object, $self->old_assembly); | |
402 } | |
403 | |
404 | |
405 # | |
406 # accessors | |
407 # | |
408 sub adaptor { | |
409 my $self = shift; | |
410 weaken($self->{'adaptor'} = shift) if (@_); | |
411 return $self->{'adaptor'}; | |
412 } | |
413 | |
414 | |
415 sub external_source { | |
416 my $self = shift; | |
417 $self->{'external_source'} = shift if (@_); | |
418 return $self->{'external_source'}; | |
419 } | |
420 | |
421 | |
422 sub old_assembly { | |
423 my $self = shift; | |
424 $self->{'old_assembly'} = shift if (@_); | |
425 return $self->{'old_assembly'}; | |
426 } | |
427 | |
428 | |
429 sub new_assembly { | |
430 my $self = shift; | |
431 $self->{'new_assembly'} = shift if (@_); | |
432 return $self->{'new_assembly'}; | |
433 } | |
434 | |
435 | |
436 sub merge_fragments { | |
437 my $self = shift; | |
438 $self->{'merge_fragments'} = shift if (@_); | |
439 return $self->{'merge_fragments'}; | |
440 } | |
441 | |
442 | |
443 sub check_length { | |
444 my $self = shift; | |
445 $self->{'check_length'} = shift if (@_); | |
446 return $self->{'check_length'}; | |
447 } | |
448 | |
449 | |
450 sub last_status { | |
451 my $self = shift; | |
452 $self->{'last_status'} = shift if (@_); | |
453 return $self->{'last_status'}; | |
454 } | |
455 | |
456 | |
457 1; | |
458 | |
459 |