comparison variant_effect_predictor/Bio/EnsEMBL/Utils/AssemblyProjector.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Utils::AssemblyProjector -
24 utility class to post-process projections from one assembly to another
25
26 =head1 SYNOPSIS
27
28 # connect to an old database
29 my $dba_old = new Bio::EnsEMBL::DBSQL::DBAdaptor(
30 -host => 'ensembldb.ensembl.org',
31 -port => 3306,
32 -user => ensro,
33 -dbname => 'mus_musculus_core_46_36g',
34 -group => 'core_old',
35 );
36
37 # connect to the new database containing the mapping between old and
38 # new assembly
39 my $dba_new = new Bio::EnsEMBL::DBSQL::DBAdaptor(
40 -host => 'ensembldb.ensembl.org',
41 -port => 3306,
42 -user => ensro,
43 -dbname => 'mus_musculus_core_47_37',
44 -group => 'core_new',
45 );
46
47 my $assembly_projector = Bio::EnsEMBL::Utils::AssemblyProjector->new(
48 -OLD_ASSEMBLY => 'NCBIM36',
49 -NEW_ASSEMBLY => 'NCBIM37',
50 -ADAPTOR => $dba_new,
51 -EXTERNAL_SOURCE => 1,
52 -MERGE_FRAGMENTS => 1,
53 -CHECK_LENGTH => 0,
54 );
55
56 # fetch a slice on the old assembly
57 my $slice_adaptor = $dba_old->get_SliceAdaptor;
58 my $slice =
59 $slice_adaptor->fetch_by_region( 'chromosome', 1, undef, undef,
60 undef, 'NCBIM36' );
61
62 my $new_slice = $assembly_projector->old_to_new($slice);
63
64 print $new_slice->name, " (", $assembly_projector->last_status, ")\n";
65
66 =head1 DESCRIPTION
67
68 This class implements some utility functions for converting coordinates
69 between assemblies. A mapping between the two assemblies has to present
70 the database for this to work, see the 'Related Modules' section below
71 on how to generate the mapping.
72
73 In addition to the "raw" projecting of features and slices, the methods
74 in this module also apply some sensible rules to the results of the
75 projection (like discarding unwanted results or merging fragmented
76 projections). These are the rules (depending on configuration):
77
78 Discard the projected feature/slice if:
79
80 1. it doesn't project at all (no segments returned)
81 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
82 than one segment)
83 3. [if CHECK_LENGTH is set] the projection doesn't have the same
84 length as the original feature/slice
85 4. all segments are on same chromosome and strand
86
87 If a projection fails any of these rules, undef is returned instead of
88 a projected feature/slice. You can use the last_status() method to find
89 out about the results of the rules tests.
90
91 Also note that when projecting features, only a shallow projection is
92 performed, i.e. other features attached to your features (e.g. the
93 transcripts of a gene) are not projected automatically, so it will be
94 the responsability of the user code project all levels of features
95 involved.
96
97 =head1 METHODS
98
99 new
100 project
101 old_to_new
102 new_to_old
103 adaptor
104 external_source
105 old_assembly
106 new_assembly
107 merge_fragments
108 check_length
109
110 =head1 RELATED MODULES
111
112 The process of creating a whole genome alignment between two assemblies
113 (which is the basis for the use of the methods in this class) is done by
114 a series of scripts. Please see
115
116 ensembl/misc-scripts/assembly/README
117
118 for a high-level description of this process, and POD in the individual
119 scripts for the details.
120
121 =cut
122
123 package Bio::EnsEMBL::Utils::AssemblyProjector;
124
125 use strict;
126 use warnings;
127 no warnings qw(uninitialized);
128
129 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
130 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
131 use Bio::EnsEMBL::Slice;
132 use Scalar::Util qw(weaken);
133
134 =head2 new
135
136 Arg [ADAPTOR] : Bio::EnsEMBL::DBSQL::DBAdaptor $adaptor - a db adaptor
137 for a database containing the assembly mapping
138 Arg [EXTERNAL_SOURCE] : (optional) Boolean $external_source - indicates if
139 source is from a different database
140 Arg [OLD_ASSEMBLY] : name of the old assembly
141 Arg [OLD_ASSEMBLY] : name of the new assembly
142 Arg [OBJECT_TYPE] : (optional) object type ('slice' or 'feature')
143 Arg [MERGE_FRAGMENTS] : (optional) Boolean - determines if segments are merged
144 to return a single object spanning all segments
145 (default: true)
146 Arg [CHECK_LENGTH] : (optional) Boolean - determines if projected objects
147 have to have same length as original (default: false)
148 Example : my $ap = Bio::EnsEMBL::Utils::AssemblyProjector->new(
149 -DBADAPTOR => $dba,
150 -OLD_ASSEMBLY => NCBIM36,
151 -NEW_ASSEMBLY => NCBIM37,
152 );
153 Description : Constructor.
154 Return type : a Bio::EnsEMBL::Utils::AssemblyProjector object
155 Exceptions : thrown on missing arguments
156 thrown on invalid OBJECT_TYPE
157 Caller : general
158 Status : At Risk
159 : under development
160
161 =cut
162
163 sub new {
164 my $caller = shift;
165 my $class = ref($caller) || $caller;
166
167 my ($adaptor, $external_source, $old_assembly, $new_assembly,
168 $merge_fragments, $check_length) = rearrange([qw(ADAPTOR EXTERNAL_SOURCE
169 OLD_ASSEMBLY NEW_ASSEMBLY MERGE_FRAGMENTS CHECK_LENGTH)], @_);
170
171 unless ($adaptor and ref($adaptor) and
172 $adaptor->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
173 throw("You must provide a DBAdaptor to a database containing the assembly mapping.");
174 }
175
176 unless ($old_assembly and $new_assembly) {
177 throw("You must provide an old and new assembly name.");
178 }
179
180 my $self = {};
181 bless ($self, $class);
182
183 # initialise
184 $self->adaptor($adaptor);
185 $self->{'old_assembly'} = $old_assembly;
186 $self->{'new_assembly'} = $new_assembly;
187
188 # by default, merge fragments
189 $self->{'merge_fragments'} = $merge_fragments || 1;
190
191 # by default, do not check length
192 $self->{'check_length'} = $check_length || 0;
193
194 # by default, features and slices are expected in same database as the
195 # assembly mapping
196 $self->{'external_source'} = $external_source || 0;
197
198 return $self;
199 }
200
201
202 =head2 project
203
204 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
205 the object to project
206 Arg[2] : String $to_assembly - assembly to project to
207 Example : my $new_slice = $assembly_projector->project($old_slice,
208 'NCBIM37');
209 Description : Projects a Slice or Feature to the specified assembly.
210
211 Several tests are performed on the result to discard unwanted
212 results. All projection segments have to be on the same
213 seq_region and strand. If -MERGE_FRAGMENTS is set, gaps will be
214 bridged by creating a single object from first_segment_start to
215 last_segment_end. If -CHECK_LENGTH is set, the projected object
216 will have to have the same length as the original. You can use
217 the last_status() method to find out what the result of some of
218 these rule tests were. Please see the comments in the code for
219 more details about these rules.
220
221 The return value of this method will always be a single object,
222 or undef if the projection fails any of the rules.
223
224 Note that when projecting features, only a "shallow" projection
225 is performed, i.e. attached features aren't projected
226 automatically! (e.g. if you project a gene, its transcripts will
227 have to be projected manually before storing the new gene)
228 Return type : same a Arg 1, or undef if projection fails any of the rules
229 Exceptions : thrown on invalid arguments
230 Caller : general, $self->old_to_new, $self->new_to_old
231 Status : At Risk
232 : under development
233
234 =cut
235
236 sub project {
237 my ($self, $object, $to_assembly) = @_;
238
239 throw("Need an assembly version to project to.") unless ($to_assembly);
240 throw("Need an object to project.") unless ($object and ref($object));
241
242 my ($slice, $object_type);
243
244 if ($object->isa('Bio::EnsEMBL::Feature')) {
245 $object_type = 'feature';
246 } elsif ($object->isa('Bio::EnsEMBL::Slice')) {
247 $object_type = 'slice';
248 } else {
249 throw("Need a Feature or Slice to project.");
250 }
251
252 # if the feature or slice is sourced from another db, we have to "transfer"
253 # it to the db that contains the assembly mapping. the transfer is very
254 # shallow but that should do for our purposes
255 if ($self->external_source) {
256 my $slice_adaptor = $self->adaptor->get_SliceAdaptor;
257
258 if ($object_type eq 'feature') {
259
260 # createa a new slice from the target db
261 my $f_slice = $object->slice;
262 my $target_slice = $slice_adaptor->fetch_by_name($f_slice->name);
263
264 # now change the feature so that it appears it's from the target db
265 $object->slice($target_slice);
266
267 } else {
268
269 # createa a new slice from the target db
270 $object = $slice_adaptor->fetch_by_name($object->name);
271
272 }
273 }
274
275 if ($object_type eq 'feature') {
276 $slice = $object->feature_Slice;
277 } else {
278 $slice = $object;
279 }
280
281 # warn if trying to project to assembly version the object already is on
282 if ($slice->coord_system->version eq $to_assembly) {
283 warning("Assembly version to project to ($to_assembly) is the same as your object's assembly (".$slice->coord_system->version.").");
284 }
285
286 # now project the slice
287 my $cs_name = $slice->coord_system_name;
288 my @segments = @{ $slice->project($cs_name, $to_assembly) };
289
290 # we need to reverse the projection segment list if the orignial
291 if ($slice->strand == -1) {
292 @segments = reverse(@segments);
293 }
294
295 # apply rules to projection results
296 #
297 # discard the projected feature/slice if
298 # 1. it doesn't project at all (no segments returned)
299 # 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
300 # than one segment)
301 # 3. [if CHECK_LENGTH is set] the projection doesn't have the same length
302 # as the original feature/slice
303 # 4. all segments are on same chromosome and strand
304
305 # keep track of the status of applied rules
306 my @status = ();
307
308 # test (1)
309 return undef unless (@segments);
310 #warn "DEBUG: passed test 1\n";
311
312 # test (2)
313 return undef if (!($self->merge_fragments) and scalar(@segments) > 1);
314 push @status, 'fragmented' if (scalar(@segments) > 1);
315 #warn "DEBUG: passed test 2\n";
316
317 # test (3)
318 my $first_slice = $segments[0]->to_Slice;
319 my $last_slice = $segments[-1]->to_Slice;
320 my $length_mismatch = (($last_slice->end - $first_slice->start + 1) !=
321 $object->length);
322 return undef if ($self->check_length and $length_mismatch);
323 push @status, 'length_mismatch' if ($length_mismatch);
324 #warn "DEBUG: passed test 3\n";
325
326 # test (4)
327 my %sr_names = ();
328 my %strands = ();
329 foreach my $seg (@segments) {
330 my $sl = $seg->to_Slice;
331 $sr_names{$sl->seq_region_name}++;
332 $strands{$sl->strand}++;
333 }
334 return undef if (scalar(keys %sr_names) > 1 or scalar(keys %strands) > 1);
335 #warn "DEBUG: passed test 4\n";
336
337 # remember rule status
338 $self->last_status(join('|', @status));
339
340 # everything looks fine, so adjust the coords of your feature/slice
341 my $new_slice = $first_slice;
342 $new_slice->{'end'} = $last_slice->end;
343
344 if ($object_type eq 'slice') {
345 return $new_slice;
346 } else {
347
348 $object->start($new_slice->start);
349 $object->end($new_slice->end);
350 $object->strand($new_slice->strand);
351 $object->slice($new_slice->seq_region_Slice);
352
353 # undef dbID and adaptor so you can store the feature in the target db
354 $object->dbID(undef);
355 $object->adaptor(undef);
356
357 return $object;
358 }
359
360 }
361
362
363 =head2 old_to_new
364
365 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
366 the object to project
367 Example : my $new_slice = $assembly_projector->old_to_new($old_slice);
368 Description : Projects a Slice or Feature from old to new assembly.
369 This method is just a convenience wrapper for $self->project.
370 Return type : same a Arg 1, or undef
371 Exceptions : none
372 Caller : general
373 Status : At Risk
374 : under development
375
376 =cut
377
378 sub old_to_new {
379 my ($self, $object) = @_;
380 return $self->project($object, $self->new_assembly);
381 }
382
383
384 =head2 new_to_old
385
386 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
387 the object to project
388 Example : my $old_slice = $assembly_projector->new_to_old($new_slice, 1);
389 Description : Projects a Slice or Feature from new to old assembly.
390 This method is just a convenience wrapper for $self->project.
391 Return type : same a Arg 1, or undef
392 Exceptions : none
393 Caller : general
394 Status : At Risk
395 : under development
396
397 =cut
398
399 sub new_to_old {
400 my ($self, $object) = @_;
401 return $self->project($object, $self->old_assembly);
402 }
403
404
405 #
406 # accessors
407 #
408 sub adaptor {
409 my $self = shift;
410 weaken($self->{'adaptor'} = shift) if (@_);
411 return $self->{'adaptor'};
412 }
413
414
415 sub external_source {
416 my $self = shift;
417 $self->{'external_source'} = shift if (@_);
418 return $self->{'external_source'};
419 }
420
421
422 sub old_assembly {
423 my $self = shift;
424 $self->{'old_assembly'} = shift if (@_);
425 return $self->{'old_assembly'};
426 }
427
428
429 sub new_assembly {
430 my $self = shift;
431 $self->{'new_assembly'} = shift if (@_);
432 return $self->{'new_assembly'};
433 }
434
435
436 sub merge_fragments {
437 my $self = shift;
438 $self->{'merge_fragments'} = shift if (@_);
439 return $self->{'merge_fragments'};
440 }
441
442
443 sub check_length {
444 my $self = shift;
445 $self->{'check_length'} = shift if (@_);
446 return $self->{'check_length'};
447 }
448
449
450 sub last_status {
451 my $self = shift;
452 $self->{'last_status'} = shift if (@_);
453 return $self->{'last_status'};
454 }
455
456
457 1;
458
459