0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Utils::AssemblyProjector -
|
|
24 utility class to post-process projections from one assembly to another
|
|
25
|
|
26 =head1 SYNOPSIS
|
|
27
|
|
28 # connect to an old database
|
|
29 my $dba_old = new Bio::EnsEMBL::DBSQL::DBAdaptor(
|
|
30 -host => 'ensembldb.ensembl.org',
|
|
31 -port => 3306,
|
|
32 -user => ensro,
|
|
33 -dbname => 'mus_musculus_core_46_36g',
|
|
34 -group => 'core_old',
|
|
35 );
|
|
36
|
|
37 # connect to the new database containing the mapping between old and
|
|
38 # new assembly
|
|
39 my $dba_new = new Bio::EnsEMBL::DBSQL::DBAdaptor(
|
|
40 -host => 'ensembldb.ensembl.org',
|
|
41 -port => 3306,
|
|
42 -user => ensro,
|
|
43 -dbname => 'mus_musculus_core_47_37',
|
|
44 -group => 'core_new',
|
|
45 );
|
|
46
|
|
47 my $assembly_projector = Bio::EnsEMBL::Utils::AssemblyProjector->new(
|
|
48 -OLD_ASSEMBLY => 'NCBIM36',
|
|
49 -NEW_ASSEMBLY => 'NCBIM37',
|
|
50 -ADAPTOR => $dba_new,
|
|
51 -EXTERNAL_SOURCE => 1,
|
|
52 -MERGE_FRAGMENTS => 1,
|
|
53 -CHECK_LENGTH => 0,
|
|
54 );
|
|
55
|
|
56 # fetch a slice on the old assembly
|
|
57 my $slice_adaptor = $dba_old->get_SliceAdaptor;
|
|
58 my $slice =
|
|
59 $slice_adaptor->fetch_by_region( 'chromosome', 1, undef, undef,
|
|
60 undef, 'NCBIM36' );
|
|
61
|
|
62 my $new_slice = $assembly_projector->old_to_new($slice);
|
|
63
|
|
64 print $new_slice->name, " (", $assembly_projector->last_status, ")\n";
|
|
65
|
|
66 =head1 DESCRIPTION
|
|
67
|
|
68 This class implements some utility functions for converting coordinates
|
|
69 between assemblies. A mapping between the two assemblies has to present
|
|
70 the database for this to work, see the 'Related Modules' section below
|
|
71 on how to generate the mapping.
|
|
72
|
|
73 In addition to the "raw" projecting of features and slices, the methods
|
|
74 in this module also apply some sensible rules to the results of the
|
|
75 projection (like discarding unwanted results or merging fragmented
|
|
76 projections). These are the rules (depending on configuration):
|
|
77
|
|
78 Discard the projected feature/slice if:
|
|
79
|
|
80 1. it doesn't project at all (no segments returned)
|
|
81 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
|
|
82 than one segment)
|
|
83 3. [if CHECK_LENGTH is set] the projection doesn't have the same
|
|
84 length as the original feature/slice
|
|
85 4. all segments are on same chromosome and strand
|
|
86
|
|
87 If a projection fails any of these rules, undef is returned instead of
|
|
88 a projected feature/slice. You can use the last_status() method to find
|
|
89 out about the results of the rules tests.
|
|
90
|
|
91 Also note that when projecting features, only a shallow projection is
|
|
92 performed, i.e. other features attached to your features (e.g. the
|
|
93 transcripts of a gene) are not projected automatically, so it will be
|
|
94 the responsability of the user code project all levels of features
|
|
95 involved.
|
|
96
|
|
97 =head1 METHODS
|
|
98
|
|
99 new
|
|
100 project
|
|
101 old_to_new
|
|
102 new_to_old
|
|
103 adaptor
|
|
104 external_source
|
|
105 old_assembly
|
|
106 new_assembly
|
|
107 merge_fragments
|
|
108 check_length
|
|
109
|
|
110 =head1 RELATED MODULES
|
|
111
|
|
112 The process of creating a whole genome alignment between two assemblies
|
|
113 (which is the basis for the use of the methods in this class) is done by
|
|
114 a series of scripts. Please see
|
|
115
|
|
116 ensembl/misc-scripts/assembly/README
|
|
117
|
|
118 for a high-level description of this process, and POD in the individual
|
|
119 scripts for the details.
|
|
120
|
|
121 =cut
|
|
122
|
|
123 package Bio::EnsEMBL::Utils::AssemblyProjector;
|
|
124
|
|
125 use strict;
|
|
126 use warnings;
|
|
127 no warnings qw(uninitialized);
|
|
128
|
|
129 use Bio::EnsEMBL::Utils::Exception qw(throw warning);
|
|
130 use Bio::EnsEMBL::Utils::Argument qw(rearrange);
|
|
131 use Bio::EnsEMBL::Slice;
|
|
132 use Scalar::Util qw(weaken);
|
|
133
|
|
134 =head2 new
|
|
135
|
|
136 Arg [ADAPTOR] : Bio::EnsEMBL::DBSQL::DBAdaptor $adaptor - a db adaptor
|
|
137 for a database containing the assembly mapping
|
|
138 Arg [EXTERNAL_SOURCE] : (optional) Boolean $external_source - indicates if
|
|
139 source is from a different database
|
|
140 Arg [OLD_ASSEMBLY] : name of the old assembly
|
|
141 Arg [OLD_ASSEMBLY] : name of the new assembly
|
|
142 Arg [OBJECT_TYPE] : (optional) object type ('slice' or 'feature')
|
|
143 Arg [MERGE_FRAGMENTS] : (optional) Boolean - determines if segments are merged
|
|
144 to return a single object spanning all segments
|
|
145 (default: true)
|
|
146 Arg [CHECK_LENGTH] : (optional) Boolean - determines if projected objects
|
|
147 have to have same length as original (default: false)
|
|
148 Example : my $ap = Bio::EnsEMBL::Utils::AssemblyProjector->new(
|
|
149 -DBADAPTOR => $dba,
|
|
150 -OLD_ASSEMBLY => NCBIM36,
|
|
151 -NEW_ASSEMBLY => NCBIM37,
|
|
152 );
|
|
153 Description : Constructor.
|
|
154 Return type : a Bio::EnsEMBL::Utils::AssemblyProjector object
|
|
155 Exceptions : thrown on missing arguments
|
|
156 thrown on invalid OBJECT_TYPE
|
|
157 Caller : general
|
|
158 Status : At Risk
|
|
159 : under development
|
|
160
|
|
161 =cut
|
|
162
|
|
163 sub new {
|
|
164 my $caller = shift;
|
|
165 my $class = ref($caller) || $caller;
|
|
166
|
|
167 my ($adaptor, $external_source, $old_assembly, $new_assembly,
|
|
168 $merge_fragments, $check_length) = rearrange([qw(ADAPTOR EXTERNAL_SOURCE
|
|
169 OLD_ASSEMBLY NEW_ASSEMBLY MERGE_FRAGMENTS CHECK_LENGTH)], @_);
|
|
170
|
|
171 unless ($adaptor and ref($adaptor) and
|
|
172 $adaptor->isa('Bio::EnsEMBL::DBSQL::DBAdaptor')) {
|
|
173 throw("You must provide a DBAdaptor to a database containing the assembly mapping.");
|
|
174 }
|
|
175
|
|
176 unless ($old_assembly and $new_assembly) {
|
|
177 throw("You must provide an old and new assembly name.");
|
|
178 }
|
|
179
|
|
180 my $self = {};
|
|
181 bless ($self, $class);
|
|
182
|
|
183 # initialise
|
|
184 $self->adaptor($adaptor);
|
|
185 $self->{'old_assembly'} = $old_assembly;
|
|
186 $self->{'new_assembly'} = $new_assembly;
|
|
187
|
|
188 # by default, merge fragments
|
|
189 $self->{'merge_fragments'} = $merge_fragments || 1;
|
|
190
|
|
191 # by default, do not check length
|
|
192 $self->{'check_length'} = $check_length || 0;
|
|
193
|
|
194 # by default, features and slices are expected in same database as the
|
|
195 # assembly mapping
|
|
196 $self->{'external_source'} = $external_source || 0;
|
|
197
|
|
198 return $self;
|
|
199 }
|
|
200
|
|
201
|
|
202 =head2 project
|
|
203
|
|
204 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
|
|
205 the object to project
|
|
206 Arg[2] : String $to_assembly - assembly to project to
|
|
207 Example : my $new_slice = $assembly_projector->project($old_slice,
|
|
208 'NCBIM37');
|
|
209 Description : Projects a Slice or Feature to the specified assembly.
|
|
210
|
|
211 Several tests are performed on the result to discard unwanted
|
|
212 results. All projection segments have to be on the same
|
|
213 seq_region and strand. If -MERGE_FRAGMENTS is set, gaps will be
|
|
214 bridged by creating a single object from first_segment_start to
|
|
215 last_segment_end. If -CHECK_LENGTH is set, the projected object
|
|
216 will have to have the same length as the original. You can use
|
|
217 the last_status() method to find out what the result of some of
|
|
218 these rule tests were. Please see the comments in the code for
|
|
219 more details about these rules.
|
|
220
|
|
221 The return value of this method will always be a single object,
|
|
222 or undef if the projection fails any of the rules.
|
|
223
|
|
224 Note that when projecting features, only a "shallow" projection
|
|
225 is performed, i.e. attached features aren't projected
|
|
226 automatically! (e.g. if you project a gene, its transcripts will
|
|
227 have to be projected manually before storing the new gene)
|
|
228 Return type : same a Arg 1, or undef if projection fails any of the rules
|
|
229 Exceptions : thrown on invalid arguments
|
|
230 Caller : general, $self->old_to_new, $self->new_to_old
|
|
231 Status : At Risk
|
|
232 : under development
|
|
233
|
|
234 =cut
|
|
235
|
|
236 sub project {
|
|
237 my ($self, $object, $to_assembly) = @_;
|
|
238
|
|
239 throw("Need an assembly version to project to.") unless ($to_assembly);
|
|
240 throw("Need an object to project.") unless ($object and ref($object));
|
|
241
|
|
242 my ($slice, $object_type);
|
|
243
|
|
244 if ($object->isa('Bio::EnsEMBL::Feature')) {
|
|
245 $object_type = 'feature';
|
|
246 } elsif ($object->isa('Bio::EnsEMBL::Slice')) {
|
|
247 $object_type = 'slice';
|
|
248 } else {
|
|
249 throw("Need a Feature or Slice to project.");
|
|
250 }
|
|
251
|
|
252 # if the feature or slice is sourced from another db, we have to "transfer"
|
|
253 # it to the db that contains the assembly mapping. the transfer is very
|
|
254 # shallow but that should do for our purposes
|
|
255 if ($self->external_source) {
|
|
256 my $slice_adaptor = $self->adaptor->get_SliceAdaptor;
|
|
257
|
|
258 if ($object_type eq 'feature') {
|
|
259
|
|
260 # createa a new slice from the target db
|
|
261 my $f_slice = $object->slice;
|
|
262 my $target_slice = $slice_adaptor->fetch_by_name($f_slice->name);
|
|
263
|
|
264 # now change the feature so that it appears it's from the target db
|
|
265 $object->slice($target_slice);
|
|
266
|
|
267 } else {
|
|
268
|
|
269 # createa a new slice from the target db
|
|
270 $object = $slice_adaptor->fetch_by_name($object->name);
|
|
271
|
|
272 }
|
|
273 }
|
|
274
|
|
275 if ($object_type eq 'feature') {
|
|
276 $slice = $object->feature_Slice;
|
|
277 } else {
|
|
278 $slice = $object;
|
|
279 }
|
|
280
|
|
281 # warn if trying to project to assembly version the object already is on
|
|
282 if ($slice->coord_system->version eq $to_assembly) {
|
|
283 warning("Assembly version to project to ($to_assembly) is the same as your object's assembly (".$slice->coord_system->version.").");
|
|
284 }
|
|
285
|
|
286 # now project the slice
|
|
287 my $cs_name = $slice->coord_system_name;
|
|
288 my @segments = @{ $slice->project($cs_name, $to_assembly) };
|
|
289
|
|
290 # we need to reverse the projection segment list if the orignial
|
|
291 if ($slice->strand == -1) {
|
|
292 @segments = reverse(@segments);
|
|
293 }
|
|
294
|
|
295 # apply rules to projection results
|
|
296 #
|
|
297 # discard the projected feature/slice if
|
|
298 # 1. it doesn't project at all (no segments returned)
|
|
299 # 2. [unless MERGE_FRAGMENTS is set] the projection is fragmented (more
|
|
300 # than one segment)
|
|
301 # 3. [if CHECK_LENGTH is set] the projection doesn't have the same length
|
|
302 # as the original feature/slice
|
|
303 # 4. all segments are on same chromosome and strand
|
|
304
|
|
305 # keep track of the status of applied rules
|
|
306 my @status = ();
|
|
307
|
|
308 # test (1)
|
|
309 return undef unless (@segments);
|
|
310 #warn "DEBUG: passed test 1\n";
|
|
311
|
|
312 # test (2)
|
|
313 return undef if (!($self->merge_fragments) and scalar(@segments) > 1);
|
|
314 push @status, 'fragmented' if (scalar(@segments) > 1);
|
|
315 #warn "DEBUG: passed test 2\n";
|
|
316
|
|
317 # test (3)
|
|
318 my $first_slice = $segments[0]->to_Slice;
|
|
319 my $last_slice = $segments[-1]->to_Slice;
|
|
320 my $length_mismatch = (($last_slice->end - $first_slice->start + 1) !=
|
|
321 $object->length);
|
|
322 return undef if ($self->check_length and $length_mismatch);
|
|
323 push @status, 'length_mismatch' if ($length_mismatch);
|
|
324 #warn "DEBUG: passed test 3\n";
|
|
325
|
|
326 # test (4)
|
|
327 my %sr_names = ();
|
|
328 my %strands = ();
|
|
329 foreach my $seg (@segments) {
|
|
330 my $sl = $seg->to_Slice;
|
|
331 $sr_names{$sl->seq_region_name}++;
|
|
332 $strands{$sl->strand}++;
|
|
333 }
|
|
334 return undef if (scalar(keys %sr_names) > 1 or scalar(keys %strands) > 1);
|
|
335 #warn "DEBUG: passed test 4\n";
|
|
336
|
|
337 # remember rule status
|
|
338 $self->last_status(join('|', @status));
|
|
339
|
|
340 # everything looks fine, so adjust the coords of your feature/slice
|
|
341 my $new_slice = $first_slice;
|
|
342 $new_slice->{'end'} = $last_slice->end;
|
|
343
|
|
344 if ($object_type eq 'slice') {
|
|
345 return $new_slice;
|
|
346 } else {
|
|
347
|
|
348 $object->start($new_slice->start);
|
|
349 $object->end($new_slice->end);
|
|
350 $object->strand($new_slice->strand);
|
|
351 $object->slice($new_slice->seq_region_Slice);
|
|
352
|
|
353 # undef dbID and adaptor so you can store the feature in the target db
|
|
354 $object->dbID(undef);
|
|
355 $object->adaptor(undef);
|
|
356
|
|
357 return $object;
|
|
358 }
|
|
359
|
|
360 }
|
|
361
|
|
362
|
|
363 =head2 old_to_new
|
|
364
|
|
365 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
|
|
366 the object to project
|
|
367 Example : my $new_slice = $assembly_projector->old_to_new($old_slice);
|
|
368 Description : Projects a Slice or Feature from old to new assembly.
|
|
369 This method is just a convenience wrapper for $self->project.
|
|
370 Return type : same a Arg 1, or undef
|
|
371 Exceptions : none
|
|
372 Caller : general
|
|
373 Status : At Risk
|
|
374 : under development
|
|
375
|
|
376 =cut
|
|
377
|
|
378 sub old_to_new {
|
|
379 my ($self, $object) = @_;
|
|
380 return $self->project($object, $self->new_assembly);
|
|
381 }
|
|
382
|
|
383
|
|
384 =head2 new_to_old
|
|
385
|
|
386 Arg[1] : Bio::EnsEMBL::Slice or Bio::EnsEMBL::Feature $object -
|
|
387 the object to project
|
|
388 Example : my $old_slice = $assembly_projector->new_to_old($new_slice, 1);
|
|
389 Description : Projects a Slice or Feature from new to old assembly.
|
|
390 This method is just a convenience wrapper for $self->project.
|
|
391 Return type : same a Arg 1, or undef
|
|
392 Exceptions : none
|
|
393 Caller : general
|
|
394 Status : At Risk
|
|
395 : under development
|
|
396
|
|
397 =cut
|
|
398
|
|
399 sub new_to_old {
|
|
400 my ($self, $object) = @_;
|
|
401 return $self->project($object, $self->old_assembly);
|
|
402 }
|
|
403
|
|
404
|
|
405 #
|
|
406 # accessors
|
|
407 #
|
|
408 sub adaptor {
|
|
409 my $self = shift;
|
|
410 weaken($self->{'adaptor'} = shift) if (@_);
|
|
411 return $self->{'adaptor'};
|
|
412 }
|
|
413
|
|
414
|
|
415 sub external_source {
|
|
416 my $self = shift;
|
|
417 $self->{'external_source'} = shift if (@_);
|
|
418 return $self->{'external_source'};
|
|
419 }
|
|
420
|
|
421
|
|
422 sub old_assembly {
|
|
423 my $self = shift;
|
|
424 $self->{'old_assembly'} = shift if (@_);
|
|
425 return $self->{'old_assembly'};
|
|
426 }
|
|
427
|
|
428
|
|
429 sub new_assembly {
|
|
430 my $self = shift;
|
|
431 $self->{'new_assembly'} = shift if (@_);
|
|
432 return $self->{'new_assembly'};
|
|
433 }
|
|
434
|
|
435
|
|
436 sub merge_fragments {
|
|
437 my $self = shift;
|
|
438 $self->{'merge_fragments'} = shift if (@_);
|
|
439 return $self->{'merge_fragments'};
|
|
440 }
|
|
441
|
|
442
|
|
443 sub check_length {
|
|
444 my $self = shift;
|
|
445 $self->{'check_length'} = shift if (@_);
|
|
446 return $self->{'check_length'};
|
|
447 }
|
|
448
|
|
449
|
|
450 sub last_status {
|
|
451 my $self = shift;
|
|
452 $self->{'last_status'} = shift if (@_);
|
|
453 return $self->{'last_status'};
|
|
454 }
|
|
455
|
|
456
|
|
457 1;
|
|
458
|
|
459
|