0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21
|
|
22 package Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3;
|
|
23
|
|
24 use strict;
|
|
25 use warnings;
|
|
26
|
|
27 # This module allows conversion of ensembl objects to GFF3 and GVF by inserting
|
|
28 # to_gff (and supporting _gff_hash) methods into the necessary feature classes
|
|
29
|
|
30 {
|
|
31 package Bio::EnsEMBL::Slice;
|
|
32
|
|
33 sub gff_version {
|
|
34 return "##gff-version 3\n";
|
|
35 }
|
|
36
|
|
37 sub gff_header {
|
|
38 my $self = shift;
|
|
39
|
|
40 my %args = @_;
|
|
41
|
|
42 # build up a date string in the format specified by the GFF spec
|
|
43
|
|
44 my ( $sec, $min, $hr, $mday, $mon, $year ) = localtime;
|
|
45 $year += 1900; # correct the year
|
|
46 $mon++; # correct the month
|
|
47
|
|
48 my $date = sprintf "%4d-%02d-%02d", $year, $mon, $mday;
|
|
49
|
|
50 my $region = $self->seq_region_name;
|
|
51 my $start = $self->start;
|
|
52 my $end = $self->end;
|
|
53 my $assembly = $self->coord_system->version;
|
|
54
|
|
55 my $mca = $self->adaptor->db->get_MetaContainerAdaptor;
|
|
56 my $tax_id = $mca->get_taxonomy_id;
|
|
57
|
|
58 my $hdr =
|
|
59 "##file-date $date\n"
|
|
60 . "##genome-build ensembl $assembly\n"
|
|
61 . "##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=$tax_id\n";
|
|
62
|
|
63 $hdr .= "##sequence-region $region $start $end\n" unless $args{no_sequence_region};
|
|
64
|
|
65 return $hdr;
|
|
66 }
|
|
67
|
|
68 sub gvf_version {
|
|
69 return "##gvf-version 1.06\n";
|
|
70 }
|
|
71
|
|
72 sub gvf_header {
|
|
73 my $self = shift;
|
|
74
|
|
75 my %args = @_;
|
|
76 my $hdr = $self->gff_version;
|
|
77 $hdr .= $self->gvf_version;
|
|
78 $hdr .= $self->gff_header(@_);
|
|
79
|
|
80 my $mca = $self->adaptor->db->get_MetaContainerAdaptor;
|
|
81 my $schema_version = $mca->get_schema_version;
|
|
82 my $species_name = $mca->get_scientific_name;
|
|
83 $species_name =~ s/ /_/g;
|
|
84 my $url = 'http://e'.$schema_version.'.ensembl.org/'.$species_name;
|
|
85
|
|
86 $hdr .= "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/so.obo?revision=1.283\n";
|
|
87 $hdr .= "##data-source Source=ensembl;version=$schema_version;url=$url\n";
|
|
88 $hdr .= "##file-version $schema_version\n";
|
|
89
|
|
90 if (my $individual = $args{individual}) {
|
|
91 $hdr .= "##individual-id ".$individual->to_gvf."\n";
|
|
92 }
|
|
93
|
|
94 if (my $population = $args{population}) {
|
|
95 $hdr .= "##attribute-method ".$population->to_gvf."\n";
|
|
96 }
|
|
97
|
|
98 return $hdr;
|
|
99 }
|
|
100 }
|
|
101
|
|
102 {
|
|
103
|
|
104 package Bio::EnsEMBL::Feature;
|
|
105
|
|
106 sub to_gff {
|
|
107
|
|
108 my $self = shift;
|
|
109
|
|
110 my %args = @_;
|
|
111
|
|
112 # This parameter is assumed to be a hashref which includes extra attributes you'd
|
|
113 # like to have appended onto the gff line for the feature
|
|
114 my $extra_attrs = $args{extra_attrs};
|
|
115
|
|
116 my $gff = $self->_gff_hash(@_);
|
|
117
|
|
118 return undef unless defined $gff;
|
|
119
|
|
120 # default optional columns, and check that all required columns are present
|
|
121
|
|
122 $gff->{score} = '.' unless defined $gff->{score};
|
|
123 $gff->{strand} = '.' unless defined $gff->{strand};
|
|
124 $gff->{phase} = '.' unless defined $gff->{phase};
|
|
125
|
|
126 for my $req (qw(source type start end)) {
|
|
127 die "'$req' attribute required for GFF" unless $gff->{$req};
|
|
128 }
|
|
129
|
|
130 # order as per GFF3 spec: http://www.sequenceontology.org/gff3.shtml
|
|
131
|
|
132 my $gff_str = join( "\t",
|
|
133 $gff->{seqid}, $gff->{source}, $gff->{type}, $gff->{start},
|
|
134 $gff->{end}, $gff->{score}, $gff->{strand}, $gff->{phase},
|
|
135 );
|
|
136
|
|
137 if ($extra_attrs) {
|
|
138
|
|
139 # combine the extra attributes with any existing ones (duplicate keys will get squashed,
|
|
140 # so attributes specified in the extra_attrs hash will override existing ones)
|
|
141
|
|
142 $gff->{attributes} = {} unless defined $gff->{attributes};
|
|
143
|
|
144 @{ $gff->{attributes} }{ keys %$extra_attrs } = values %$extra_attrs;
|
|
145 }
|
|
146
|
|
147 if ( $gff->{attributes} ) {
|
|
148
|
|
149 my @attrs;
|
|
150
|
|
151 for my $key (keys %{ $gff->{attributes} }) {
|
|
152
|
|
153 my $val = $gff->{attributes}->{$key};
|
|
154
|
|
155 if (ref $val eq 'ARRAY') {
|
|
156 push @attrs, map { $key . '='. $_ } @$val;
|
|
157 }
|
|
158 else {
|
|
159 push @attrs, $key . '=' . $val;
|
|
160 }
|
|
161 }
|
|
162
|
|
163 $gff_str .= "\t" . join( ';', @attrs );
|
|
164 }
|
|
165
|
|
166 return $gff_str;
|
|
167 }
|
|
168
|
|
169 sub _gff_hash {
|
|
170 my $self = shift;
|
|
171
|
|
172 my %args = @_;
|
|
173
|
|
174 my $rebase = $args{rebase}; # use absolute or slice-relative coordinates
|
|
175
|
|
176 my $gff_seqid = $args{gff_seqid} || $self->slice->seq_region_name;
|
|
177 my $gff_source = $args{gff_source} || $self->_gff_source;
|
|
178
|
|
179 my $seqid = $rebase ? $gff_seqid.'_'.$self->slice->start.'-'.$self->slice->end : $gff_seqid;
|
|
180 my $start = $rebase ? $self->start : $self->seq_region_start;
|
|
181 my $end = $rebase ? $self->end : $self->seq_region_end;
|
|
182
|
|
183 # GFF3 does not allow start > end, and mandates that for zero-length features (e.g. insertions)
|
|
184 # start = end and the implied insertion site is to the right of the specified base, so we use the
|
|
185 # smaller of the two values
|
|
186
|
|
187 if ($start > $end) {
|
|
188 $start = $end;
|
|
189 }
|
|
190
|
|
191 my $gff = {
|
|
192 seqid => $gff_seqid,
|
|
193 source => $gff_source,
|
|
194 type => $self->_gff_type,
|
|
195 start => $start,
|
|
196 end => $end,
|
|
197 strand => (
|
|
198 $self->strand == 1 ? '+' : ( $self->strand == -1 ? '-' : '.' )
|
|
199 )
|
|
200 };
|
|
201
|
|
202 return $gff;
|
|
203 }
|
|
204
|
|
205 sub _gff_source {
|
|
206 my $self = shift;
|
|
207
|
|
208 if ($self->analysis) {
|
|
209 return
|
|
210 $self->analysis->gff_source
|
|
211 || $self->analysis->logic_name;
|
|
212 }
|
|
213 else {
|
|
214 return ref($self);
|
|
215 }
|
|
216 }
|
|
217
|
|
218 sub _gff_type {
|
|
219 my $self = shift;
|
|
220
|
|
221 return
|
|
222 ( $self->analysis && $self->analysis->gff_feature )
|
|
223 || 'misc_feature';
|
|
224 }
|
|
225 }
|
|
226
|
|
227 {
|
|
228 package Bio::EnsEMBL::Variation::VariationFeature;
|
|
229
|
|
230 use Bio::EnsEMBL::Utils::Sequence qw(expand);
|
|
231
|
|
232 my $REFERENCE_ALLELE_IDENTIFIER = '@';
|
|
233
|
|
234 sub to_gvf {
|
|
235 my $self = shift;
|
|
236 return $self->to_gff(@_);
|
|
237 }
|
|
238
|
|
239 sub _gff_hash {
|
|
240
|
|
241 my $self = shift;
|
|
242
|
|
243 my $gff = $self->SUPER::_gff_hash(@_);
|
|
244
|
|
245 my %args = @_;
|
|
246
|
|
247 my $include_consequences = $args{include_consequences};
|
|
248 my $include_coding_details = $args{include_coding_details};
|
|
249 my $include_global_maf = $args{include_global_maf};
|
|
250
|
|
251 $gff->{source} = $self->source;
|
|
252
|
|
253 $gff->{type} = $self->class_SO_term;
|
|
254
|
|
255 my $source = $self->source;
|
|
256
|
|
257 $source .= '_'.$self->source_version if defined $self->source_version;
|
|
258
|
|
259 $gff->{attributes}->{Dbxref} = "$source:".$self->variation_name;
|
|
260
|
|
261 $gff->{attributes}->{ID} = $self->dbID;
|
|
262
|
|
263 # the Variant_seq attribute requires a comma separated list of alleles
|
|
264
|
|
265 my @alleles = split '/', $self->allele_string;
|
|
266 my $ref_seq = shift @alleles unless @alleles == 1; # shift off the reference allele
|
|
267
|
|
268 $gff->{attributes}->{Variant_seq} = join ',', @alleles;
|
|
269
|
|
270 my $index = 0;
|
|
271
|
|
272 # expand tandem repeat alleles, because TranscriptVariationAlleles use the expanded sequence
|
|
273
|
|
274 map { expand(\$_) } @alleles;
|
|
275
|
|
276 # if you expand e.g. (T)0 you get an empty string, which we treat as a deletion, so default to '-'
|
|
277
|
|
278 my %allele_index = map { ($_ || '-') => $index++ } @alleles;
|
|
279
|
|
280 if ($include_global_maf) {
|
|
281
|
|
282 my $var = $self->variation;
|
|
283
|
|
284 if (defined $var->minor_allele_frequency) {
|
|
285
|
|
286 my $allele_idx;
|
|
287
|
|
288 if ($var->minor_allele eq $ref_seq) {
|
|
289 $allele_idx = $REFERENCE_ALLELE_IDENTIFIER;
|
|
290 }
|
|
291 else {
|
|
292 $allele_idx = $allele_index{$var->minor_allele};
|
|
293 }
|
|
294
|
|
295 if (defined $allele_idx) {
|
|
296 $gff->{attributes}->{global_minor_allele_frequency} =
|
|
297 join (' ',
|
|
298 $allele_idx,
|
|
299 $var->minor_allele_frequency,
|
|
300 $var->minor_allele_count
|
|
301 );
|
|
302 }
|
|
303 }
|
|
304 }
|
|
305
|
|
306 # the reference sequence should be set to '~' if the sequence is longer than 50 nucleotides
|
|
307
|
|
308 $ref_seq = '~' if (not $ref_seq) || (CORE::length($ref_seq) > 50);
|
|
309 $gff->{attributes}->{Reference_seq} = $ref_seq;
|
|
310
|
|
311 # Hack for HGMD mutations
|
|
312
|
|
313 if ($self->allele_string eq 'HGMD_MUTATION') {
|
|
314 $gff->{attributes}->{Reference_seq} = '~';
|
|
315 $gff->{attributes}->{Variant_seq} = '~';
|
|
316 $allele_index{$self->allele_string} = 0;
|
|
317 }
|
|
318
|
|
319 if ($include_consequences || $include_coding_details) {
|
|
320
|
|
321 for my $tv (@{ $self->get_all_TranscriptVariations }) {
|
|
322
|
|
323 unless ($tv->get_all_alternate_TranscriptVariationAlleles) {
|
|
324 warn $self->variation_name." has no alternate alleles?";
|
|
325 next;
|
|
326 }
|
|
327
|
|
328 if ($include_coding_details) {
|
|
329 my $ref_tva = $tv->get_reference_TranscriptVariationAllele;
|
|
330
|
|
331 if (my $pep = $ref_tva->peptide) {
|
|
332 $gff->{attributes}->{reference_peptide} = $pep;
|
|
333 }
|
|
334 }
|
|
335
|
|
336 for my $tva (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) {
|
|
337
|
|
338 my $allele_idx = $allele_index{$tva->variation_feature_seq};
|
|
339
|
|
340 if (defined $allele_idx) {
|
|
341
|
|
342 if ($include_consequences) {
|
|
343 for my $oc (@{ $tva->get_all_OverlapConsequences }) {
|
|
344
|
|
345 push @{ $gff->{attributes}->{Variant_effect} ||= [] },
|
|
346 join(' ',
|
|
347 $oc->SO_term,
|
|
348 $allele_idx,
|
|
349 $oc->feature_SO_term,
|
|
350 $tv->transcript_stable_id,
|
|
351 );
|
|
352 }
|
|
353 }
|
|
354
|
|
355 if ($include_coding_details) {
|
|
356 if ($tva->pep_allele_string) {
|
|
357
|
|
358 push @{ $gff->{attributes}->{variant_peptide} ||= [] },
|
|
359 join(' ',
|
|
360 $allele_idx,
|
|
361 $tva->peptide,
|
|
362 $tv->transcript_stable_id,
|
|
363 );
|
|
364
|
|
365 for my $tool (qw(sift polyphen)) {
|
|
366 my $pred_meth = $tool.'_prediction';
|
|
367 my $score_meth = $tool.'_score';
|
|
368 if (my $pred = $tva->$pred_meth) {
|
|
369 $pred =~ s/\s/_/g;
|
|
370 push @{ $gff->{attributes}->{polyphen_prediction} ||= [] },
|
|
371 join(' ',
|
|
372 $allele_idx,
|
|
373 $pred,
|
|
374 $tva->$score_meth,
|
|
375 $tv->transcript_stable_id
|
|
376 );
|
|
377 }
|
|
378 }
|
|
379 }
|
|
380 }
|
|
381 }
|
|
382 else {
|
|
383 warn "No allele_index entry for allele: ".$tva->variation_feature_seq.
|
|
384 " of ".$self->variation_name."? Is reference " . $tva->is_reference . " ref seq " . $ref_seq . "\n";
|
|
385 }
|
|
386 }
|
|
387 }
|
|
388 }
|
|
389
|
|
390 return $gff;
|
|
391 }
|
|
392 }
|
|
393
|
|
394 {
|
|
395 package Bio::EnsEMBL::Variation::StructuralVariationFeature;
|
|
396
|
|
397 sub _gff_hash {
|
|
398
|
|
399 my $self = shift;
|
|
400
|
|
401 my $gff = $self->SUPER::_gff_hash(@_);
|
|
402
|
|
403 $gff->{attributes}->{ID} = $self->variation_name;
|
|
404
|
|
405 $gff->{source} = $self->source;
|
|
406
|
|
407 my $sv = $self->structural_variation;
|
|
408
|
|
409 $gff->{attributes}->{Dbxref} = $self->source . ':' . $self->variation_name;
|
|
410
|
|
411 $gff->{attributes}->{study_accession} = $sv->study->name if $sv->study->name;
|
|
412
|
|
413 $gff->{type} = $self->class_SO_term;
|
|
414
|
|
415 #$gff->{attributes}->{Reference_seq} = $self->end > $self->start+50 ? '~' : $self->get_reference_sequence;
|
|
416
|
|
417 if ( (defined $self->inner_start) && (defined $self->outer_start) && ($self->inner_start != $self->outer_start) ) {
|
|
418 $gff->{attributes}->{Start_range} = join ',', $self->outer_start, $self->inner_start;
|
|
419 }
|
|
420
|
|
421 if ( (defined $self->inner_end) && (defined $self->outer_end) && ($self->inner_end != $self->outer_end) ) {
|
|
422 $gff->{attributes}->{End_range} = join ',', $self->inner_end, $self->outer_end;
|
|
423 }
|
|
424
|
|
425 if (my $sv = $self->structural_variation) {
|
|
426 if (ref $sv eq 'Bio::EnsEMBL::Variation::SupportingStructuralVariation') {
|
|
427 if (my $parents = $sv->get_all_StructuralVariations) {
|
|
428 $gff->{attributes}->{Parent} = join ',', map { $_->variation_name } @$parents;
|
|
429 }
|
|
430 }
|
|
431 }
|
|
432
|
|
433 return $gff;
|
|
434 }
|
|
435
|
|
436 sub to_gvf {
|
|
437 my $self = shift;
|
|
438 return $self->to_gff(@_);
|
|
439 }
|
|
440 }
|
|
441
|
|
442 {
|
|
443 package Bio::EnsEMBL::Variation::Individual;
|
|
444
|
|
445 sub _gff_hash {
|
|
446
|
|
447 my $self = shift;
|
|
448
|
|
449 my $gff;
|
|
450
|
|
451 $gff->{Gender} = $self->gender;
|
|
452
|
|
453 $gff->{Display_name} = $self->name;
|
|
454
|
|
455 $gff->{ensembl_description} = $self->description;
|
|
456
|
|
457 $gff->{Type} = $self->type_description;
|
|
458
|
|
459 $gff->{Population} = join ',', map { $_->name } @{ $self->get_all_Populations };
|
|
460
|
|
461 return $gff;
|
|
462 }
|
|
463
|
|
464 sub to_gvf {
|
|
465 my $self = shift;
|
|
466
|
|
467 my $attrs = $self->_gff_hash(@_);
|
|
468
|
|
469 # get rid of any empty attributes
|
|
470 map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs;
|
|
471
|
|
472 return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs;
|
|
473 }
|
|
474
|
|
475 }
|
|
476
|
|
477 {
|
|
478 package Bio::EnsEMBL::Variation::Population;
|
|
479
|
|
480 sub _gff_hash {
|
|
481
|
|
482 my $self = shift;
|
|
483
|
|
484 my $gff;
|
|
485
|
|
486 $gff->{Attribute} = 'Variant_freq';
|
|
487
|
|
488 $gff->{population} = $self->name;
|
|
489
|
|
490 $gff->{population_size} = $self->size;
|
|
491
|
|
492 $gff->{Comment} = $self->description;
|
|
493
|
|
494 return $gff;
|
|
495 }
|
|
496
|
|
497 sub to_gvf {
|
|
498 my $self = shift;
|
|
499
|
|
500 my $attrs = $self->_gff_hash(@_);
|
|
501
|
|
502 # get rid of any empty attributes
|
|
503 map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs;
|
|
504
|
|
505 return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs;
|
|
506 }
|
|
507
|
|
508 }
|
|
509
|
|
510 1;
|
|
511
|