comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/EnsEMBL2GFF3.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21
22 package Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3;
23
24 use strict;
25 use warnings;
26
27 # This module allows conversion of ensembl objects to GFF3 and GVF by inserting
28 # to_gff (and supporting _gff_hash) methods into the necessary feature classes
29
30 {
31 package Bio::EnsEMBL::Slice;
32
33 sub gff_version {
34 return "##gff-version 3\n";
35 }
36
37 sub gff_header {
38 my $self = shift;
39
40 my %args = @_;
41
42 # build up a date string in the format specified by the GFF spec
43
44 my ( $sec, $min, $hr, $mday, $mon, $year ) = localtime;
45 $year += 1900; # correct the year
46 $mon++; # correct the month
47
48 my $date = sprintf "%4d-%02d-%02d", $year, $mon, $mday;
49
50 my $region = $self->seq_region_name;
51 my $start = $self->start;
52 my $end = $self->end;
53 my $assembly = $self->coord_system->version;
54
55 my $mca = $self->adaptor->db->get_MetaContainerAdaptor;
56 my $tax_id = $mca->get_taxonomy_id;
57
58 my $hdr =
59 "##file-date $date\n"
60 . "##genome-build ensembl $assembly\n"
61 . "##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=$tax_id\n";
62
63 $hdr .= "##sequence-region $region $start $end\n" unless $args{no_sequence_region};
64
65 return $hdr;
66 }
67
68 sub gvf_version {
69 return "##gvf-version 1.06\n";
70 }
71
72 sub gvf_header {
73 my $self = shift;
74
75 my %args = @_;
76 my $hdr = $self->gff_version;
77 $hdr .= $self->gvf_version;
78 $hdr .= $self->gff_header(@_);
79
80 my $mca = $self->adaptor->db->get_MetaContainerAdaptor;
81 my $schema_version = $mca->get_schema_version;
82 my $species_name = $mca->get_scientific_name;
83 $species_name =~ s/ /_/g;
84 my $url = 'http://e'.$schema_version.'.ensembl.org/'.$species_name;
85
86 $hdr .= "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/so.obo?revision=1.283\n";
87 $hdr .= "##data-source Source=ensembl;version=$schema_version;url=$url\n";
88 $hdr .= "##file-version $schema_version\n";
89
90 if (my $individual = $args{individual}) {
91 $hdr .= "##individual-id ".$individual->to_gvf."\n";
92 }
93
94 if (my $population = $args{population}) {
95 $hdr .= "##attribute-method ".$population->to_gvf."\n";
96 }
97
98 return $hdr;
99 }
100 }
101
102 {
103
104 package Bio::EnsEMBL::Feature;
105
106 sub to_gff {
107
108 my $self = shift;
109
110 my %args = @_;
111
112 # This parameter is assumed to be a hashref which includes extra attributes you'd
113 # like to have appended onto the gff line for the feature
114 my $extra_attrs = $args{extra_attrs};
115
116 my $gff = $self->_gff_hash(@_);
117
118 return undef unless defined $gff;
119
120 # default optional columns, and check that all required columns are present
121
122 $gff->{score} = '.' unless defined $gff->{score};
123 $gff->{strand} = '.' unless defined $gff->{strand};
124 $gff->{phase} = '.' unless defined $gff->{phase};
125
126 for my $req (qw(source type start end)) {
127 die "'$req' attribute required for GFF" unless $gff->{$req};
128 }
129
130 # order as per GFF3 spec: http://www.sequenceontology.org/gff3.shtml
131
132 my $gff_str = join( "\t",
133 $gff->{seqid}, $gff->{source}, $gff->{type}, $gff->{start},
134 $gff->{end}, $gff->{score}, $gff->{strand}, $gff->{phase},
135 );
136
137 if ($extra_attrs) {
138
139 # combine the extra attributes with any existing ones (duplicate keys will get squashed,
140 # so attributes specified in the extra_attrs hash will override existing ones)
141
142 $gff->{attributes} = {} unless defined $gff->{attributes};
143
144 @{ $gff->{attributes} }{ keys %$extra_attrs } = values %$extra_attrs;
145 }
146
147 if ( $gff->{attributes} ) {
148
149 my @attrs;
150
151 for my $key (keys %{ $gff->{attributes} }) {
152
153 my $val = $gff->{attributes}->{$key};
154
155 if (ref $val eq 'ARRAY') {
156 push @attrs, map { $key . '='. $_ } @$val;
157 }
158 else {
159 push @attrs, $key . '=' . $val;
160 }
161 }
162
163 $gff_str .= "\t" . join( ';', @attrs );
164 }
165
166 return $gff_str;
167 }
168
169 sub _gff_hash {
170 my $self = shift;
171
172 my %args = @_;
173
174 my $rebase = $args{rebase}; # use absolute or slice-relative coordinates
175
176 my $gff_seqid = $args{gff_seqid} || $self->slice->seq_region_name;
177 my $gff_source = $args{gff_source} || $self->_gff_source;
178
179 my $seqid = $rebase ? $gff_seqid.'_'.$self->slice->start.'-'.$self->slice->end : $gff_seqid;
180 my $start = $rebase ? $self->start : $self->seq_region_start;
181 my $end = $rebase ? $self->end : $self->seq_region_end;
182
183 # GFF3 does not allow start > end, and mandates that for zero-length features (e.g. insertions)
184 # start = end and the implied insertion site is to the right of the specified base, so we use the
185 # smaller of the two values
186
187 if ($start > $end) {
188 $start = $end;
189 }
190
191 my $gff = {
192 seqid => $gff_seqid,
193 source => $gff_source,
194 type => $self->_gff_type,
195 start => $start,
196 end => $end,
197 strand => (
198 $self->strand == 1 ? '+' : ( $self->strand == -1 ? '-' : '.' )
199 )
200 };
201
202 return $gff;
203 }
204
205 sub _gff_source {
206 my $self = shift;
207
208 if ($self->analysis) {
209 return
210 $self->analysis->gff_source
211 || $self->analysis->logic_name;
212 }
213 else {
214 return ref($self);
215 }
216 }
217
218 sub _gff_type {
219 my $self = shift;
220
221 return
222 ( $self->analysis && $self->analysis->gff_feature )
223 || 'misc_feature';
224 }
225 }
226
227 {
228 package Bio::EnsEMBL::Variation::VariationFeature;
229
230 use Bio::EnsEMBL::Utils::Sequence qw(expand);
231
232 my $REFERENCE_ALLELE_IDENTIFIER = '@';
233
234 sub to_gvf {
235 my $self = shift;
236 return $self->to_gff(@_);
237 }
238
239 sub _gff_hash {
240
241 my $self = shift;
242
243 my $gff = $self->SUPER::_gff_hash(@_);
244
245 my %args = @_;
246
247 my $include_consequences = $args{include_consequences};
248 my $include_coding_details = $args{include_coding_details};
249 my $include_global_maf = $args{include_global_maf};
250
251 $gff->{source} = $self->source;
252
253 $gff->{type} = $self->class_SO_term;
254
255 my $source = $self->source;
256
257 $source .= '_'.$self->source_version if defined $self->source_version;
258
259 $gff->{attributes}->{Dbxref} = "$source:".$self->variation_name;
260
261 $gff->{attributes}->{ID} = $self->dbID;
262
263 # the Variant_seq attribute requires a comma separated list of alleles
264
265 my @alleles = split '/', $self->allele_string;
266 my $ref_seq = shift @alleles unless @alleles == 1; # shift off the reference allele
267
268 $gff->{attributes}->{Variant_seq} = join ',', @alleles;
269
270 my $index = 0;
271
272 # expand tandem repeat alleles, because TranscriptVariationAlleles use the expanded sequence
273
274 map { expand(\$_) } @alleles;
275
276 # if you expand e.g. (T)0 you get an empty string, which we treat as a deletion, so default to '-'
277
278 my %allele_index = map { ($_ || '-') => $index++ } @alleles;
279
280 if ($include_global_maf) {
281
282 my $var = $self->variation;
283
284 if (defined $var->minor_allele_frequency) {
285
286 my $allele_idx;
287
288 if ($var->minor_allele eq $ref_seq) {
289 $allele_idx = $REFERENCE_ALLELE_IDENTIFIER;
290 }
291 else {
292 $allele_idx = $allele_index{$var->minor_allele};
293 }
294
295 if (defined $allele_idx) {
296 $gff->{attributes}->{global_minor_allele_frequency} =
297 join (' ',
298 $allele_idx,
299 $var->minor_allele_frequency,
300 $var->minor_allele_count
301 );
302 }
303 }
304 }
305
306 # the reference sequence should be set to '~' if the sequence is longer than 50 nucleotides
307
308 $ref_seq = '~' if (not $ref_seq) || (CORE::length($ref_seq) > 50);
309 $gff->{attributes}->{Reference_seq} = $ref_seq;
310
311 # Hack for HGMD mutations
312
313 if ($self->allele_string eq 'HGMD_MUTATION') {
314 $gff->{attributes}->{Reference_seq} = '~';
315 $gff->{attributes}->{Variant_seq} = '~';
316 $allele_index{$self->allele_string} = 0;
317 }
318
319 if ($include_consequences || $include_coding_details) {
320
321 for my $tv (@{ $self->get_all_TranscriptVariations }) {
322
323 unless ($tv->get_all_alternate_TranscriptVariationAlleles) {
324 warn $self->variation_name." has no alternate alleles?";
325 next;
326 }
327
328 if ($include_coding_details) {
329 my $ref_tva = $tv->get_reference_TranscriptVariationAllele;
330
331 if (my $pep = $ref_tva->peptide) {
332 $gff->{attributes}->{reference_peptide} = $pep;
333 }
334 }
335
336 for my $tva (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) {
337
338 my $allele_idx = $allele_index{$tva->variation_feature_seq};
339
340 if (defined $allele_idx) {
341
342 if ($include_consequences) {
343 for my $oc (@{ $tva->get_all_OverlapConsequences }) {
344
345 push @{ $gff->{attributes}->{Variant_effect} ||= [] },
346 join(' ',
347 $oc->SO_term,
348 $allele_idx,
349 $oc->feature_SO_term,
350 $tv->transcript_stable_id,
351 );
352 }
353 }
354
355 if ($include_coding_details) {
356 if ($tva->pep_allele_string) {
357
358 push @{ $gff->{attributes}->{variant_peptide} ||= [] },
359 join(' ',
360 $allele_idx,
361 $tva->peptide,
362 $tv->transcript_stable_id,
363 );
364
365 for my $tool (qw(sift polyphen)) {
366 my $pred_meth = $tool.'_prediction';
367 my $score_meth = $tool.'_score';
368 if (my $pred = $tva->$pred_meth) {
369 $pred =~ s/\s/_/g;
370 push @{ $gff->{attributes}->{polyphen_prediction} ||= [] },
371 join(' ',
372 $allele_idx,
373 $pred,
374 $tva->$score_meth,
375 $tv->transcript_stable_id
376 );
377 }
378 }
379 }
380 }
381 }
382 else {
383 warn "No allele_index entry for allele: ".$tva->variation_feature_seq.
384 " of ".$self->variation_name."? Is reference " . $tva->is_reference . " ref seq " . $ref_seq . "\n";
385 }
386 }
387 }
388 }
389
390 return $gff;
391 }
392 }
393
394 {
395 package Bio::EnsEMBL::Variation::StructuralVariationFeature;
396
397 sub _gff_hash {
398
399 my $self = shift;
400
401 my $gff = $self->SUPER::_gff_hash(@_);
402
403 $gff->{attributes}->{ID} = $self->variation_name;
404
405 $gff->{source} = $self->source;
406
407 my $sv = $self->structural_variation;
408
409 $gff->{attributes}->{Dbxref} = $self->source . ':' . $self->variation_name;
410
411 $gff->{attributes}->{study_accession} = $sv->study->name if $sv->study->name;
412
413 $gff->{type} = $self->class_SO_term;
414
415 #$gff->{attributes}->{Reference_seq} = $self->end > $self->start+50 ? '~' : $self->get_reference_sequence;
416
417 if ( (defined $self->inner_start) && (defined $self->outer_start) && ($self->inner_start != $self->outer_start) ) {
418 $gff->{attributes}->{Start_range} = join ',', $self->outer_start, $self->inner_start;
419 }
420
421 if ( (defined $self->inner_end) && (defined $self->outer_end) && ($self->inner_end != $self->outer_end) ) {
422 $gff->{attributes}->{End_range} = join ',', $self->inner_end, $self->outer_end;
423 }
424
425 if (my $sv = $self->structural_variation) {
426 if (ref $sv eq 'Bio::EnsEMBL::Variation::SupportingStructuralVariation') {
427 if (my $parents = $sv->get_all_StructuralVariations) {
428 $gff->{attributes}->{Parent} = join ',', map { $_->variation_name } @$parents;
429 }
430 }
431 }
432
433 return $gff;
434 }
435
436 sub to_gvf {
437 my $self = shift;
438 return $self->to_gff(@_);
439 }
440 }
441
442 {
443 package Bio::EnsEMBL::Variation::Individual;
444
445 sub _gff_hash {
446
447 my $self = shift;
448
449 my $gff;
450
451 $gff->{Gender} = $self->gender;
452
453 $gff->{Display_name} = $self->name;
454
455 $gff->{ensembl_description} = $self->description;
456
457 $gff->{Type} = $self->type_description;
458
459 $gff->{Population} = join ',', map { $_->name } @{ $self->get_all_Populations };
460
461 return $gff;
462 }
463
464 sub to_gvf {
465 my $self = shift;
466
467 my $attrs = $self->_gff_hash(@_);
468
469 # get rid of any empty attributes
470 map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs;
471
472 return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs;
473 }
474
475 }
476
477 {
478 package Bio::EnsEMBL::Variation::Population;
479
480 sub _gff_hash {
481
482 my $self = shift;
483
484 my $gff;
485
486 $gff->{Attribute} = 'Variant_freq';
487
488 $gff->{population} = $self->name;
489
490 $gff->{population_size} = $self->size;
491
492 $gff->{Comment} = $self->description;
493
494 return $gff;
495 }
496
497 sub to_gvf {
498 my $self = shift;
499
500 my $attrs = $self->_gff_hash(@_);
501
502 # get rid of any empty attributes
503 map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs;
504
505 return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs;
506 }
507
508 }
509
510 1;
511