Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/EnsEMBL2GFF3.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 | |
22 package Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3; | |
23 | |
24 use strict; | |
25 use warnings; | |
26 | |
27 # This module allows conversion of ensembl objects to GFF3 and GVF by inserting | |
28 # to_gff (and supporting _gff_hash) methods into the necessary feature classes | |
29 | |
30 { | |
31 package Bio::EnsEMBL::Slice; | |
32 | |
33 sub gff_version { | |
34 return "##gff-version 3\n"; | |
35 } | |
36 | |
37 sub gff_header { | |
38 my $self = shift; | |
39 | |
40 my %args = @_; | |
41 | |
42 # build up a date string in the format specified by the GFF spec | |
43 | |
44 my ( $sec, $min, $hr, $mday, $mon, $year ) = localtime; | |
45 $year += 1900; # correct the year | |
46 $mon++; # correct the month | |
47 | |
48 my $date = sprintf "%4d-%02d-%02d", $year, $mon, $mday; | |
49 | |
50 my $region = $self->seq_region_name; | |
51 my $start = $self->start; | |
52 my $end = $self->end; | |
53 my $assembly = $self->coord_system->version; | |
54 | |
55 my $mca = $self->adaptor->db->get_MetaContainerAdaptor; | |
56 my $tax_id = $mca->get_taxonomy_id; | |
57 | |
58 my $hdr = | |
59 "##file-date $date\n" | |
60 . "##genome-build ensembl $assembly\n" | |
61 . "##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=$tax_id\n"; | |
62 | |
63 $hdr .= "##sequence-region $region $start $end\n" unless $args{no_sequence_region}; | |
64 | |
65 return $hdr; | |
66 } | |
67 | |
68 sub gvf_version { | |
69 return "##gvf-version 1.06\n"; | |
70 } | |
71 | |
72 sub gvf_header { | |
73 my $self = shift; | |
74 | |
75 my %args = @_; | |
76 my $hdr = $self->gff_version; | |
77 $hdr .= $self->gvf_version; | |
78 $hdr .= $self->gff_header(@_); | |
79 | |
80 my $mca = $self->adaptor->db->get_MetaContainerAdaptor; | |
81 my $schema_version = $mca->get_schema_version; | |
82 my $species_name = $mca->get_scientific_name; | |
83 $species_name =~ s/ /_/g; | |
84 my $url = 'http://e'.$schema_version.'.ensembl.org/'.$species_name; | |
85 | |
86 $hdr .= "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/so.obo?revision=1.283\n"; | |
87 $hdr .= "##data-source Source=ensembl;version=$schema_version;url=$url\n"; | |
88 $hdr .= "##file-version $schema_version\n"; | |
89 | |
90 if (my $individual = $args{individual}) { | |
91 $hdr .= "##individual-id ".$individual->to_gvf."\n"; | |
92 } | |
93 | |
94 if (my $population = $args{population}) { | |
95 $hdr .= "##attribute-method ".$population->to_gvf."\n"; | |
96 } | |
97 | |
98 return $hdr; | |
99 } | |
100 } | |
101 | |
102 { | |
103 | |
104 package Bio::EnsEMBL::Feature; | |
105 | |
106 sub to_gff { | |
107 | |
108 my $self = shift; | |
109 | |
110 my %args = @_; | |
111 | |
112 # This parameter is assumed to be a hashref which includes extra attributes you'd | |
113 # like to have appended onto the gff line for the feature | |
114 my $extra_attrs = $args{extra_attrs}; | |
115 | |
116 my $gff = $self->_gff_hash(@_); | |
117 | |
118 return undef unless defined $gff; | |
119 | |
120 # default optional columns, and check that all required columns are present | |
121 | |
122 $gff->{score} = '.' unless defined $gff->{score}; | |
123 $gff->{strand} = '.' unless defined $gff->{strand}; | |
124 $gff->{phase} = '.' unless defined $gff->{phase}; | |
125 | |
126 for my $req (qw(source type start end)) { | |
127 die "'$req' attribute required for GFF" unless $gff->{$req}; | |
128 } | |
129 | |
130 # order as per GFF3 spec: http://www.sequenceontology.org/gff3.shtml | |
131 | |
132 my $gff_str = join( "\t", | |
133 $gff->{seqid}, $gff->{source}, $gff->{type}, $gff->{start}, | |
134 $gff->{end}, $gff->{score}, $gff->{strand}, $gff->{phase}, | |
135 ); | |
136 | |
137 if ($extra_attrs) { | |
138 | |
139 # combine the extra attributes with any existing ones (duplicate keys will get squashed, | |
140 # so attributes specified in the extra_attrs hash will override existing ones) | |
141 | |
142 $gff->{attributes} = {} unless defined $gff->{attributes}; | |
143 | |
144 @{ $gff->{attributes} }{ keys %$extra_attrs } = values %$extra_attrs; | |
145 } | |
146 | |
147 if ( $gff->{attributes} ) { | |
148 | |
149 my @attrs; | |
150 | |
151 for my $key (keys %{ $gff->{attributes} }) { | |
152 | |
153 my $val = $gff->{attributes}->{$key}; | |
154 | |
155 if (ref $val eq 'ARRAY') { | |
156 push @attrs, map { $key . '='. $_ } @$val; | |
157 } | |
158 else { | |
159 push @attrs, $key . '=' . $val; | |
160 } | |
161 } | |
162 | |
163 $gff_str .= "\t" . join( ';', @attrs ); | |
164 } | |
165 | |
166 return $gff_str; | |
167 } | |
168 | |
169 sub _gff_hash { | |
170 my $self = shift; | |
171 | |
172 my %args = @_; | |
173 | |
174 my $rebase = $args{rebase}; # use absolute or slice-relative coordinates | |
175 | |
176 my $gff_seqid = $args{gff_seqid} || $self->slice->seq_region_name; | |
177 my $gff_source = $args{gff_source} || $self->_gff_source; | |
178 | |
179 my $seqid = $rebase ? $gff_seqid.'_'.$self->slice->start.'-'.$self->slice->end : $gff_seqid; | |
180 my $start = $rebase ? $self->start : $self->seq_region_start; | |
181 my $end = $rebase ? $self->end : $self->seq_region_end; | |
182 | |
183 # GFF3 does not allow start > end, and mandates that for zero-length features (e.g. insertions) | |
184 # start = end and the implied insertion site is to the right of the specified base, so we use the | |
185 # smaller of the two values | |
186 | |
187 if ($start > $end) { | |
188 $start = $end; | |
189 } | |
190 | |
191 my $gff = { | |
192 seqid => $gff_seqid, | |
193 source => $gff_source, | |
194 type => $self->_gff_type, | |
195 start => $start, | |
196 end => $end, | |
197 strand => ( | |
198 $self->strand == 1 ? '+' : ( $self->strand == -1 ? '-' : '.' ) | |
199 ) | |
200 }; | |
201 | |
202 return $gff; | |
203 } | |
204 | |
205 sub _gff_source { | |
206 my $self = shift; | |
207 | |
208 if ($self->analysis) { | |
209 return | |
210 $self->analysis->gff_source | |
211 || $self->analysis->logic_name; | |
212 } | |
213 else { | |
214 return ref($self); | |
215 } | |
216 } | |
217 | |
218 sub _gff_type { | |
219 my $self = shift; | |
220 | |
221 return | |
222 ( $self->analysis && $self->analysis->gff_feature ) | |
223 || 'misc_feature'; | |
224 } | |
225 } | |
226 | |
227 { | |
228 package Bio::EnsEMBL::Variation::VariationFeature; | |
229 | |
230 use Bio::EnsEMBL::Utils::Sequence qw(expand); | |
231 | |
232 my $REFERENCE_ALLELE_IDENTIFIER = '@'; | |
233 | |
234 sub to_gvf { | |
235 my $self = shift; | |
236 return $self->to_gff(@_); | |
237 } | |
238 | |
239 sub _gff_hash { | |
240 | |
241 my $self = shift; | |
242 | |
243 my $gff = $self->SUPER::_gff_hash(@_); | |
244 | |
245 my %args = @_; | |
246 | |
247 my $include_consequences = $args{include_consequences}; | |
248 my $include_coding_details = $args{include_coding_details}; | |
249 my $include_global_maf = $args{include_global_maf}; | |
250 | |
251 $gff->{source} = $self->source; | |
252 | |
253 $gff->{type} = $self->class_SO_term; | |
254 | |
255 my $source = $self->source; | |
256 | |
257 $source .= '_'.$self->source_version if defined $self->source_version; | |
258 | |
259 $gff->{attributes}->{Dbxref} = "$source:".$self->variation_name; | |
260 | |
261 $gff->{attributes}->{ID} = $self->dbID; | |
262 | |
263 # the Variant_seq attribute requires a comma separated list of alleles | |
264 | |
265 my @alleles = split '/', $self->allele_string; | |
266 my $ref_seq = shift @alleles unless @alleles == 1; # shift off the reference allele | |
267 | |
268 $gff->{attributes}->{Variant_seq} = join ',', @alleles; | |
269 | |
270 my $index = 0; | |
271 | |
272 # expand tandem repeat alleles, because TranscriptVariationAlleles use the expanded sequence | |
273 | |
274 map { expand(\$_) } @alleles; | |
275 | |
276 # if you expand e.g. (T)0 you get an empty string, which we treat as a deletion, so default to '-' | |
277 | |
278 my %allele_index = map { ($_ || '-') => $index++ } @alleles; | |
279 | |
280 if ($include_global_maf) { | |
281 | |
282 my $var = $self->variation; | |
283 | |
284 if (defined $var->minor_allele_frequency) { | |
285 | |
286 my $allele_idx; | |
287 | |
288 if ($var->minor_allele eq $ref_seq) { | |
289 $allele_idx = $REFERENCE_ALLELE_IDENTIFIER; | |
290 } | |
291 else { | |
292 $allele_idx = $allele_index{$var->minor_allele}; | |
293 } | |
294 | |
295 if (defined $allele_idx) { | |
296 $gff->{attributes}->{global_minor_allele_frequency} = | |
297 join (' ', | |
298 $allele_idx, | |
299 $var->minor_allele_frequency, | |
300 $var->minor_allele_count | |
301 ); | |
302 } | |
303 } | |
304 } | |
305 | |
306 # the reference sequence should be set to '~' if the sequence is longer than 50 nucleotides | |
307 | |
308 $ref_seq = '~' if (not $ref_seq) || (CORE::length($ref_seq) > 50); | |
309 $gff->{attributes}->{Reference_seq} = $ref_seq; | |
310 | |
311 # Hack for HGMD mutations | |
312 | |
313 if ($self->allele_string eq 'HGMD_MUTATION') { | |
314 $gff->{attributes}->{Reference_seq} = '~'; | |
315 $gff->{attributes}->{Variant_seq} = '~'; | |
316 $allele_index{$self->allele_string} = 0; | |
317 } | |
318 | |
319 if ($include_consequences || $include_coding_details) { | |
320 | |
321 for my $tv (@{ $self->get_all_TranscriptVariations }) { | |
322 | |
323 unless ($tv->get_all_alternate_TranscriptVariationAlleles) { | |
324 warn $self->variation_name." has no alternate alleles?"; | |
325 next; | |
326 } | |
327 | |
328 if ($include_coding_details) { | |
329 my $ref_tva = $tv->get_reference_TranscriptVariationAllele; | |
330 | |
331 if (my $pep = $ref_tva->peptide) { | |
332 $gff->{attributes}->{reference_peptide} = $pep; | |
333 } | |
334 } | |
335 | |
336 for my $tva (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) { | |
337 | |
338 my $allele_idx = $allele_index{$tva->variation_feature_seq}; | |
339 | |
340 if (defined $allele_idx) { | |
341 | |
342 if ($include_consequences) { | |
343 for my $oc (@{ $tva->get_all_OverlapConsequences }) { | |
344 | |
345 push @{ $gff->{attributes}->{Variant_effect} ||= [] }, | |
346 join(' ', | |
347 $oc->SO_term, | |
348 $allele_idx, | |
349 $oc->feature_SO_term, | |
350 $tv->transcript_stable_id, | |
351 ); | |
352 } | |
353 } | |
354 | |
355 if ($include_coding_details) { | |
356 if ($tva->pep_allele_string) { | |
357 | |
358 push @{ $gff->{attributes}->{variant_peptide} ||= [] }, | |
359 join(' ', | |
360 $allele_idx, | |
361 $tva->peptide, | |
362 $tv->transcript_stable_id, | |
363 ); | |
364 | |
365 for my $tool (qw(sift polyphen)) { | |
366 my $pred_meth = $tool.'_prediction'; | |
367 my $score_meth = $tool.'_score'; | |
368 if (my $pred = $tva->$pred_meth) { | |
369 $pred =~ s/\s/_/g; | |
370 push @{ $gff->{attributes}->{polyphen_prediction} ||= [] }, | |
371 join(' ', | |
372 $allele_idx, | |
373 $pred, | |
374 $tva->$score_meth, | |
375 $tv->transcript_stable_id | |
376 ); | |
377 } | |
378 } | |
379 } | |
380 } | |
381 } | |
382 else { | |
383 warn "No allele_index entry for allele: ".$tva->variation_feature_seq. | |
384 " of ".$self->variation_name."? Is reference " . $tva->is_reference . " ref seq " . $ref_seq . "\n"; | |
385 } | |
386 } | |
387 } | |
388 } | |
389 | |
390 return $gff; | |
391 } | |
392 } | |
393 | |
394 { | |
395 package Bio::EnsEMBL::Variation::StructuralVariationFeature; | |
396 | |
397 sub _gff_hash { | |
398 | |
399 my $self = shift; | |
400 | |
401 my $gff = $self->SUPER::_gff_hash(@_); | |
402 | |
403 $gff->{attributes}->{ID} = $self->variation_name; | |
404 | |
405 $gff->{source} = $self->source; | |
406 | |
407 my $sv = $self->structural_variation; | |
408 | |
409 $gff->{attributes}->{Dbxref} = $self->source . ':' . $self->variation_name; | |
410 | |
411 $gff->{attributes}->{study_accession} = $sv->study->name if $sv->study->name; | |
412 | |
413 $gff->{type} = $self->class_SO_term; | |
414 | |
415 #$gff->{attributes}->{Reference_seq} = $self->end > $self->start+50 ? '~' : $self->get_reference_sequence; | |
416 | |
417 if ( (defined $self->inner_start) && (defined $self->outer_start) && ($self->inner_start != $self->outer_start) ) { | |
418 $gff->{attributes}->{Start_range} = join ',', $self->outer_start, $self->inner_start; | |
419 } | |
420 | |
421 if ( (defined $self->inner_end) && (defined $self->outer_end) && ($self->inner_end != $self->outer_end) ) { | |
422 $gff->{attributes}->{End_range} = join ',', $self->inner_end, $self->outer_end; | |
423 } | |
424 | |
425 if (my $sv = $self->structural_variation) { | |
426 if (ref $sv eq 'Bio::EnsEMBL::Variation::SupportingStructuralVariation') { | |
427 if (my $parents = $sv->get_all_StructuralVariations) { | |
428 $gff->{attributes}->{Parent} = join ',', map { $_->variation_name } @$parents; | |
429 } | |
430 } | |
431 } | |
432 | |
433 return $gff; | |
434 } | |
435 | |
436 sub to_gvf { | |
437 my $self = shift; | |
438 return $self->to_gff(@_); | |
439 } | |
440 } | |
441 | |
442 { | |
443 package Bio::EnsEMBL::Variation::Individual; | |
444 | |
445 sub _gff_hash { | |
446 | |
447 my $self = shift; | |
448 | |
449 my $gff; | |
450 | |
451 $gff->{Gender} = $self->gender; | |
452 | |
453 $gff->{Display_name} = $self->name; | |
454 | |
455 $gff->{ensembl_description} = $self->description; | |
456 | |
457 $gff->{Type} = $self->type_description; | |
458 | |
459 $gff->{Population} = join ',', map { $_->name } @{ $self->get_all_Populations }; | |
460 | |
461 return $gff; | |
462 } | |
463 | |
464 sub to_gvf { | |
465 my $self = shift; | |
466 | |
467 my $attrs = $self->_gff_hash(@_); | |
468 | |
469 # get rid of any empty attributes | |
470 map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs; | |
471 | |
472 return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; | |
473 } | |
474 | |
475 } | |
476 | |
477 { | |
478 package Bio::EnsEMBL::Variation::Population; | |
479 | |
480 sub _gff_hash { | |
481 | |
482 my $self = shift; | |
483 | |
484 my $gff; | |
485 | |
486 $gff->{Attribute} = 'Variant_freq'; | |
487 | |
488 $gff->{population} = $self->name; | |
489 | |
490 $gff->{population_size} = $self->size; | |
491 | |
492 $gff->{Comment} = $self->description; | |
493 | |
494 return $gff; | |
495 } | |
496 | |
497 sub to_gvf { | |
498 my $self = shift; | |
499 | |
500 my $attrs = $self->_gff_hash(@_); | |
501 | |
502 # get rid of any empty attributes | |
503 map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs; | |
504 | |
505 return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; | |
506 } | |
507 | |
508 } | |
509 | |
510 1; | |
511 |