Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/EnsEMBL2GFF3.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/EnsEMBL2GFF3.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,511 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + + +package Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3; + +use strict; +use warnings; + +# This module allows conversion of ensembl objects to GFF3 and GVF by inserting +# to_gff (and supporting _gff_hash) methods into the necessary feature classes + +{ + package Bio::EnsEMBL::Slice; + + sub gff_version { + return "##gff-version 3\n"; + } + + sub gff_header { + my $self = shift; + + my %args = @_; + + # build up a date string in the format specified by the GFF spec + + my ( $sec, $min, $hr, $mday, $mon, $year ) = localtime; + $year += 1900; # correct the year + $mon++; # correct the month + + my $date = sprintf "%4d-%02d-%02d", $year, $mon, $mday; + + my $region = $self->seq_region_name; + my $start = $self->start; + my $end = $self->end; + my $assembly = $self->coord_system->version; + + my $mca = $self->adaptor->db->get_MetaContainerAdaptor; + my $tax_id = $mca->get_taxonomy_id; + + my $hdr = + "##file-date $date\n" + . "##genome-build ensembl $assembly\n" + . "##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=$tax_id\n"; + + $hdr .= "##sequence-region $region $start $end\n" unless $args{no_sequence_region}; + + return $hdr; + } + + sub gvf_version { + return "##gvf-version 1.06\n"; + } + + sub gvf_header { + my $self = shift; + + my %args = @_; + my $hdr = $self->gff_version; + $hdr .= $self->gvf_version; + $hdr .= $self->gff_header(@_); + + my $mca = $self->adaptor->db->get_MetaContainerAdaptor; + my $schema_version = $mca->get_schema_version; + my $species_name = $mca->get_scientific_name; + $species_name =~ s/ /_/g; + my $url = 'http://e'.$schema_version.'.ensembl.org/'.$species_name; + + $hdr .= "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/so.obo?revision=1.283\n"; + $hdr .= "##data-source Source=ensembl;version=$schema_version;url=$url\n"; + $hdr .= "##file-version $schema_version\n"; + + if (my $individual = $args{individual}) { + $hdr .= "##individual-id ".$individual->to_gvf."\n"; + } + + if (my $population = $args{population}) { + $hdr .= "##attribute-method ".$population->to_gvf."\n"; + } + + return $hdr; + } +} + +{ + + package Bio::EnsEMBL::Feature; + + sub to_gff { + + my $self = shift; + + my %args = @_; + + # This parameter is assumed to be a hashref which includes extra attributes you'd + # like to have appended onto the gff line for the feature + my $extra_attrs = $args{extra_attrs}; + + my $gff = $self->_gff_hash(@_); + + return undef unless defined $gff; + + # default optional columns, and check that all required columns are present + + $gff->{score} = '.' unless defined $gff->{score}; + $gff->{strand} = '.' unless defined $gff->{strand}; + $gff->{phase} = '.' unless defined $gff->{phase}; + + for my $req (qw(source type start end)) { + die "'$req' attribute required for GFF" unless $gff->{$req}; + } + + # order as per GFF3 spec: http://www.sequenceontology.org/gff3.shtml + + my $gff_str = join( "\t", + $gff->{seqid}, $gff->{source}, $gff->{type}, $gff->{start}, + $gff->{end}, $gff->{score}, $gff->{strand}, $gff->{phase}, + ); + + if ($extra_attrs) { + + # combine the extra attributes with any existing ones (duplicate keys will get squashed, + # so attributes specified in the extra_attrs hash will override existing ones) + + $gff->{attributes} = {} unless defined $gff->{attributes}; + + @{ $gff->{attributes} }{ keys %$extra_attrs } = values %$extra_attrs; + } + + if ( $gff->{attributes} ) { + + my @attrs; + + for my $key (keys %{ $gff->{attributes} }) { + + my $val = $gff->{attributes}->{$key}; + + if (ref $val eq 'ARRAY') { + push @attrs, map { $key . '='. $_ } @$val; + } + else { + push @attrs, $key . '=' . $val; + } + } + + $gff_str .= "\t" . join( ';', @attrs ); + } + + return $gff_str; + } + + sub _gff_hash { + my $self = shift; + + my %args = @_; + + my $rebase = $args{rebase}; # use absolute or slice-relative coordinates + + my $gff_seqid = $args{gff_seqid} || $self->slice->seq_region_name; + my $gff_source = $args{gff_source} || $self->_gff_source; + + my $seqid = $rebase ? $gff_seqid.'_'.$self->slice->start.'-'.$self->slice->end : $gff_seqid; + my $start = $rebase ? $self->start : $self->seq_region_start; + my $end = $rebase ? $self->end : $self->seq_region_end; + + # GFF3 does not allow start > end, and mandates that for zero-length features (e.g. insertions) + # start = end and the implied insertion site is to the right of the specified base, so we use the + # smaller of the two values + + if ($start > $end) { + $start = $end; + } + + my $gff = { + seqid => $gff_seqid, + source => $gff_source, + type => $self->_gff_type, + start => $start, + end => $end, + strand => ( + $self->strand == 1 ? '+' : ( $self->strand == -1 ? '-' : '.' ) + ) + }; + + return $gff; + } + + sub _gff_source { + my $self = shift; + + if ($self->analysis) { + return + $self->analysis->gff_source + || $self->analysis->logic_name; + } + else { + return ref($self); + } + } + + sub _gff_type { + my $self = shift; + + return + ( $self->analysis && $self->analysis->gff_feature ) + || 'misc_feature'; + } +} + +{ + package Bio::EnsEMBL::Variation::VariationFeature; + + use Bio::EnsEMBL::Utils::Sequence qw(expand); + + my $REFERENCE_ALLELE_IDENTIFIER = '@'; + + sub to_gvf { + my $self = shift; + return $self->to_gff(@_); + } + + sub _gff_hash { + + my $self = shift; + + my $gff = $self->SUPER::_gff_hash(@_); + + my %args = @_; + + my $include_consequences = $args{include_consequences}; + my $include_coding_details = $args{include_coding_details}; + my $include_global_maf = $args{include_global_maf}; + + $gff->{source} = $self->source; + + $gff->{type} = $self->class_SO_term; + + my $source = $self->source; + + $source .= '_'.$self->source_version if defined $self->source_version; + + $gff->{attributes}->{Dbxref} = "$source:".$self->variation_name; + + $gff->{attributes}->{ID} = $self->dbID; + + # the Variant_seq attribute requires a comma separated list of alleles + + my @alleles = split '/', $self->allele_string; + my $ref_seq = shift @alleles unless @alleles == 1; # shift off the reference allele + + $gff->{attributes}->{Variant_seq} = join ',', @alleles; + + my $index = 0; + + # expand tandem repeat alleles, because TranscriptVariationAlleles use the expanded sequence + + map { expand(\$_) } @alleles; + + # if you expand e.g. (T)0 you get an empty string, which we treat as a deletion, so default to '-' + + my %allele_index = map { ($_ || '-') => $index++ } @alleles; + + if ($include_global_maf) { + + my $var = $self->variation; + + if (defined $var->minor_allele_frequency) { + + my $allele_idx; + + if ($var->minor_allele eq $ref_seq) { + $allele_idx = $REFERENCE_ALLELE_IDENTIFIER; + } + else { + $allele_idx = $allele_index{$var->minor_allele}; + } + + if (defined $allele_idx) { + $gff->{attributes}->{global_minor_allele_frequency} = + join (' ', + $allele_idx, + $var->minor_allele_frequency, + $var->minor_allele_count + ); + } + } + } + + # the reference sequence should be set to '~' if the sequence is longer than 50 nucleotides + + $ref_seq = '~' if (not $ref_seq) || (CORE::length($ref_seq) > 50); + $gff->{attributes}->{Reference_seq} = $ref_seq; + + # Hack for HGMD mutations + + if ($self->allele_string eq 'HGMD_MUTATION') { + $gff->{attributes}->{Reference_seq} = '~'; + $gff->{attributes}->{Variant_seq} = '~'; + $allele_index{$self->allele_string} = 0; + } + + if ($include_consequences || $include_coding_details) { + + for my $tv (@{ $self->get_all_TranscriptVariations }) { + + unless ($tv->get_all_alternate_TranscriptVariationAlleles) { + warn $self->variation_name." has no alternate alleles?"; + next; + } + + if ($include_coding_details) { + my $ref_tva = $tv->get_reference_TranscriptVariationAllele; + + if (my $pep = $ref_tva->peptide) { + $gff->{attributes}->{reference_peptide} = $pep; + } + } + + for my $tva (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) { + + my $allele_idx = $allele_index{$tva->variation_feature_seq}; + + if (defined $allele_idx) { + + if ($include_consequences) { + for my $oc (@{ $tva->get_all_OverlapConsequences }) { + + push @{ $gff->{attributes}->{Variant_effect} ||= [] }, + join(' ', + $oc->SO_term, + $allele_idx, + $oc->feature_SO_term, + $tv->transcript_stable_id, + ); + } + } + + if ($include_coding_details) { + if ($tva->pep_allele_string) { + + push @{ $gff->{attributes}->{variant_peptide} ||= [] }, + join(' ', + $allele_idx, + $tva->peptide, + $tv->transcript_stable_id, + ); + + for my $tool (qw(sift polyphen)) { + my $pred_meth = $tool.'_prediction'; + my $score_meth = $tool.'_score'; + if (my $pred = $tva->$pred_meth) { + $pred =~ s/\s/_/g; + push @{ $gff->{attributes}->{polyphen_prediction} ||= [] }, + join(' ', + $allele_idx, + $pred, + $tva->$score_meth, + $tv->transcript_stable_id + ); + } + } + } + } + } + else { + warn "No allele_index entry for allele: ".$tva->variation_feature_seq. + " of ".$self->variation_name."? Is reference " . $tva->is_reference . " ref seq " . $ref_seq . "\n"; + } + } + } + } + + return $gff; + } +} + +{ + package Bio::EnsEMBL::Variation::StructuralVariationFeature; + + sub _gff_hash { + + my $self = shift; + + my $gff = $self->SUPER::_gff_hash(@_); + + $gff->{attributes}->{ID} = $self->variation_name; + + $gff->{source} = $self->source; + + my $sv = $self->structural_variation; + + $gff->{attributes}->{Dbxref} = $self->source . ':' . $self->variation_name; + + $gff->{attributes}->{study_accession} = $sv->study->name if $sv->study->name; + + $gff->{type} = $self->class_SO_term; + + #$gff->{attributes}->{Reference_seq} = $self->end > $self->start+50 ? '~' : $self->get_reference_sequence; + + if ( (defined $self->inner_start) && (defined $self->outer_start) && ($self->inner_start != $self->outer_start) ) { + $gff->{attributes}->{Start_range} = join ',', $self->outer_start, $self->inner_start; + } + + if ( (defined $self->inner_end) && (defined $self->outer_end) && ($self->inner_end != $self->outer_end) ) { + $gff->{attributes}->{End_range} = join ',', $self->inner_end, $self->outer_end; + } + + if (my $sv = $self->structural_variation) { + if (ref $sv eq 'Bio::EnsEMBL::Variation::SupportingStructuralVariation') { + if (my $parents = $sv->get_all_StructuralVariations) { + $gff->{attributes}->{Parent} = join ',', map { $_->variation_name } @$parents; + } + } + } + + return $gff; + } + + sub to_gvf { + my $self = shift; + return $self->to_gff(@_); + } +} + +{ + package Bio::EnsEMBL::Variation::Individual; + + sub _gff_hash { + + my $self = shift; + + my $gff; + + $gff->{Gender} = $self->gender; + + $gff->{Display_name} = $self->name; + + $gff->{ensembl_description} = $self->description; + + $gff->{Type} = $self->type_description; + + $gff->{Population} = join ',', map { $_->name } @{ $self->get_all_Populations }; + + return $gff; + } + + sub to_gvf { + my $self = shift; + + my $attrs = $self->_gff_hash(@_); + + # get rid of any empty attributes + map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs; + + return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; + } + +} + +{ + package Bio::EnsEMBL::Variation::Population; + + sub _gff_hash { + + my $self = shift; + + my $gff; + + $gff->{Attribute} = 'Variant_freq'; + + $gff->{population} = $self->name; + + $gff->{population_size} = $self->size; + + $gff->{Comment} = $self->description; + + return $gff; + } + + sub to_gvf { + my $self = shift; + + my $attrs = $self->_gff_hash(@_); + + # get rid of any empty attributes + map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs; + + return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; + } + +} + +1; +