diff variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/EnsEMBL2GFF3.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/EnsEMBL2GFF3.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,511 @@
+=head1 LICENSE
+
+ Copyright (c) 1999-2012 The European Bioinformatics Institute and
+ Genome Research Limited.  All rights reserved.
+
+ This software is distributed under a modified Apache license.
+ For license details, please see
+
+   http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+ Please email comments or questions to the public Ensembl
+ developers list at <dev@ensembl.org>.
+
+ Questions may also be sent to the Ensembl help desk at
+ <helpdesk@ensembl.org>.
+
+=cut
+
+
+package Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3;
+
+use strict;
+use warnings;
+
+# This module allows conversion of ensembl objects to GFF3 and GVF by inserting
+# to_gff (and supporting _gff_hash) methods into the necessary feature classes
+
+{
+    package Bio::EnsEMBL::Slice;
+   
+	sub gff_version {
+		return "##gff-version 3\n";
+	}
+	 
+    sub gff_header {
+        my $self = shift;
+        
+        my %args = @_;
+        
+        # build up a date string in the format specified by the GFF spec
+    
+        my ( $sec, $min, $hr, $mday, $mon, $year ) = localtime;
+        $year += 1900;    # correct the year
+        $mon++;           # correct the month
+        
+        my $date = sprintf "%4d-%02d-%02d", $year, $mon, $mday;
+        
+        my $region      = $self->seq_region_name;
+        my $start       = $self->start;
+        my $end         = $self->end;
+        my $assembly    = $self->coord_system->version;
+        
+        my $mca = $self->adaptor->db->get_MetaContainerAdaptor;
+        my $tax_id = $mca->get_taxonomy_id;
+        
+        my $hdr =
+			"##file-date $date\n"
+          . "##genome-build ensembl $assembly\n"
+          . "##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=$tax_id\n";
+        
+        $hdr .= "##sequence-region $region $start $end\n" unless $args{no_sequence_region};
+        
+        return $hdr;
+    }
+   
+	sub gvf_version {
+		return "##gvf-version 1.06\n";
+	}
+ 
+    sub gvf_header {
+        my $self = shift;
+       
+        my %args = @_;
+		my $hdr = $self->gff_version;
+		$hdr .= $self->gvf_version;
+        $hdr .= $self->gff_header(@_);
+        
+        my $mca = $self->adaptor->db->get_MetaContainerAdaptor;
+        my $schema_version = $mca->get_schema_version;
+        my $species_name = $mca->get_scientific_name;
+        $species_name =~ s/ /_/g;
+        my $url = 'http://e'.$schema_version.'.ensembl.org/'.$species_name;
+        
+        $hdr .= "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/so.obo?revision=1.283\n";
+        $hdr .= "##data-source Source=ensembl;version=$schema_version;url=$url\n";
+        $hdr .= "##file-version $schema_version\n";
+        
+        if (my $individual = $args{individual}) {
+            $hdr .= "##individual-id ".$individual->to_gvf."\n";
+        }
+
+        if (my $population = $args{population}) {
+            $hdr .= "##attribute-method ".$population->to_gvf."\n";
+        }
+
+        return $hdr;
+    }
+}
+
+{
+
+    package Bio::EnsEMBL::Feature;
+
+    sub to_gff {
+
+        my $self = shift;
+        
+        my %args = @_;
+        
+        # This parameter is assumed to be a hashref which includes extra attributes you'd
+        # like to have appended onto the gff line for the feature
+        my $extra_attrs = $args{extra_attrs};
+
+        my $gff = $self->_gff_hash(@_);
+
+        return undef unless defined $gff;
+
+        # default optional columns, and check that all required columns are present
+
+        $gff->{score}  = '.' unless defined $gff->{score};
+        $gff->{strand} = '.' unless defined $gff->{strand};
+        $gff->{phase}  = '.' unless defined $gff->{phase};
+
+        for my $req (qw(source type start end)) {
+            die "'$req' attribute required for GFF" unless $gff->{$req};
+        }
+
+        # order as per GFF3 spec: http://www.sequenceontology.org/gff3.shtml 
+ 
+        my $gff_str = join( "\t",
+            $gff->{seqid}, $gff->{source}, $gff->{type}, $gff->{start},
+            $gff->{end}, $gff->{score}, $gff->{strand}, $gff->{phase},
+        );
+
+        if ($extra_attrs) {
+
+            # combine the extra attributes with any existing ones (duplicate keys will get squashed,
+            # so attributes specified in the extra_attrs hash will override existing ones)
+
+            $gff->{attributes} = {} unless defined $gff->{attributes};
+            
+            @{ $gff->{attributes} }{ keys %$extra_attrs } = values %$extra_attrs;
+        }
+
+        if ( $gff->{attributes} ) {
+
+            my @attrs;
+
+            for my $key (keys %{ $gff->{attributes} }) {
+
+                my $val = $gff->{attributes}->{$key};
+
+                if (ref $val eq 'ARRAY') {
+                    push @attrs, map { $key . '='. $_ } @$val;
+                }
+                else {
+                    push @attrs, $key . '=' . $val;
+                }
+            }
+                
+            $gff_str .= "\t" . join( ';', @attrs );
+        }
+
+        return $gff_str;
+    }
+
+    sub _gff_hash {
+        my $self = shift;
+        
+        my %args = @_;
+        
+        my $rebase      = $args{rebase}; # use absolute or slice-relative coordinates
+        
+        my $gff_seqid   = $args{gff_seqid} || $self->slice->seq_region_name;
+        my $gff_source  = $args{gff_source} || $self->_gff_source;
+
+        my $seqid = $rebase ? $gff_seqid.'_'.$self->slice->start.'-'.$self->slice->end : $gff_seqid;
+        my $start = $rebase ? $self->start : $self->seq_region_start;
+        my $end = $rebase ? $self->end : $self->seq_region_end;
+        
+        # GFF3 does not allow start > end, and mandates that for zero-length features (e.g. insertions) 
+        # start = end and the implied insertion site is to the right of the specified base, so we use the
+        # smaller of the two values
+        
+        if ($start > $end) {
+            $start = $end;
+        }
+        
+        my $gff = {
+            seqid   => $gff_seqid,
+            source  => $gff_source,
+            type    => $self->_gff_type,
+            start   => $start,
+            end     => $end,
+            strand  => (
+                $self->strand == 1 ? '+' : ( $self->strand == -1 ? '-' : '.' )
+            )
+        };
+
+        return $gff;
+    }
+
+    sub _gff_source {
+        my $self = shift;
+        
+        if ($self->analysis) {
+            return
+                $self->analysis->gff_source
+                || $self->analysis->logic_name;
+        }
+        else {
+            return ref($self);
+        }
+    }
+
+    sub _gff_type {
+        my $self = shift;
+
+        return
+            ( $self->analysis && $self->analysis->gff_feature )
+            || 'misc_feature';
+    }
+}
+
+{
+    package Bio::EnsEMBL::Variation::VariationFeature;
+    
+    use Bio::EnsEMBL::Utils::Sequence qw(expand);
+
+    my $REFERENCE_ALLELE_IDENTIFIER = '@';
+
+    sub to_gvf {
+        my $self = shift;
+        return $self->to_gff(@_);
+    }
+    
+    sub _gff_hash {
+   
+        my $self = shift;
+        
+        my $gff = $self->SUPER::_gff_hash(@_);
+        
+        my %args = @_;
+        
+        my $include_consequences    = $args{include_consequences};
+        my $include_coding_details  = $args{include_coding_details};
+        my $include_global_maf      = $args{include_global_maf};
+
+        $gff->{source} = $self->source;
+
+        $gff->{type} = $self->class_SO_term;
+
+        my $source = $self->source;
+
+        $source .= '_'.$self->source_version if defined $self->source_version;
+
+        $gff->{attributes}->{Dbxref} = "$source:".$self->variation_name;
+
+        $gff->{attributes}->{ID} = $self->dbID;
+
+        # the Variant_seq attribute requires a comma separated list of alleles
+
+        my @alleles = split '/', $self->allele_string;
+        my $ref_seq = shift @alleles unless @alleles == 1; # shift off the reference allele
+       
+        $gff->{attributes}->{Variant_seq} = join ',', @alleles;
+
+        my $index = 0;
+
+        # expand tandem repeat alleles, because TranscriptVariationAlleles use the expanded sequence
+        
+        map { expand(\$_) } @alleles; 
+
+        # if you expand e.g. (T)0 you get an empty string, which we treat as a deletion, so default to '-'
+
+        my %allele_index = map { ($_ || '-') => $index++ } @alleles;
+
+        if ($include_global_maf) {
+
+            my $var = $self->variation;
+            
+            if (defined $var->minor_allele_frequency) {
+                
+                my $allele_idx;
+
+                if ($var->minor_allele eq $ref_seq) {
+                    $allele_idx = $REFERENCE_ALLELE_IDENTIFIER;
+                }
+                else {
+                    $allele_idx = $allele_index{$var->minor_allele};
+                }
+
+                if (defined $allele_idx) {
+                    $gff->{attributes}->{global_minor_allele_frequency} = 
+                        join (' ', 
+                            $allele_idx,
+                            $var->minor_allele_frequency,
+                            $var->minor_allele_count
+                        );
+                }
+            }
+        }
+        
+        # the reference sequence should be set to '~' if the sequence is longer than 50 nucleotides
+
+        $ref_seq = '~' if (not $ref_seq) || (CORE::length($ref_seq) > 50);
+        $gff->{attributes}->{Reference_seq} = $ref_seq;
+        
+        # Hack for HGMD mutations
+       
+        if ($self->allele_string eq 'HGMD_MUTATION') {
+            $gff->{attributes}->{Reference_seq} = '~';
+            $gff->{attributes}->{Variant_seq}   = '~';
+            $allele_index{$self->allele_string} = 0;
+        }
+        
+        if ($include_consequences || $include_coding_details) {
+            
+            for my $tv (@{ $self->get_all_TranscriptVariations }) {
+
+                unless ($tv->get_all_alternate_TranscriptVariationAlleles) {
+                    warn $self->variation_name." has no alternate alleles?";
+                    next;
+                }
+
+                if ($include_coding_details) {
+                    my $ref_tva = $tv->get_reference_TranscriptVariationAllele;
+
+                    if (my $pep = $ref_tva->peptide) {
+                        $gff->{attributes}->{reference_peptide} = $pep;
+                    }
+                }
+
+                for my $tva (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) {
+
+                    my $allele_idx = $allele_index{$tva->variation_feature_seq};
+                    
+                    if (defined $allele_idx) {
+
+                        if ($include_consequences) {
+                            for my $oc (@{ $tva->get_all_OverlapConsequences }) {
+
+                                push @{ $gff->{attributes}->{Variant_effect} ||= [] },
+                                    join(' ',
+                                        $oc->SO_term, 
+                                        $allele_idx,
+                                        $oc->feature_SO_term,
+                                        $tv->transcript_stable_id,
+                                    );
+                            }
+                        }
+                        
+                        if ($include_coding_details) {
+                            if ($tva->pep_allele_string) {
+
+                                push @{ $gff->{attributes}->{variant_peptide} ||= [] },
+                                    join(' ',
+                                        $allele_idx,
+                                        $tva->peptide,
+                                        $tv->transcript_stable_id,
+                                    );
+
+                                for my $tool (qw(sift polyphen)) {
+                                    my $pred_meth = $tool.'_prediction';
+                                    my $score_meth = $tool.'_score';
+                                    if (my $pred = $tva->$pred_meth) {
+                                        $pred =~ s/\s/_/g;
+                                        push @{ $gff->{attributes}->{polyphen_prediction} ||= [] }, 
+                                            join(' ', 
+                                                $allele_idx, 
+                                                $pred, 
+                                                $tva->$score_meth,
+                                                $tv->transcript_stable_id
+                                            );
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    else {
+                        warn "No allele_index entry for allele: ".$tva->variation_feature_seq.
+                            " of ".$self->variation_name."? Is reference " . $tva->is_reference . " ref seq " . $ref_seq . "\n";
+                    }
+                }
+            }
+        }
+    
+        return $gff;
+    }
+}
+
+{
+    package Bio::EnsEMBL::Variation::StructuralVariationFeature;
+    
+    sub _gff_hash {
+        
+        my $self = shift;
+        
+        my $gff = $self->SUPER::_gff_hash(@_);
+        
+        $gff->{attributes}->{ID} = $self->variation_name;
+        
+        $gff->{source} = $self->source;
+        
+        my $sv = $self->structural_variation;
+        
+        $gff->{attributes}->{Dbxref} = $self->source . ':' . $self->variation_name;
+        
+        $gff->{attributes}->{study_accession} = $sv->study->name if $sv->study->name;
+
+        $gff->{type} = $self->class_SO_term;
+        
+        #$gff->{attributes}->{Reference_seq} = $self->end > $self->start+50 ? '~' : $self->get_reference_sequence;
+     
+        if ( (defined $self->inner_start) && (defined $self->outer_start) && ($self->inner_start != $self->outer_start) ) {
+            $gff->{attributes}->{Start_range} = join ',', $self->outer_start, $self->inner_start;
+        }
+
+        if ( (defined $self->inner_end) && (defined $self->outer_end) && ($self->inner_end != $self->outer_end) ) {
+            $gff->{attributes}->{End_range} = join ',', $self->inner_end, $self->outer_end;
+        }
+        
+        if (my $sv = $self->structural_variation) {
+            if (ref $sv eq 'Bio::EnsEMBL::Variation::SupportingStructuralVariation') {
+                if (my $parents = $sv->get_all_StructuralVariations) {
+                    $gff->{attributes}->{Parent} = join ',', map { $_->variation_name } @$parents;
+                }
+            }
+        }
+
+        return $gff;
+    }
+    
+    sub to_gvf {
+        my $self = shift;
+        return $self->to_gff(@_);
+    }
+}
+
+{
+    package Bio::EnsEMBL::Variation::Individual;
+        
+    sub _gff_hash {
+        
+        my $self = shift;
+
+        my $gff;
+
+        $gff->{Gender} = $self->gender;
+
+        $gff->{Display_name} = $self->name;
+
+        $gff->{ensembl_description} = $self->description;
+
+        $gff->{Type} = $self->type_description;
+
+        $gff->{Population} = join ',', map { $_->name } @{ $self->get_all_Populations };
+       
+        return $gff;
+    }
+
+    sub to_gvf {
+        my $self = shift;
+       
+        my $attrs = $self->_gff_hash(@_);
+
+        # get rid of any empty attributes 
+        map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs;
+        
+        return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; 
+    }
+    
+}
+
+{
+    package Bio::EnsEMBL::Variation::Population;
+        
+    sub _gff_hash {
+        
+        my $self = shift;
+
+        my $gff;
+
+        $gff->{Attribute} = 'Variant_freq';
+
+        $gff->{population} = $self->name;
+        
+        $gff->{population_size} = $self->size;
+
+        $gff->{Comment} = $self->description;
+      
+        return $gff;
+    }
+
+    sub to_gvf {
+        my $self = shift;
+       
+        my $attrs = $self->_gff_hash(@_);
+
+        # get rid of any empty attributes 
+        map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs;
+        
+        return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; 
+    }
+    
+}
+
+1;
+