view variant_effect_predictor/Bio/EnsEMBL/Variation/Utils/EnsEMBL2GFF3.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line source

=head1 LICENSE

 Copyright (c) 1999-2012 The European Bioinformatics Institute and
 Genome Research Limited.  All rights reserved.

 This software is distributed under a modified Apache license.
 For license details, please see

   http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

 Please email comments or questions to the public Ensembl
 developers list at <dev@ensembl.org>.

 Questions may also be sent to the Ensembl help desk at
 <helpdesk@ensembl.org>.

=cut


package Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3;

use strict;
use warnings;

# This module allows conversion of ensembl objects to GFF3 and GVF by inserting
# to_gff (and supporting _gff_hash) methods into the necessary feature classes

{
    package Bio::EnsEMBL::Slice;
   
	sub gff_version {
		return "##gff-version 3\n";
	}
	 
    sub gff_header {
        my $self = shift;
        
        my %args = @_;
        
        # build up a date string in the format specified by the GFF spec
    
        my ( $sec, $min, $hr, $mday, $mon, $year ) = localtime;
        $year += 1900;    # correct the year
        $mon++;           # correct the month
        
        my $date = sprintf "%4d-%02d-%02d", $year, $mon, $mday;
        
        my $region      = $self->seq_region_name;
        my $start       = $self->start;
        my $end         = $self->end;
        my $assembly    = $self->coord_system->version;
        
        my $mca = $self->adaptor->db->get_MetaContainerAdaptor;
        my $tax_id = $mca->get_taxonomy_id;
        
        my $hdr =
			"##file-date $date\n"
          . "##genome-build ensembl $assembly\n"
          . "##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=$tax_id\n";
        
        $hdr .= "##sequence-region $region $start $end\n" unless $args{no_sequence_region};
        
        return $hdr;
    }
   
	sub gvf_version {
		return "##gvf-version 1.06\n";
	}
 
    sub gvf_header {
        my $self = shift;
       
        my %args = @_;
		my $hdr = $self->gff_version;
		$hdr .= $self->gvf_version;
        $hdr .= $self->gff_header(@_);
        
        my $mca = $self->adaptor->db->get_MetaContainerAdaptor;
        my $schema_version = $mca->get_schema_version;
        my $species_name = $mca->get_scientific_name;
        $species_name =~ s/ /_/g;
        my $url = 'http://e'.$schema_version.'.ensembl.org/'.$species_name;
        
        $hdr .= "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/so.obo?revision=1.283\n";
        $hdr .= "##data-source Source=ensembl;version=$schema_version;url=$url\n";
        $hdr .= "##file-version $schema_version\n";
        
        if (my $individual = $args{individual}) {
            $hdr .= "##individual-id ".$individual->to_gvf."\n";
        }

        if (my $population = $args{population}) {
            $hdr .= "##attribute-method ".$population->to_gvf."\n";
        }

        return $hdr;
    }
}

{

    package Bio::EnsEMBL::Feature;

    sub to_gff {

        my $self = shift;
        
        my %args = @_;
        
        # This parameter is assumed to be a hashref which includes extra attributes you'd
        # like to have appended onto the gff line for the feature
        my $extra_attrs = $args{extra_attrs};

        my $gff = $self->_gff_hash(@_);

        return undef unless defined $gff;

        # default optional columns, and check that all required columns are present

        $gff->{score}  = '.' unless defined $gff->{score};
        $gff->{strand} = '.' unless defined $gff->{strand};
        $gff->{phase}  = '.' unless defined $gff->{phase};

        for my $req (qw(source type start end)) {
            die "'$req' attribute required for GFF" unless $gff->{$req};
        }

        # order as per GFF3 spec: http://www.sequenceontology.org/gff3.shtml 
 
        my $gff_str = join( "\t",
            $gff->{seqid}, $gff->{source}, $gff->{type}, $gff->{start},
            $gff->{end}, $gff->{score}, $gff->{strand}, $gff->{phase},
        );

        if ($extra_attrs) {

            # combine the extra attributes with any existing ones (duplicate keys will get squashed,
            # so attributes specified in the extra_attrs hash will override existing ones)

            $gff->{attributes} = {} unless defined $gff->{attributes};
            
            @{ $gff->{attributes} }{ keys %$extra_attrs } = values %$extra_attrs;
        }

        if ( $gff->{attributes} ) {

            my @attrs;

            for my $key (keys %{ $gff->{attributes} }) {

                my $val = $gff->{attributes}->{$key};

                if (ref $val eq 'ARRAY') {
                    push @attrs, map { $key . '='. $_ } @$val;
                }
                else {
                    push @attrs, $key . '=' . $val;
                }
            }
                
            $gff_str .= "\t" . join( ';', @attrs );
        }

        return $gff_str;
    }

    sub _gff_hash {
        my $self = shift;
        
        my %args = @_;
        
        my $rebase      = $args{rebase}; # use absolute or slice-relative coordinates
        
        my $gff_seqid   = $args{gff_seqid} || $self->slice->seq_region_name;
        my $gff_source  = $args{gff_source} || $self->_gff_source;

        my $seqid = $rebase ? $gff_seqid.'_'.$self->slice->start.'-'.$self->slice->end : $gff_seqid;
        my $start = $rebase ? $self->start : $self->seq_region_start;
        my $end = $rebase ? $self->end : $self->seq_region_end;
        
        # GFF3 does not allow start > end, and mandates that for zero-length features (e.g. insertions) 
        # start = end and the implied insertion site is to the right of the specified base, so we use the
        # smaller of the two values
        
        if ($start > $end) {
            $start = $end;
        }
        
        my $gff = {
            seqid   => $gff_seqid,
            source  => $gff_source,
            type    => $self->_gff_type,
            start   => $start,
            end     => $end,
            strand  => (
                $self->strand == 1 ? '+' : ( $self->strand == -1 ? '-' : '.' )
            )
        };

        return $gff;
    }

    sub _gff_source {
        my $self = shift;
        
        if ($self->analysis) {
            return
                $self->analysis->gff_source
                || $self->analysis->logic_name;
        }
        else {
            return ref($self);
        }
    }

    sub _gff_type {
        my $self = shift;

        return
            ( $self->analysis && $self->analysis->gff_feature )
            || 'misc_feature';
    }
}

{
    package Bio::EnsEMBL::Variation::VariationFeature;
    
    use Bio::EnsEMBL::Utils::Sequence qw(expand);

    my $REFERENCE_ALLELE_IDENTIFIER = '@';

    sub to_gvf {
        my $self = shift;
        return $self->to_gff(@_);
    }
    
    sub _gff_hash {
   
        my $self = shift;
        
        my $gff = $self->SUPER::_gff_hash(@_);
        
        my %args = @_;
        
        my $include_consequences    = $args{include_consequences};
        my $include_coding_details  = $args{include_coding_details};
        my $include_global_maf      = $args{include_global_maf};

        $gff->{source} = $self->source;

        $gff->{type} = $self->class_SO_term;

        my $source = $self->source;

        $source .= '_'.$self->source_version if defined $self->source_version;

        $gff->{attributes}->{Dbxref} = "$source:".$self->variation_name;

        $gff->{attributes}->{ID} = $self->dbID;

        # the Variant_seq attribute requires a comma separated list of alleles

        my @alleles = split '/', $self->allele_string;
        my $ref_seq = shift @alleles unless @alleles == 1; # shift off the reference allele
       
        $gff->{attributes}->{Variant_seq} = join ',', @alleles;

        my $index = 0;

        # expand tandem repeat alleles, because TranscriptVariationAlleles use the expanded sequence
        
        map { expand(\$_) } @alleles; 

        # if you expand e.g. (T)0 you get an empty string, which we treat as a deletion, so default to '-'

        my %allele_index = map { ($_ || '-') => $index++ } @alleles;

        if ($include_global_maf) {

            my $var = $self->variation;
            
            if (defined $var->minor_allele_frequency) {
                
                my $allele_idx;

                if ($var->minor_allele eq $ref_seq) {
                    $allele_idx = $REFERENCE_ALLELE_IDENTIFIER;
                }
                else {
                    $allele_idx = $allele_index{$var->minor_allele};
                }

                if (defined $allele_idx) {
                    $gff->{attributes}->{global_minor_allele_frequency} = 
                        join (' ', 
                            $allele_idx,
                            $var->minor_allele_frequency,
                            $var->minor_allele_count
                        );
                }
            }
        }
        
        # the reference sequence should be set to '~' if the sequence is longer than 50 nucleotides

        $ref_seq = '~' if (not $ref_seq) || (CORE::length($ref_seq) > 50);
        $gff->{attributes}->{Reference_seq} = $ref_seq;
        
        # Hack for HGMD mutations
       
        if ($self->allele_string eq 'HGMD_MUTATION') {
            $gff->{attributes}->{Reference_seq} = '~';
            $gff->{attributes}->{Variant_seq}   = '~';
            $allele_index{$self->allele_string} = 0;
        }
        
        if ($include_consequences || $include_coding_details) {
            
            for my $tv (@{ $self->get_all_TranscriptVariations }) {

                unless ($tv->get_all_alternate_TranscriptVariationAlleles) {
                    warn $self->variation_name." has no alternate alleles?";
                    next;
                }

                if ($include_coding_details) {
                    my $ref_tva = $tv->get_reference_TranscriptVariationAllele;

                    if (my $pep = $ref_tva->peptide) {
                        $gff->{attributes}->{reference_peptide} = $pep;
                    }
                }

                for my $tva (@{ $tv->get_all_alternate_TranscriptVariationAlleles }) {

                    my $allele_idx = $allele_index{$tva->variation_feature_seq};
                    
                    if (defined $allele_idx) {

                        if ($include_consequences) {
                            for my $oc (@{ $tva->get_all_OverlapConsequences }) {

                                push @{ $gff->{attributes}->{Variant_effect} ||= [] },
                                    join(' ',
                                        $oc->SO_term, 
                                        $allele_idx,
                                        $oc->feature_SO_term,
                                        $tv->transcript_stable_id,
                                    );
                            }
                        }
                        
                        if ($include_coding_details) {
                            if ($tva->pep_allele_string) {

                                push @{ $gff->{attributes}->{variant_peptide} ||= [] },
                                    join(' ',
                                        $allele_idx,
                                        $tva->peptide,
                                        $tv->transcript_stable_id,
                                    );

                                for my $tool (qw(sift polyphen)) {
                                    my $pred_meth = $tool.'_prediction';
                                    my $score_meth = $tool.'_score';
                                    if (my $pred = $tva->$pred_meth) {
                                        $pred =~ s/\s/_/g;
                                        push @{ $gff->{attributes}->{polyphen_prediction} ||= [] }, 
                                            join(' ', 
                                                $allele_idx, 
                                                $pred, 
                                                $tva->$score_meth,
                                                $tv->transcript_stable_id
                                            );
                                    }
                                }
                            }
                        }
                    }
                    else {
                        warn "No allele_index entry for allele: ".$tva->variation_feature_seq.
                            " of ".$self->variation_name."? Is reference " . $tva->is_reference . " ref seq " . $ref_seq . "\n";
                    }
                }
            }
        }
    
        return $gff;
    }
}

{
    package Bio::EnsEMBL::Variation::StructuralVariationFeature;
    
    sub _gff_hash {
        
        my $self = shift;
        
        my $gff = $self->SUPER::_gff_hash(@_);
        
        $gff->{attributes}->{ID} = $self->variation_name;
        
        $gff->{source} = $self->source;
        
        my $sv = $self->structural_variation;
        
        $gff->{attributes}->{Dbxref} = $self->source . ':' . $self->variation_name;
        
        $gff->{attributes}->{study_accession} = $sv->study->name if $sv->study->name;

        $gff->{type} = $self->class_SO_term;
        
        #$gff->{attributes}->{Reference_seq} = $self->end > $self->start+50 ? '~' : $self->get_reference_sequence;
     
        if ( (defined $self->inner_start) && (defined $self->outer_start) && ($self->inner_start != $self->outer_start) ) {
            $gff->{attributes}->{Start_range} = join ',', $self->outer_start, $self->inner_start;
        }

        if ( (defined $self->inner_end) && (defined $self->outer_end) && ($self->inner_end != $self->outer_end) ) {
            $gff->{attributes}->{End_range} = join ',', $self->inner_end, $self->outer_end;
        }
        
        if (my $sv = $self->structural_variation) {
            if (ref $sv eq 'Bio::EnsEMBL::Variation::SupportingStructuralVariation') {
                if (my $parents = $sv->get_all_StructuralVariations) {
                    $gff->{attributes}->{Parent} = join ',', map { $_->variation_name } @$parents;
                }
            }
        }

        return $gff;
    }
    
    sub to_gvf {
        my $self = shift;
        return $self->to_gff(@_);
    }
}

{
    package Bio::EnsEMBL::Variation::Individual;
        
    sub _gff_hash {
        
        my $self = shift;

        my $gff;

        $gff->{Gender} = $self->gender;

        $gff->{Display_name} = $self->name;

        $gff->{ensembl_description} = $self->description;

        $gff->{Type} = $self->type_description;

        $gff->{Population} = join ',', map { $_->name } @{ $self->get_all_Populations };
       
        return $gff;
    }

    sub to_gvf {
        my $self = shift;
       
        my $attrs = $self->_gff_hash(@_);

        # get rid of any empty attributes 
        map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs;
        
        return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; 
    }
    
}

{
    package Bio::EnsEMBL::Variation::Population;
        
    sub _gff_hash {
        
        my $self = shift;

        my $gff;

        $gff->{Attribute} = 'Variant_freq';

        $gff->{population} = $self->name;
        
        $gff->{population_size} = $self->size;

        $gff->{Comment} = $self->description;
      
        return $gff;
    }

    sub to_gvf {
        my $self = shift;
       
        my $attrs = $self->_gff_hash(@_);

        # get rid of any empty attributes 
        map { delete $attrs->{$_} unless $attrs->{$_} } keys %$attrs;
        
        return join ';', map { $_.'='.$attrs->{$_} } keys %$attrs; 
    }
    
}

1;