Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,217 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +GFFSerializer - Feature to GFF converter + +=head1 AUTHOR + +Kieron Taylor, 2011 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Utils::IO::GFFSerializer; +use Bio::EnsEMBL::Utils::BiotypeMapper; + +my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' ); +my $biotype_mapper = new BiotypeMapper($ontology_adaptor); +my $serializer = new GFFSerializer($biotype_mapper,$output_fh); + +my $variation_feature_adaptor = $registry->get_adaptor( $config{'species'}, 'variation', 'variationfeature' ); +$serializer->print_metadata("Variation Features:"); +my $iterator = $variation_feature_adaptor->fetch_Iterator_by_Slice($slice,undef,60000); +$serializer->print_feature_Iterator($iterator); + +=head1 DESCRIPTION + +Subclass of Serializer that can turn a feature into a line for the GFF3 format. Requires +a BiotypeMapper in order to translate biotypes to SO terms. + +=cut + +package Bio::EnsEMBL::Utils::IO::GFFSerializer; +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use Bio::EnsEMBL::Utils::BiotypeMapper; +use URI::Escape; +use Bio::EnsEMBL::Utils::IO::FeatureSerializer; + +use base qw(Bio::EnsEMBL::Utils::IO::FeatureSerializer); + +my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-'); + +=head2 new + + Constructor + Arg [1] : Ontology Adaptor + Arg [2] : Optional File handle + + Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer + +=cut + +sub new { + my $class = shift; + my $self = { + ontology_adaptor => shift, + filehandle => shift, + }; + bless $self, $class; + if ( ref($self->{'ontology_adaptor'}) ne "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" ) { + throw("GFF format requires an instance of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor to function. See also Bio::EnsEMBL::Utils::BiotypeMapper"); + } + $self->{'mapper'} = new Bio::EnsEMBL::Utils::BiotypeMapper($self->{'ontology_adaptor'}); + + if (!defined ($self->{'filehandle'})) { + # no file handle, let the handle point to a copy of STDOUT instead + open $self->{'filehandle'}, ">&STDOUT"; + $self->{'stdout'} = 1; + } + return $self; +} + +=head2 print_feature + + Arg [1] : Bio::EnsEMBL::Feature, subclass or related pseudo-feature + Example : $reporter->print_feature($feature,$slice_start_coordinate,"X") + Description: Asks a feature for its summary, and generates a GFF3 + compliant entry to hand back again + Additional attributes are handed through to column 9 of the + output using exact spelling and capitalisation of the + feature-supplied hash. + Returntype : none +=cut + +sub print_feature { + my $self = shift; + my $feature = shift; + my $biotype_mapper = $self->{'mapper'}; + + my $text_buffer = ""; + if ($feature->can('summary_as_hash') ) { + my %summary = %{$feature->summary_as_hash}; + my $row = ""; +# Column 1 - seqname, the name of the sequence/chromosome the feature is on. Landmark for start below + if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";} + $row .= $summary{'seq_region_name'}."\t"; + +# Column 2 - source, complicated with Ensembl not being the originator of all data + $row .= "EnsEMBL\t"; + +# Column 3 - feature, the ontology term for the kind of feature this row is + my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature); + $row .= $so_term."\t"; + +# Column 4 - start, the start coordinate of the feature, here shifted to chromosomal coordinates +# Start and end must be in ascending order for GFF. Circular genomes require the length of +# the circuit to be added on. + if ($summary{'start'} > $summary{'end'}) { + #assumes this is not a Compara circular sequence and can treat is as a Feature + if ($feature->slice() && $feature->slice()->is_circular() ) { + $summary{'end'} = $summary{'end'} + $feature->seq_region_length; + } + # non-circular, but end still before start + else {$summary{'end'} = $summary{'start'};} + } + $row .= $summary{'start'} . "\t"; + +# Column 5 - end, coordinates (absolute) for the end of this feature + $row .= $summary{'end'} . "\t"; + +# Column 6 - score, for variations only. + if (exists($summary{'score'})) { + $row .= $summary{'score'}."\t"; + } + else { + $row .= ".\t"; + } + +# Column 7 - strand, up or down + if (exists($summary{'strand'})) { + $row .= $strand_conversion{$summary{'strand'}}."\t"; + } + else { + $row .= ".\t"; + } + +# Column 8 - reading frame, necessary only for Exons + $row .= ".\t"; + +# Column 9 - the 'other' section for all GFF and GVF compliant attributes +# We include Stable ID and biotype where possible to supplement the information in the other columns + delete $summary{'seq_region_start'}; + delete $summary{'seq_region_name'}; + delete $summary{'start'}; + delete $summary{'end'}; + delete $summary{'strand'}; + delete $summary{'score'}; +# Slice the hash for specific keys in GFF-friendly order + my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular); + my @ordered_values = @summary{@ordered_keys}; + while (my $key = shift @ordered_keys) { + my $value = shift @ordered_values; + if ($value) { + $row .= $key."=".uri_escape($value,'\t\n\r;=%&,').";"; + } + delete $summary{$key}; + } +# Catch the remaining keys, containing whatever else the Feature provided + foreach my $attribute ( keys(%summary)) { + if (ref $summary{$attribute} eq "ARRAY") { + $row .= $attribute."=".join (',',@{$summary{$attribute}}) . ";" + } + else { + if ($summary{$attribute}) { $row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,') . ";"; } + } + } +# trim off any trailing commas left by the ordered keys stage above: + $text_buffer .= $row."\n"; + } + else { + warning("Feature failed to self-summarise"); + } + #filehandle is inherited + my $fh = $self->{'filehandle'}; + print $fh $text_buffer; +} + +=head2 print_main_header + + Arg [1] : Arrayref of slices going into the file. + Description: Printing the header text or metadata required for GFF, + using a list of slices to be written + Returntype : None +=cut + +sub print_main_header { + my $self = shift; + my $arrayref_of_slices = shift; + my $fh = $self->{'filehandle'}; + + print $fh "##gff-version 3\n"; + foreach my $slice (@{$arrayref_of_slices}) { + if (not defined($slice)) { warning("Slice not defined.\n"); return;} + print $fh "##sequence-region ",$slice->seq_region_name," ",$slice->start," ",$slice->end,"\n"; + } +} + +sub print_metadata { + my $self = shift; + my $text = shift; + my $fh = $self->{'filehandle'}; + print $fh "\n#".$text."\n"; +} + + +1;