Mercurial > repos > mahtabm > ensembl
view variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm @ 3:d30fa12e4cc5 default tip
Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author | devteam <devteam@galaxyproject.org> |
---|---|
date | Mon, 13 Jan 2014 10:38:30 -0500 |
parents | 1f6dce3d34e0 |
children |
line wrap: on
line source
=pod =head1 LICENSE Copyright (c) 1999-2012 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 NAME GFFSerializer - Feature to GFF converter =head1 AUTHOR Kieron Taylor, 2011 - ktaylor@ebi.ac.uk =head1 SYNOPSIS use Bio::EnsEMBL::Utils::IO::GFFSerializer; use Bio::EnsEMBL::Utils::BiotypeMapper; my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' ); my $biotype_mapper = new BiotypeMapper($ontology_adaptor); my $serializer = new GFFSerializer($biotype_mapper,$output_fh); my $variation_feature_adaptor = $registry->get_adaptor( $config{'species'}, 'variation', 'variationfeature' ); $serializer->print_metadata("Variation Features:"); my $iterator = $variation_feature_adaptor->fetch_Iterator_by_Slice($slice,undef,60000); $serializer->print_feature_Iterator($iterator); =head1 DESCRIPTION Subclass of Serializer that can turn a feature into a line for the GFF3 format. Requires a BiotypeMapper in order to translate biotypes to SO terms. =cut package Bio::EnsEMBL::Utils::IO::GFFSerializer; use strict; use warnings; use Bio::EnsEMBL::Utils::Exception; use Bio::EnsEMBL::Utils::BiotypeMapper; use URI::Escape; use Bio::EnsEMBL::Utils::IO::FeatureSerializer; use base qw(Bio::EnsEMBL::Utils::IO::FeatureSerializer); my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-'); =head2 new Constructor Arg [1] : Ontology Adaptor Arg [2] : Optional File handle Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer =cut sub new { my $class = shift; my $self = { ontology_adaptor => shift, filehandle => shift, }; bless $self, $class; if ( ref($self->{'ontology_adaptor'}) ne "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" ) { throw("GFF format requires an instance of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor to function. See also Bio::EnsEMBL::Utils::BiotypeMapper"); } $self->{'mapper'} = new Bio::EnsEMBL::Utils::BiotypeMapper($self->{'ontology_adaptor'}); if (!defined ($self->{'filehandle'})) { # no file handle, let the handle point to a copy of STDOUT instead open $self->{'filehandle'}, ">&STDOUT"; $self->{'stdout'} = 1; } return $self; } =head2 print_feature Arg [1] : Bio::EnsEMBL::Feature, subclass or related pseudo-feature Example : $reporter->print_feature($feature,$slice_start_coordinate,"X") Description: Asks a feature for its summary, and generates a GFF3 compliant entry to hand back again Additional attributes are handed through to column 9 of the output using exact spelling and capitalisation of the feature-supplied hash. Returntype : none =cut sub print_feature { my $self = shift; my $feature = shift; my $biotype_mapper = $self->{'mapper'}; my $text_buffer = ""; if ($feature->can('summary_as_hash') ) { my %summary = %{$feature->summary_as_hash}; my $row = ""; # Column 1 - seqname, the name of the sequence/chromosome the feature is on. Landmark for start below if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";} $row .= $summary{'seq_region_name'}."\t"; # Column 2 - source, complicated with Ensembl not being the originator of all data $row .= "EnsEMBL\t"; # Column 3 - feature, the ontology term for the kind of feature this row is my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature); $row .= $so_term."\t"; # Column 4 - start, the start coordinate of the feature, here shifted to chromosomal coordinates # Start and end must be in ascending order for GFF. Circular genomes require the length of # the circuit to be added on. if ($summary{'start'} > $summary{'end'}) { #assumes this is not a Compara circular sequence and can treat is as a Feature if ($feature->slice() && $feature->slice()->is_circular() ) { $summary{'end'} = $summary{'end'} + $feature->seq_region_length; } # non-circular, but end still before start else {$summary{'end'} = $summary{'start'};} } $row .= $summary{'start'} . "\t"; # Column 5 - end, coordinates (absolute) for the end of this feature $row .= $summary{'end'} . "\t"; # Column 6 - score, for variations only. if (exists($summary{'score'})) { $row .= $summary{'score'}."\t"; } else { $row .= ".\t"; } # Column 7 - strand, up or down if (exists($summary{'strand'})) { $row .= $strand_conversion{$summary{'strand'}}."\t"; } else { $row .= ".\t"; } # Column 8 - reading frame, necessary only for Exons $row .= ".\t"; # Column 9 - the 'other' section for all GFF and GVF compliant attributes # We include Stable ID and biotype where possible to supplement the information in the other columns delete $summary{'seq_region_start'}; delete $summary{'seq_region_name'}; delete $summary{'start'}; delete $summary{'end'}; delete $summary{'strand'}; delete $summary{'score'}; # Slice the hash for specific keys in GFF-friendly order my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular); my @ordered_values = @summary{@ordered_keys}; while (my $key = shift @ordered_keys) { my $value = shift @ordered_values; if ($value) { $row .= $key."=".uri_escape($value,'\t\n\r;=%&,').";"; } delete $summary{$key}; } # Catch the remaining keys, containing whatever else the Feature provided foreach my $attribute ( keys(%summary)) { if (ref $summary{$attribute} eq "ARRAY") { $row .= $attribute."=".join (',',@{$summary{$attribute}}) . ";" } else { if ($summary{$attribute}) { $row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,') . ";"; } } } # trim off any trailing commas left by the ordered keys stage above: $text_buffer .= $row."\n"; } else { warning("Feature failed to self-summarise"); } #filehandle is inherited my $fh = $self->{'filehandle'}; print $fh $text_buffer; } =head2 print_main_header Arg [1] : Arrayref of slices going into the file. Description: Printing the header text or metadata required for GFF, using a list of slices to be written Returntype : None =cut sub print_main_header { my $self = shift; my $arrayref_of_slices = shift; my $fh = $self->{'filehandle'}; print $fh "##gff-version 3\n"; foreach my $slice (@{$arrayref_of_slices}) { if (not defined($slice)) { warning("Slice not defined.\n"); return;} print $fh "##sequence-region ",$slice->seq_region_name," ",$slice->start," ",$slice->end,"\n"; } } sub print_metadata { my $self = shift; my $text = shift; my $fh = $self->{'filehandle'}; print $fh "\n#".$text."\n"; } 1;