comparison variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 LICENSE
4
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
6 Genome Research Limited. All rights reserved.
7
8 This software is distributed under a modified Apache license.
9 For license details, please see
10
11 http://www.ensembl.org/info/about/code_licence.html
12
13 =head1 NAME
14
15 GFFSerializer - Feature to GFF converter
16
17 =head1 AUTHOR
18
19 Kieron Taylor, 2011 - ktaylor@ebi.ac.uk
20
21 =head1 SYNOPSIS
22
23 use Bio::EnsEMBL::Utils::IO::GFFSerializer;
24 use Bio::EnsEMBL::Utils::BiotypeMapper;
25
26 my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' );
27 my $biotype_mapper = new BiotypeMapper($ontology_adaptor);
28 my $serializer = new GFFSerializer($biotype_mapper,$output_fh);
29
30 my $variation_feature_adaptor = $registry->get_adaptor( $config{'species'}, 'variation', 'variationfeature' );
31 $serializer->print_metadata("Variation Features:");
32 my $iterator = $variation_feature_adaptor->fetch_Iterator_by_Slice($slice,undef,60000);
33 $serializer->print_feature_Iterator($iterator);
34
35 =head1 DESCRIPTION
36
37 Subclass of Serializer that can turn a feature into a line for the GFF3 format. Requires
38 a BiotypeMapper in order to translate biotypes to SO terms.
39
40 =cut
41
42 package Bio::EnsEMBL::Utils::IO::GFFSerializer;
43 use strict;
44 use warnings;
45 use Bio::EnsEMBL::Utils::Exception;
46 use Bio::EnsEMBL::Utils::BiotypeMapper;
47 use URI::Escape;
48 use Bio::EnsEMBL::Utils::IO::FeatureSerializer;
49
50 use base qw(Bio::EnsEMBL::Utils::IO::FeatureSerializer);
51
52 my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-');
53
54 =head2 new
55
56 Constructor
57 Arg [1] : Ontology Adaptor
58 Arg [2] : Optional File handle
59
60 Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer
61
62 =cut
63
64 sub new {
65 my $class = shift;
66 my $self = {
67 ontology_adaptor => shift,
68 filehandle => shift,
69 };
70 bless $self, $class;
71 if ( ref($self->{'ontology_adaptor'}) ne "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" ) {
72 throw("GFF format requires an instance of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor to function. See also Bio::EnsEMBL::Utils::BiotypeMapper");
73 }
74 $self->{'mapper'} = new Bio::EnsEMBL::Utils::BiotypeMapper($self->{'ontology_adaptor'});
75
76 if (!defined ($self->{'filehandle'})) {
77 # no file handle, let the handle point to a copy of STDOUT instead
78 open $self->{'filehandle'}, ">&STDOUT";
79 $self->{'stdout'} = 1;
80 }
81 return $self;
82 }
83
84 =head2 print_feature
85
86 Arg [1] : Bio::EnsEMBL::Feature, subclass or related pseudo-feature
87 Example : $reporter->print_feature($feature,$slice_start_coordinate,"X")
88 Description: Asks a feature for its summary, and generates a GFF3
89 compliant entry to hand back again
90 Additional attributes are handed through to column 9 of the
91 output using exact spelling and capitalisation of the
92 feature-supplied hash.
93 Returntype : none
94 =cut
95
96 sub print_feature {
97 my $self = shift;
98 my $feature = shift;
99 my $biotype_mapper = $self->{'mapper'};
100
101 my $text_buffer = "";
102 if ($feature->can('summary_as_hash') ) {
103 my %summary = %{$feature->summary_as_hash};
104 my $row = "";
105 # Column 1 - seqname, the name of the sequence/chromosome the feature is on. Landmark for start below
106 if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";}
107 $row .= $summary{'seq_region_name'}."\t";
108
109 # Column 2 - source, complicated with Ensembl not being the originator of all data
110 $row .= "EnsEMBL\t";
111
112 # Column 3 - feature, the ontology term for the kind of feature this row is
113 my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature);
114 $row .= $so_term."\t";
115
116 # Column 4 - start, the start coordinate of the feature, here shifted to chromosomal coordinates
117 # Start and end must be in ascending order for GFF. Circular genomes require the length of
118 # the circuit to be added on.
119 if ($summary{'start'} > $summary{'end'}) {
120 #assumes this is not a Compara circular sequence and can treat is as a Feature
121 if ($feature->slice() && $feature->slice()->is_circular() ) {
122 $summary{'end'} = $summary{'end'} + $feature->seq_region_length;
123 }
124 # non-circular, but end still before start
125 else {$summary{'end'} = $summary{'start'};}
126 }
127 $row .= $summary{'start'} . "\t";
128
129 # Column 5 - end, coordinates (absolute) for the end of this feature
130 $row .= $summary{'end'} . "\t";
131
132 # Column 6 - score, for variations only.
133 if (exists($summary{'score'})) {
134 $row .= $summary{'score'}."\t";
135 }
136 else {
137 $row .= ".\t";
138 }
139
140 # Column 7 - strand, up or down
141 if (exists($summary{'strand'})) {
142 $row .= $strand_conversion{$summary{'strand'}}."\t";
143 }
144 else {
145 $row .= ".\t";
146 }
147
148 # Column 8 - reading frame, necessary only for Exons
149 $row .= ".\t";
150
151 # Column 9 - the 'other' section for all GFF and GVF compliant attributes
152 # We include Stable ID and biotype where possible to supplement the information in the other columns
153 delete $summary{'seq_region_start'};
154 delete $summary{'seq_region_name'};
155 delete $summary{'start'};
156 delete $summary{'end'};
157 delete $summary{'strand'};
158 delete $summary{'score'};
159 # Slice the hash for specific keys in GFF-friendly order
160 my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular);
161 my @ordered_values = @summary{@ordered_keys};
162 while (my $key = shift @ordered_keys) {
163 my $value = shift @ordered_values;
164 if ($value) {
165 $row .= $key."=".uri_escape($value,'\t\n\r;=%&,').";";
166 }
167 delete $summary{$key};
168 }
169 # Catch the remaining keys, containing whatever else the Feature provided
170 foreach my $attribute ( keys(%summary)) {
171 if (ref $summary{$attribute} eq "ARRAY") {
172 $row .= $attribute."=".join (',',@{$summary{$attribute}}) . ";"
173 }
174 else {
175 if ($summary{$attribute}) { $row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,') . ";"; }
176 }
177 }
178 # trim off any trailing commas left by the ordered keys stage above:
179 $text_buffer .= $row."\n";
180 }
181 else {
182 warning("Feature failed to self-summarise");
183 }
184 #filehandle is inherited
185 my $fh = $self->{'filehandle'};
186 print $fh $text_buffer;
187 }
188
189 =head2 print_main_header
190
191 Arg [1] : Arrayref of slices going into the file.
192 Description: Printing the header text or metadata required for GFF,
193 using a list of slices to be written
194 Returntype : None
195 =cut
196
197 sub print_main_header {
198 my $self = shift;
199 my $arrayref_of_slices = shift;
200 my $fh = $self->{'filehandle'};
201
202 print $fh "##gff-version 3\n";
203 foreach my $slice (@{$arrayref_of_slices}) {
204 if (not defined($slice)) { warning("Slice not defined.\n"); return;}
205 print $fh "##sequence-region ",$slice->seq_region_name," ",$slice->start," ",$slice->end,"\n";
206 }
207 }
208
209 sub print_metadata {
210 my $self = shift;
211 my $text = shift;
212 my $fh = $self->{'filehandle'};
213 print $fh "\n#".$text."\n";
214 }
215
216
217 1;