0
|
1 =pod
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 GFFSerializer - Feature to GFF converter
|
|
16
|
|
17 =head1 AUTHOR
|
|
18
|
|
19 Kieron Taylor, 2011 - ktaylor@ebi.ac.uk
|
|
20
|
|
21 =head1 SYNOPSIS
|
|
22
|
|
23 use Bio::EnsEMBL::Utils::IO::GFFSerializer;
|
|
24 use Bio::EnsEMBL::Utils::BiotypeMapper;
|
|
25
|
|
26 my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' );
|
|
27 my $biotype_mapper = new BiotypeMapper($ontology_adaptor);
|
|
28 my $serializer = new GFFSerializer($biotype_mapper,$output_fh);
|
|
29
|
|
30 my $variation_feature_adaptor = $registry->get_adaptor( $config{'species'}, 'variation', 'variationfeature' );
|
|
31 $serializer->print_metadata("Variation Features:");
|
|
32 my $iterator = $variation_feature_adaptor->fetch_Iterator_by_Slice($slice,undef,60000);
|
|
33 $serializer->print_feature_Iterator($iterator);
|
|
34
|
|
35 =head1 DESCRIPTION
|
|
36
|
|
37 Subclass of Serializer that can turn a feature into a line for the GFF3 format. Requires
|
|
38 a BiotypeMapper in order to translate biotypes to SO terms.
|
|
39
|
|
40 =cut
|
|
41
|
|
42 package Bio::EnsEMBL::Utils::IO::GFFSerializer;
|
|
43 use strict;
|
|
44 use warnings;
|
|
45 use Bio::EnsEMBL::Utils::Exception;
|
|
46 use Bio::EnsEMBL::Utils::BiotypeMapper;
|
|
47 use URI::Escape;
|
|
48 use Bio::EnsEMBL::Utils::IO::FeatureSerializer;
|
|
49
|
|
50 use base qw(Bio::EnsEMBL::Utils::IO::FeatureSerializer);
|
|
51
|
|
52 my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-');
|
|
53
|
|
54 =head2 new
|
|
55
|
|
56 Constructor
|
|
57 Arg [1] : Ontology Adaptor
|
|
58 Arg [2] : Optional File handle
|
|
59
|
|
60 Returntype : Bio::EnsEMBL::Utils::IO::GFFSerializer
|
|
61
|
|
62 =cut
|
|
63
|
|
64 sub new {
|
|
65 my $class = shift;
|
|
66 my $self = {
|
|
67 ontology_adaptor => shift,
|
|
68 filehandle => shift,
|
|
69 };
|
|
70 bless $self, $class;
|
|
71 if ( ref($self->{'ontology_adaptor'}) ne "Bio::EnsEMBL::DBSQL::OntologyTermAdaptor" ) {
|
|
72 throw("GFF format requires an instance of Bio::EnsEMBL::DBSQL::OntologyTermAdaptor to function. See also Bio::EnsEMBL::Utils::BiotypeMapper");
|
|
73 }
|
|
74 $self->{'mapper'} = new Bio::EnsEMBL::Utils::BiotypeMapper($self->{'ontology_adaptor'});
|
|
75
|
|
76 if (!defined ($self->{'filehandle'})) {
|
|
77 # no file handle, let the handle point to a copy of STDOUT instead
|
|
78 open $self->{'filehandle'}, ">&STDOUT";
|
|
79 $self->{'stdout'} = 1;
|
|
80 }
|
|
81 return $self;
|
|
82 }
|
|
83
|
|
84 =head2 print_feature
|
|
85
|
|
86 Arg [1] : Bio::EnsEMBL::Feature, subclass or related pseudo-feature
|
|
87 Example : $reporter->print_feature($feature,$slice_start_coordinate,"X")
|
|
88 Description: Asks a feature for its summary, and generates a GFF3
|
|
89 compliant entry to hand back again
|
|
90 Additional attributes are handed through to column 9 of the
|
|
91 output using exact spelling and capitalisation of the
|
|
92 feature-supplied hash.
|
|
93 Returntype : none
|
|
94 =cut
|
|
95
|
|
96 sub print_feature {
|
|
97 my $self = shift;
|
|
98 my $feature = shift;
|
|
99 my $biotype_mapper = $self->{'mapper'};
|
|
100
|
|
101 my $text_buffer = "";
|
|
102 if ($feature->can('summary_as_hash') ) {
|
|
103 my %summary = %{$feature->summary_as_hash};
|
|
104 my $row = "";
|
|
105 # Column 1 - seqname, the name of the sequence/chromosome the feature is on. Landmark for start below
|
|
106 if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";}
|
|
107 $row .= $summary{'seq_region_name'}."\t";
|
|
108
|
|
109 # Column 2 - source, complicated with Ensembl not being the originator of all data
|
|
110 $row .= "EnsEMBL\t";
|
|
111
|
|
112 # Column 3 - feature, the ontology term for the kind of feature this row is
|
|
113 my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature);
|
|
114 $row .= $so_term."\t";
|
|
115
|
|
116 # Column 4 - start, the start coordinate of the feature, here shifted to chromosomal coordinates
|
|
117 # Start and end must be in ascending order for GFF. Circular genomes require the length of
|
|
118 # the circuit to be added on.
|
|
119 if ($summary{'start'} > $summary{'end'}) {
|
|
120 #assumes this is not a Compara circular sequence and can treat is as a Feature
|
|
121 if ($feature->slice() && $feature->slice()->is_circular() ) {
|
|
122 $summary{'end'} = $summary{'end'} + $feature->seq_region_length;
|
|
123 }
|
|
124 # non-circular, but end still before start
|
|
125 else {$summary{'end'} = $summary{'start'};}
|
|
126 }
|
|
127 $row .= $summary{'start'} . "\t";
|
|
128
|
|
129 # Column 5 - end, coordinates (absolute) for the end of this feature
|
|
130 $row .= $summary{'end'} . "\t";
|
|
131
|
|
132 # Column 6 - score, for variations only.
|
|
133 if (exists($summary{'score'})) {
|
|
134 $row .= $summary{'score'}."\t";
|
|
135 }
|
|
136 else {
|
|
137 $row .= ".\t";
|
|
138 }
|
|
139
|
|
140 # Column 7 - strand, up or down
|
|
141 if (exists($summary{'strand'})) {
|
|
142 $row .= $strand_conversion{$summary{'strand'}}."\t";
|
|
143 }
|
|
144 else {
|
|
145 $row .= ".\t";
|
|
146 }
|
|
147
|
|
148 # Column 8 - reading frame, necessary only for Exons
|
|
149 $row .= ".\t";
|
|
150
|
|
151 # Column 9 - the 'other' section for all GFF and GVF compliant attributes
|
|
152 # We include Stable ID and biotype where possible to supplement the information in the other columns
|
|
153 delete $summary{'seq_region_start'};
|
|
154 delete $summary{'seq_region_name'};
|
|
155 delete $summary{'start'};
|
|
156 delete $summary{'end'};
|
|
157 delete $summary{'strand'};
|
|
158 delete $summary{'score'};
|
|
159 # Slice the hash for specific keys in GFF-friendly order
|
|
160 my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular);
|
|
161 my @ordered_values = @summary{@ordered_keys};
|
|
162 while (my $key = shift @ordered_keys) {
|
|
163 my $value = shift @ordered_values;
|
|
164 if ($value) {
|
|
165 $row .= $key."=".uri_escape($value,'\t\n\r;=%&,').";";
|
|
166 }
|
|
167 delete $summary{$key};
|
|
168 }
|
|
169 # Catch the remaining keys, containing whatever else the Feature provided
|
|
170 foreach my $attribute ( keys(%summary)) {
|
|
171 if (ref $summary{$attribute} eq "ARRAY") {
|
|
172 $row .= $attribute."=".join (',',@{$summary{$attribute}}) . ";"
|
|
173 }
|
|
174 else {
|
|
175 if ($summary{$attribute}) { $row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,') . ";"; }
|
|
176 }
|
|
177 }
|
|
178 # trim off any trailing commas left by the ordered keys stage above:
|
|
179 $text_buffer .= $row."\n";
|
|
180 }
|
|
181 else {
|
|
182 warning("Feature failed to self-summarise");
|
|
183 }
|
|
184 #filehandle is inherited
|
|
185 my $fh = $self->{'filehandle'};
|
|
186 print $fh $text_buffer;
|
|
187 }
|
|
188
|
|
189 =head2 print_main_header
|
|
190
|
|
191 Arg [1] : Arrayref of slices going into the file.
|
|
192 Description: Printing the header text or metadata required for GFF,
|
|
193 using a list of slices to be written
|
|
194 Returntype : None
|
|
195 =cut
|
|
196
|
|
197 sub print_main_header {
|
|
198 my $self = shift;
|
|
199 my $arrayref_of_slices = shift;
|
|
200 my $fh = $self->{'filehandle'};
|
|
201
|
|
202 print $fh "##gff-version 3\n";
|
|
203 foreach my $slice (@{$arrayref_of_slices}) {
|
|
204 if (not defined($slice)) { warning("Slice not defined.\n"); return;}
|
|
205 print $fh "##sequence-region ",$slice->seq_region_name," ",$slice->start," ",$slice->end,"\n";
|
|
206 }
|
|
207 }
|
|
208
|
|
209 sub print_metadata {
|
|
210 my $self = shift;
|
|
211 my $text = shift;
|
|
212 my $fh = $self->{'filehandle'};
|
|
213 print $fh "\n#".$text."\n";
|
|
214 }
|
|
215
|
|
216
|
|
217 1;
|