0
|
1 =pod
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 GFFParser - simple gff3 parser.
|
|
16
|
|
17
|
|
18 =head1 AUTHOR
|
|
19
|
|
20 Monika Komorowska, 2012 - monika@ebi.ac.uk
|
|
21
|
|
22 =head1 SYNOPSIS
|
|
23
|
|
24 use strict;
|
|
25 use Bio::EnsEMBL::Utils::IO::GFFParser;
|
|
26 use IO::File;
|
|
27
|
|
28 my $file_name = "features.gff";
|
|
29 my $fh = IO::File->new($file_name, 'r');
|
|
30 my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh);
|
|
31
|
|
32 my @header_lines = @{$parser->parse_header()};
|
|
33 #do something with the header lines array, e.g. print array elements
|
|
34
|
|
35 foreach my $header_line (@header_lines) {
|
|
36 print $header_line . "\n";
|
|
37 }
|
|
38 print "\n\n";
|
|
39 my $feature = $parser->parse_next_feature();
|
|
40
|
|
41 while (defined($feature) ) {
|
|
42
|
|
43 my %feature = %{$feature};
|
|
44
|
|
45 #do something with the feature, e.g. print hash keys and values
|
|
46 foreach my $key (keys %feature) {
|
|
47 if ($key ne 'attribute') {
|
|
48 print $key . " " . $feature{$key} ."\n";
|
|
49 } else {
|
|
50 print $key . "\n";
|
|
51 my %attribs = %{$feature{$key}};
|
|
52 foreach my $attrib_key (keys %attribs) {
|
|
53 printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
|
|
54
|
|
55 }
|
|
56 }
|
|
57 }
|
|
58 print "\n\n";
|
|
59 $feature = $parser->parse_next_feature();
|
|
60 }
|
|
61
|
|
62 my $sequence = $parser->parse_next_sequence();
|
|
63
|
|
64 while (defined($sequence)) {
|
|
65 my %sequence = %{$sequence};
|
|
66
|
|
67 foreach my $key (keys %sequence) {
|
|
68 print $key . " " . $sequence{$key} ."\n";
|
|
69 }
|
|
70 print "\n\n";
|
|
71
|
|
72 $sequence = $parser->parse_next_sequence();
|
|
73 }
|
|
74
|
|
75 $parser->close();
|
|
76
|
|
77 $fh->close();
|
|
78
|
|
79
|
|
80
|
|
81 =head1 DESCRIPTION
|
|
82
|
|
83 GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml
|
|
84
|
|
85 Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.
|
|
86
|
|
87 This class can be extended to convert a feature hash into a feature object reversing
|
|
88 the processing done by GFFSerializer.
|
|
89
|
|
90 =cut
|
|
91
|
|
92 package Bio::EnsEMBL::Utils::IO::GFFParser;
|
|
93 use strict;
|
|
94 use warnings;
|
|
95 use Bio::EnsEMBL::Utils::Exception;
|
|
96 use IO::File;
|
|
97 use URI::Escape;
|
|
98 use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
|
|
99
|
|
100
|
|
101 my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');
|
|
102
|
|
103 =head2 new
|
|
104
|
|
105 Constructor
|
|
106 Arg [1] : File handle
|
|
107
|
|
108 Returntype : Bio::EnsEMBL::Utils::IO::GFFParser
|
|
109
|
|
110 =cut
|
|
111
|
|
112 sub new {
|
|
113 my $class = shift;
|
|
114 my $self = {
|
|
115 filehandle => shift,
|
|
116 };
|
|
117 bless $self, $class;
|
|
118 if (!defined($self->{'filehandle'})) {
|
|
119 throw("GFFParser requires a valid filehandle to a GFF3 formatted file");
|
|
120 }
|
|
121 return $self;
|
|
122
|
|
123 }
|
|
124
|
|
125 =head2 parse_header
|
|
126
|
|
127 Arg [1] : File handle
|
|
128 Description: Returns a arrayref with each header line stored in array element
|
|
129 Returntype : Arrayref of GFF3 file header lines
|
|
130
|
|
131 =cut
|
|
132
|
|
133 sub parse_header {
|
|
134
|
|
135 my $self = shift;
|
|
136
|
|
137 my $next_line;
|
|
138 my @header_lines;
|
|
139
|
|
140 while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) ) {
|
|
141
|
|
142 #stop parsing features if ##FASTA directive encountered
|
|
143 last if ($next_line =~ /\#\#FASTA/ );
|
|
144
|
|
145 #header lines start with ## (except for the ##FASTA directive indicating sequence section)
|
|
146 if ($next_line =~ /^[\#]{2}/ ) {
|
|
147 push @header_lines, $next_line;
|
|
148 if ($next_line =~ /gff-version\s+(\d+)/) {
|
|
149 if ($1 != 3) {
|
|
150 warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.");
|
|
151 }
|
|
152 }
|
|
153 }
|
|
154 }
|
|
155
|
|
156 if (defined($next_line)) {
|
|
157 $self->{'first_non_header_line'} = $next_line;
|
|
158 }
|
|
159 return \@header_lines;
|
|
160
|
|
161 }
|
|
162
|
|
163 =head2 parse_next_feature
|
|
164
|
|
165 Arg [1] : File handle
|
|
166 Description: Returns a hashref in the format -
|
|
167 {
|
|
168 seqid => scalar,
|
|
169 source => scalar,
|
|
170 type => scalar,
|
|
171 start => scalar,
|
|
172 end => scalar,
|
|
173 score => scalar,
|
|
174 strand => scalar,
|
|
175 phase => scalar,
|
|
176 attribute => hashref,
|
|
177
|
|
178 }
|
|
179 Returntype : Hashref of a GFF3 feature line
|
|
180
|
|
181 =cut
|
|
182
|
|
183 sub parse_next_feature {
|
|
184
|
|
185 my $self = shift;
|
|
186
|
|
187 my $next_line;
|
|
188 my $feature_line;
|
|
189
|
|
190 while (($next_line = $self->_read_line() ) && defined($next_line) ) {
|
|
191
|
|
192 #stop parsing features if ##FASTA directive
|
|
193 last if ($next_line =~ /\#\#FASTA/);
|
|
194
|
|
195
|
|
196 next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
|
|
197 $next_line =~ /^\/\//);
|
|
198
|
|
199 $feature_line = $next_line;
|
|
200 last;
|
|
201 }
|
|
202
|
|
203 return undef unless $feature_line;
|
|
204
|
|
205 my %feature;
|
|
206 my %attribute;
|
|
207
|
|
208
|
|
209 #strip off trailing comments
|
|
210 $feature_line =~ s/\#.*//;
|
|
211
|
|
212 my @chunks = split(/\t/, $feature_line);
|
|
213
|
|
214 %feature = (
|
|
215 'seqid' => uri_unescape($chunks[0]),
|
|
216 'source' => uri_unescape($chunks[1]),
|
|
217 'type' => uri_unescape($chunks[2]),
|
|
218 'start' => $chunks[3],
|
|
219 'end' => $chunks[4],
|
|
220 'score' => $chunks[5],
|
|
221 'strand' => $strand_conversion{$chunks[6]},
|
|
222 'phase' => $chunks[7]
|
|
223 );
|
|
224
|
|
225 if ($chunks[8]) {
|
|
226 my @attributes = split( /;/, $chunks[8] );
|
|
227 my %attributes;
|
|
228 foreach my $attribute (@attributes) {
|
|
229 my ( $name, $value ) = split( /=/, $attribute );
|
|
230 $name = uri_unescape($name);
|
|
231 my @split_values = map { uri_unescape($_) } split(/,/, $value);
|
|
232 if(scalar(@split_values) > 1) {
|
|
233 $attributes{$name} = \@split_values;
|
|
234 }
|
|
235 else {
|
|
236 $attributes{$name} = $split_values[0];
|
|
237 }
|
|
238 }
|
|
239 $feature{'attribute'} = \%attributes;
|
|
240 }
|
|
241
|
|
242 return \%feature;
|
|
243 }
|
|
244
|
|
245 =head2 parse_next_sequence
|
|
246
|
|
247 Arg [1] : File handle
|
|
248 Description: Returns a hashref in the format -
|
|
249 {
|
|
250 header => scalar,
|
|
251 sequence => scalar,
|
|
252
|
|
253 }
|
|
254 Returntype : Hashref of a GFF3 sequence line
|
|
255
|
|
256 =cut
|
|
257
|
|
258 sub parse_next_sequence {
|
|
259
|
|
260 my $self = shift;
|
|
261
|
|
262 my $next_line;
|
|
263 my $sequence;
|
|
264 my $header;
|
|
265
|
|
266 while (($next_line = $self->_read_line() ) && defined($next_line) ) {
|
|
267
|
|
268 next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
|
|
269 $next_line =~ /^\/\//);
|
|
270
|
|
271 if ($next_line =~ /^>/) {
|
|
272 if ($header) {
|
|
273 #next fasta header encountered
|
|
274 $self->{'next_fasta_header'} = $next_line;
|
|
275 last;
|
|
276
|
|
277 } else {
|
|
278 $header = $next_line;
|
|
279 }
|
|
280 } else {
|
|
281 $sequence .= $next_line;
|
|
282 }
|
|
283 }
|
|
284
|
|
285 return undef unless ($sequence || $header);
|
|
286
|
|
287 my %sequence = (header => $header , sequence => $sequence );
|
|
288
|
|
289 return \%sequence;
|
|
290 }
|
|
291
|
|
292
|
|
293 sub _read_line {
|
|
294
|
|
295 my $self = shift;
|
|
296 my $fh = $self->{'filehandle'};
|
|
297
|
|
298 my $line;
|
|
299
|
|
300 if (defined($self->{'first_non_header_line'})) {
|
|
301 $line = $self->{'first_non_header_line'};
|
|
302 $self->{'first_non_header_line'} = undef;
|
|
303 } elsif ( defined($self->{'next_fasta_header'} )) {
|
|
304 $line = $self->{'next_fasta_header'};
|
|
305 $self->{'next_fasta_header'} = undef;
|
|
306 }
|
|
307 else {
|
|
308 $line = <$fh>;
|
|
309 if (defined($line)) {
|
|
310 chomp $line;
|
|
311 if (!$line) {
|
|
312 #parse next line if current line is empty
|
|
313 $line = $self->_read_line();
|
|
314 }
|
|
315 }
|
|
316 }
|
|
317
|
|
318 return $line;
|
|
319 }
|
|
320
|
|
321 sub close {
|
|
322
|
|
323 my $self = shift;
|
|
324 $self->{"filehandle"} = undef;
|
|
325
|
|
326 }
|
|
327
|
|
328 1;
|