comparison variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFParser.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 LICENSE
4
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
6 Genome Research Limited. All rights reserved.
7
8 This software is distributed under a modified Apache license.
9 For license details, please see
10
11 http://www.ensembl.org/info/about/code_licence.html
12
13 =head1 NAME
14
15 GFFParser - simple gff3 parser.
16
17
18 =head1 AUTHOR
19
20 Monika Komorowska, 2012 - monika@ebi.ac.uk
21
22 =head1 SYNOPSIS
23
24 use strict;
25 use Bio::EnsEMBL::Utils::IO::GFFParser;
26 use IO::File;
27
28 my $file_name = "features.gff";
29 my $fh = IO::File->new($file_name, 'r');
30 my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh);
31
32 my @header_lines = @{$parser->parse_header()};
33 #do something with the header lines array, e.g. print array elements
34
35 foreach my $header_line (@header_lines) {
36 print $header_line . "\n";
37 }
38 print "\n\n";
39 my $feature = $parser->parse_next_feature();
40
41 while (defined($feature) ) {
42
43 my %feature = %{$feature};
44
45 #do something with the feature, e.g. print hash keys and values
46 foreach my $key (keys %feature) {
47 if ($key ne 'attribute') {
48 print $key . " " . $feature{$key} ."\n";
49 } else {
50 print $key . "\n";
51 my %attribs = %{$feature{$key}};
52 foreach my $attrib_key (keys %attribs) {
53 printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
54
55 }
56 }
57 }
58 print "\n\n";
59 $feature = $parser->parse_next_feature();
60 }
61
62 my $sequence = $parser->parse_next_sequence();
63
64 while (defined($sequence)) {
65 my %sequence = %{$sequence};
66
67 foreach my $key (keys %sequence) {
68 print $key . " " . $sequence{$key} ."\n";
69 }
70 print "\n\n";
71
72 $sequence = $parser->parse_next_sequence();
73 }
74
75 $parser->close();
76
77 $fh->close();
78
79
80
81 =head1 DESCRIPTION
82
83 GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml
84
85 Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.
86
87 This class can be extended to convert a feature hash into a feature object reversing
88 the processing done by GFFSerializer.
89
90 =cut
91
92 package Bio::EnsEMBL::Utils::IO::GFFParser;
93 use strict;
94 use warnings;
95 use Bio::EnsEMBL::Utils::Exception;
96 use IO::File;
97 use URI::Escape;
98 use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
99
100
101 my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');
102
103 =head2 new
104
105 Constructor
106 Arg [1] : File handle
107
108 Returntype : Bio::EnsEMBL::Utils::IO::GFFParser
109
110 =cut
111
112 sub new {
113 my $class = shift;
114 my $self = {
115 filehandle => shift,
116 };
117 bless $self, $class;
118 if (!defined($self->{'filehandle'})) {
119 throw("GFFParser requires a valid filehandle to a GFF3 formatted file");
120 }
121 return $self;
122
123 }
124
125 =head2 parse_header
126
127 Arg [1] : File handle
128 Description: Returns a arrayref with each header line stored in array element
129 Returntype : Arrayref of GFF3 file header lines
130
131 =cut
132
133 sub parse_header {
134
135 my $self = shift;
136
137 my $next_line;
138 my @header_lines;
139
140 while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) ) {
141
142 #stop parsing features if ##FASTA directive encountered
143 last if ($next_line =~ /\#\#FASTA/ );
144
145 #header lines start with ## (except for the ##FASTA directive indicating sequence section)
146 if ($next_line =~ /^[\#]{2}/ ) {
147 push @header_lines, $next_line;
148 if ($next_line =~ /gff-version\s+(\d+)/) {
149 if ($1 != 3) {
150 warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.");
151 }
152 }
153 }
154 }
155
156 if (defined($next_line)) {
157 $self->{'first_non_header_line'} = $next_line;
158 }
159 return \@header_lines;
160
161 }
162
163 =head2 parse_next_feature
164
165 Arg [1] : File handle
166 Description: Returns a hashref in the format -
167 {
168 seqid => scalar,
169 source => scalar,
170 type => scalar,
171 start => scalar,
172 end => scalar,
173 score => scalar,
174 strand => scalar,
175 phase => scalar,
176 attribute => hashref,
177
178 }
179 Returntype : Hashref of a GFF3 feature line
180
181 =cut
182
183 sub parse_next_feature {
184
185 my $self = shift;
186
187 my $next_line;
188 my $feature_line;
189
190 while (($next_line = $self->_read_line() ) && defined($next_line) ) {
191
192 #stop parsing features if ##FASTA directive
193 last if ($next_line =~ /\#\#FASTA/);
194
195
196 next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
197 $next_line =~ /^\/\//);
198
199 $feature_line = $next_line;
200 last;
201 }
202
203 return undef unless $feature_line;
204
205 my %feature;
206 my %attribute;
207
208
209 #strip off trailing comments
210 $feature_line =~ s/\#.*//;
211
212 my @chunks = split(/\t/, $feature_line);
213
214 %feature = (
215 'seqid' => uri_unescape($chunks[0]),
216 'source' => uri_unescape($chunks[1]),
217 'type' => uri_unescape($chunks[2]),
218 'start' => $chunks[3],
219 'end' => $chunks[4],
220 'score' => $chunks[5],
221 'strand' => $strand_conversion{$chunks[6]},
222 'phase' => $chunks[7]
223 );
224
225 if ($chunks[8]) {
226 my @attributes = split( /;/, $chunks[8] );
227 my %attributes;
228 foreach my $attribute (@attributes) {
229 my ( $name, $value ) = split( /=/, $attribute );
230 $name = uri_unescape($name);
231 my @split_values = map { uri_unescape($_) } split(/,/, $value);
232 if(scalar(@split_values) > 1) {
233 $attributes{$name} = \@split_values;
234 }
235 else {
236 $attributes{$name} = $split_values[0];
237 }
238 }
239 $feature{'attribute'} = \%attributes;
240 }
241
242 return \%feature;
243 }
244
245 =head2 parse_next_sequence
246
247 Arg [1] : File handle
248 Description: Returns a hashref in the format -
249 {
250 header => scalar,
251 sequence => scalar,
252
253 }
254 Returntype : Hashref of a GFF3 sequence line
255
256 =cut
257
258 sub parse_next_sequence {
259
260 my $self = shift;
261
262 my $next_line;
263 my $sequence;
264 my $header;
265
266 while (($next_line = $self->_read_line() ) && defined($next_line) ) {
267
268 next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
269 $next_line =~ /^\/\//);
270
271 if ($next_line =~ /^>/) {
272 if ($header) {
273 #next fasta header encountered
274 $self->{'next_fasta_header'} = $next_line;
275 last;
276
277 } else {
278 $header = $next_line;
279 }
280 } else {
281 $sequence .= $next_line;
282 }
283 }
284
285 return undef unless ($sequence || $header);
286
287 my %sequence = (header => $header , sequence => $sequence );
288
289 return \%sequence;
290 }
291
292
293 sub _read_line {
294
295 my $self = shift;
296 my $fh = $self->{'filehandle'};
297
298 my $line;
299
300 if (defined($self->{'first_non_header_line'})) {
301 $line = $self->{'first_non_header_line'};
302 $self->{'first_non_header_line'} = undef;
303 } elsif ( defined($self->{'next_fasta_header'} )) {
304 $line = $self->{'next_fasta_header'};
305 $self->{'next_fasta_header'} = undef;
306 }
307 else {
308 $line = <$fh>;
309 if (defined($line)) {
310 chomp $line;
311 if (!$line) {
312 #parse next line if current line is empty
313 $line = $self->_read_line();
314 }
315 }
316 }
317
318 return $line;
319 }
320
321 sub close {
322
323 my $self = shift;
324 $self->{"filehandle"} = undef;
325
326 }
327
328 1;