Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFParser.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =pod | |
| 2 | |
| 3 =head1 LICENSE | |
| 4 | |
| 5 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
| 6 Genome Research Limited. All rights reserved. | |
| 7 | |
| 8 This software is distributed under a modified Apache license. | |
| 9 For license details, please see | |
| 10 | |
| 11 http://www.ensembl.org/info/about/code_licence.html | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 GFFParser - simple gff3 parser. | |
| 16 | |
| 17 | |
| 18 =head1 AUTHOR | |
| 19 | |
| 20 Monika Komorowska, 2012 - monika@ebi.ac.uk | |
| 21 | |
| 22 =head1 SYNOPSIS | |
| 23 | |
| 24 use strict; | |
| 25 use Bio::EnsEMBL::Utils::IO::GFFParser; | |
| 26 use IO::File; | |
| 27 | |
| 28 my $file_name = "features.gff"; | |
| 29 my $fh = IO::File->new($file_name, 'r'); | |
| 30 my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh); | |
| 31 | |
| 32 my @header_lines = @{$parser->parse_header()}; | |
| 33 #do something with the header lines array, e.g. print array elements | |
| 34 | |
| 35 foreach my $header_line (@header_lines) { | |
| 36 print $header_line . "\n"; | |
| 37 } | |
| 38 print "\n\n"; | |
| 39 my $feature = $parser->parse_next_feature(); | |
| 40 | |
| 41 while (defined($feature) ) { | |
| 42 | |
| 43 my %feature = %{$feature}; | |
| 44 | |
| 45 #do something with the feature, e.g. print hash keys and values | |
| 46 foreach my $key (keys %feature) { | |
| 47 if ($key ne 'attribute') { | |
| 48 print $key . " " . $feature{$key} ."\n"; | |
| 49 } else { | |
| 50 print $key . "\n"; | |
| 51 my %attribs = %{$feature{$key}}; | |
| 52 foreach my $attrib_key (keys %attribs) { | |
| 53 printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)})); | |
| 54 | |
| 55 } | |
| 56 } | |
| 57 } | |
| 58 print "\n\n"; | |
| 59 $feature = $parser->parse_next_feature(); | |
| 60 } | |
| 61 | |
| 62 my $sequence = $parser->parse_next_sequence(); | |
| 63 | |
| 64 while (defined($sequence)) { | |
| 65 my %sequence = %{$sequence}; | |
| 66 | |
| 67 foreach my $key (keys %sequence) { | |
| 68 print $key . " " . $sequence{$key} ."\n"; | |
| 69 } | |
| 70 print "\n\n"; | |
| 71 | |
| 72 $sequence = $parser->parse_next_sequence(); | |
| 73 } | |
| 74 | |
| 75 $parser->close(); | |
| 76 | |
| 77 $fh->close(); | |
| 78 | |
| 79 | |
| 80 | |
| 81 =head1 DESCRIPTION | |
| 82 | |
| 83 GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml | |
| 84 | |
| 85 Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file. | |
| 86 | |
| 87 This class can be extended to convert a feature hash into a feature object reversing | |
| 88 the processing done by GFFSerializer. | |
| 89 | |
| 90 =cut | |
| 91 | |
| 92 package Bio::EnsEMBL::Utils::IO::GFFParser; | |
| 93 use strict; | |
| 94 use warnings; | |
| 95 use Bio::EnsEMBL::Utils::Exception; | |
| 96 use IO::File; | |
| 97 use URI::Escape; | |
| 98 use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/; | |
| 99 | |
| 100 | |
| 101 my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1'); | |
| 102 | |
| 103 =head2 new | |
| 104 | |
| 105 Constructor | |
| 106 Arg [1] : File handle | |
| 107 | |
| 108 Returntype : Bio::EnsEMBL::Utils::IO::GFFParser | |
| 109 | |
| 110 =cut | |
| 111 | |
| 112 sub new { | |
| 113 my $class = shift; | |
| 114 my $self = { | |
| 115 filehandle => shift, | |
| 116 }; | |
| 117 bless $self, $class; | |
| 118 if (!defined($self->{'filehandle'})) { | |
| 119 throw("GFFParser requires a valid filehandle to a GFF3 formatted file"); | |
| 120 } | |
| 121 return $self; | |
| 122 | |
| 123 } | |
| 124 | |
| 125 =head2 parse_header | |
| 126 | |
| 127 Arg [1] : File handle | |
| 128 Description: Returns a arrayref with each header line stored in array element | |
| 129 Returntype : Arrayref of GFF3 file header lines | |
| 130 | |
| 131 =cut | |
| 132 | |
| 133 sub parse_header { | |
| 134 | |
| 135 my $self = shift; | |
| 136 | |
| 137 my $next_line; | |
| 138 my @header_lines; | |
| 139 | |
| 140 while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) ) { | |
| 141 | |
| 142 #stop parsing features if ##FASTA directive encountered | |
| 143 last if ($next_line =~ /\#\#FASTA/ ); | |
| 144 | |
| 145 #header lines start with ## (except for the ##FASTA directive indicating sequence section) | |
| 146 if ($next_line =~ /^[\#]{2}/ ) { | |
| 147 push @header_lines, $next_line; | |
| 148 if ($next_line =~ /gff-version\s+(\d+)/) { | |
| 149 if ($1 != 3) { | |
| 150 warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files."); | |
| 151 } | |
| 152 } | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 if (defined($next_line)) { | |
| 157 $self->{'first_non_header_line'} = $next_line; | |
| 158 } | |
| 159 return \@header_lines; | |
| 160 | |
| 161 } | |
| 162 | |
| 163 =head2 parse_next_feature | |
| 164 | |
| 165 Arg [1] : File handle | |
| 166 Description: Returns a hashref in the format - | |
| 167 { | |
| 168 seqid => scalar, | |
| 169 source => scalar, | |
| 170 type => scalar, | |
| 171 start => scalar, | |
| 172 end => scalar, | |
| 173 score => scalar, | |
| 174 strand => scalar, | |
| 175 phase => scalar, | |
| 176 attribute => hashref, | |
| 177 | |
| 178 } | |
| 179 Returntype : Hashref of a GFF3 feature line | |
| 180 | |
| 181 =cut | |
| 182 | |
| 183 sub parse_next_feature { | |
| 184 | |
| 185 my $self = shift; | |
| 186 | |
| 187 my $next_line; | |
| 188 my $feature_line; | |
| 189 | |
| 190 while (($next_line = $self->_read_line() ) && defined($next_line) ) { | |
| 191 | |
| 192 #stop parsing features if ##FASTA directive | |
| 193 last if ($next_line =~ /\#\#FASTA/); | |
| 194 | |
| 195 | |
| 196 next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ || | |
| 197 $next_line =~ /^\/\//); | |
| 198 | |
| 199 $feature_line = $next_line; | |
| 200 last; | |
| 201 } | |
| 202 | |
| 203 return undef unless $feature_line; | |
| 204 | |
| 205 my %feature; | |
| 206 my %attribute; | |
| 207 | |
| 208 | |
| 209 #strip off trailing comments | |
| 210 $feature_line =~ s/\#.*//; | |
| 211 | |
| 212 my @chunks = split(/\t/, $feature_line); | |
| 213 | |
| 214 %feature = ( | |
| 215 'seqid' => uri_unescape($chunks[0]), | |
| 216 'source' => uri_unescape($chunks[1]), | |
| 217 'type' => uri_unescape($chunks[2]), | |
| 218 'start' => $chunks[3], | |
| 219 'end' => $chunks[4], | |
| 220 'score' => $chunks[5], | |
| 221 'strand' => $strand_conversion{$chunks[6]}, | |
| 222 'phase' => $chunks[7] | |
| 223 ); | |
| 224 | |
| 225 if ($chunks[8]) { | |
| 226 my @attributes = split( /;/, $chunks[8] ); | |
| 227 my %attributes; | |
| 228 foreach my $attribute (@attributes) { | |
| 229 my ( $name, $value ) = split( /=/, $attribute ); | |
| 230 $name = uri_unescape($name); | |
| 231 my @split_values = map { uri_unescape($_) } split(/,/, $value); | |
| 232 if(scalar(@split_values) > 1) { | |
| 233 $attributes{$name} = \@split_values; | |
| 234 } | |
| 235 else { | |
| 236 $attributes{$name} = $split_values[0]; | |
| 237 } | |
| 238 } | |
| 239 $feature{'attribute'} = \%attributes; | |
| 240 } | |
| 241 | |
| 242 return \%feature; | |
| 243 } | |
| 244 | |
| 245 =head2 parse_next_sequence | |
| 246 | |
| 247 Arg [1] : File handle | |
| 248 Description: Returns a hashref in the format - | |
| 249 { | |
| 250 header => scalar, | |
| 251 sequence => scalar, | |
| 252 | |
| 253 } | |
| 254 Returntype : Hashref of a GFF3 sequence line | |
| 255 | |
| 256 =cut | |
| 257 | |
| 258 sub parse_next_sequence { | |
| 259 | |
| 260 my $self = shift; | |
| 261 | |
| 262 my $next_line; | |
| 263 my $sequence; | |
| 264 my $header; | |
| 265 | |
| 266 while (($next_line = $self->_read_line() ) && defined($next_line) ) { | |
| 267 | |
| 268 next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ || | |
| 269 $next_line =~ /^\/\//); | |
| 270 | |
| 271 if ($next_line =~ /^>/) { | |
| 272 if ($header) { | |
| 273 #next fasta header encountered | |
| 274 $self->{'next_fasta_header'} = $next_line; | |
| 275 last; | |
| 276 | |
| 277 } else { | |
| 278 $header = $next_line; | |
| 279 } | |
| 280 } else { | |
| 281 $sequence .= $next_line; | |
| 282 } | |
| 283 } | |
| 284 | |
| 285 return undef unless ($sequence || $header); | |
| 286 | |
| 287 my %sequence = (header => $header , sequence => $sequence ); | |
| 288 | |
| 289 return \%sequence; | |
| 290 } | |
| 291 | |
| 292 | |
| 293 sub _read_line { | |
| 294 | |
| 295 my $self = shift; | |
| 296 my $fh = $self->{'filehandle'}; | |
| 297 | |
| 298 my $line; | |
| 299 | |
| 300 if (defined($self->{'first_non_header_line'})) { | |
| 301 $line = $self->{'first_non_header_line'}; | |
| 302 $self->{'first_non_header_line'} = undef; | |
| 303 } elsif ( defined($self->{'next_fasta_header'} )) { | |
| 304 $line = $self->{'next_fasta_header'}; | |
| 305 $self->{'next_fasta_header'} = undef; | |
| 306 } | |
| 307 else { | |
| 308 $line = <$fh>; | |
| 309 if (defined($line)) { | |
| 310 chomp $line; | |
| 311 if (!$line) { | |
| 312 #parse next line if current line is empty | |
| 313 $line = $self->_read_line(); | |
| 314 } | |
| 315 } | |
| 316 } | |
| 317 | |
| 318 return $line; | |
| 319 } | |
| 320 | |
| 321 sub close { | |
| 322 | |
| 323 my $self = shift; | |
| 324 $self->{"filehandle"} = undef; | |
| 325 | |
| 326 } | |
| 327 | |
| 328 1; |
