comparison variant_effect_predictor/Bio/Tools/GFF.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: GFF.pm,v 1.26 2002/11/24 21:35:40 jason Exp $
2 #
3 # BioPerl module for Bio::Tools::GFF
4 #
5 # Cared for by the Bioperl core team
6 #
7 # Copyright Matthew Pocock
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser
16
17 =head1 SYNOPSIS
18
19 use Bio::Tools::GFF;
20
21 # specify input via -fh or -file
22 my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2);
23 my $feature;
24 # loop over the input stream
25 while($feature = $gffio->next_feature()) {
26 # do something with feature
27 }
28 $gffio->close();
29
30 # you can also obtain a GFF parser as a SeqAnalasisParserI in
31 # HT analysis pipelines (see Bio::SeqAnalysisParserI and
32 # Bio::Factory::SeqAnalysisParserFactory)
33 my $factory = Bio::Factory::SeqAnalysisParserFactory->new();
34 my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff");
35 while($feature = $parser->next_feature()) {
36 # do something with feature
37 }
38
39 =head1 DESCRIPTION
40
41 This class provides a simple GFF parser and writer. In the sense of a
42 SeqAnalysisParser, it parses an input file or stream into SeqFeatureI
43 objects, but is not in any way specific to a particular analysis
44 program and the output that program produces.
45
46 That is, if you can get your analysis program spit out GFF, here is
47 your result parser.
48
49 =head1 FEEDBACK
50
51 =head2 Mailing Lists
52
53 User feedback is an integral part of the evolution of this and other
54 Bioperl modules. Send your comments and suggestions preferably to one
55 of the Bioperl mailing lists. Your participation is much appreciated.
56
57 bioperl-l@bioperl.org - General discussion
58 http://bio.perl.org/MailList.html - About the mailing lists
59
60 =head2 Reporting Bugs
61
62 Report bugs to the Bioperl bug tracking system to help us keep track
63 the bugs and their resolution. Bug reports can be submitted via email
64 or the web:
65
66 bioperl-bugs@bio.perl.org
67 http://bugzilla.bioperl.org/
68
69 =head1 AUTHOR - Matthew Pocock
70
71 Email mrp@sanger.ac.uk
72
73 =head1 APPENDIX
74
75 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
76
77 =cut
78
79 # Let the code begin...
80
81 package Bio::Tools::GFF;
82
83 use vars qw(@ISA);
84 use strict;
85
86 use Bio::Root::IO;
87 use Bio::SeqAnalysisParserI;
88 use Bio::SeqFeature::Generic;
89
90 @ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
91
92 =head2 new
93
94 Title : new
95 Usage :
96 Function: Creates a new instance. Recognized named parameters are -file, -fh,
97 and -gff_version.
98
99 Returns : a new object
100 Args : names parameters
101
102
103 =cut
104
105 sub new {
106 my ($class, @args) = @_;
107 my $self = $class->SUPER::new(@args);
108
109 my ($gff_version) = $self->_rearrange([qw(GFF_VERSION)],@args);
110
111 # initialize IO
112 $self->_initialize_io(@args);
113
114 $gff_version ||= 2;
115 if(($gff_version != 1) && ($gff_version != 2)) {
116 $self->throw("Can't build a GFF object with the unknown version ".
117 $gff_version);
118 }
119 $self->gff_version($gff_version);
120 return $self;
121 }
122
123 =head2 next_feature
124
125 Title : next_feature
126 Usage : $seqfeature = $gffio->next_feature();
127 Function: Returns the next feature available in the input file or stream, or
128 undef if there are no more features.
129 Example :
130 Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
131 more features.
132 Args : none
133
134 =cut
135
136 sub next_feature {
137 my ($self) = @_;
138
139 my $gff_string;
140
141 # be graceful about empty lines or comments, and make sure we return undef
142 # if the input's consumed
143 while(($gff_string = $self->_readline()) && defined($gff_string)) {
144 next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ ||
145 $gff_string =~ /^\/\//);
146 last;
147 }
148 return undef unless $gff_string;
149
150 my $feat = Bio::SeqFeature::Generic->new();
151 $self->from_gff_string($feat, $gff_string);
152
153 return $feat;
154 }
155
156 =head2 from_gff_string
157
158 Title : from_gff_string
159 Usage : $gff->from_gff_string($feature, $gff_string);
160 Function: Sets properties of a SeqFeatureI object from a GFF-formatted
161 string. Interpretation of the string depends on the version
162 that has been specified at initialization.
163
164 This method is used by next_feature(). It actually dispatches to
165 one of the version-specific (private) methods.
166 Example :
167 Returns : void
168 Args : A Bio::SeqFeatureI implementing object to be initialized
169 The GFF-formatted string to initialize it from
170
171 =cut
172
173 sub from_gff_string {
174 my ($self, $feat, $gff_string) = @_;
175
176 if($self->gff_version() == 1) {
177 $self->_from_gff1_string($feat, $gff_string);
178 } else {
179 $self->_from_gff2_string($feat, $gff_string);
180 }
181 }
182
183 =head2 _from_gff1_string
184
185 Title : _from_gff1_string
186 Usage :
187 Function:
188 Example :
189 Returns : void
190 Args : A Bio::SeqFeatureI implementing object to be initialized
191 The GFF-formatted string to initialize it from
192
193 =cut
194
195 sub _from_gff1_string {
196 my ($gff, $feat, $string) = @_;
197 chomp $string;
198 my ($seqname, $source, $primary, $start, $end, $score, $strand, $frame, @group) = split(/\t/, $string);
199
200 if ( !defined $frame ) {
201 $feat->throw("[$string] does not look like GFF to me");
202 }
203 $frame = 0 unless( $frame =~ /^\d+$/);
204 $feat->seq_id($seqname);
205 $feat->source_tag($source);
206 $feat->primary_tag($primary);
207 $feat->start($start);
208 $feat->end($end);
209 $feat->frame($frame);
210 if ( $score eq '.' ) {
211 #$feat->score(undef);
212 } else {
213 $feat->score($score);
214 }
215 if ( $strand eq '-' ) { $feat->strand(-1); }
216 if ( $strand eq '+' ) { $feat->strand(1); }
217 if ( $strand eq '.' ) { $feat->strand(0); }
218 foreach my $g ( @group ) {
219 if ( $g =~ /(\S+)=(\S+)/ ) {
220 my $tag = $1;
221 my $value = $2;
222 $feat->add_tag_value($1, $2);
223 } else {
224 $feat->add_tag_value('group', $g);
225 }
226 }
227 }
228
229 =head2 _from_gff2_string
230
231 Title : _from_gff2_string
232 Usage :
233 Function:
234 Example :
235 Returns : void
236 Args : A Bio::SeqFeatureI implementing object to be initialized
237 The GFF2-formatted string to initialize it from
238
239
240 =cut
241
242 sub _from_gff2_string {
243 my ($gff, $feat, $string) = @_;
244 chomp($string);
245 # according to the Sanger website, GFF2 should be single-tab separated elements, and the
246 # free-text at the end should contain text-translated tab symbols but no "real" tabs,
247 # so splitting on \t is safe, and $attribs gets the entire attributes field to be parsed later
248 my ($seqname, $source, $primary, $start, $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string);
249 my $attribs = join '', @attribs; # just in case the rule against tab characters has been broken
250 if ( !defined $frame ) {
251 $feat->throw("[$string] does not look like GFF2 to me");
252 }
253 $feat->seq_id($seqname);
254 $feat->source_tag($source);
255 $feat->primary_tag($primary);
256 $feat->start($start);
257 $feat->end($end);
258 $feat->frame($frame);
259 if ( $score eq '.' ) {
260 #$feat->score(undef);
261 } else {
262 $feat->score($score);
263 }
264 if ( $strand eq '-' ) { $feat->strand(-1); }
265 if ( $strand eq '+' ) { $feat->strand(1); }
266 if ( $strand eq '.' ) { $feat->strand(0); }
267
268
269 # <Begin Inefficient Code from Mark Wilkinson>
270 # this routine is necessay to allow the presence of semicolons in
271 # quoted text Semicolons are the delimiting character for new
272 # tag/value attributes. it is more or less a "state" machine, with
273 # the "quoted" flag going up and down as we pass thorugh quotes to
274 # distinguish free-text semicolon and hash symbols from GFF control
275 # characters
276
277
278 my $flag = 0; # this could be changed to a bit and just be twiddled
279 my @parsed;
280
281 # run through each character one at a time and check it
282 # NOTE: changed to foreach loop which is more efficient in perl
283 # --jasons
284
285 foreach my $a ( split //, $attribs ) {
286 # flag up on entering quoted text, down on leaving it
287 if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 }
288 elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"}
289 elsif( $a eq '#' && ! $flag ) { last }
290 push @parsed, $a;
291 }
292 $attribs = join "", @parsed; # rejoin into a single string
293
294 # <End Inefficient Code>
295 # Please feel free to fix this and make it more "perlish"
296
297 my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited
298
299 foreach my $pair ( @key_vals ) {
300 # replace semicolons that were removed from free-text above.
301 $pair =~ s/INSERT_SEMICOLON_HERE/;/g;
302
303 # separate the key from the value
304 my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair;
305
306
307 if( defined $values ) {
308 my @values;
309 # free text is quoted, so match each free-text block
310 # and remove it from the $values string
311 while ($values =~ s/"(.*?)"//){
312 # and push it on to the list of values (tags may have
313 # more than one value... and the value may be undef)
314 push @values, $1;
315 }
316
317 # and what is left over should be space-separated
318 # non-free-text values
319
320 my @othervals = split /\s+/, $values;
321 foreach my $othervalue(@othervals){
322 # get rid of any empty strings which might
323 # result from the split
324 if (CORE::length($othervalue) > 0) {push @values, $othervalue}
325 }
326
327 foreach my $value(@values){
328 $feat->add_tag_value($key, $value);
329 }
330 }
331 }
332 }
333
334 =head2 write_feature
335
336 Title : write_feature
337 Usage : $gffio->write_feature($feature);
338 Function: Writes the specified SeqFeatureI object in GFF format to the stream
339 associated with this instance.
340 Returns : none
341 Args : An array of Bio::SeqFeatureI implementing objects to be serialized
342
343 =cut
344
345 sub write_feature {
346 my ($self, @features) = @_;
347 foreach my $feature ( @features ) {
348 $self->_print($self->gff_string($feature)."\n");
349 }
350 }
351
352 =head2 gff_string
353
354 Title : gff_string
355 Usage : $gffstr = $gffio->gff_string($feature);
356 Function: Obtain the GFF-formatted representation of a SeqFeatureI object.
357 The formatting depends on the version specified at initialization.
358
359 This method is used by write_feature(). It actually dispatches to
360 one of the version-specific (private) methods.
361 Example :
362 Returns : A GFF-formatted string representation of the SeqFeature
363 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
364
365 =cut
366
367 sub gff_string{
368 my ($self, $feature) = @_;
369
370 if($self->gff_version() == 1) {
371 return $self->_gff1_string($feature);
372 } else {
373 return $self->_gff2_string($feature);
374 }
375 }
376
377 =head2 _gff1_string
378
379 Title : _gff1_string
380 Usage : $gffstr = $gffio->_gff1_string
381 Function:
382 Example :
383 Returns : A GFF1-formatted string representation of the SeqFeature
384 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified
385
386 =cut
387
388 sub _gff1_string{
389 my ($gff, $feat) = @_;
390 my ($str,$score,$frame,$name,$strand);
391
392 if( $feat->can('score') ) {
393 $score = $feat->score();
394 }
395 $score = '.' unless defined $score;
396
397 if( $feat->can('frame') ) {
398 $frame = $feat->frame();
399 }
400 $frame = '.' unless defined $frame;
401
402 $strand = $feat->strand();
403 if(! $strand) {
404 $strand = ".";
405 } elsif( $strand == 1 ) {
406 $strand = '+';
407 } elsif ( $feat->strand == -1 ) {
408 $strand = '-';
409 }
410
411 if( $feat->can('seqname') ) {
412 $name = $feat->seq_id();
413 $name ||= 'SEQ';
414 } else {
415 $name = 'SEQ';
416 }
417
418
419 $str = join("\t",
420 $name,
421 $feat->source_tag(),
422 $feat->primary_tag(),
423 $feat->start(),
424 $feat->end(),
425 $score,
426 $strand,
427 $frame);
428
429 foreach my $tag ( $feat->all_tags ) {
430 foreach my $value ( $feat->each_tag_value($tag) ) {
431 $str .= " $tag=$value";
432 }
433 }
434
435
436 return $str;
437 }
438
439 =head2 _gff2_string
440
441 Title : _gff2_string
442 Usage : $gffstr = $gffio->_gff2_string
443 Function:
444 Example :
445 Returns : A GFF2-formatted string representation of the SeqFeature
446 Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified
447
448 =cut
449
450 sub _gff2_string{
451 my ($gff, $feat) = @_;
452 my ($str,$score,$frame,$name,$strand);
453
454 if( $feat->can('score') ) {
455 $score = $feat->score();
456 }
457 $score = '.' unless defined $score;
458
459 if( $feat->can('frame') ) {
460 $frame = $feat->frame();
461 }
462 $frame = '.' unless defined $frame;
463
464 $strand = $feat->strand();
465 if(! $strand) {
466 $strand = ".";
467 } elsif( $strand == 1 ) {
468 $strand = '+';
469 } elsif ( $feat->strand == -1 ) {
470 $strand = '-';
471 }
472
473 if( $feat->can('seqname') ) {
474 $name = $feat->seq_id();
475 $name ||= 'SEQ';
476 } else {
477 $name = 'SEQ';
478 }
479 $str = join("\t",
480 $name,
481 $feat->source_tag(),
482 $feat->primary_tag(),
483 $feat->start(),
484 $feat->end(),
485 $score,
486 $strand,
487 $frame);
488
489 # the routine below is the only modification I made to the original
490 # ->gff_string routine (above) as on November 17th, 2000, the
491 # Sanger webpage describing GFF2 format reads: "From version 2
492 # onwards, the attribute field must have a tag value structure
493 # following the syntax used within objects in a .ace file,
494 # flattened onto one line by semicolon separators. Tags must be
495 # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values
496 # must be quoted with double quotes".
497
498 # MW
499
500 my $valuestr;
501 my @all_tags = $feat->all_tags;
502 if (@all_tags) { # only play this game if it is worth playing...
503 $str .= "\t"; # my interpretation of the GFF2
504 # specification suggests the need
505 # for this additional TAB character...??
506 foreach my $tag ( @all_tags ) {
507 my $valuestr; # a string which will hold one or more values
508 # for this tag, with quoted free text and
509 # space-separated individual values.
510 foreach my $value ( $feat->each_tag_value($tag) ) {
511 if ($value =~ /[^A-Za-z0-9_]/){
512 $value =~ s/\t/\\t/g; # substitute tab and newline
513 # characters
514 $value =~ s/\n/\\n/g; # to their UNIX equivalents
515 $value = '"' . $value . '" '} # if the value contains
516 # anything other than valid
517 # tag/value characters, then
518 # quote it
519 $value = "\"\"" unless defined $value;
520 # if it is completely empty,
521 # then just make empty double
522 # quotes
523 $valuestr .= $value . " "; # with a trailing space in case
524 # there are multiple values
525 # for this tag (allowed in GFF2 and .ace format)
526 }
527 $str .= "$tag $valuestr ; "; # semicolon delimited with no '=' sign
528 }
529 chop $str; chop $str # remove the trailing semicolon and space
530 }
531 return $str;
532 }
533
534 =head2 gff_version
535
536 Title : _gff_version
537 Usage : $gffversion = $gffio->gff_version
538 Function:
539 Example :
540 Returns : The GFF version this parser will accept and emit.
541 Args : none
542
543 =cut
544
545 sub gff_version {
546 my ($self, $value) = @_;
547 if(defined $value && (($value == 1) || ($value == 2))) {
548 $self->{'GFF_VERSION'} = $value;
549 }
550 return $self->{'GFF_VERSION'};
551 }
552
553 # Make filehandles
554
555 =head2 newFh
556
557 Title : newFh
558 Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
559 Function: does a new() followed by an fh()
560 Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format')
561 $feature = <$fh>; # read a feature object
562 print $fh $feature ; # write a feature object
563 Returns : filehandle tied to the Bio::Tools::GFF class
564 Args :
565
566 =cut
567
568 sub newFh {
569 my $class = shift;
570 return unless my $self = $class->new(@_);
571 return $self->fh;
572 }
573
574 =head2 fh
575
576 Title : fh
577 Usage : $obj->fh
578 Function:
579 Example : $fh = $obj->fh; # make a tied filehandle
580 $feature = <$fh>; # read a feature object
581 print $fh $feature; # write a feature object
582 Returns : filehandle tied to Bio::Tools::GFF class
583 Args : none
584
585 =cut
586
587
588 sub fh {
589 my $self = shift;
590 my $class = ref($self) || $self;
591 my $s = Symbol::gensym;
592 tie $$s,$class,$self;
593 return $s;
594 }
595
596 sub DESTROY {
597 my $self = shift;
598
599 $self->close();
600 }
601
602 sub TIEHANDLE {
603 my ($class,$val) = @_;
604 return bless {'gffio' => $val}, $class;
605 }
606
607 sub READLINE {
608 my $self = shift;
609 return $self->{'gffio'}->next_feature() unless wantarray;
610 my (@list, $obj);
611 push @list, $obj while $obj = $self->{'gffio'}->next_feature();
612 return @list;
613 }
614
615 sub PRINT {
616 my $self = shift;
617 $self->{'gffio'}->write_feature(@_);
618 }
619
620 1;
621