Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/Tools/GFF.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: GFF.pm,v 1.26 2002/11/24 21:35:40 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::Tools::GFF | |
4 # | |
5 # Cared for by the Bioperl core team | |
6 # | |
7 # Copyright Matthew Pocock | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 use Bio::Tools::GFF; | |
20 | |
21 # specify input via -fh or -file | |
22 my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2); | |
23 my $feature; | |
24 # loop over the input stream | |
25 while($feature = $gffio->next_feature()) { | |
26 # do something with feature | |
27 } | |
28 $gffio->close(); | |
29 | |
30 # you can also obtain a GFF parser as a SeqAnalasisParserI in | |
31 # HT analysis pipelines (see Bio::SeqAnalysisParserI and | |
32 # Bio::Factory::SeqAnalysisParserFactory) | |
33 my $factory = Bio::Factory::SeqAnalysisParserFactory->new(); | |
34 my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff"); | |
35 while($feature = $parser->next_feature()) { | |
36 # do something with feature | |
37 } | |
38 | |
39 =head1 DESCRIPTION | |
40 | |
41 This class provides a simple GFF parser and writer. In the sense of a | |
42 SeqAnalysisParser, it parses an input file or stream into SeqFeatureI | |
43 objects, but is not in any way specific to a particular analysis | |
44 program and the output that program produces. | |
45 | |
46 That is, if you can get your analysis program spit out GFF, here is | |
47 your result parser. | |
48 | |
49 =head1 FEEDBACK | |
50 | |
51 =head2 Mailing Lists | |
52 | |
53 User feedback is an integral part of the evolution of this and other | |
54 Bioperl modules. Send your comments and suggestions preferably to one | |
55 of the Bioperl mailing lists. Your participation is much appreciated. | |
56 | |
57 bioperl-l@bioperl.org - General discussion | |
58 http://bio.perl.org/MailList.html - About the mailing lists | |
59 | |
60 =head2 Reporting Bugs | |
61 | |
62 Report bugs to the Bioperl bug tracking system to help us keep track | |
63 the bugs and their resolution. Bug reports can be submitted via email | |
64 or the web: | |
65 | |
66 bioperl-bugs@bio.perl.org | |
67 http://bugzilla.bioperl.org/ | |
68 | |
69 =head1 AUTHOR - Matthew Pocock | |
70 | |
71 Email mrp@sanger.ac.uk | |
72 | |
73 =head1 APPENDIX | |
74 | |
75 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ | |
76 | |
77 =cut | |
78 | |
79 # Let the code begin... | |
80 | |
81 package Bio::Tools::GFF; | |
82 | |
83 use vars qw(@ISA); | |
84 use strict; | |
85 | |
86 use Bio::Root::IO; | |
87 use Bio::SeqAnalysisParserI; | |
88 use Bio::SeqFeature::Generic; | |
89 | |
90 @ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO); | |
91 | |
92 =head2 new | |
93 | |
94 Title : new | |
95 Usage : | |
96 Function: Creates a new instance. Recognized named parameters are -file, -fh, | |
97 and -gff_version. | |
98 | |
99 Returns : a new object | |
100 Args : names parameters | |
101 | |
102 | |
103 =cut | |
104 | |
105 sub new { | |
106 my ($class, @args) = @_; | |
107 my $self = $class->SUPER::new(@args); | |
108 | |
109 my ($gff_version) = $self->_rearrange([qw(GFF_VERSION)],@args); | |
110 | |
111 # initialize IO | |
112 $self->_initialize_io(@args); | |
113 | |
114 $gff_version ||= 2; | |
115 if(($gff_version != 1) && ($gff_version != 2)) { | |
116 $self->throw("Can't build a GFF object with the unknown version ". | |
117 $gff_version); | |
118 } | |
119 $self->gff_version($gff_version); | |
120 return $self; | |
121 } | |
122 | |
123 =head2 next_feature | |
124 | |
125 Title : next_feature | |
126 Usage : $seqfeature = $gffio->next_feature(); | |
127 Function: Returns the next feature available in the input file or stream, or | |
128 undef if there are no more features. | |
129 Example : | |
130 Returns : A Bio::SeqFeatureI implementing object, or undef if there are no | |
131 more features. | |
132 Args : none | |
133 | |
134 =cut | |
135 | |
136 sub next_feature { | |
137 my ($self) = @_; | |
138 | |
139 my $gff_string; | |
140 | |
141 # be graceful about empty lines or comments, and make sure we return undef | |
142 # if the input's consumed | |
143 while(($gff_string = $self->_readline()) && defined($gff_string)) { | |
144 next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ || | |
145 $gff_string =~ /^\/\//); | |
146 last; | |
147 } | |
148 return undef unless $gff_string; | |
149 | |
150 my $feat = Bio::SeqFeature::Generic->new(); | |
151 $self->from_gff_string($feat, $gff_string); | |
152 | |
153 return $feat; | |
154 } | |
155 | |
156 =head2 from_gff_string | |
157 | |
158 Title : from_gff_string | |
159 Usage : $gff->from_gff_string($feature, $gff_string); | |
160 Function: Sets properties of a SeqFeatureI object from a GFF-formatted | |
161 string. Interpretation of the string depends on the version | |
162 that has been specified at initialization. | |
163 | |
164 This method is used by next_feature(). It actually dispatches to | |
165 one of the version-specific (private) methods. | |
166 Example : | |
167 Returns : void | |
168 Args : A Bio::SeqFeatureI implementing object to be initialized | |
169 The GFF-formatted string to initialize it from | |
170 | |
171 =cut | |
172 | |
173 sub from_gff_string { | |
174 my ($self, $feat, $gff_string) = @_; | |
175 | |
176 if($self->gff_version() == 1) { | |
177 $self->_from_gff1_string($feat, $gff_string); | |
178 } else { | |
179 $self->_from_gff2_string($feat, $gff_string); | |
180 } | |
181 } | |
182 | |
183 =head2 _from_gff1_string | |
184 | |
185 Title : _from_gff1_string | |
186 Usage : | |
187 Function: | |
188 Example : | |
189 Returns : void | |
190 Args : A Bio::SeqFeatureI implementing object to be initialized | |
191 The GFF-formatted string to initialize it from | |
192 | |
193 =cut | |
194 | |
195 sub _from_gff1_string { | |
196 my ($gff, $feat, $string) = @_; | |
197 chomp $string; | |
198 my ($seqname, $source, $primary, $start, $end, $score, $strand, $frame, @group) = split(/\t/, $string); | |
199 | |
200 if ( !defined $frame ) { | |
201 $feat->throw("[$string] does not look like GFF to me"); | |
202 } | |
203 $frame = 0 unless( $frame =~ /^\d+$/); | |
204 $feat->seq_id($seqname); | |
205 $feat->source_tag($source); | |
206 $feat->primary_tag($primary); | |
207 $feat->start($start); | |
208 $feat->end($end); | |
209 $feat->frame($frame); | |
210 if ( $score eq '.' ) { | |
211 #$feat->score(undef); | |
212 } else { | |
213 $feat->score($score); | |
214 } | |
215 if ( $strand eq '-' ) { $feat->strand(-1); } | |
216 if ( $strand eq '+' ) { $feat->strand(1); } | |
217 if ( $strand eq '.' ) { $feat->strand(0); } | |
218 foreach my $g ( @group ) { | |
219 if ( $g =~ /(\S+)=(\S+)/ ) { | |
220 my $tag = $1; | |
221 my $value = $2; | |
222 $feat->add_tag_value($1, $2); | |
223 } else { | |
224 $feat->add_tag_value('group', $g); | |
225 } | |
226 } | |
227 } | |
228 | |
229 =head2 _from_gff2_string | |
230 | |
231 Title : _from_gff2_string | |
232 Usage : | |
233 Function: | |
234 Example : | |
235 Returns : void | |
236 Args : A Bio::SeqFeatureI implementing object to be initialized | |
237 The GFF2-formatted string to initialize it from | |
238 | |
239 | |
240 =cut | |
241 | |
242 sub _from_gff2_string { | |
243 my ($gff, $feat, $string) = @_; | |
244 chomp($string); | |
245 # according to the Sanger website, GFF2 should be single-tab separated elements, and the | |
246 # free-text at the end should contain text-translated tab symbols but no "real" tabs, | |
247 # so splitting on \t is safe, and $attribs gets the entire attributes field to be parsed later | |
248 my ($seqname, $source, $primary, $start, $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string); | |
249 my $attribs = join '', @attribs; # just in case the rule against tab characters has been broken | |
250 if ( !defined $frame ) { | |
251 $feat->throw("[$string] does not look like GFF2 to me"); | |
252 } | |
253 $feat->seq_id($seqname); | |
254 $feat->source_tag($source); | |
255 $feat->primary_tag($primary); | |
256 $feat->start($start); | |
257 $feat->end($end); | |
258 $feat->frame($frame); | |
259 if ( $score eq '.' ) { | |
260 #$feat->score(undef); | |
261 } else { | |
262 $feat->score($score); | |
263 } | |
264 if ( $strand eq '-' ) { $feat->strand(-1); } | |
265 if ( $strand eq '+' ) { $feat->strand(1); } | |
266 if ( $strand eq '.' ) { $feat->strand(0); } | |
267 | |
268 | |
269 # <Begin Inefficient Code from Mark Wilkinson> | |
270 # this routine is necessay to allow the presence of semicolons in | |
271 # quoted text Semicolons are the delimiting character for new | |
272 # tag/value attributes. it is more or less a "state" machine, with | |
273 # the "quoted" flag going up and down as we pass thorugh quotes to | |
274 # distinguish free-text semicolon and hash symbols from GFF control | |
275 # characters | |
276 | |
277 | |
278 my $flag = 0; # this could be changed to a bit and just be twiddled | |
279 my @parsed; | |
280 | |
281 # run through each character one at a time and check it | |
282 # NOTE: changed to foreach loop which is more efficient in perl | |
283 # --jasons | |
284 | |
285 foreach my $a ( split //, $attribs ) { | |
286 # flag up on entering quoted text, down on leaving it | |
287 if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 } | |
288 elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"} | |
289 elsif( $a eq '#' && ! $flag ) { last } | |
290 push @parsed, $a; | |
291 } | |
292 $attribs = join "", @parsed; # rejoin into a single string | |
293 | |
294 # <End Inefficient Code> | |
295 # Please feel free to fix this and make it more "perlish" | |
296 | |
297 my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited | |
298 | |
299 foreach my $pair ( @key_vals ) { | |
300 # replace semicolons that were removed from free-text above. | |
301 $pair =~ s/INSERT_SEMICOLON_HERE/;/g; | |
302 | |
303 # separate the key from the value | |
304 my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair; | |
305 | |
306 | |
307 if( defined $values ) { | |
308 my @values; | |
309 # free text is quoted, so match each free-text block | |
310 # and remove it from the $values string | |
311 while ($values =~ s/"(.*?)"//){ | |
312 # and push it on to the list of values (tags may have | |
313 # more than one value... and the value may be undef) | |
314 push @values, $1; | |
315 } | |
316 | |
317 # and what is left over should be space-separated | |
318 # non-free-text values | |
319 | |
320 my @othervals = split /\s+/, $values; | |
321 foreach my $othervalue(@othervals){ | |
322 # get rid of any empty strings which might | |
323 # result from the split | |
324 if (CORE::length($othervalue) > 0) {push @values, $othervalue} | |
325 } | |
326 | |
327 foreach my $value(@values){ | |
328 $feat->add_tag_value($key, $value); | |
329 } | |
330 } | |
331 } | |
332 } | |
333 | |
334 =head2 write_feature | |
335 | |
336 Title : write_feature | |
337 Usage : $gffio->write_feature($feature); | |
338 Function: Writes the specified SeqFeatureI object in GFF format to the stream | |
339 associated with this instance. | |
340 Returns : none | |
341 Args : An array of Bio::SeqFeatureI implementing objects to be serialized | |
342 | |
343 =cut | |
344 | |
345 sub write_feature { | |
346 my ($self, @features) = @_; | |
347 foreach my $feature ( @features ) { | |
348 $self->_print($self->gff_string($feature)."\n"); | |
349 } | |
350 } | |
351 | |
352 =head2 gff_string | |
353 | |
354 Title : gff_string | |
355 Usage : $gffstr = $gffio->gff_string($feature); | |
356 Function: Obtain the GFF-formatted representation of a SeqFeatureI object. | |
357 The formatting depends on the version specified at initialization. | |
358 | |
359 This method is used by write_feature(). It actually dispatches to | |
360 one of the version-specific (private) methods. | |
361 Example : | |
362 Returns : A GFF-formatted string representation of the SeqFeature | |
363 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified | |
364 | |
365 =cut | |
366 | |
367 sub gff_string{ | |
368 my ($self, $feature) = @_; | |
369 | |
370 if($self->gff_version() == 1) { | |
371 return $self->_gff1_string($feature); | |
372 } else { | |
373 return $self->_gff2_string($feature); | |
374 } | |
375 } | |
376 | |
377 =head2 _gff1_string | |
378 | |
379 Title : _gff1_string | |
380 Usage : $gffstr = $gffio->_gff1_string | |
381 Function: | |
382 Example : | |
383 Returns : A GFF1-formatted string representation of the SeqFeature | |
384 Args : A Bio::SeqFeatureI implementing object to be GFF-stringified | |
385 | |
386 =cut | |
387 | |
388 sub _gff1_string{ | |
389 my ($gff, $feat) = @_; | |
390 my ($str,$score,$frame,$name,$strand); | |
391 | |
392 if( $feat->can('score') ) { | |
393 $score = $feat->score(); | |
394 } | |
395 $score = '.' unless defined $score; | |
396 | |
397 if( $feat->can('frame') ) { | |
398 $frame = $feat->frame(); | |
399 } | |
400 $frame = '.' unless defined $frame; | |
401 | |
402 $strand = $feat->strand(); | |
403 if(! $strand) { | |
404 $strand = "."; | |
405 } elsif( $strand == 1 ) { | |
406 $strand = '+'; | |
407 } elsif ( $feat->strand == -1 ) { | |
408 $strand = '-'; | |
409 } | |
410 | |
411 if( $feat->can('seqname') ) { | |
412 $name = $feat->seq_id(); | |
413 $name ||= 'SEQ'; | |
414 } else { | |
415 $name = 'SEQ'; | |
416 } | |
417 | |
418 | |
419 $str = join("\t", | |
420 $name, | |
421 $feat->source_tag(), | |
422 $feat->primary_tag(), | |
423 $feat->start(), | |
424 $feat->end(), | |
425 $score, | |
426 $strand, | |
427 $frame); | |
428 | |
429 foreach my $tag ( $feat->all_tags ) { | |
430 foreach my $value ( $feat->each_tag_value($tag) ) { | |
431 $str .= " $tag=$value"; | |
432 } | |
433 } | |
434 | |
435 | |
436 return $str; | |
437 } | |
438 | |
439 =head2 _gff2_string | |
440 | |
441 Title : _gff2_string | |
442 Usage : $gffstr = $gffio->_gff2_string | |
443 Function: | |
444 Example : | |
445 Returns : A GFF2-formatted string representation of the SeqFeature | |
446 Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified | |
447 | |
448 =cut | |
449 | |
450 sub _gff2_string{ | |
451 my ($gff, $feat) = @_; | |
452 my ($str,$score,$frame,$name,$strand); | |
453 | |
454 if( $feat->can('score') ) { | |
455 $score = $feat->score(); | |
456 } | |
457 $score = '.' unless defined $score; | |
458 | |
459 if( $feat->can('frame') ) { | |
460 $frame = $feat->frame(); | |
461 } | |
462 $frame = '.' unless defined $frame; | |
463 | |
464 $strand = $feat->strand(); | |
465 if(! $strand) { | |
466 $strand = "."; | |
467 } elsif( $strand == 1 ) { | |
468 $strand = '+'; | |
469 } elsif ( $feat->strand == -1 ) { | |
470 $strand = '-'; | |
471 } | |
472 | |
473 if( $feat->can('seqname') ) { | |
474 $name = $feat->seq_id(); | |
475 $name ||= 'SEQ'; | |
476 } else { | |
477 $name = 'SEQ'; | |
478 } | |
479 $str = join("\t", | |
480 $name, | |
481 $feat->source_tag(), | |
482 $feat->primary_tag(), | |
483 $feat->start(), | |
484 $feat->end(), | |
485 $score, | |
486 $strand, | |
487 $frame); | |
488 | |
489 # the routine below is the only modification I made to the original | |
490 # ->gff_string routine (above) as on November 17th, 2000, the | |
491 # Sanger webpage describing GFF2 format reads: "From version 2 | |
492 # onwards, the attribute field must have a tag value structure | |
493 # following the syntax used within objects in a .ace file, | |
494 # flattened onto one line by semicolon separators. Tags must be | |
495 # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values | |
496 # must be quoted with double quotes". | |
497 | |
498 # MW | |
499 | |
500 my $valuestr; | |
501 my @all_tags = $feat->all_tags; | |
502 if (@all_tags) { # only play this game if it is worth playing... | |
503 $str .= "\t"; # my interpretation of the GFF2 | |
504 # specification suggests the need | |
505 # for this additional TAB character...?? | |
506 foreach my $tag ( @all_tags ) { | |
507 my $valuestr; # a string which will hold one or more values | |
508 # for this tag, with quoted free text and | |
509 # space-separated individual values. | |
510 foreach my $value ( $feat->each_tag_value($tag) ) { | |
511 if ($value =~ /[^A-Za-z0-9_]/){ | |
512 $value =~ s/\t/\\t/g; # substitute tab and newline | |
513 # characters | |
514 $value =~ s/\n/\\n/g; # to their UNIX equivalents | |
515 $value = '"' . $value . '" '} # if the value contains | |
516 # anything other than valid | |
517 # tag/value characters, then | |
518 # quote it | |
519 $value = "\"\"" unless defined $value; | |
520 # if it is completely empty, | |
521 # then just make empty double | |
522 # quotes | |
523 $valuestr .= $value . " "; # with a trailing space in case | |
524 # there are multiple values | |
525 # for this tag (allowed in GFF2 and .ace format) | |
526 } | |
527 $str .= "$tag $valuestr ; "; # semicolon delimited with no '=' sign | |
528 } | |
529 chop $str; chop $str # remove the trailing semicolon and space | |
530 } | |
531 return $str; | |
532 } | |
533 | |
534 =head2 gff_version | |
535 | |
536 Title : _gff_version | |
537 Usage : $gffversion = $gffio->gff_version | |
538 Function: | |
539 Example : | |
540 Returns : The GFF version this parser will accept and emit. | |
541 Args : none | |
542 | |
543 =cut | |
544 | |
545 sub gff_version { | |
546 my ($self, $value) = @_; | |
547 if(defined $value && (($value == 1) || ($value == 2))) { | |
548 $self->{'GFF_VERSION'} = $value; | |
549 } | |
550 return $self->{'GFF_VERSION'}; | |
551 } | |
552 | |
553 # Make filehandles | |
554 | |
555 =head2 newFh | |
556 | |
557 Title : newFh | |
558 Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format') | |
559 Function: does a new() followed by an fh() | |
560 Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format') | |
561 $feature = <$fh>; # read a feature object | |
562 print $fh $feature ; # write a feature object | |
563 Returns : filehandle tied to the Bio::Tools::GFF class | |
564 Args : | |
565 | |
566 =cut | |
567 | |
568 sub newFh { | |
569 my $class = shift; | |
570 return unless my $self = $class->new(@_); | |
571 return $self->fh; | |
572 } | |
573 | |
574 =head2 fh | |
575 | |
576 Title : fh | |
577 Usage : $obj->fh | |
578 Function: | |
579 Example : $fh = $obj->fh; # make a tied filehandle | |
580 $feature = <$fh>; # read a feature object | |
581 print $fh $feature; # write a feature object | |
582 Returns : filehandle tied to Bio::Tools::GFF class | |
583 Args : none | |
584 | |
585 =cut | |
586 | |
587 | |
588 sub fh { | |
589 my $self = shift; | |
590 my $class = ref($self) || $self; | |
591 my $s = Symbol::gensym; | |
592 tie $$s,$class,$self; | |
593 return $s; | |
594 } | |
595 | |
596 sub DESTROY { | |
597 my $self = shift; | |
598 | |
599 $self->close(); | |
600 } | |
601 | |
602 sub TIEHANDLE { | |
603 my ($class,$val) = @_; | |
604 return bless {'gffio' => $val}, $class; | |
605 } | |
606 | |
607 sub READLINE { | |
608 my $self = shift; | |
609 return $self->{'gffio'}->next_feature() unless wantarray; | |
610 my (@list, $obj); | |
611 push @list, $obj while $obj = $self->{'gffio'}->next_feature(); | |
612 return @list; | |
613 } | |
614 | |
615 sub PRINT { | |
616 my $self = shift; | |
617 $self->{'gffio'}->write_feature(@_); | |
618 } | |
619 | |
620 1; | |
621 |