comparison variant_effect_predictor/Bio/OntologyIO/dagflat.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: dagflat.pm,v 1.2.2.6 2003/06/30 05:04:06 lapp Exp $
2 #
3 # BioPerl module for Bio::OntologyIO::dagflat
4 #
5 # Cared for by Hilmar Lapp, hlapp at gmx.net
6 #
7 # (c) Christian M. Zmasek, czmasek@gnf.org, 2002.
8 # (c) Hilmar Lapp, hlapp at gmx.net, 2003.
9 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
10 #
11 # You may distribute this module under the same terms as perl itself.
12 # Refer to the Perl Artistic License (see the license accompanying this
13 # software package, or see http://www.perl.com/language/misc/Artistic.html)
14 # for the terms under which you may use, modify, and redistribute this module.
15 #
16 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
17 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
18 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
19 #
20 # You may distribute this module under the same terms as perl itself
21
22 # POD documentation - main docs before the code
23
24 =head1 NAME
25
26 dagflat - a base class parser for GO flat-file type formats
27
28 =head1 SYNOPSIS
29
30 use Bio::OntologyIO;
31
32 # do not use directly -- use via Bio::OntologyIO
33 # e.g., the GO parser is a simple extension of this class
34 my $parser = Bio::OntologyIO->new
35 ( -format => "go",
36 -defs_file => "/home/czmasek/GO/GO.defs",
37 -files => ["/home/czmasek/GO/component.ontology",
38 "/home/czmasek/GO/function.ontology",
39 "/home/czmasek/GO/process.ontology"] );
40
41 my $go_ontology = $parser->next_ontology();
42
43 my $IS_A = Bio::Ontology::RelationshipType->get_instance( "IS_A" );
44 my $PART_OF = Bio::Ontology::RelationshipType->get_instance( "PART_OF" );
45
46 =head1 DESCRIPTION
47
48 Needs Graph.pm from CPAN.
49
50 =head1 FEEDBACK
51
52 =head2 Mailing Lists
53
54 User feedback is an integral part of the evolution of this and other
55 Bioperl modules. Send your comments and suggestions preferably to the
56 Bioperl mailing lists Your participation is much appreciated.
57
58 bioperl-l@bioperl.org - General discussion
59 http://bio.perl.org/MailList.html - About the mailing lists
60
61 =head2 Reporting Bugs
62
63 report bugs to the Bioperl bug tracking system to help us keep track
64 the bugs and their resolution. Bug reports can be submitted via
65 email or the web:
66
67 bioperl-bugs@bio.perl.org
68 http://bugzilla.bioperl.org/
69
70 =head1 AUTHOR
71
72 Christian M. Zmasek
73
74 Email: czmasek@gnf.org or cmzmasek@yahoo.com
75
76 WWW: http://www.genetics.wustl.edu/eddy/people/zmasek/
77
78 Address:
79
80 Genomics Institute of the Novartis Research Foundation
81 10675 John Jay Hopkins Drive
82 San Diego, CA 92121
83
84 =head2 CONTRIBUTOR
85
86 Hilmar Lapp, hlapp at gmx.net
87
88 =head1 APPENDIX
89
90 The rest of the documentation details each of the object
91 methods. Internal methods are usually preceded with a _
92
93 =cut
94
95
96 # Let the code begin...
97
98
99 package Bio::OntologyIO::dagflat;
100
101 use vars qw( @ISA );
102 use strict;
103
104 use Bio::Root::IO;
105 use Bio::Ontology::SimpleGOEngine;
106 use Bio::Ontology::Ontology;
107 use Bio::Ontology::TermFactory;
108 use Bio::OntologyIO;
109
110 use constant TRUE => 1;
111 use constant FALSE => 0;
112
113
114 @ISA = qw( Bio::OntologyIO );
115
116
117 =head2 new
118
119 Title : new
120 Usage : $parser = Bio::OntologyIO->new(
121 -format => "go",
122 -defs_file => "/path/to/GO.defs",
123 -files => ["/path/to/component.ontology",
124 "/path/to/function.ontology",
125 "/path/to/process.ontology"] );
126 Function: Creates a new dagflat parser.
127 Returns : A new dagflat parser object, implementing L<Bio::OntologyIO>.
128 Args : -defs_file => the name of the file holding the term
129 definitions
130 -files => a single ontology flat file holding the
131 term relationships, or an array ref holding
132 the file names (for GO, there will usually be
133 3 files: component.ontology, function.ontology,
134 process.ontology)
135 -file => if there is only a single flat file, it may
136 also be specified via the -file parameter
137 -ontology_name => the name of the ontology; if not specified the
138 parser will auto-discover it by using the term
139 that starts with a '$', and converting underscores
140 to spaces
141 -engine => the Bio::Ontology::OntologyEngineI object
142 to be reused (will be created otherwise); note
143 that every Bio::Ontology::OntologyI will
144 qualify as well since that one inherits from the
145 former.
146
147 =cut
148
149 # in reality, we let OntologyIO::new do the instantiation, and override
150 # _initialize for all initialization work
151 sub _initialize {
152 my ($self, @args) = @_;
153
154 $self->SUPER::_initialize( @args );
155
156 my ( $defs_file_name,$files,$name,$eng ) =
157 $self->_rearrange([qw( DEFS_FILE
158 FILES
159 ONTOLOGY_NAME
160 ENGINE)
161 ],
162 @args );
163
164 $self->_done( FALSE );
165 $self->_not_first_record( FALSE );
166 $self->_term( "" );
167 delete $self->{'_ontologies'};
168
169 # ontology engine (and possibly name if it's an OntologyI)
170 $eng = Bio::Ontology::SimpleGOEngine->new() unless $eng;
171 if($eng->isa("Bio::Ontology::OntologyI")) {
172 $self->ontology_name($eng->name());
173 $eng = $eng->engine() if $eng->can('engine');
174 }
175 $self->_ont_engine($eng);
176
177 # flat files to parse
178 $self->defs_file( $defs_file_name );
179 $self->{_flat_files} = $files ? ref($files) ? $files : [$files] : [];
180
181 # ontology name (overrides implicit one through OntologyI engine)
182 $self->ontology_name($name) if $name;
183
184 } # _initialize
185
186 =head2 ontology_name
187
188 Title : ontology_name
189 Usage : $obj->ontology_name($newval)
190 Function: Get/set the name of the ontology parsed by this module.
191 Example :
192 Returns : value of ontology_name (a scalar)
193 Args : on set, new value (a scalar or undef, optional)
194
195
196 =cut
197
198 sub ontology_name{
199 my $self = shift;
200
201 return $self->{'ontology_name'} = shift if @_;
202 return $self->{'ontology_name'};
203 }
204
205
206 =head2 parse
207
208 Title : parse()
209 Usage : $parser->parse();
210 Function: Parses the files set with "new" or with methods
211 defs_file and _flat_files.
212
213 Normally you should not need to call this method as it will
214 be called automatically upon the first call to
215 next_ontology().
216
217 Returns : [Bio::Ontology::OntologyEngineI]
218 Args :
219
220 =cut
221
222 sub parse {
223 my $self = shift;
224
225 # setup the default term factory if not done by anyone yet
226 $self->term_factory(Bio::Ontology::TermFactory->new(
227 -type => "Bio::Ontology::Term"))
228 unless $self->term_factory();
229
230 # create the ontology object itself
231 my $ont = Bio::Ontology::Ontology->new(-name => $self->ontology_name(),
232 -engine => $self->_ont_engine());
233
234 # parse definitions
235 while( my $term = $self->_next_term() ) {
236 $self->_add_term( $term, $ont );
237 }
238
239 # set up the ontology of the relationship types
240 foreach ($self->_part_of_relationship(), $self->_is_a_relationship()) {
241 $_->ontology($ont);
242 }
243
244 # pre-seed the IO system with the first flat file if -file wasn't provided
245 if(! $self->_fh) {
246 $self->_initialize_io(-file => shift(@{$self->_flat_files()}));
247 }
248
249 while($self->_fh) {
250 $self->_parse_flat_file($ont);
251 # advance to next flat file if more are available
252 if(@{$self->_flat_files()}) {
253 $self->close();
254 $self->_initialize_io(-file => shift(@{$self->_flat_files()}));
255 } else {
256 last; # nothing else to parse so terminate the loop
257 }
258 }
259 $self->_add_ontology($ont);
260
261 # not needed anywhere, only because of backward compatibility
262 return $self->_ont_engine();
263
264 } # parse
265
266 =head2 next_ontology
267
268 Title : next_ontology
269 Usage :
270 Function: Get the next available ontology from the parser. This is the
271 method prescribed by Bio::OntologyIO.
272 Example :
273 Returns : An object implementing Bio::Ontology::OntologyI, and undef if
274 there is no more ontology in the input.
275 Args :
276
277
278 =cut
279
280 sub next_ontology{
281 my $self = shift;
282
283 # parse if not done already
284 $self->parse() unless exists($self->{'_ontologies'});
285 # return next available ontology
286 return shift(@{$self->{'_ontologies'}}) if exists($self->{'_ontologies'});
287 return undef;
288 }
289
290
291 =head2 defs_file
292
293 Title : defs_file
294 Usage : $parser->defs_file( "GO.defs" );
295 Function: Set/get for the term definitions filename.
296 Returns : The term definitions file name [string].
297 Args : On set, the term definitions file name [string] (optional).
298
299 =cut
300
301 sub defs_file {
302 my $self = shift;
303
304 if ( @_ ) {
305 my $f = shift;
306 $self->{ "_defs_file_name" } = $f;
307 $self->_defs_io->close() if $self->_defs_io();
308 if(defined($f)) {
309 $self->_defs_io( Bio::Root::IO->new( -input => $f ) );
310 }
311 }
312 return $self->{ "_defs_file_name" };
313 } # defs_file
314
315
316 =head2 _flat_files
317
318 Title : _flat_files
319 Usage : $files_to_parse = $parser->_flat_files();
320 Function: Get the array of ontology flat files that need to be parsed.
321
322 Note that this array will decrease in elements over the
323 parsing process. Therefore, it's value outside of this
324 module will be limited. Also, be careful not to alter the
325 array unless you know what you are doing.
326
327 Returns : a reference to an array of zero or more strings
328 Args : none
329
330 =cut
331
332 sub _flat_files {
333 my $self = shift;
334
335 $self->{_flat_files} = [] unless exists($self->{_flat_files});
336 return $self->{_flat_files};
337 }
338
339
340 # INTERNAL METHODS
341 # ----------------
342
343 =head2 _defs_io
344
345 Title : _defs_io
346 Usage : $obj->_defs_io($newval)
347 Function: Get/set the Bio::Root::IO instance representing the
348 definition file, if provided (see defs_file()).
349 Example :
350 Returns : value of _defs_io (a Bio::Root::IO object)
351 Args : on set, new value (a Bio::Root::IO object or undef, optional)
352
353
354 =cut
355
356 sub _defs_io{
357 my $self = shift;
358
359 return $self->{'_defs_io'} = shift if @_;
360 return $self->{'_defs_io'};
361 }
362
363 sub _add_ontology {
364 my $self = shift;
365 $self->{'_ontologies'} = [] unless exists($self->{'_ontologies'});
366 foreach my $ont (@_) {
367 $self->throw(ref($ont)." does not implement Bio::Ontology::OntologyI")
368 unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
369 # the ontology name may have been auto-discovered while parsing
370 # the file
371 $ont->name($self->ontology_name) unless $ont->name();
372 push(@{$self->{'_ontologies'}}, $ont);
373 }
374 }
375
376 # This simply delegates. See SimpleGOEngine.
377 sub _add_term {
378 my ( $self, $term, $ont ) = @_;
379
380 $term->ontology($ont) if $ont && (! $term->ontology);
381 $self->_ont_engine()->add_term( $term );
382 } # _add_term
383
384
385
386 # This simply delegates. See SimpleGOEngine
387 sub _part_of_relationship {
388 my $self = shift;
389
390 return $self->_ont_engine()->part_of_relationship(@_);
391 } # _part_of_relationship
392
393
394
395 # This simply delegates. See SimpleGOEngine
396 sub _is_a_relationship {
397 my $self = shift;
398
399 return $self->_ont_engine()->is_a_relationship(@_);
400 } # _is_a_relationship
401
402
403
404 # This simply delegates. See SimpleGOEngine
405 sub _add_relationship {
406 my ( $self, $parent, $child, $type, $ont ) = @_;
407
408 # note the triple terminology (subject,predicate,object) corresponds to
409 # (child,type,parent)
410 $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
411
412
413 } # _add_relationship
414
415
416 # This simply delegates. See SimpleGOEngine
417 sub _has_term {
418 my $self = shift;
419
420 return $self->_ont_engine()->has_term( @_ );
421 } # _add_term
422
423
424
425 # This parses the relationships files
426 sub _parse_flat_file {
427 my $self = shift;
428 my $ont = shift;
429
430 my @stack = ();
431 my $prev_spaces = -1;
432 my $prev_term = "";
433
434 while( my $line = $self->_readline() ) {
435
436 if ( $line =~ /^!/ ) {
437 next;
438 }
439
440 my $current_term = $self->_get_first_termid( $line );
441 my @isa_parents = $self->_get_isa_termids( $line );
442 my @partof_parents = $self->_get_partof_termids( $line );
443 my @syns = $self->_get_synonyms( $line );
444 my @sec_go_ids = $self->_get_secondary_termids( $line );
445 my @cross_refs = $self->_get_db_cross_refs( $line );
446
447
448 if ( ! $self->_has_term( $current_term ) ) {
449 my $term =$self->_create_ont_entry($self->_get_name($line,
450 $current_term),
451 $current_term );
452 $self->_add_term( $term, $ont );
453 }
454
455 my $current_term_object = $self->_ont_engine()->get_terms( $current_term );
456
457 $current_term_object->add_dblink( @cross_refs );
458 $current_term_object->add_secondary_id( @sec_go_ids );
459 $current_term_object->add_synonym( @syns );
460 unless ( $line =~ /^\$/ ) {
461 $current_term_object->ontology( $ont );
462 }
463 foreach my $parent ( @isa_parents ) {
464 if ( ! $self->_has_term( $parent ) ) {
465 my $term = $self->_create_ont_entry($self->_get_name($line,
466 $parent),
467 $parent );
468 $self->_add_term( $term, $ont );
469 }
470
471 $self->_add_relationship( $parent,
472 $current_term,
473 $self->_is_a_relationship(),
474 $ont);
475
476 }
477 foreach my $parent ( @partof_parents ) {
478 if ( ! $self->_has_term( $parent ) ) {
479 my $term = $self->_create_ont_entry($self->_get_name($line,
480 $parent),
481 $parent );
482 $self->_add_term( $term, $ont );
483 }
484
485 $self->_add_relationship( $parent,
486 $current_term,
487 $self->_part_of_relationship(),
488 $ont);
489 }
490
491 my $current_spaces = $self->_count_spaces( $line );
492
493 if ( $current_spaces != $prev_spaces ) {
494
495 if ( $current_spaces == $prev_spaces + 1 ) {
496 push( @stack, $prev_term );
497 }
498 elsif ( $current_spaces < $prev_spaces ) {
499 my $n = $prev_spaces - $current_spaces;
500 for ( my $i = 0; $i < $n; ++$i ) {
501 pop( @stack );
502 }
503 }
504 else {
505 $self->throw( "format error (file ".$self->file.")" );
506 }
507 }
508
509 my $parent = $stack[ @stack - 1 ];
510
511 # add a relationship if the line isn't the one with the root term
512 # of the ontology (which is also the name of the ontology)
513 if ( index($line,'$') != 0 ) {
514 if ( $line !~ /^\s*[<%]/ ) {
515 $self->throw( "format error (file ".$self->file.")" );
516 }
517 my $reltype = ($line =~ /^\s*</) ?
518 $self->_part_of_relationship() :
519 $self->_is_a_relationship();
520 $self->_add_relationship( $parent, $current_term, $reltype,
521 $ont);
522 }
523
524 $prev_spaces = $current_spaces;
525
526 $prev_term = $current_term;
527
528 }
529 return $ont;
530 } # _parse_relationships_file
531
532
533
534 # Parses the 1st term id number out of line.
535 sub _get_first_termid {
536 my ( $self, $line ) = @_;
537
538 if ( $line =~ /;\s*([A-Z]{1,8}:\d{3,})/ ) {
539 return $1;
540 }
541 else {
542 $self->throw( "format error: no term id in line \"$line\"" );
543 }
544
545 } # _get_first_termid
546
547
548
549 # Parses the name out of line.
550 sub _get_name {
551 my ( $self, $line, $termid ) = @_;
552
553 if ( $line =~ /([^;<%]+);\s*$termid/ ) {
554 my $name = $1;
555 # remove trailing and leading whitespace
556 $name =~ s/\s+$//;
557 $name =~ s/^\s+//;
558 # remove leading dollar character; also we default the name of the
559 # ontology to this name if preset to something else
560 if(index($name,'$') == 0) {
561 $name = substr($name,1);
562 # replace underscores by spaces for setting the ontology name
563 $self->ontology_name(join(" ",split(/_/,$name)))
564 unless $self->ontology_name();
565 }
566 return $name;
567 }
568 else {
569 return undef;
570 }
571 } # _get_name
572
573
574 # Parses the synonyms out of line.
575 sub _get_synonyms {
576 my ( $self, $line ) = @_;
577
578 my @synonyms = ();
579
580 while ( $line =~ /synonym\s*:\s*([^;^<^%]+)/g ) {
581 my $syn = $1;
582 $syn =~ s/\s+$//;
583 $syn =~ s/^\s+//;
584 push( @synonyms, $syn );
585 }
586 return @synonyms;
587
588 } # _get_synonyms
589
590
591
592 # Parses the db cross refs out of line.
593 sub _get_db_cross_refs {
594 my ( $self, $line ) = @_;
595
596 my @refs = ();
597
598 while ( $line =~ /;([^;^<^%^:]+:[^;^<^%^:]+)/g ) {
599 my $ref = $1;
600 if ( $ref =~ /synonym/ || $ref =~ /[A-Z]{1,8}:\d{3,}/ ) {
601 next;
602 }
603 $ref =~ s/\s+$//;
604 $ref =~ s/^\s+//;
605 push( @refs, $ref );
606 }
607 return @refs;
608
609 }
610
611
612 # Parses the secondary go ids out of a line
613 sub _get_secondary_termids {
614 my ( $self, $line ) = @_;
615 my @secs = ();
616
617 while ( $line =~ /,\s*([A-Z]{1,8}:\d{3,})/g ) {
618 my $sec = $1;
619 push( @secs, $sec );
620 }
621 return @secs;
622
623 } # _get_secondary_termids
624
625
626
627 # Parses the is a ids out of a line
628 sub _get_isa_termids {
629 my ( $self, $line ) = @_;
630
631 my @ids = ();
632
633 $line =~ s/[A-Z]{1,8}:\d{3,}//;
634
635 while ( $line =~ /%[^<^,]*?([A-Z]{1,8}:\d{3,})/g ) {
636 push( @ids, $1 );
637 }
638 return @ids;
639 } # _get_isa_termids
640
641
642
643 # Parses the part of ids out of a line
644 sub _get_partof_termids {
645 my ( $self, $line ) = @_;
646
647 my @ids = ();
648
649 $line =~ s/[A-Z]{1,8}:\d{3,}//;
650
651 while ( $line =~ /<[^%^,]*?([A-Z]{1,8}:\d{3,})/g ) {
652 push( @ids, $1 );
653 }
654 return @ids;
655
656
657 } # _get_partof_termids
658
659
660
661
662 # Counts the spaces at the beginning of a line in the relationships files
663 sub _count_spaces {
664 my ( $self, $line ) = @_;
665
666 if ( $line =~ /^(\s+)/ ) {
667 return length( $1 );
668 }
669 else {
670 return 0;
671 }
672 } # _count_spaces
673
674
675
676
677 # "next" method for parsing the defintions file
678 sub _next_term {
679 my ( $self ) = @_;
680
681 if ( ($self->_done() == TRUE) || (! $self->_defs_io())) {
682 return undef;
683 }
684
685 my $line = "";
686 my $termid = "";
687 my $next_term = $self->_term();
688 my $def = "";
689 my $comment = "";
690 my @def_refs = ();
691 my $isobsolete;
692
693 while( $line = ( $self->_defs_io->_readline() ) ) {
694
695 if ( $line !~ /\S/
696 || $line =~ /^\s*!/ ) {
697 next;
698 }
699 elsif ( $line =~ /^\s*term:\s*(.+)/ ) {
700 $self->_term( $1 );
701 last if $self->_not_first_record();
702 $next_term = $1;
703 $self->_not_first_record( TRUE );
704 }
705 elsif ( $line =~ /^\s*[a-z]{1,8}id:\s*(.+)/ ) {
706 $termid = $1;
707 }
708 elsif ( $line =~ /^\s*definition:\s*(.+)/ ) {
709 $def = $1;
710 $isobsolete = 1 if index($def,"OBSOLETE") == 0;
711 }
712 elsif ( $line =~ /^\s*definition_reference:\s*(.+)/ ) {
713 push( @def_refs, $1 );
714 }
715 elsif ( $line =~ /^\s*comment:\s*(.+)/ ) {
716 $comment = $1;
717 }
718 }
719 $self->_done( TRUE ) unless $line; # we'll come back until done
720
721 return $self->_create_ont_entry( $next_term, $termid, $def,
722 $comment, \@def_refs, $isobsolete);
723 } # _next_term
724
725
726
727
728
729 # Holds the GO engine to be parsed into
730 sub _ont_engine {
731 my ( $self, $value ) = @_;
732
733 if ( defined $value ) {
734 $self->{ "_ont_engine" } = $value;
735 }
736
737 return $self->{ "_ont_engine" };
738 } # _ont_engine
739
740
741
742
743 # Used to create ontology terms.
744 # Arguments: name, id
745 sub _create_ont_entry {
746 my ( $self, $name, $termid, $def, $cmt, $dbxrefs, $obsolete ) = @_;
747
748 if((!defined($obsolete)) && (index(lc($name),"obsolete") == 0)) {
749 $obsolete = 1;
750 }
751 my $term = $self->term_factory->create_object(-name => $name,
752 -identifier => $termid,
753 -definition => $def,
754 -comment => $cmt,
755 -dblinks => $dbxrefs,
756 -is_obsolete => $obsolete);
757
758 return $term;
759 } # _create_ont_entry
760
761
762
763 # Holds whether first record or not
764 sub _not_first_record {
765 my ( $self, $value ) = @_;
766
767 if ( defined $value ) {
768 $self->{ "_not_first_record" } = $value;
769 }
770
771 return $self->{ "_not_first_record" };
772 } # _not_first_record
773
774
775
776 # Holds whether done or not
777 sub _done {
778 my ( $self, $value ) = @_;
779
780 if ( defined $value ) {
781 $self->{ "_done" } = $value;
782 }
783
784 return $self->{ "_done" };
785 } # _done
786
787
788 # Holds a term.
789 sub _term {
790 my ( $self, $value ) = @_;
791
792 if ( defined $value ) {
793 $self->{ "_term" } = $value;
794 }
795
796 return $self->{ "_term" };
797 } # _term
798
799