Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/OntologyIO/dagflat.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/OntologyIO/dagflat.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,799 @@
+# $Id: dagflat.pm,v 1.2.2.6 2003/06/30 05:04:06 lapp Exp $
+#
+# BioPerl module for Bio::OntologyIO::dagflat
+#
+# Cared for by Hilmar Lapp, hlapp at gmx.net
+#
+# (c) Christian M. Zmasek, czmasek@gnf.org, 2002.
+# (c) Hilmar Lapp, hlapp at gmx.net, 2003.
+# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
+#
+# You may distribute this module under the same terms as perl itself.
+# Refer to the Perl Artistic License (see the license accompanying this
+# software package, or see http://www.perl.com/language/misc/Artistic.html)
+# for the terms under which you may use, modify, and redistribute this module.
+#
+# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+dagflat - a base class parser for GO flat-file type formats 
+
+=head1 SYNOPSIS
+
+  use Bio::OntologyIO;
+
+  # do not use directly -- use via Bio::OntologyIO
+  # e.g., the GO parser is a simple extension of this class
+  my $parser = Bio::OntologyIO->new
+	( -format       => "go",
+          -defs_file    => "/home/czmasek/GO/GO.defs",
+	  -files        => ["/home/czmasek/GO/component.ontology",
+	                    "/home/czmasek/GO/function.ontology",
+	                    "/home/czmasek/GO/process.ontology"] );
+
+  my $go_ontology = $parser->next_ontology();
+
+  my $IS_A    = Bio::Ontology::RelationshipType->get_instance( "IS_A" );
+  my $PART_OF = Bio::Ontology::RelationshipType->get_instance( "PART_OF" );
+
+=head1 DESCRIPTION
+
+Needs Graph.pm from CPAN.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to the 
+Bioperl mailing lists  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org                         - General discussion
+  http://bio.perl.org/MailList.html             - About the mailing lists
+
+=head2 Reporting Bugs
+
+report bugs to the Bioperl bug tracking system to help us keep track
+ the bugs and their resolution.  Bug reports can be submitted via
+ email or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR
+
+Christian M. Zmasek
+
+Email: czmasek@gnf.org  or  cmzmasek@yahoo.com
+
+WWW:   http://www.genetics.wustl.edu/eddy/people/zmasek/
+
+Address: 
+
+  Genomics Institute of the Novartis Research Foundation
+  10675 John Jay Hopkins Drive
+  San Diego, CA 92121
+
+=head2 CONTRIBUTOR
+
+ Hilmar Lapp, hlapp at gmx.net
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package  Bio::OntologyIO::dagflat;
+
+use vars qw( @ISA );
+use strict;
+
+use Bio::Root::IO;
+use Bio::Ontology::SimpleGOEngine;
+use Bio::Ontology::Ontology;
+use Bio::Ontology::TermFactory;
+use Bio::OntologyIO;
+
+use constant TRUE         => 1;
+use constant FALSE        => 0;
+
+
+@ISA = qw( Bio::OntologyIO );
+
+
+=head2 new
+
+ Title   : new
+ Usage   : $parser = Bio::OntologyIO->new(
+                             -format => "go",
+                             -defs_file => "/path/to/GO.defs",
+                             -files => ["/path/to/component.ontology",
+                                        "/path/to/function.ontology",
+                                        "/path/to/process.ontology"] );
+ Function: Creates a new dagflat parser.
+ Returns : A new dagflat parser object, implementing L<Bio::OntologyIO>.
+ Args    : -defs_file  => the name of the file holding the term
+                          definitions
+           -files      => a single ontology flat file holding the
+                          term relationships, or an array ref holding
+                          the file names (for GO, there will usually be
+                          3 files: component.ontology, function.ontology,
+                          process.ontology)
+           -file       => if there is only a single flat file, it may
+                          also be specified via the -file parameter
+           -ontology_name => the name of the ontology; if not specified the
+                          parser will auto-discover it by using the term
+                          that starts with a '$', and converting underscores
+                          to spaces
+           -engine     => the Bio::Ontology::OntologyEngineI object
+                          to be reused (will be created otherwise); note
+                          that every Bio::Ontology::OntologyI will
+                          qualify as well since that one inherits from the
+                          former.
+
+=cut
+
+# in reality, we let OntologyIO::new do the instantiation, and override
+# _initialize for all initialization work
+sub _initialize {
+    my ($self, @args) = @_;
+    
+    $self->SUPER::_initialize( @args );
+
+    my ( $defs_file_name,$files,$name,$eng ) =
+	$self->_rearrange([qw( DEFS_FILE
+			       FILES
+			       ONTOLOGY_NAME
+			       ENGINE)
+			   ],
+			  @args );
+    
+    $self->_done( FALSE );
+    $self->_not_first_record( FALSE );
+    $self->_term( "" );
+    delete $self->{'_ontologies'};
+
+    # ontology engine (and possibly name if it's an OntologyI)
+    $eng = Bio::Ontology::SimpleGOEngine->new() unless $eng;
+    if($eng->isa("Bio::Ontology::OntologyI")) {
+	$self->ontology_name($eng->name());
+	$eng = $eng->engine() if $eng->can('engine');
+    }
+    $self->_ont_engine($eng);
+    
+    # flat files to parse
+    $self->defs_file( $defs_file_name );
+    $self->{_flat_files} = $files ? ref($files) ? $files : [$files] : [];
+
+    # ontology name (overrides implicit one through OntologyI engine)
+    $self->ontology_name($name) if $name;
+
+} # _initialize
+
+=head2 ontology_name
+
+ Title   : ontology_name
+ Usage   : $obj->ontology_name($newval)
+ Function: Get/set the name of the ontology parsed by this module. 
+ Example : 
+ Returns : value of ontology_name (a scalar)
+ Args    : on set, new value (a scalar or undef, optional)
+
+
+=cut
+
+sub ontology_name{
+    my $self = shift;
+
+    return $self->{'ontology_name'} = shift if @_;
+    return $self->{'ontology_name'};
+}
+
+
+=head2 parse
+
+ Title   : parse()
+ Usage   : $parser->parse();   
+ Function: Parses the files set with "new" or with methods
+           defs_file and _flat_files.
+
+           Normally you should not need to call this method as it will
+           be called automatically upon the first call to
+           next_ontology().
+
+ Returns : [Bio::Ontology::OntologyEngineI]
+ Args    :
+
+=cut
+
+sub parse {
+    my $self = shift;
+    
+    # setup the default term factory if not done by anyone yet
+    $self->term_factory(Bio::Ontology::TermFactory->new(
+					     -type => "Bio::Ontology::Term"))
+	unless $self->term_factory();
+
+    # create the ontology object itself
+    my $ont = Bio::Ontology::Ontology->new(-name => $self->ontology_name(),
+					   -engine => $self->_ont_engine());
+
+    # parse definitions
+    while( my $term = $self->_next_term() ) {
+        $self->_add_term( $term, $ont );
+    }
+
+    # set up the ontology of the relationship types
+    foreach ($self->_part_of_relationship(), $self->_is_a_relationship()) {
+	$_->ontology($ont);
+    }
+
+    # pre-seed the IO system with the first flat file if -file wasn't provided
+    if(! $self->_fh) {
+	$self->_initialize_io(-file => shift(@{$self->_flat_files()}));
+    }
+
+    while($self->_fh) {
+	$self->_parse_flat_file($ont);
+	# advance to next flat file if more are available
+	if(@{$self->_flat_files()}) {
+	    $self->close();
+	    $self->_initialize_io(-file => shift(@{$self->_flat_files()}));
+	} else {
+	    last; # nothing else to parse so terminate the loop
+	}
+    }
+    $self->_add_ontology($ont);
+    
+    # not needed anywhere, only because of backward compatibility
+    return $self->_ont_engine();
+    
+} # parse
+
+=head2 next_ontology
+
+ Title   : next_ontology
+ Usage   :
+ Function: Get the next available ontology from the parser. This is the
+           method prescribed by Bio::OntologyIO.
+ Example :
+ Returns : An object implementing Bio::Ontology::OntologyI, and undef if
+           there is no more ontology in the input.
+ Args    :
+
+
+=cut
+
+sub next_ontology{
+    my $self = shift;
+
+    # parse if not done already
+    $self->parse() unless exists($self->{'_ontologies'});
+    # return next available ontology
+    return shift(@{$self->{'_ontologies'}}) if exists($self->{'_ontologies'});
+    return undef;
+}
+
+
+=head2 defs_file
+
+ Title   : defs_file
+ Usage   : $parser->defs_file( "GO.defs" );
+ Function: Set/get for the term definitions filename.
+ Returns : The term definitions file name [string].
+ Args    : On set, the term definitions file name [string] (optional).
+
+=cut
+
+sub defs_file {
+    my $self = shift;
+
+    if ( @_ ) {
+	my $f = shift;
+        $self->{ "_defs_file_name" } = $f;
+	$self->_defs_io->close() if $self->_defs_io();
+	if(defined($f)) {
+            $self->_defs_io( Bio::Root::IO->new( -input => $f ) );
+        }
+    }
+    return $self->{ "_defs_file_name" };
+} # defs_file
+
+
+=head2 _flat_files
+
+ Title   : _flat_files
+ Usage   : $files_to_parse = $parser->_flat_files();
+ Function: Get the array of ontology flat files that need to be parsed.
+
+           Note that this array will decrease in elements over the
+           parsing process. Therefore, it's value outside of this
+           module will be limited. Also, be careful not to alter the
+           array unless you know what you are doing.
+
+ Returns : a reference to an array of zero or more strings
+ Args    : none
+
+=cut
+
+sub _flat_files {
+    my $self = shift;
+
+    $self->{_flat_files} = [] unless exists($self->{_flat_files});
+    return $self->{_flat_files};
+}
+
+
+# INTERNAL METHODS
+# ----------------
+
+=head2 _defs_io
+
+ Title   : _defs_io
+ Usage   : $obj->_defs_io($newval)
+ Function: Get/set the Bio::Root::IO instance representing the
+           definition file, if provided (see defs_file()).
+ Example : 
+ Returns : value of _defs_io (a Bio::Root::IO object)
+ Args    : on set, new value (a Bio::Root::IO object or undef, optional)
+
+
+=cut
+
+sub _defs_io{
+    my $self = shift;
+
+    return $self->{'_defs_io'} = shift if @_;
+    return $self->{'_defs_io'};
+}
+
+sub _add_ontology {
+    my $self = shift;
+    $self->{'_ontologies'} = [] unless exists($self->{'_ontologies'});
+    foreach my $ont (@_) {
+	$self->throw(ref($ont)." does not implement Bio::Ontology::OntologyI")
+	    unless ref($ont) && $ont->isa("Bio::Ontology::OntologyI");
+	# the ontology name may have been auto-discovered while parsing
+	# the file
+	$ont->name($self->ontology_name) unless $ont->name();
+	push(@{$self->{'_ontologies'}}, $ont);
+    }
+}
+
+# This simply delegates. See SimpleGOEngine.
+sub _add_term {
+    my ( $self, $term, $ont ) = @_;
+
+    $term->ontology($ont) if $ont && (! $term->ontology);
+    $self->_ont_engine()->add_term( $term );
+} # _add_term 
+
+
+
+# This simply delegates. See SimpleGOEngine
+sub _part_of_relationship {
+    my $self = shift;
+
+    return $self->_ont_engine()->part_of_relationship(@_);
+} # _part_of_relationship 
+
+
+
+# This simply delegates. See SimpleGOEngine
+sub _is_a_relationship {
+    my $self = shift;
+
+    return $self->_ont_engine()->is_a_relationship(@_);
+} # _is_a_relationship 
+
+
+
+# This simply delegates. See SimpleGOEngine
+sub _add_relationship {
+    my ( $self, $parent, $child, $type, $ont ) = @_;
+
+    # note the triple terminology (subject,predicate,object) corresponds to
+    # (child,type,parent)
+    $self->_ont_engine()->add_relationship( $child, $type, $parent, $ont );
+
+
+} # _add_relationship
+
+
+# This simply delegates. See SimpleGOEngine
+sub _has_term {
+    my $self = shift;
+
+    return $self->_ont_engine()->has_term( @_ );
+} # _add_term
+
+
+
+# This parses the relationships files
+sub _parse_flat_file {
+    my $self = shift;
+    my $ont  = shift;
+    
+    my @stack       = ();
+    my $prev_spaces = -1;
+    my $prev_term   = "";
+    
+    while( my $line = $self->_readline() ) {
+        
+        if ( $line =~ /^!/ ) {
+            next;
+        }
+        
+        my $current_term   = $self->_get_first_termid( $line );
+        my @isa_parents    = $self->_get_isa_termids( $line );
+        my @partof_parents = $self->_get_partof_termids( $line );
+        my @syns           = $self->_get_synonyms( $line );
+        my @sec_go_ids     = $self->_get_secondary_termids( $line );
+        my @cross_refs     = $self->_get_db_cross_refs( $line );
+        
+        
+        if ( ! $self->_has_term( $current_term ) ) {
+            my $term =$self->_create_ont_entry($self->_get_name($line,
+								$current_term),
+					       $current_term );
+            $self->_add_term( $term, $ont );
+        }
+        
+        my $current_term_object = $self->_ont_engine()->get_terms( $current_term );
+        
+        $current_term_object->add_dblink( @cross_refs );
+        $current_term_object->add_secondary_id( @sec_go_ids );
+        $current_term_object->add_synonym( @syns );
+        unless ( $line =~ /^\$/ ) {
+            $current_term_object->ontology( $ont );
+        }
+        foreach my $parent ( @isa_parents ) {
+            if ( ! $self->_has_term( $parent ) ) {
+                my $term = $self->_create_ont_entry($self->_get_name($line,
+								     $parent),
+						    $parent );
+                $self->_add_term( $term, $ont );
+            }
+            
+            $self->_add_relationship( $parent,
+                                      $current_term,
+                                      $self->_is_a_relationship(),
+				      $ont);
+             
+        }
+        foreach my $parent ( @partof_parents ) {
+            if ( ! $self->_has_term( $parent ) ) {
+                my $term = $self->_create_ont_entry($self->_get_name($line,
+								     $parent),
+						    $parent );
+                $self->_add_term( $term, $ont );
+            }
+           
+            $self->_add_relationship( $parent,
+                                      $current_term,
+                                      $self->_part_of_relationship(),
+				      $ont);
+        }
+        
+        my $current_spaces = $self->_count_spaces( $line );
+        
+        if ( $current_spaces != $prev_spaces  ) {
+       
+            if ( $current_spaces == $prev_spaces + 1 ) {
+                push( @stack, $prev_term ); 
+            }
+            elsif ( $current_spaces < $prev_spaces ) {
+                my $n = $prev_spaces -  $current_spaces;
+                for ( my $i = 0; $i < $n; ++$i ) {
+                    pop( @stack );
+                }
+            }
+            else {
+		$self->throw( "format error (file ".$self->file.")" );
+            } 
+        }
+        
+        my $parent = $stack[ @stack - 1 ];
+        
+        # add a relationship if the line isn't the one with the root term
+	# of the ontology (which is also the name of the ontology)
+        if ( index($line,'$') != 0 ) {
+	    if ( $line !~ /^\s*[<%]/ ) {
+		$self->throw( "format error (file ".$self->file.")" );
+	    }
+	    my $reltype = ($line =~ /^\s*</) ?
+		$self->_part_of_relationship() :
+		$self->_is_a_relationship();
+            $self->_add_relationship( $parent, $current_term, $reltype,
+				      $ont);
+        }
+        
+        $prev_spaces = $current_spaces;
+        
+        $prev_term = $current_term;
+        
+    } 
+    return $ont;
+} # _parse_relationships_file
+
+
+
+# Parses the 1st term id number out of line.
+sub _get_first_termid {
+    my ( $self, $line ) = @_;
+    
+    if ( $line =~ /;\s*([A-Z]{1,8}:\d{3,})/ ) {
+        return $1;
+    }
+    else {
+        $self->throw( "format error: no term id in line \"$line\"" );
+    }
+    
+} # _get_first_termid
+
+
+
+# Parses the name out of line.
+sub _get_name {
+    my ( $self, $line, $termid ) = @_;
+    
+    if ( $line =~ /([^;<%]+);\s*$termid/ ) {
+        my $name = $1;
+	# remove trailing and leading whitespace
+        $name =~ s/\s+$//;
+        $name =~ s/^\s+//;
+	# remove leading dollar character; also we default the name of the
+	# ontology to this name if preset to something else
+	if(index($name,'$') == 0) {
+	    $name = substr($name,1);
+	    # replace underscores by spaces for setting the ontology name
+	    $self->ontology_name(join(" ",split(/_/,$name))) 
+		unless $self->ontology_name();
+	}
+        return $name;
+    }
+    else {
+        return undef;
+    }
+} # _get_name   
+
+
+# Parses the synonyms out of line.
+sub _get_synonyms {
+    my ( $self, $line ) = @_;
+   
+    my @synonyms = ();
+   
+    while ( $line =~ /synonym\s*:\s*([^;^<^%]+)/g ) {
+        my $syn = $1;
+        $syn =~ s/\s+$//;
+        $syn =~ s/^\s+//;
+        push( @synonyms, $syn );
+    }
+    return @synonyms;
+    
+} # _get_synonyms
+
+
+
+# Parses the db cross refs out of line.
+sub _get_db_cross_refs {
+    my ( $self, $line ) = @_;
+   
+    my @refs = ();
+   
+    while ( $line =~ /;([^;^<^%^:]+:[^;^<^%^:]+)/g ) {
+        my $ref = $1;
+        if ( $ref =~ /synonym/ || $ref =~ /[A-Z]{1,8}:\d{3,}/ ) {
+            next;
+        }
+        $ref =~ s/\s+$//;
+        $ref =~ s/^\s+//;
+        push( @refs, $ref );
+    }
+    return @refs;
+    
+} 
+
+
+# Parses the secondary go ids out of a line
+sub _get_secondary_termids {
+    my ( $self, $line ) = @_;
+    my @secs = ();
+   
+    while ( $line =~ /,\s*([A-Z]{1,8}:\d{3,})/g ) {
+        my $sec = $1;
+        push( @secs, $sec );
+    }
+    return @secs;
+    
+} # _get_secondary_termids 
+
+
+
+# Parses the is a ids out of a line
+sub _get_isa_termids {
+    my ( $self, $line ) = @_;
+    
+    my @ids = ();
+    
+    $line =~ s/[A-Z]{1,8}:\d{3,}//;
+    
+    while ( $line =~ /%[^<^,]*?([A-Z]{1,8}:\d{3,})/g ) {
+        push( @ids, $1 );
+    }
+    return @ids; 
+} # _get_isa_termids
+
+
+
+# Parses the part of ids out of a line
+sub _get_partof_termids {
+    my ( $self, $line ) = @_;
+    
+    my @ids = ();
+    
+    $line =~ s/[A-Z]{1,8}:\d{3,}//;
+    
+    while ( $line =~ /<[^%^,]*?([A-Z]{1,8}:\d{3,})/g ) {
+        push( @ids, $1 );
+    }
+    return @ids; 
+    
+    
+} # _get_partof_termids
+
+
+
+
+# Counts the spaces at the beginning of a line in the relationships files
+sub _count_spaces {
+    my ( $self, $line ) = @_;
+     
+    if ( $line =~ /^(\s+)/ ) {
+         return length( $1 );
+    }
+    else {
+         return 0;
+    }
+} # _count_spaces
+
+
+
+
+# "next" method for parsing the defintions file
+sub _next_term {
+    my ( $self ) = @_;
+
+    if ( ($self->_done() == TRUE) || (! $self->_defs_io())) {
+        return undef;
+    }
+    
+    my $line      = "";
+    my $termid    = "";
+    my $next_term = $self->_term();
+    my $def       = "";
+    my $comment   = "";
+    my @def_refs  = ();
+    my $isobsolete;
+    
+    while( $line = ( $self->_defs_io->_readline() ) ) {
+    
+        if ( $line !~ /\S/ 
+        ||   $line =~ /^\s*!/ ) {
+            next;
+        }
+        elsif ( $line =~ /^\s*term:\s*(.+)/ ) {
+	    $self->_term( $1 );
+            last if $self->_not_first_record();
+            $next_term = $1;
+	    $self->_not_first_record( TRUE );
+        }
+        elsif ( $line =~ /^\s*[a-z]{1,8}id:\s*(.+)/ ) {
+            $termid = $1;
+        }
+        elsif ( $line =~ /^\s*definition:\s*(.+)/ ) {
+            $def = $1;   
+	    $isobsolete = 1 if index($def,"OBSOLETE") == 0;
+        }
+        elsif ( $line =~ /^\s*definition_reference:\s*(.+)/ ) {
+            push( @def_refs, $1 );  
+        }
+        elsif ( $line =~ /^\s*comment:\s*(.+)/ ) {
+            $comment = $1;  
+        }
+    }
+    $self->_done( TRUE ) unless $line; # we'll come back until done
+    
+    return $self->_create_ont_entry( $next_term, $termid, $def,
+				     $comment, \@def_refs, $isobsolete);
+} # _next_term
+
+
+
+
+
+# Holds the GO engine to be parsed into
+sub _ont_engine {
+    my ( $self, $value ) = @_;
+
+    if ( defined $value ) {
+        $self->{ "_ont_engine" } = $value;
+    }
+    
+    return $self->{ "_ont_engine" };
+} # _ont_engine
+
+
+
+
+# Used to create ontology terms.
+# Arguments: name, id
+sub _create_ont_entry {
+    my ( $self, $name, $termid, $def, $cmt, $dbxrefs, $obsolete ) = @_;
+
+    if((!defined($obsolete)) && (index(lc($name),"obsolete") == 0)) {
+	$obsolete = 1;
+    }
+    my $term = $self->term_factory->create_object(-name => $name,
+						  -identifier => $termid,
+						  -definition => $def,
+						  -comment => $cmt,
+						  -dblinks => $dbxrefs,
+						  -is_obsolete => $obsolete);
+
+    return $term;
+} # _create_ont_entry
+
+
+
+# Holds whether first record or not
+sub _not_first_record {
+    my ( $self, $value ) = @_;
+
+    if ( defined $value ) {
+        $self->{ "_not_first_record" } = $value;
+    }
+    
+    return $self->{ "_not_first_record" };
+} # _not_first_record
+
+
+
+# Holds whether done or not
+sub _done {
+    my ( $self, $value ) = @_;
+
+    if ( defined $value ) {
+        $self->{ "_done" } = $value;
+    }
+    
+    return $self->{ "_done" };
+} # _done
+
+
+# Holds a term.  
+sub _term {
+    my ( $self, $value ) = @_;
+
+    if ( defined $value ) {
+        $self->{ "_term" } = $value;
+    }
+    
+    return $self->{ "_term" };
+} # _term  
+  
+
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children