Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/OntologyIO/Handlers/InterProHandler.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/OntologyIO/Handlers/InterProHandler.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,718 @@ +# $Id: InterProHandler.pm,v 1.7.2.2 2003/03/27 10:07:57 lapp Exp $ +# +# BioPerl module for InterProHandler +# +# Cared for by Peter Dimitrov <dimitrov@gnf.org> +# +# Copyright Peter Dimitrov +# (c) Peter Dimitrov, dimitrov@gnf.org, 2003. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2003. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# +# POD documentation - main docs before the code + +=head1 NAME + +InterProHandler - XML handler class for InterProParser + +=head1 SYNOPSIS + + # do not use directly - used and instantiated by InterProParser + +=head1 DESCRIPTION + +Handles xml events generated by InterProParser when parsing InterPro +xml files. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + +bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + +bioperl-bugs@bioperl.org +http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Dimitrov + +Email dimitrov@gnf.org + +=head1 CONTRIBUTORS + +Additional contributors names and emails here + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::OntologyIO::Handlers::InterProHandler; +use vars qw(@ISA); +use strict; +use Carp; +use Bio::Root::Root; +use Bio::Ontology::Ontology; +use Bio::Ontology::RelationshipType; +use Bio::Ontology::SimpleOntologyEngine; +use Bio::Annotation::Reference; +use Data::Dumper; + +@ISA = qw(Bio::Root::Root); + +my ($record_count, $processed_count, $is_a_rel, $contains_rel, $found_in_rel); + +=head2 new + + Title : new + Usage : $h = Bio::OntologyIO::Handlers::InterProHandler->new; + Function: Initializes global variables + Example : + Returns : an InterProHandler object + Args : + + +=cut + +sub new{ + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($eng,$ont,$name,$fact) = + $self->_rearrange([qw(ENGINE + ONTOLOGY + ONTOLOGY_NAME + TERM_FACTORY) + ],@args); + + if(defined($ont)) { + $self->ontology($ont); + } else { + $name = "InterPro" unless $name; + $self->ontology(Bio::Ontology::Ontology->new(-name => $name)); + } + $self->ontology_engine($eng) if $eng; + + $self->term_factory($fact) if $fact; + + $is_a_rel = Bio::Ontology::RelationshipType->get_instance( "IS_A" ); + $contains_rel = Bio::Ontology::RelationshipType->get_instance( "CONTAINS" ); + $found_in_rel = Bio::Ontology::RelationshipType->get_instance( "FOUND_IN" ); + $self->_cite_skip(0); + $self->secondary_accessions_map( {} ); + + return $self; +} + +=head2 ontology_engine + + Title : ontology_engine + Usage : $obj->ontology_engine($newval) + Function: Get/set ontology engine. Can be initialized only once. + Example : + Returns : value of ontology_engine (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub ontology_engine{ + my ($self, $value) = @_; + + if( defined $value) { + if ( defined $self->{'ontology_engine'}) { + $self->throw("ontology_engine already defined"); + } else { + $self->throw(ref($value)." does not implement ". + "Bio::Ontology::OntologyEngineI. Bummer.") + unless $value->isa("Bio::Ontology::OntologyEngineI"); + $self->{'ontology_engine'} = $value; + + # don't forget to set this as the engine of the ontology, otherwise + # those two might not point to the same object + my $ont = $self->ontology(); + if($ont && $ont->can("engine") && (!$ont->engine())) { + $ont->engine($value); + } + + $self->debug(ref($self) . + "::ontology_engine: registering ontology engine (". + ref($value)."):\n". + $value->to_string."\n"); + } + } + + return $self->{'ontology_engine'}; +} + +=head2 ontology + + Title : ontology + Usage : + Function: Get the ontology to add the InterPro terms to. + + The value is determined automatically once ontology_engine + has been set and if it hasn't been set before. + + Example : + Returns : A L<Bio::Ontology::OntologyI> implementing object. + Args : On set, a L<Bio::Ontology::OntologyI> implementing object. + +=cut + +sub ontology{ + my ($self,$ont) = @_; + + if(defined($ont)) { + $self->throw(ref($ont)." does not implement Bio::Ontology::OntologyI". + ". Bummer.") + unless $ont->isa("Bio::Ontology::OntologyI"); + $self->{'_ontology'} = $ont; + } + return $self->{'_ontology'}; +} + +=head2 term_factory + + Title : term_factory + Usage : $obj->term_factory($newval) + Function: Get/set the ontology term object factory + Example : + Returns : value of term_factory (a Bio::Factory::ObjectFactory instance) + Args : on set, new value (a Bio::Factory::ObjectFactory instance + or undef, optional) + + +=cut + +sub term_factory{ + my $self = shift; + + return $self->{'term_factory'} = shift if @_; + return $self->{'term_factory'}; +} + +=head2 _cite_skip + + Title : _cite_skip + Usage : $obj->_cite_skip($newval) + Function: + Example : + Returns : value of _cite_skip (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _cite_skip{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_cite_skip'} = $value; + } + + return $self->{'_cite_skip'}; +} + +=head2 _hash + + Title : _hash + Usage : $obj->_hash($newval) + Function: + Example : + Returns : value of _hash (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _hash{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_hash'} = $value; + } + + return $self->{'_hash'}; +} + +=head2 _stack + + Title : _stack + Usage : $obj->_stack($newval) + Function: + Example : + Returns : value of _stack (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _stack{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_stack'} = $value; + } + return $self->{'_stack'}; +} + +=head2 _top + + Title : _top + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _top{ + my ($self, $_stack) = @_; + my @stack = @{$_stack}; + + return (@stack >= 1) ? $stack[@stack - 1] : undef; +} + +=head2 _term + + Title : _term + Usage : $obj->_term($newval) + Function: Get/set method for the term currently processed. + Example : + Returns : value of term (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _term{ + my ($self, $value) = @_; + + if(defined $value) { + $self->{'_term'} = $value; + } + + return $self->{'_term'}; +} + +=head2 _clear_term + + Title : _clear_term + Usage : + Function: Removes the current term from the handler + Example : + Returns : + Args : + + +=cut + +sub _clear_term{ + my ($self) = @_; + + delete $self->{'_term'}; +} + +=head2 _names + + Title : _names + Usage : $obj->_names($newval) + Function: + Example : + Returns : value of _names (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _names{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_names'} = $value; + } + + return $self->{'_names'}; +} + +=head2 _create_relationship + + Title : _create_relationship + Usage : + Function: Helper function. Adds relationships to one of the relationship stores. + Example : + Returns : + Args : + + +=cut + +sub _create_relationship{ + my ($self, $ref_id, $rel_type_term) = @_; + my $ont = $self->ontology(); + my $fact = $self->term_factory(); + my $term_temp = ($ont->engine->get_term_by_identifier($ref_id))[0]; + + my $rel = Bio::Ontology::Relationship->new( -predicate_term => $rel_type_term ); + + if (!defined $term_temp) { + $term_temp = $ont->engine->add_term( $fact->create_object( -InterPro_id => $ref_id ) ); + $ont->engine->mark_uninstantiated($term_temp); + } + my $rel_type_name = $self->_top($self->_names); + + if ($rel_type_name eq 'parent_list' || $rel_type_name eq 'found_in') { + $rel->object_term( $term_temp ); + $rel->subject_term( $self->_term ); + } else { + $rel->object_term( $self->_term ); + $rel->subject_term( $term_temp ); + } + $rel->ontology($ont); + $ont->add_relationship($rel); +} + +=head2 start_element + + Title : start_element + Usage : + Function: This is a method that is derived from XML::SAX::Base and + has to be overridden for processing start of xml element + events. Used internally only. + + Example : + Returns : + Args : + + +=cut + +sub start_element { + my ($self, $element) = @_; + my $ont = $self->ontology(); + my $fact = $self->term_factory(); + + if ($element->{Name} eq 'interprodb') { + $ont->add_term($fact->create_object(-identifier => "Family", + -name => "Family") ); + $ont->add_term($fact->create_object(-identifier => "Domain", + -name => "Domain") ); + $ont->add_term($fact->create_object(-identifier => "Repeat", + -name => "Repeat") ); + $ont->add_term($fact->create_object(-identifier => "PTM", + -name => "post-translational modification")); + } elsif ($element->{Name} eq 'interpro') { + my %record_args = %{$element->{Attributes}}; + my $id = $record_args{"id"}; + my $term_temp = ($ont->engine->get_term_by_identifier($id))[0]; + + $self->_term( + (!defined $term_temp) + ? $ont->add_term( $fact->create_object(-InterPro_id => $id) ) + : $term_temp + ); + + $self->_term->ontology( $ont ); + $self->_term->short_name( $record_args{"short_name"} ); + $self->_term->protein_count( $record_args{"protein_count"} ); + $self->_increment_record_count(); + $self->_stack([{ interpro => undef }]); + $self->_names(["interpro"]); + + ## Adding a relationship between the newly created InterPro term + ## and the term describing its type + + my $rel = Bio::Ontology::Relationship->new( -predicate_term => $is_a_rel ); + $rel->object_term( ($ont->engine->get_term_by_identifier($record_args{"type"}))[0] ); + $rel->subject_term( $self->_term ); + $rel->ontology($ont); + $ont->add_relationship($rel); + } + elsif (defined $self->_stack) { + my %hash = (); + + if (keys %{$element->{Attributes}} > 0) { + foreach my $key (keys %{$element->{Attributes}}) { + $hash{$key} = $element->{Attributes}->{$key}; + } + } + push @{$self->_stack}, \%hash; + if ($element->{Name} eq 'rel_ref') { + my $ref_id = $element->{Attributes}->{"ipr_ref"}; + my $parent = $self->_top($self->_names); + + if ($parent eq 'parent_list' || $parent eq 'child_list') { + $self->_create_relationship($ref_id, $is_a_rel); + } + if ($parent eq 'contains' ) { + $self->_create_relationship($ref_id, $contains_rel); + } + if ($parent eq 'found_in' ) { + $self->_create_relationship($ref_id, $found_in_rel); + } + } + elsif ($element->{Name} eq 'abstract') { + $self->_cite_skip(1); + } + push @{$self->_names}, $element->{Name}; + } + +} + +=head2 _char_storage + + Title : _char_storage + Usage : $obj->_char_storage($newval) + Function: + Example : + Returns : value of _char_storage (a scalar) + Args : new value (a scalar, optional) + + +=cut + +sub _char_storage{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'_char_storage'} = $value; + } + + return $self->{'_char_storage'}; +} + +=head2 characters + + Title : characters + Usage : + Function: This is a method that is derived from XML::SAX::Base and has to be overridden for processing xml characters events. Used internally only. + Example : + Returns : + Args : + + +=cut + +sub characters { + my ($self, $characters) = @_; + my $text = $characters->{Data}; + + chomp $text; + $text =~ s/^(\s+)//; + $self->{_char_storage} .= $text; + +} + +=head2 end_element + + Title : end_element + Usage : + Function: This is a method that is derived from XML::SAX::Base and has to be overridden for processing end of xml element events. Used internally only. + Example : + Returns : + Args : + + +=cut + +sub end_element { + my ($self, $element) = @_; + + if ($element->{Name} eq 'interprodb') { + $self->debug("Interpro DB Parser Finished: $record_count read, $processed_count processed\n"); + } + elsif ($element->{Name} eq 'interpro') { + $self->_clear_term; + $self->_increment_processed_count(); + } + elsif ($element->{Name} ne 'cite') { + $self->{_char_storage} =~ s/<\/?p>//g; + if ((defined $self->_stack)) { + my $current_hash = pop @{$self->_stack}; + my $parent_hash = $self->_top($self->_stack); + my $current_hash_key = pop @{$self->_names}; + + if (keys %{$current_hash} > 0 && $self->_char_storage ne "") { + $current_hash->{comment} = $self->_char_storage; + push @{ $parent_hash->{$current_hash_key} }, $current_hash + } + elsif ($self->_char_storage ne ""){ + push @{ $parent_hash->{$current_hash_key} }, { 'accumulated_text_12345' => $self->_char_storage }; + } + elsif (keys %{$current_hash} > 0) { + push @{ $parent_hash->{$current_hash_key} }, $current_hash; + } + if ($element->{Name} eq 'pub_list') { + my @refs = (); + + foreach my $pub_record ( @{ $current_hash->{publication} } ) { + my $ref = Bio::Annotation::Reference->new; + my $loc = $pub_record->{location}->[0]; + + $ref->location( $pub_record->{journal}->[0]->{accumulated_text_12345}.", ".$loc->{firstpage}."-".$loc->{lastpage}.", ".$loc->{volume}.", ".$pub_record->{year}->[0]->{accumulated_text_12345}); + $ref->title( $pub_record->{title}->[0]->{accumulated_text_12345} ); + my $ttt = $pub_record->{author_list}->[0]; + + $ref->authors( $ttt->{accumulated_text_12345} ); + $ref->medline( scalar($ttt->{dbkey}) ) + if exists($ttt->{db}) && $ttt->{db} eq "MEDLINE"; + push @refs, $ref; + } + $self->_term->add_reference(@refs); + } + elsif ($element->{Name} eq 'name') { + $self->_term->name( $self->_char_storage ); + } + elsif ($element->{Name} eq 'abstract') { + $self->_term->definition( $self->_char_storage ); + $self->_cite_skip(0); + } + elsif ($element->{Name} eq 'member_list') { + my @refs = (); + + foreach my $db_xref ( @{ $current_hash->{db_xref} } ) { + push @refs, Bio::Annotation::DBLink->new( -database => $db_xref->{db}, + -primary_id => $db_xref->{dbkey} + ); + } + $self->_term->add_member(@refs); + } + elsif ($element->{Name} eq 'sec_list') { + my @refs = (); + + foreach my $sec_ac ( @{ $current_hash->{sec_ac} } ) { + push @refs, $sec_ac->{sec_ac}; + } + $self->_term->add_secondary_id(@refs); + $self->secondary_accessions_map->{$self->_term->identifier} = \@refs; + } + elsif ($element->{Name} eq 'example_list') { + my @refs = (); + + foreach my $example ( @{ $current_hash->{example} } ) { + push @refs, Bio::Annotation::DBLink->new( -database => $example->{db_xref}->[0]->{db}, + -primary_id => $example->{db_xref}->[0]->{dbkey}, + -comment => $example->{comment} + ); + } + $self->_term->add_example(@refs); + } + elsif ($element->{Name} eq 'external_doc_list') { + my @refs = (); + + foreach my $db_xref ( @{ $current_hash->{db_xref} } ) { + push @refs, Bio::Annotation::DBLink->new( -database => $db_xref->{db}, + -primary_id => $db_xref->{dbkey} + ); + } + $self->_term->add_external_document(@refs); + } + elsif ($element->{Name} eq 'class_list') { + my @refs = (); + + foreach my $classification ( @{ $current_hash->{classification} } ) { + push @refs, Bio::Annotation::DBLink->new( -database => $classification->{class_type}, + -primary_id => $classification->{id} + ); + } + $self->_term->class_list(\@refs); + } + elsif ($element->{Name} eq 'deleted_entries') { + my @refs = (); + + foreach my $del_ref ( @{ $current_hash->{del_ref} } ) { + my $term = ($self->ontology_engine->get_term_by_identifier( $del_ref->{id} ))[0]; + + $term->is_obsolete(1) if defined $term; + } + } + } + $self->_char_storage( '' ) if !$self->_cite_skip; + } +} + +=head2 secondary_accessions_map + + Title : secondary_accessions_map + Usage : $obj->secondary_accessions_map($newval) + Function: + Example : $map = $interpro_handler->secondary_accessions_map(); + Returns : Reference to a hash that maps InterPro identifier to an + array reference of secondary accessions following the InterPro + xml schema. + Args : Empty hash reference + + +=cut + +sub secondary_accessions_map{ + my ($self, $value) = @_; + + if( defined $value) { + $self->{'secondary_accessions_map'} = $value; + } + + return $self->{'secondary_accessions_map'}; +} + +=head2 _increment_record_count + + Title : _increment_record_count + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _increment_record_count{ + $record_count++; +} + +=head2 _increment_processed_count + + Title : _increment_processed_count + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _increment_processed_count{ + $processed_count++; + print $processed_count."\n" if $processed_count % 100 == 0; +} + +1;