Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Biblio/IO/pubmedxml.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/IO/pubmedxml.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,309 @@ +# $Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $ +# +# BioPerl module Bio::Biblio::IO::pubmedxml.pm +# +# Cared for by Martin Senger <senger@ebi.ac.uk> +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::IO::pubmedxml - A converter of XML files with PUBMED citations + +=head1 SYNOPSIS + +Do not use this object directly, it is recommended to access it and use +it through the I<Bio::Biblio::IO> module: + + use Bio::Biblio::IO; + my $io = new Bio::Biblio::IO (-format => 'pubmedxml'); + +=head1 DESCRIPTION + +This object reads bibliographic citations in XML/MEDLINE format and +converts them into I<Bio::Biblio::RefI> objects. It is an +implementation of methods defined in I<Bio::Biblio::IO>. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The main documentation details are to be found in +L<Bio::Biblio::IO>. + +Here is the rest of the object methods. Internal methods are preceded +with an underscore _. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::IO::pubmedxml; +use vars qw(@ISA $VERSION $Revision); +use vars qw(%PCDATA_NAMES %SIMPLE_TREATMENT %POP_DATA_AND_PEEK_OBJ %POP_AND_ADD_DATA_ELEMENT); + +use strict; + +use Bio::Biblio::IO::medlinexml; + +@ISA = qw(Bio::Biblio::IO::medlinexml); + +BEGIN { + # set the version for version checking + $VERSION = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $; +} + +sub _initialize { + my ($self, @args) = @_; + + # make a hashtable from @args + my %param = @args; + @param { map { lc $_ } keys %param } = values %param; # lowercase keys + + # copy all @args into this object (overwriting what may already be + # there) - changing '-key' into '_key', and making keys lowercase + my $new_key; + foreach my $key (keys %param) { + ($new_key = $key) =~ s/^-/_/; + $self->{ lc $new_key } = $param { $key }; + } + + # find the format for output - and put it into a global $Convert + # because it will be used by the event handler who knows nothing + # about this object + my $result = $self->{'_result'} || 'pubmed2ref'; + $result = "\L$result"; # normalize capitalization to lower case + + # a special case is 'raw' when no converting module is loaded + # and citations will be returned as a hashtable (the one which + # is created during parsing XML file/stream) + unless ($result eq 'raw') { + + # load module with output converter - as defined in $result + if (defined &Bio::Biblio::IO::_load_format_module ($result)) { + $Bio::Biblio::IO::medlinexml::Convert = "Bio::Biblio::IO::$result"->new (@args); + } + } + + # create an instance of the XML parser + # (unless it is already there...) + $self->{'_xml_parser'} = new XML::Parser (Handlers => {Init => \&Bio::Biblio::IO::medlinexml::handle_doc_start, + Start => \&handle_start, + End => \&handle_end, + Char => \&Bio::Biblio::IO::medlinexml::handle_char, + Final => \&Bio::Biblio::IO::medlinexml::handle_doc_end}) + unless $self->{'_xml_parser'}; + + # if there is an argument '-callback' then start parsing at once - + # the registered event handlers will use 'callback' to report + # back after each citation + # + # we need to remember this situation also in a global variable + # because the event handler subroutines know nothing about this + # object (unfortunately) + if ($SUPER::Callback = $self->{'_callback'}) { + $self->_parse; + } +} + +# --------------------------------------------------------------------- +# +# Here are the event handlers (they do the real job!) +# +# Note that these methods do not know anything about the object they +# are part of - they are called as subroutines. not as methods. +# It also means that they need to use global variables to store and +# exchnage intermediate results. +# +# --------------------------------------------------------------------- + +# +# This is a list of #PCDATA elements. +# +%PCDATA_NAMES = + ( + 'PublicationStatus' => 1, + 'ProviderId' => 1, + 'ArticleId' => 1, + 'URL' => 1, + ); + +%SIMPLE_TREATMENT = + ( + 'History' => 1, + 'PubMedArticle' => 1, + 'PubmedArticle' => 1, + 'PubmedData' => 1, + ); + +%POP_DATA_AND_PEEK_OBJ = + ( + 'Year' => 1, + 'Month' => 1, + 'Day' => 1, + 'Hour' => 1, + 'Minute' => 1, + 'Second' => 1, + 'ProviderId' => 1, + 'PublicationStatus' => 1, + ); + +%POP_AND_ADD_DATA_ELEMENT = + ( + 'PubMedPubDate' => 'pubDates', + 'History' => 'histories', + ); + + +=head2 VERSION and Revision + + Usage : print $Bio::Biblio::IO::pubmedxml::VERSION; + print $Bio::Biblio::IO::pubmedxml::Revision; + +=cut + + +sub handle_start { + my ($expat, $e, %attrs) = @_; +# &Bio::Biblio::IO::medlinexml::_debug_object_stack ("START", $e); + + # + # The #PCDATA elements which have an attribute list must + # be first here - because for them I create entries both on + # the @PCDataStack _and_ on @ObjectStack. + # + if ($e eq 'ArticleId') { + my %p = (); + $p{'idType'} = (defined $attrs{'IdType'} ? $attrs{'IdType'} : 'pubmed'); + push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p); + } + + if ($e eq 'URL') { + my %p = (); + $p{'type'} = $attrs{'type'} if $attrs{'type'}; + $p{'lang'} = $attrs{'lang'} if $attrs{'lang'}; + push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p); + } + + # + # Then we have #PCDATA elements without an attribute list. + # For them I create an entry on @PCDataStack. + # + if (exists $PCDATA_NAMES{$e}) { + push (@Bio::Biblio::IO::medlinexml::PCDataStack, ''); + + # + # And finally, all non-PCDATA elements go to the objectStack + # + } elsif (exists $SIMPLE_TREATMENT{$e}) { + push (@Bio::Biblio::IO::medlinexml::ObjectStack, {}); + + } elsif ($e eq 'ArticleIdList') { + ; + + } elsif ($e eq 'PubMedPubDate') { + my %p = (); + $p{'pubStatus'} = $attrs{'PubStatus'} if $attrs{'PubStatus'}; + push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p); + + } else { + &Bio::Biblio::IO::medlinexml::handle_start ($expat, $e, %attrs); + } +} + +sub handle_end { + my ($expat, $e) = @_; + + # + # First I have to deal with those elements which are both PCDATA + # (and therefore they are on the pcdataStack) and which have an + # attribute list (therefore they are also known as a separate + # p-object on the objectStack. + # + if ($e eq 'ArticleId') { + &Bio::Biblio::IO::medlinexml::_data2obj ('id'); + &Bio::Biblio::IO::medlinexml::_add_element ('pubmedArticleIds', pop @Bio::Biblio::IO::medlinexml::ObjectStack); +# &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e); + return; + } + + if ($e eq 'URL') { + &Bio::Biblio::IO::medlinexml::_data2obj ('URL'); + &Bio::Biblio::IO::medlinexml::_add_element ('pubmedURLs', pop @Bio::Biblio::IO::medlinexml::ObjectStack); +# &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e); + return; + } + + + # + # both object and pcdata stacks elements mixed here together + # + + if (exists $POP_DATA_AND_PEEK_OBJ{$e}) { + &Bio::Biblio::IO::medlinexml::_data2obj ("\l$e"); + + } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) { + &Bio::Biblio::IO::medlinexml::_add_element ($POP_AND_ADD_DATA_ELEMENT{$e}, pop @Bio::Biblio::IO::medlinexml::ObjectStack); + + } elsif ($e eq 'MedlineCitation' || + $e eq 'NCBIArticle') { + &Bio::Biblio::IO::medlinexml::_obj2obj ('Citation'); + + } elsif ($e eq 'PubmedData') { + &Bio::Biblio::IO::medlinexml::_obj2obj ('PubmedData'); + + } elsif ($e eq 'PubMedArticle' || + $e eq 'PubmedArticle') { + + # + # Here we finally have the whole citation ready. + # + &Bio::Biblio::IO::medlinexml::_process_citation (pop @Bio::Biblio::IO::medlinexml::ObjectStack); + + } else { + &Bio::Biblio::IO::medlinexml::handle_end ($expat, $e); + } + +# &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e); + +} + +1; +__END__