0
|
1 # $Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module Bio::Biblio::IO::pubmedxml.pm
|
|
4 #
|
|
5 # Cared for by Martin Senger <senger@ebi.ac.uk>
|
|
6 # For copyright and disclaimer see below.
|
|
7
|
|
8 # POD documentation - main docs before the code
|
|
9
|
|
10 =head1 NAME
|
|
11
|
|
12 Bio::Biblio::IO::pubmedxml - A converter of XML files with PUBMED citations
|
|
13
|
|
14 =head1 SYNOPSIS
|
|
15
|
|
16 Do not use this object directly, it is recommended to access it and use
|
|
17 it through the I<Bio::Biblio::IO> module:
|
|
18
|
|
19 use Bio::Biblio::IO;
|
|
20 my $io = new Bio::Biblio::IO (-format => 'pubmedxml');
|
|
21
|
|
22 =head1 DESCRIPTION
|
|
23
|
|
24 This object reads bibliographic citations in XML/MEDLINE format and
|
|
25 converts them into I<Bio::Biblio::RefI> objects. It is an
|
|
26 implementation of methods defined in I<Bio::Biblio::IO>.
|
|
27
|
|
28 =head1 FEEDBACK
|
|
29
|
|
30 =head2 Mailing Lists
|
|
31
|
|
32 User feedback is an integral part of the evolution of this and other
|
|
33 Bioperl modules. Send your comments and suggestions preferably to
|
|
34 the Bioperl mailing list. Your participation is much appreciated.
|
|
35
|
|
36 bioperl-l@bioperl.org - General discussion
|
|
37 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
38
|
|
39 =head2 Reporting Bugs
|
|
40
|
|
41 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
42 of the bugs and their resolution. Bug reports can be submitted via
|
|
43 email or the web:
|
|
44
|
|
45 bioperl-bugs@bioperl.org
|
|
46 http://bugzilla.bioperl.org/
|
|
47
|
|
48 =head1 AUTHOR
|
|
49
|
|
50 Martin Senger (senger@ebi.ac.uk)
|
|
51
|
|
52 =head1 COPYRIGHT
|
|
53
|
|
54 Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved.
|
|
55
|
|
56 This module is free software; you can redistribute it and/or modify
|
|
57 it under the same terms as Perl itself.
|
|
58
|
|
59 =head1 DISCLAIMER
|
|
60
|
|
61 This software is provided "as is" without warranty of any kind.
|
|
62
|
|
63 =head1 APPENDIX
|
|
64
|
|
65 The main documentation details are to be found in
|
|
66 L<Bio::Biblio::IO>.
|
|
67
|
|
68 Here is the rest of the object methods. Internal methods are preceded
|
|
69 with an underscore _.
|
|
70
|
|
71 =cut
|
|
72
|
|
73
|
|
74 # Let the code begin...
|
|
75
|
|
76
|
|
77 package Bio::Biblio::IO::pubmedxml;
|
|
78 use vars qw(@ISA $VERSION $Revision);
|
|
79 use vars qw(%PCDATA_NAMES %SIMPLE_TREATMENT %POP_DATA_AND_PEEK_OBJ %POP_AND_ADD_DATA_ELEMENT);
|
|
80
|
|
81 use strict;
|
|
82
|
|
83 use Bio::Biblio::IO::medlinexml;
|
|
84
|
|
85 @ISA = qw(Bio::Biblio::IO::medlinexml);
|
|
86
|
|
87 BEGIN {
|
|
88 # set the version for version checking
|
|
89 $VERSION = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d.%-02d", @r };
|
|
90 $Revision = q$Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $;
|
|
91 }
|
|
92
|
|
93 sub _initialize {
|
|
94 my ($self, @args) = @_;
|
|
95
|
|
96 # make a hashtable from @args
|
|
97 my %param = @args;
|
|
98 @param { map { lc $_ } keys %param } = values %param; # lowercase keys
|
|
99
|
|
100 # copy all @args into this object (overwriting what may already be
|
|
101 # there) - changing '-key' into '_key', and making keys lowercase
|
|
102 my $new_key;
|
|
103 foreach my $key (keys %param) {
|
|
104 ($new_key = $key) =~ s/^-/_/;
|
|
105 $self->{ lc $new_key } = $param { $key };
|
|
106 }
|
|
107
|
|
108 # find the format for output - and put it into a global $Convert
|
|
109 # because it will be used by the event handler who knows nothing
|
|
110 # about this object
|
|
111 my $result = $self->{'_result'} || 'pubmed2ref';
|
|
112 $result = "\L$result"; # normalize capitalization to lower case
|
|
113
|
|
114 # a special case is 'raw' when no converting module is loaded
|
|
115 # and citations will be returned as a hashtable (the one which
|
|
116 # is created during parsing XML file/stream)
|
|
117 unless ($result eq 'raw') {
|
|
118
|
|
119 # load module with output converter - as defined in $result
|
|
120 if (defined &Bio::Biblio::IO::_load_format_module ($result)) {
|
|
121 $Bio::Biblio::IO::medlinexml::Convert = "Bio::Biblio::IO::$result"->new (@args);
|
|
122 }
|
|
123 }
|
|
124
|
|
125 # create an instance of the XML parser
|
|
126 # (unless it is already there...)
|
|
127 $self->{'_xml_parser'} = new XML::Parser (Handlers => {Init => \&Bio::Biblio::IO::medlinexml::handle_doc_start,
|
|
128 Start => \&handle_start,
|
|
129 End => \&handle_end,
|
|
130 Char => \&Bio::Biblio::IO::medlinexml::handle_char,
|
|
131 Final => \&Bio::Biblio::IO::medlinexml::handle_doc_end})
|
|
132 unless $self->{'_xml_parser'};
|
|
133
|
|
134 # if there is an argument '-callback' then start parsing at once -
|
|
135 # the registered event handlers will use 'callback' to report
|
|
136 # back after each citation
|
|
137 #
|
|
138 # we need to remember this situation also in a global variable
|
|
139 # because the event handler subroutines know nothing about this
|
|
140 # object (unfortunately)
|
|
141 if ($SUPER::Callback = $self->{'_callback'}) {
|
|
142 $self->_parse;
|
|
143 }
|
|
144 }
|
|
145
|
|
146 # ---------------------------------------------------------------------
|
|
147 #
|
|
148 # Here are the event handlers (they do the real job!)
|
|
149 #
|
|
150 # Note that these methods do not know anything about the object they
|
|
151 # are part of - they are called as subroutines. not as methods.
|
|
152 # It also means that they need to use global variables to store and
|
|
153 # exchnage intermediate results.
|
|
154 #
|
|
155 # ---------------------------------------------------------------------
|
|
156
|
|
157 #
|
|
158 # This is a list of #PCDATA elements.
|
|
159 #
|
|
160 %PCDATA_NAMES =
|
|
161 (
|
|
162 'PublicationStatus' => 1,
|
|
163 'ProviderId' => 1,
|
|
164 'ArticleId' => 1,
|
|
165 'URL' => 1,
|
|
166 );
|
|
167
|
|
168 %SIMPLE_TREATMENT =
|
|
169 (
|
|
170 'History' => 1,
|
|
171 'PubMedArticle' => 1,
|
|
172 'PubmedArticle' => 1,
|
|
173 'PubmedData' => 1,
|
|
174 );
|
|
175
|
|
176 %POP_DATA_AND_PEEK_OBJ =
|
|
177 (
|
|
178 'Year' => 1,
|
|
179 'Month' => 1,
|
|
180 'Day' => 1,
|
|
181 'Hour' => 1,
|
|
182 'Minute' => 1,
|
|
183 'Second' => 1,
|
|
184 'ProviderId' => 1,
|
|
185 'PublicationStatus' => 1,
|
|
186 );
|
|
187
|
|
188 %POP_AND_ADD_DATA_ELEMENT =
|
|
189 (
|
|
190 'PubMedPubDate' => 'pubDates',
|
|
191 'History' => 'histories',
|
|
192 );
|
|
193
|
|
194
|
|
195 =head2 VERSION and Revision
|
|
196
|
|
197 Usage : print $Bio::Biblio::IO::pubmedxml::VERSION;
|
|
198 print $Bio::Biblio::IO::pubmedxml::Revision;
|
|
199
|
|
200 =cut
|
|
201
|
|
202
|
|
203 sub handle_start {
|
|
204 my ($expat, $e, %attrs) = @_;
|
|
205 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("START", $e);
|
|
206
|
|
207 #
|
|
208 # The #PCDATA elements which have an attribute list must
|
|
209 # be first here - because for them I create entries both on
|
|
210 # the @PCDataStack _and_ on @ObjectStack.
|
|
211 #
|
|
212 if ($e eq 'ArticleId') {
|
|
213 my %p = ();
|
|
214 $p{'idType'} = (defined $attrs{'IdType'} ? $attrs{'IdType'} : 'pubmed');
|
|
215 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p);
|
|
216 }
|
|
217
|
|
218 if ($e eq 'URL') {
|
|
219 my %p = ();
|
|
220 $p{'type'} = $attrs{'type'} if $attrs{'type'};
|
|
221 $p{'lang'} = $attrs{'lang'} if $attrs{'lang'};
|
|
222 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p);
|
|
223 }
|
|
224
|
|
225 #
|
|
226 # Then we have #PCDATA elements without an attribute list.
|
|
227 # For them I create an entry on @PCDataStack.
|
|
228 #
|
|
229 if (exists $PCDATA_NAMES{$e}) {
|
|
230 push (@Bio::Biblio::IO::medlinexml::PCDataStack, '');
|
|
231
|
|
232 #
|
|
233 # And finally, all non-PCDATA elements go to the objectStack
|
|
234 #
|
|
235 } elsif (exists $SIMPLE_TREATMENT{$e}) {
|
|
236 push (@Bio::Biblio::IO::medlinexml::ObjectStack, {});
|
|
237
|
|
238 } elsif ($e eq 'ArticleIdList') {
|
|
239 ;
|
|
240
|
|
241 } elsif ($e eq 'PubMedPubDate') {
|
|
242 my %p = ();
|
|
243 $p{'pubStatus'} = $attrs{'PubStatus'} if $attrs{'PubStatus'};
|
|
244 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p);
|
|
245
|
|
246 } else {
|
|
247 &Bio::Biblio::IO::medlinexml::handle_start ($expat, $e, %attrs);
|
|
248 }
|
|
249 }
|
|
250
|
|
251 sub handle_end {
|
|
252 my ($expat, $e) = @_;
|
|
253
|
|
254 #
|
|
255 # First I have to deal with those elements which are both PCDATA
|
|
256 # (and therefore they are on the pcdataStack) and which have an
|
|
257 # attribute list (therefore they are also known as a separate
|
|
258 # p-object on the objectStack.
|
|
259 #
|
|
260 if ($e eq 'ArticleId') {
|
|
261 &Bio::Biblio::IO::medlinexml::_data2obj ('id');
|
|
262 &Bio::Biblio::IO::medlinexml::_add_element ('pubmedArticleIds', pop @Bio::Biblio::IO::medlinexml::ObjectStack);
|
|
263 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
|
|
264 return;
|
|
265 }
|
|
266
|
|
267 if ($e eq 'URL') {
|
|
268 &Bio::Biblio::IO::medlinexml::_data2obj ('URL');
|
|
269 &Bio::Biblio::IO::medlinexml::_add_element ('pubmedURLs', pop @Bio::Biblio::IO::medlinexml::ObjectStack);
|
|
270 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
|
|
271 return;
|
|
272 }
|
|
273
|
|
274
|
|
275 #
|
|
276 # both object and pcdata stacks elements mixed here together
|
|
277 #
|
|
278
|
|
279 if (exists $POP_DATA_AND_PEEK_OBJ{$e}) {
|
|
280 &Bio::Biblio::IO::medlinexml::_data2obj ("\l$e");
|
|
281
|
|
282 } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) {
|
|
283 &Bio::Biblio::IO::medlinexml::_add_element ($POP_AND_ADD_DATA_ELEMENT{$e}, pop @Bio::Biblio::IO::medlinexml::ObjectStack);
|
|
284
|
|
285 } elsif ($e eq 'MedlineCitation' ||
|
|
286 $e eq 'NCBIArticle') {
|
|
287 &Bio::Biblio::IO::medlinexml::_obj2obj ('Citation');
|
|
288
|
|
289 } elsif ($e eq 'PubmedData') {
|
|
290 &Bio::Biblio::IO::medlinexml::_obj2obj ('PubmedData');
|
|
291
|
|
292 } elsif ($e eq 'PubMedArticle' ||
|
|
293 $e eq 'PubmedArticle') {
|
|
294
|
|
295 #
|
|
296 # Here we finally have the whole citation ready.
|
|
297 #
|
|
298 &Bio::Biblio::IO::medlinexml::_process_citation (pop @Bio::Biblio::IO::medlinexml::ObjectStack);
|
|
299
|
|
300 } else {
|
|
301 &Bio::Biblio::IO::medlinexml::handle_end ($expat, $e);
|
|
302 }
|
|
303
|
|
304 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
|
|
305
|
|
306 }
|
|
307
|
|
308 1;
|
|
309 __END__
|