comparison variant_effect_predictor/Bio/Biblio/IO/pubmedxml.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:21066c0abaf5
1 # $Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $
2 #
3 # BioPerl module Bio::Biblio::IO::pubmedxml.pm
4 #
5 # Cared for by Martin Senger <senger@ebi.ac.uk>
6 # For copyright and disclaimer see below.
7
8 # POD documentation - main docs before the code
9
10 =head1 NAME
11
12 Bio::Biblio::IO::pubmedxml - A converter of XML files with PUBMED citations
13
14 =head1 SYNOPSIS
15
16 Do not use this object directly, it is recommended to access it and use
17 it through the I<Bio::Biblio::IO> module:
18
19 use Bio::Biblio::IO;
20 my $io = new Bio::Biblio::IO (-format => 'pubmedxml');
21
22 =head1 DESCRIPTION
23
24 This object reads bibliographic citations in XML/MEDLINE format and
25 converts them into I<Bio::Biblio::RefI> objects. It is an
26 implementation of methods defined in I<Bio::Biblio::IO>.
27
28 =head1 FEEDBACK
29
30 =head2 Mailing Lists
31
32 User feedback is an integral part of the evolution of this and other
33 Bioperl modules. Send your comments and suggestions preferably to
34 the Bioperl mailing list. Your participation is much appreciated.
35
36 bioperl-l@bioperl.org - General discussion
37 http://bioperl.org/MailList.shtml - About the mailing lists
38
39 =head2 Reporting Bugs
40
41 Report bugs to the Bioperl bug tracking system to help us keep track
42 of the bugs and their resolution. Bug reports can be submitted via
43 email or the web:
44
45 bioperl-bugs@bioperl.org
46 http://bugzilla.bioperl.org/
47
48 =head1 AUTHOR
49
50 Martin Senger (senger@ebi.ac.uk)
51
52 =head1 COPYRIGHT
53
54 Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved.
55
56 This module is free software; you can redistribute it and/or modify
57 it under the same terms as Perl itself.
58
59 =head1 DISCLAIMER
60
61 This software is provided "as is" without warranty of any kind.
62
63 =head1 APPENDIX
64
65 The main documentation details are to be found in
66 L<Bio::Biblio::IO>.
67
68 Here is the rest of the object methods. Internal methods are preceded
69 with an underscore _.
70
71 =cut
72
73
74 # Let the code begin...
75
76
77 package Bio::Biblio::IO::pubmedxml;
78 use vars qw(@ISA $VERSION $Revision);
79 use vars qw(%PCDATA_NAMES %SIMPLE_TREATMENT %POP_DATA_AND_PEEK_OBJ %POP_AND_ADD_DATA_ELEMENT);
80
81 use strict;
82
83 use Bio::Biblio::IO::medlinexml;
84
85 @ISA = qw(Bio::Biblio::IO::medlinexml);
86
87 BEGIN {
88 # set the version for version checking
89 $VERSION = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d.%-02d", @r };
90 $Revision = q$Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $;
91 }
92
93 sub _initialize {
94 my ($self, @args) = @_;
95
96 # make a hashtable from @args
97 my %param = @args;
98 @param { map { lc $_ } keys %param } = values %param; # lowercase keys
99
100 # copy all @args into this object (overwriting what may already be
101 # there) - changing '-key' into '_key', and making keys lowercase
102 my $new_key;
103 foreach my $key (keys %param) {
104 ($new_key = $key) =~ s/^-/_/;
105 $self->{ lc $new_key } = $param { $key };
106 }
107
108 # find the format for output - and put it into a global $Convert
109 # because it will be used by the event handler who knows nothing
110 # about this object
111 my $result = $self->{'_result'} || 'pubmed2ref';
112 $result = "\L$result"; # normalize capitalization to lower case
113
114 # a special case is 'raw' when no converting module is loaded
115 # and citations will be returned as a hashtable (the one which
116 # is created during parsing XML file/stream)
117 unless ($result eq 'raw') {
118
119 # load module with output converter - as defined in $result
120 if (defined &Bio::Biblio::IO::_load_format_module ($result)) {
121 $Bio::Biblio::IO::medlinexml::Convert = "Bio::Biblio::IO::$result"->new (@args);
122 }
123 }
124
125 # create an instance of the XML parser
126 # (unless it is already there...)
127 $self->{'_xml_parser'} = new XML::Parser (Handlers => {Init => \&Bio::Biblio::IO::medlinexml::handle_doc_start,
128 Start => \&handle_start,
129 End => \&handle_end,
130 Char => \&Bio::Biblio::IO::medlinexml::handle_char,
131 Final => \&Bio::Biblio::IO::medlinexml::handle_doc_end})
132 unless $self->{'_xml_parser'};
133
134 # if there is an argument '-callback' then start parsing at once -
135 # the registered event handlers will use 'callback' to report
136 # back after each citation
137 #
138 # we need to remember this situation also in a global variable
139 # because the event handler subroutines know nothing about this
140 # object (unfortunately)
141 if ($SUPER::Callback = $self->{'_callback'}) {
142 $self->_parse;
143 }
144 }
145
146 # ---------------------------------------------------------------------
147 #
148 # Here are the event handlers (they do the real job!)
149 #
150 # Note that these methods do not know anything about the object they
151 # are part of - they are called as subroutines. not as methods.
152 # It also means that they need to use global variables to store and
153 # exchnage intermediate results.
154 #
155 # ---------------------------------------------------------------------
156
157 #
158 # This is a list of #PCDATA elements.
159 #
160 %PCDATA_NAMES =
161 (
162 'PublicationStatus' => 1,
163 'ProviderId' => 1,
164 'ArticleId' => 1,
165 'URL' => 1,
166 );
167
168 %SIMPLE_TREATMENT =
169 (
170 'History' => 1,
171 'PubMedArticle' => 1,
172 'PubmedArticle' => 1,
173 'PubmedData' => 1,
174 );
175
176 %POP_DATA_AND_PEEK_OBJ =
177 (
178 'Year' => 1,
179 'Month' => 1,
180 'Day' => 1,
181 'Hour' => 1,
182 'Minute' => 1,
183 'Second' => 1,
184 'ProviderId' => 1,
185 'PublicationStatus' => 1,
186 );
187
188 %POP_AND_ADD_DATA_ELEMENT =
189 (
190 'PubMedPubDate' => 'pubDates',
191 'History' => 'histories',
192 );
193
194
195 =head2 VERSION and Revision
196
197 Usage : print $Bio::Biblio::IO::pubmedxml::VERSION;
198 print $Bio::Biblio::IO::pubmedxml::Revision;
199
200 =cut
201
202
203 sub handle_start {
204 my ($expat, $e, %attrs) = @_;
205 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("START", $e);
206
207 #
208 # The #PCDATA elements which have an attribute list must
209 # be first here - because for them I create entries both on
210 # the @PCDataStack _and_ on @ObjectStack.
211 #
212 if ($e eq 'ArticleId') {
213 my %p = ();
214 $p{'idType'} = (defined $attrs{'IdType'} ? $attrs{'IdType'} : 'pubmed');
215 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p);
216 }
217
218 if ($e eq 'URL') {
219 my %p = ();
220 $p{'type'} = $attrs{'type'} if $attrs{'type'};
221 $p{'lang'} = $attrs{'lang'} if $attrs{'lang'};
222 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p);
223 }
224
225 #
226 # Then we have #PCDATA elements without an attribute list.
227 # For them I create an entry on @PCDataStack.
228 #
229 if (exists $PCDATA_NAMES{$e}) {
230 push (@Bio::Biblio::IO::medlinexml::PCDataStack, '');
231
232 #
233 # And finally, all non-PCDATA elements go to the objectStack
234 #
235 } elsif (exists $SIMPLE_TREATMENT{$e}) {
236 push (@Bio::Biblio::IO::medlinexml::ObjectStack, {});
237
238 } elsif ($e eq 'ArticleIdList') {
239 ;
240
241 } elsif ($e eq 'PubMedPubDate') {
242 my %p = ();
243 $p{'pubStatus'} = $attrs{'PubStatus'} if $attrs{'PubStatus'};
244 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p);
245
246 } else {
247 &Bio::Biblio::IO::medlinexml::handle_start ($expat, $e, %attrs);
248 }
249 }
250
251 sub handle_end {
252 my ($expat, $e) = @_;
253
254 #
255 # First I have to deal with those elements which are both PCDATA
256 # (and therefore they are on the pcdataStack) and which have an
257 # attribute list (therefore they are also known as a separate
258 # p-object on the objectStack.
259 #
260 if ($e eq 'ArticleId') {
261 &Bio::Biblio::IO::medlinexml::_data2obj ('id');
262 &Bio::Biblio::IO::medlinexml::_add_element ('pubmedArticleIds', pop @Bio::Biblio::IO::medlinexml::ObjectStack);
263 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
264 return;
265 }
266
267 if ($e eq 'URL') {
268 &Bio::Biblio::IO::medlinexml::_data2obj ('URL');
269 &Bio::Biblio::IO::medlinexml::_add_element ('pubmedURLs', pop @Bio::Biblio::IO::medlinexml::ObjectStack);
270 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
271 return;
272 }
273
274
275 #
276 # both object and pcdata stacks elements mixed here together
277 #
278
279 if (exists $POP_DATA_AND_PEEK_OBJ{$e}) {
280 &Bio::Biblio::IO::medlinexml::_data2obj ("\l$e");
281
282 } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) {
283 &Bio::Biblio::IO::medlinexml::_add_element ($POP_AND_ADD_DATA_ELEMENT{$e}, pop @Bio::Biblio::IO::medlinexml::ObjectStack);
284
285 } elsif ($e eq 'MedlineCitation' ||
286 $e eq 'NCBIArticle') {
287 &Bio::Biblio::IO::medlinexml::_obj2obj ('Citation');
288
289 } elsif ($e eq 'PubmedData') {
290 &Bio::Biblio::IO::medlinexml::_obj2obj ('PubmedData');
291
292 } elsif ($e eq 'PubMedArticle' ||
293 $e eq 'PubmedArticle') {
294
295 #
296 # Here we finally have the whole citation ready.
297 #
298 &Bio::Biblio::IO::medlinexml::_process_citation (pop @Bio::Biblio::IO::medlinexml::ObjectStack);
299
300 } else {
301 &Bio::Biblio::IO::medlinexml::handle_end ($expat, $e);
302 }
303
304 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e);
305
306 }
307
308 1;
309 __END__