Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/Biblio/IO/pubmedxml.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 # $Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $ | |
2 # | |
3 # BioPerl module Bio::Biblio::IO::pubmedxml.pm | |
4 # | |
5 # Cared for by Martin Senger <senger@ebi.ac.uk> | |
6 # For copyright and disclaimer see below. | |
7 | |
8 # POD documentation - main docs before the code | |
9 | |
10 =head1 NAME | |
11 | |
12 Bio::Biblio::IO::pubmedxml - A converter of XML files with PUBMED citations | |
13 | |
14 =head1 SYNOPSIS | |
15 | |
16 Do not use this object directly, it is recommended to access it and use | |
17 it through the I<Bio::Biblio::IO> module: | |
18 | |
19 use Bio::Biblio::IO; | |
20 my $io = new Bio::Biblio::IO (-format => 'pubmedxml'); | |
21 | |
22 =head1 DESCRIPTION | |
23 | |
24 This object reads bibliographic citations in XML/MEDLINE format and | |
25 converts them into I<Bio::Biblio::RefI> objects. It is an | |
26 implementation of methods defined in I<Bio::Biblio::IO>. | |
27 | |
28 =head1 FEEDBACK | |
29 | |
30 =head2 Mailing Lists | |
31 | |
32 User feedback is an integral part of the evolution of this and other | |
33 Bioperl modules. Send your comments and suggestions preferably to | |
34 the Bioperl mailing list. Your participation is much appreciated. | |
35 | |
36 bioperl-l@bioperl.org - General discussion | |
37 http://bioperl.org/MailList.shtml - About the mailing lists | |
38 | |
39 =head2 Reporting Bugs | |
40 | |
41 Report bugs to the Bioperl bug tracking system to help us keep track | |
42 of the bugs and their resolution. Bug reports can be submitted via | |
43 email or the web: | |
44 | |
45 bioperl-bugs@bioperl.org | |
46 http://bugzilla.bioperl.org/ | |
47 | |
48 =head1 AUTHOR | |
49 | |
50 Martin Senger (senger@ebi.ac.uk) | |
51 | |
52 =head1 COPYRIGHT | |
53 | |
54 Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. | |
55 | |
56 This module is free software; you can redistribute it and/or modify | |
57 it under the same terms as Perl itself. | |
58 | |
59 =head1 DISCLAIMER | |
60 | |
61 This software is provided "as is" without warranty of any kind. | |
62 | |
63 =head1 APPENDIX | |
64 | |
65 The main documentation details are to be found in | |
66 L<Bio::Biblio::IO>. | |
67 | |
68 Here is the rest of the object methods. Internal methods are preceded | |
69 with an underscore _. | |
70 | |
71 =cut | |
72 | |
73 | |
74 # Let the code begin... | |
75 | |
76 | |
77 package Bio::Biblio::IO::pubmedxml; | |
78 use vars qw(@ISA $VERSION $Revision); | |
79 use vars qw(%PCDATA_NAMES %SIMPLE_TREATMENT %POP_DATA_AND_PEEK_OBJ %POP_AND_ADD_DATA_ELEMENT); | |
80 | |
81 use strict; | |
82 | |
83 use Bio::Biblio::IO::medlinexml; | |
84 | |
85 @ISA = qw(Bio::Biblio::IO::medlinexml); | |
86 | |
87 BEGIN { | |
88 # set the version for version checking | |
89 $VERSION = do { my @r = (q$Revision: 1.4 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; | |
90 $Revision = q$Id: pubmedxml.pm,v 1.4 2002/10/22 07:45:13 lapp Exp $; | |
91 } | |
92 | |
93 sub _initialize { | |
94 my ($self, @args) = @_; | |
95 | |
96 # make a hashtable from @args | |
97 my %param = @args; | |
98 @param { map { lc $_ } keys %param } = values %param; # lowercase keys | |
99 | |
100 # copy all @args into this object (overwriting what may already be | |
101 # there) - changing '-key' into '_key', and making keys lowercase | |
102 my $new_key; | |
103 foreach my $key (keys %param) { | |
104 ($new_key = $key) =~ s/^-/_/; | |
105 $self->{ lc $new_key } = $param { $key }; | |
106 } | |
107 | |
108 # find the format for output - and put it into a global $Convert | |
109 # because it will be used by the event handler who knows nothing | |
110 # about this object | |
111 my $result = $self->{'_result'} || 'pubmed2ref'; | |
112 $result = "\L$result"; # normalize capitalization to lower case | |
113 | |
114 # a special case is 'raw' when no converting module is loaded | |
115 # and citations will be returned as a hashtable (the one which | |
116 # is created during parsing XML file/stream) | |
117 unless ($result eq 'raw') { | |
118 | |
119 # load module with output converter - as defined in $result | |
120 if (defined &Bio::Biblio::IO::_load_format_module ($result)) { | |
121 $Bio::Biblio::IO::medlinexml::Convert = "Bio::Biblio::IO::$result"->new (@args); | |
122 } | |
123 } | |
124 | |
125 # create an instance of the XML parser | |
126 # (unless it is already there...) | |
127 $self->{'_xml_parser'} = new XML::Parser (Handlers => {Init => \&Bio::Biblio::IO::medlinexml::handle_doc_start, | |
128 Start => \&handle_start, | |
129 End => \&handle_end, | |
130 Char => \&Bio::Biblio::IO::medlinexml::handle_char, | |
131 Final => \&Bio::Biblio::IO::medlinexml::handle_doc_end}) | |
132 unless $self->{'_xml_parser'}; | |
133 | |
134 # if there is an argument '-callback' then start parsing at once - | |
135 # the registered event handlers will use 'callback' to report | |
136 # back after each citation | |
137 # | |
138 # we need to remember this situation also in a global variable | |
139 # because the event handler subroutines know nothing about this | |
140 # object (unfortunately) | |
141 if ($SUPER::Callback = $self->{'_callback'}) { | |
142 $self->_parse; | |
143 } | |
144 } | |
145 | |
146 # --------------------------------------------------------------------- | |
147 # | |
148 # Here are the event handlers (they do the real job!) | |
149 # | |
150 # Note that these methods do not know anything about the object they | |
151 # are part of - they are called as subroutines. not as methods. | |
152 # It also means that they need to use global variables to store and | |
153 # exchnage intermediate results. | |
154 # | |
155 # --------------------------------------------------------------------- | |
156 | |
157 # | |
158 # This is a list of #PCDATA elements. | |
159 # | |
160 %PCDATA_NAMES = | |
161 ( | |
162 'PublicationStatus' => 1, | |
163 'ProviderId' => 1, | |
164 'ArticleId' => 1, | |
165 'URL' => 1, | |
166 ); | |
167 | |
168 %SIMPLE_TREATMENT = | |
169 ( | |
170 'History' => 1, | |
171 'PubMedArticle' => 1, | |
172 'PubmedArticle' => 1, | |
173 'PubmedData' => 1, | |
174 ); | |
175 | |
176 %POP_DATA_AND_PEEK_OBJ = | |
177 ( | |
178 'Year' => 1, | |
179 'Month' => 1, | |
180 'Day' => 1, | |
181 'Hour' => 1, | |
182 'Minute' => 1, | |
183 'Second' => 1, | |
184 'ProviderId' => 1, | |
185 'PublicationStatus' => 1, | |
186 ); | |
187 | |
188 %POP_AND_ADD_DATA_ELEMENT = | |
189 ( | |
190 'PubMedPubDate' => 'pubDates', | |
191 'History' => 'histories', | |
192 ); | |
193 | |
194 | |
195 =head2 VERSION and Revision | |
196 | |
197 Usage : print $Bio::Biblio::IO::pubmedxml::VERSION; | |
198 print $Bio::Biblio::IO::pubmedxml::Revision; | |
199 | |
200 =cut | |
201 | |
202 | |
203 sub handle_start { | |
204 my ($expat, $e, %attrs) = @_; | |
205 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("START", $e); | |
206 | |
207 # | |
208 # The #PCDATA elements which have an attribute list must | |
209 # be first here - because for them I create entries both on | |
210 # the @PCDataStack _and_ on @ObjectStack. | |
211 # | |
212 if ($e eq 'ArticleId') { | |
213 my %p = (); | |
214 $p{'idType'} = (defined $attrs{'IdType'} ? $attrs{'IdType'} : 'pubmed'); | |
215 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p); | |
216 } | |
217 | |
218 if ($e eq 'URL') { | |
219 my %p = (); | |
220 $p{'type'} = $attrs{'type'} if $attrs{'type'}; | |
221 $p{'lang'} = $attrs{'lang'} if $attrs{'lang'}; | |
222 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p); | |
223 } | |
224 | |
225 # | |
226 # Then we have #PCDATA elements without an attribute list. | |
227 # For them I create an entry on @PCDataStack. | |
228 # | |
229 if (exists $PCDATA_NAMES{$e}) { | |
230 push (@Bio::Biblio::IO::medlinexml::PCDataStack, ''); | |
231 | |
232 # | |
233 # And finally, all non-PCDATA elements go to the objectStack | |
234 # | |
235 } elsif (exists $SIMPLE_TREATMENT{$e}) { | |
236 push (@Bio::Biblio::IO::medlinexml::ObjectStack, {}); | |
237 | |
238 } elsif ($e eq 'ArticleIdList') { | |
239 ; | |
240 | |
241 } elsif ($e eq 'PubMedPubDate') { | |
242 my %p = (); | |
243 $p{'pubStatus'} = $attrs{'PubStatus'} if $attrs{'PubStatus'}; | |
244 push (@Bio::Biblio::IO::medlinexml::ObjectStack, \%p); | |
245 | |
246 } else { | |
247 &Bio::Biblio::IO::medlinexml::handle_start ($expat, $e, %attrs); | |
248 } | |
249 } | |
250 | |
251 sub handle_end { | |
252 my ($expat, $e) = @_; | |
253 | |
254 # | |
255 # First I have to deal with those elements which are both PCDATA | |
256 # (and therefore they are on the pcdataStack) and which have an | |
257 # attribute list (therefore they are also known as a separate | |
258 # p-object on the objectStack. | |
259 # | |
260 if ($e eq 'ArticleId') { | |
261 &Bio::Biblio::IO::medlinexml::_data2obj ('id'); | |
262 &Bio::Biblio::IO::medlinexml::_add_element ('pubmedArticleIds', pop @Bio::Biblio::IO::medlinexml::ObjectStack); | |
263 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e); | |
264 return; | |
265 } | |
266 | |
267 if ($e eq 'URL') { | |
268 &Bio::Biblio::IO::medlinexml::_data2obj ('URL'); | |
269 &Bio::Biblio::IO::medlinexml::_add_element ('pubmedURLs', pop @Bio::Biblio::IO::medlinexml::ObjectStack); | |
270 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e); | |
271 return; | |
272 } | |
273 | |
274 | |
275 # | |
276 # both object and pcdata stacks elements mixed here together | |
277 # | |
278 | |
279 if (exists $POP_DATA_AND_PEEK_OBJ{$e}) { | |
280 &Bio::Biblio::IO::medlinexml::_data2obj ("\l$e"); | |
281 | |
282 } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) { | |
283 &Bio::Biblio::IO::medlinexml::_add_element ($POP_AND_ADD_DATA_ELEMENT{$e}, pop @Bio::Biblio::IO::medlinexml::ObjectStack); | |
284 | |
285 } elsif ($e eq 'MedlineCitation' || | |
286 $e eq 'NCBIArticle') { | |
287 &Bio::Biblio::IO::medlinexml::_obj2obj ('Citation'); | |
288 | |
289 } elsif ($e eq 'PubmedData') { | |
290 &Bio::Biblio::IO::medlinexml::_obj2obj ('PubmedData'); | |
291 | |
292 } elsif ($e eq 'PubMedArticle' || | |
293 $e eq 'PubmedArticle') { | |
294 | |
295 # | |
296 # Here we finally have the whole citation ready. | |
297 # | |
298 &Bio::Biblio::IO::medlinexml::_process_citation (pop @Bio::Biblio::IO::medlinexml::ObjectStack); | |
299 | |
300 } else { | |
301 &Bio::Biblio::IO::medlinexml::handle_end ($expat, $e); | |
302 } | |
303 | |
304 # &Bio::Biblio::IO::medlinexml::_debug_object_stack ("END", $e); | |
305 | |
306 } | |
307 | |
308 1; | |
309 __END__ |