Mercurial > repos > willmclaren > ensembl_vep
comparison variant_effect_predictor/Bio/Biblio/IO/medlinexml.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:21066c0abaf5 |
---|---|
1 # $Id: medlinexml.pm,v 1.5 2002/10/22 07:45:13 lapp Exp $ | |
2 # | |
3 # BioPerl module Bio::Biblio::IO::medlinexml.pm | |
4 # | |
5 # Cared for by Martin Senger <senger@ebi.ac.uk> | |
6 # For copyright and disclaimer see below. | |
7 | |
8 # POD documentation - main docs before the code | |
9 | |
10 =head1 NAME | |
11 | |
12 Bio::Biblio::IO::medlinexml - A converter of XML files with MEDLINE citations | |
13 | |
14 =head1 SYNOPSIS | |
15 | |
16 Do not use this object directly, it is recommended to access it and use | |
17 it through the I<Bio::Biblio::IO> module: | |
18 | |
19 use Bio::Biblio::IO; | |
20 my $io = new Bio::Biblio::IO (-format => 'medlinexml'); | |
21 | |
22 =head1 DESCRIPTION | |
23 | |
24 This object reads bibliographic citations in XML/MEDLINE format and | |
25 converts them into I<Bio::Biblio::RefI> objects. It is an | |
26 implementation of methods defined in I<Bio::Biblio::IO>. | |
27 | |
28 =head1 FEEDBACK | |
29 | |
30 =head2 Mailing Lists | |
31 | |
32 User feedback is an integral part of the evolution of this and other | |
33 Bioperl modules. Send your comments and suggestions preferably to | |
34 the Bioperl mailing list. Your participation is much appreciated. | |
35 | |
36 bioperl-l@bioperl.org - General discussion | |
37 http://bioperl.org/MailList.shtml - About the mailing lists | |
38 | |
39 =head2 Reporting Bugs | |
40 | |
41 Report bugs to the Bioperl bug tracking system to help us keep track | |
42 of the bugs and their resolution. Bug reports can be submitted via | |
43 email or the web: | |
44 | |
45 bioperl-bugs@bioperl.org | |
46 http://bugzilla.bioperl.org/ | |
47 | |
48 =head1 AUTHOR | |
49 | |
50 Martin Senger (senger@ebi.ac.uk) | |
51 | |
52 =head1 COPYRIGHT | |
53 | |
54 Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. | |
55 | |
56 This module is free software; you can redistribute it and/or modify | |
57 it under the same terms as Perl itself. | |
58 | |
59 =head1 DISCLAIMER | |
60 | |
61 This software is provided "as is" without warranty of any kind. | |
62 | |
63 =head1 APPENDIX | |
64 | |
65 The main documentation details are to be found in | |
66 L<Bio::Biblio::IO>. | |
67 | |
68 Here is the rest of the object methods. Internal methods are preceded | |
69 with an underscore _. | |
70 | |
71 =cut | |
72 | |
73 | |
74 # Let the code begin... | |
75 | |
76 | |
77 package Bio::Biblio::IO::medlinexml; | |
78 use vars qw(@ISA $VERSION $Revision); | |
79 use vars qw(@Citations $Callback $Convert @ObjectStack @PCDataStack); | |
80 use vars qw(%PCDATA_NAMES %SIMPLE_TREATMENT %POP_DATA_AND_PEEK_OBJ %POP_OBJ_AND_PEEK_OBJ); | |
81 use vars qw(%POP_AND_ADD_ELEMENT %POP_AND_ADD_DATA_ELEMENT); | |
82 | |
83 use strict; | |
84 | |
85 use Bio::Biblio::IO; | |
86 use XML::Parser; | |
87 | |
88 @ISA = qw(Bio::Biblio::IO); | |
89 | |
90 BEGIN { | |
91 # set the version for version checking | |
92 $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; | |
93 $Revision = q$Id: medlinexml.pm,v 1.5 2002/10/22 07:45:13 lapp Exp $; | |
94 } | |
95 | |
96 # ----------------------------------------------------------------------------- | |
97 | |
98 sub _initialize { | |
99 my ($self, @args) = @_; | |
100 | |
101 # make a hashtable from @args | |
102 my %param = @args; | |
103 @param { map { lc $_ } keys %param } = values %param; # lowercase keys | |
104 | |
105 # copy all @args into this object (overwriting what may already be | |
106 # there) - changing '-key' into '_key', and making keys lowercase | |
107 my $new_key; | |
108 foreach my $key (keys %param) { | |
109 ($new_key = $key) =~ s/^-/_/; | |
110 $self->{ lc $new_key } = $param { $key }; | |
111 } | |
112 | |
113 # find the format for output - and put it into a global $Convert | |
114 # because it will be used by the event handler who knows nothing | |
115 # about this object | |
116 my $result = $self->{'_result'} || 'medline2ref'; | |
117 $result = "\L$result"; # normalize capitalization to lower case | |
118 | |
119 # a special case is 'raw' when no converting module is loaded | |
120 # and citations will be returned as a hashtable (the one which | |
121 # is created during parsing XML file/stream) | |
122 unless ($result eq 'raw') { | |
123 | |
124 # load module with output converter - as defined in $result | |
125 if (defined &Bio::Biblio::IO::_load_format_module ($result)) { | |
126 $Convert = "Bio::Biblio::IO::$result"->new (@args); | |
127 } | |
128 } | |
129 | |
130 # create an instance of the XML parser | |
131 # (unless it is already there...) | |
132 $self->{'_xml_parser'} = new XML::Parser (Handlers => {Init => \&handle_doc_start, | |
133 Start => \&handle_start, | |
134 End => \&handle_end, | |
135 Char => \&handle_char, | |
136 Final => \&handle_doc_end}) | |
137 unless $self->{'_xml_parser'}; | |
138 | |
139 # if there is an argument '-callback' then start parsing at once - | |
140 # the registered event handlers will use 'callback' to report | |
141 # back after each citation | |
142 # | |
143 # we need to remember this situation also in a global variable | |
144 # because the event handler subroutines know nothing about this | |
145 # object (unfortunately) | |
146 if ($Callback = $self->{'_callback'}) { | |
147 $self->_parse; | |
148 } | |
149 } | |
150 | |
151 # ----------------------------------------------------------------------------- | |
152 | |
153 sub _parse { | |
154 my ($self) = shift; | |
155 | |
156 | |
157 if (defined $self->{'_file'}) { | |
158 $self->{'_xml_parser'}->parsefile ($self->{'_file'}); | |
159 } elsif (defined $self->{'_fh'}) { | |
160 my $fh = $self->{'_fh'}; | |
161 if (ref ($fh) and UNIVERSAL::isa ($fh, 'IO::Handler')) { | |
162 $self->{'_xml_parser'}->parse ($fh); | |
163 } else { | |
164 my $data; | |
165 $data .= $_ while <$fh>; | |
166 $self->{'_xml_parser'}->parse ($data); | |
167 } | |
168 } elsif ($self->{'_data'}) { | |
169 $self->{'_xml_parser'}->parse ($self->{'_data'}); | |
170 } else { | |
171 $self->throw ("XML source to be parsed is unknown. Should be given in the new()."); | |
172 } | |
173 | |
174 # when parsing is done all citations have already been delivered | |
175 # to the caller using her callbacks - and nothing to be stored | |
176 # here, or parser put all citations into global @Cittaions where | |
177 # we want to copy there into this instance - so any caller can | |
178 # start parsing other XML input without overwriting already read | |
179 # citations from the first parser | |
180 if (@Citations) { | |
181 $self->{'_citations'} = []; | |
182 foreach my $cit (@Citations) { | |
183 push (@{ $self->{'_citations'} }, $cit); | |
184 undef $cit; | |
185 } | |
186 undef @Citations; | |
187 } | |
188 } | |
189 | |
190 # --------------------------------------------------------------------- | |
191 # | |
192 # Here is an implementation of Bio::Biblio::IO methods | |
193 # | |
194 # --------------------------------------------------------------------- | |
195 | |
196 # global variables used by the XML event handlers | |
197 # TBD: make them accessible at least ONLY from this module... | |
198 @Citations = (); | |
199 $Callback = undef; | |
200 $Convert = undef; | |
201 @ObjectStack = (); # it has Hash-Ref elements | |
202 @PCDataStack = (); # it has String elements | |
203 | |
204 sub next_bibref { | |
205 my ($self) = @_; | |
206 $self->throw ("Method 'next_bibref' should not be called when a '-callback' argument given.") | |
207 if $self->{'_callback'}; | |
208 | |
209 # parse the whole input into memory (global @Citations) | |
210 # and then copy it into this object | |
211 $self->_parse unless $self->{'_citations'}; | |
212 | |
213 # return the next citation (and forget it here) | |
214 shift (@{ $self->{'_citations'} }); | |
215 } | |
216 | |
217 # --------------------------------------------------------------------- | |
218 # | |
219 # Here are the event handlers (they do the real job!) | |
220 # | |
221 # Note that these methods do not know anything about the object they | |
222 # are part of - they are called as subroutines. not as methods. | |
223 # It also means that they need to use global variables to store and | |
224 # exchnage intermediate results. | |
225 # | |
226 # --------------------------------------------------------------------- | |
227 | |
228 # | |
229 # This is a list of #PCDATA elements. | |
230 # | |
231 %PCDATA_NAMES = ( | |
232 'AbstractText' => 1, | |
233 'AccessionNumber' => 1, | |
234 'Acronym' => 1, | |
235 'Affiliation' => 1, | |
236 'Agency' => 1, | |
237 'ArticleTitle' => 1, | |
238 'CASRegistryNumber' => 1, | |
239 'CitationSubset' => 1, | |
240 'Coden' => 1, | |
241 'CollectionTitle' => 1, | |
242 'CollectiveName' => 1, | |
243 'CopyrightInformation' => 1, | |
244 'Country' => 1, | |
245 'DataBankName' => 1, | |
246 'DateOfElectronicPublication' => 1, | |
247 'Day' => 1, | |
248 'Descriptor' => 1, | |
249 'DescriptorName' => 1, | |
250 'EndPage' => 1, | |
251 'FirstName' => 1, | |
252 'ForeName' => 1, | |
253 'GeneralNote' => 1, | |
254 'GeneSymbol' => 1, | |
255 'GrantID' => 1, | |
256 'Hour' => 1, | |
257 'ISOAbbreviation' => 1, | |
258 'ISSN' => 1, | |
259 'Initials' => 1, | |
260 'Issue' => 1, | |
261 'Keyword' => 1, | |
262 'Language' => 1, | |
263 'LastName' => 1, | |
264 'MedlineCode' => 1, | |
265 'MedlineDate' => 1, | |
266 'MedlineID' => 1, | |
267 'MedlinePgn' => 1, | |
268 'MedlineTA' => 1, | |
269 'MiddleName' => 1, | |
270 'Minute' => 1, | |
271 'Month' => 1, | |
272 'NameOfSubstance' => 1, | |
273 'NlmUniqueID' => 1, | |
274 'Note' => 1, | |
275 'NumberOfReferences' => 1, | |
276 'OtherID' => 1, | |
277 'PMID' => 1, | |
278 'PublicationType' => 1, | |
279 'Publisher' => 1, | |
280 'QualifierName' => 1, | |
281 'RefSource' => 1, | |
282 'RegistryNumber' => 1, | |
283 'Season' => 1, | |
284 'Second' => 1, | |
285 'SpaceFlightMission' => 1, | |
286 'StartPage' => 1, | |
287 'SubHeading' => 1, | |
288 'Suffix' => 1, | |
289 'Title' => 1, | |
290 'VernacularTitle' => 1, | |
291 'Volume' => 1, | |
292 'Year' => 1, | |
293 ); | |
294 | |
295 %SIMPLE_TREATMENT = ( | |
296 'MeshHeading' => 1, | |
297 'Author' => 1, | |
298 'Article' => 1, | |
299 'Book' => 1, | |
300 'Investigator' => 1, | |
301 'Chemical' => 1, | |
302 'Pagination' => 1, | |
303 'MedlineJournalInfo' => 1, | |
304 'JournalIssue' => 1, | |
305 'Journal' => 1, | |
306 'DateCreated' => 1, | |
307 'DateCompleted' => 1, | |
308 'DateRevised' => 1, | |
309 'PubDate' => 1, | |
310 'Abstract' => 1, | |
311 'Grant' => 1, | |
312 'CommentsCorrections' => 1, | |
313 'CommentOn' => 1, | |
314 'CommentIn' => 1, | |
315 'ErratumFor' => 1, | |
316 'ErratumIn' => 1, | |
317 'OriginalReportIn' => 1, | |
318 'RepublishedFrom' => 1, | |
319 'RepublishedIn' => 1, | |
320 'RetractionOf' => 1, | |
321 'RetractionIn' => 1, | |
322 'SummaryForPatientsIn' => 1, | |
323 'UpdateIn' => 1, | |
324 'UpdateOf' => 1, | |
325 'DataBank' => 1, | |
326 'KeywordList' => 1, | |
327 'DeleteCitation' => 1, | |
328 ); | |
329 | |
330 %POP_DATA_AND_PEEK_OBJ = ( | |
331 'Descriptor' => 1, | |
332 'DescriptorName' => 1, | |
333 'Year' => 1, | |
334 'Month' => 1, | |
335 'Day' => 1, | |
336 'LastName' => 1, | |
337 'Initials' => 1, | |
338 'FirstName' => 1, | |
339 'ForeName' => 1, | |
340 'NameOfSubstance' => 1, | |
341 'RegistryNumber' => 1, | |
342 'CASRegistryNumber' => 1, | |
343 'MiddleName' => 1, | |
344 'NlmUniqueID' => 1, | |
345 'MedlineTA' => 1, | |
346 'MedlinePgn' => 1, | |
347 'MedlineCode' => 1, | |
348 'Country' => 1, | |
349 'ISSN' => 1, | |
350 'ArticleTitle' => 1, | |
351 'Issue' => 1, | |
352 'AbstractText' => 1, | |
353 'VernacularTitle' => 1, | |
354 'GrantID' => 1, | |
355 'Agency' => 1, | |
356 'Acronym' => 1, | |
357 'MedlineDate' => 1, | |
358 'NumberOfReferences' => 1, | |
359 'RefSource' => 1, | |
360 'DataBankName' => 1, | |
361 'CopyrightInformation' => 1, | |
362 'Suffix' => 1, | |
363 'Note' => 1, | |
364 'CollectiveName' => 1, | |
365 'Hour' => 1, | |
366 'Minute' => 1, | |
367 'Second' => 1, | |
368 'Season' => 1, | |
369 'Coden' => 1, | |
370 'ISOAbbreviation' => 1, | |
371 'Publisher' => 1, | |
372 'CollectionTitle' => 1, | |
373 'DateOfElectronicPublication' => 1, | |
374 'StartPage' => 1, | |
375 'EndPage' => 1, | |
376 'Volume' => 1, | |
377 'Title' => 1, | |
378 ); | |
379 | |
380 %POP_OBJ_AND_PEEK_OBJ = ( | |
381 'Pagination' => 1, | |
382 'JournalIssue' => 1, | |
383 'Journal' => 1, | |
384 'DateCreated' => 1, | |
385 'Article' => 1, | |
386 'DateCompleted' => 1, | |
387 'DateRevised' => 1, | |
388 'CommentsCorrections' => 1, | |
389 'Book' => 1, | |
390 'PubDate' => 1, | |
391 'Abstract' => 1, | |
392 ); | |
393 | |
394 %POP_AND_ADD_DATA_ELEMENT = ( | |
395 'Keyword' => 'keywords', | |
396 'PublicationType' => 'publicationTypes', | |
397 'CitationSubset' => 'citationSubsets', | |
398 'Language' => 'languages', | |
399 'AccessionNumber' => 'accessionNumbers', | |
400 'GeneSymbol' => 'geneSymbols', | |
401 'SpaceFlightMission' => 'spaceFlightMissions', | |
402 ); | |
403 | |
404 | |
405 %POP_AND_ADD_ELEMENT = ( | |
406 'OtherAbstract' => 'otherAbstracts', | |
407 'Chemical' => 'chemicals', | |
408 'KeywordList' => 'keywordLists', | |
409 'Grant' => 'grants', | |
410 'UpdateIn' => 'updateIns', | |
411 'CommentOn' => 'commentOns', | |
412 'CommentIn' => 'commentIns', | |
413 'DataBank' => 'dataBanks', | |
414 'PersonalNameSubject' => 'personalNameSubjects', | |
415 'ErratumFor' => 'erratumFors', | |
416 'ErratumIn' => 'erratumIns', | |
417 'RepublishedFrom' => 'republishedFroms', | |
418 'RepublishedIn' => 'republishedIns', | |
419 'RetractionOf' => 'retractionOfs', | |
420 'RetractionIn' => 'retractionIns', | |
421 'UpdateOf' => 'updateOfs', | |
422 'OriginalReportIn' => 'originalReportIns', | |
423 'SummaryForPatientsIn' => 'summaryForPatientsIns', | |
424 'MeshHeading' => 'meshHeadings', | |
425 ); | |
426 | |
427 sub handle_doc_start { | |
428 @Citations = (); | |
429 @ObjectStack = (); | |
430 @PCDataStack = (); | |
431 } | |
432 | |
433 sub handle_doc_end { | |
434 undef @ObjectStack; | |
435 undef @PCDataStack; | |
436 } | |
437 | |
438 sub handle_char { | |
439 my ($expat, $str) = @_; | |
440 | |
441 # this may happen with whitespaces between tags; | |
442 # but because I have not created an entry for data on the stack | |
443 # I can also ignore such data, can't I | |
444 return if $#PCDataStack < 0; | |
445 | |
446 $PCDataStack [$#PCDataStack] .= $str; | |
447 } | |
448 | |
449 | |
450 | |
451 | |
452 =head2 VERSION and Revision | |
453 | |
454 Usage : print $Bio::Biblio::IO::medlinexml::VERSION; | |
455 print $Bio::Biblio::IO::medlinexml::Revision; | |
456 | |
457 =cut | |
458 | |
459 | |
460 sub handle_start { | |
461 my ($expat, $e, %attrs) = @_; | |
462 # &_debug_object_stack ("START", $e); | |
463 | |
464 # | |
465 # The #PCDATA elements which have an attribute list must | |
466 # be first here - because for them I create entries both on | |
467 # the @PCDataStack _and_ on @ObjectStack. | |
468 # | |
469 if ($e eq 'QualifierName' or | |
470 $e eq 'SubHeading') { | |
471 my %p = (); | |
472 $p{'majorTopic'} = "Y" if $attrs{'MajorTopicYN'}; | |
473 push (@ObjectStack, \%p); | |
474 } | |
475 | |
476 if ($e eq 'GeneralNote') { | |
477 my %p = (); | |
478 $p{'owner'} = $attrs{'Owner'} if $attrs{'Owner'}; | |
479 push (@ObjectStack, \%p); | |
480 } | |
481 | |
482 if ($e eq 'OtherID') { | |
483 my %p = (); | |
484 $p{'source'} = $attrs{'Source'}; | |
485 push (@ObjectStack, \%p); | |
486 } | |
487 | |
488 # | |
489 # A special treatment is for attributes for personal name. | |
490 # Because there is no XML element 'PersonalName' I need to | |
491 # to put yet another object on @ObjectStack unless there is | |
492 # already one. | |
493 # | |
494 if ($e eq 'LastName' or | |
495 $e eq 'FirstName' or | |
496 $e eq 'MidleName' or | |
497 $e eq 'Initials' or | |
498 $e eq 'ForeName' or | |
499 $e eq 'Suffix') { | |
500 my $peek = $ObjectStack[$#ObjectStack]; | |
501 push (@ObjectStack, {'type' => 'PersonalName'}) | |
502 unless (ref $peek and &_eq_hash_elem ($peek, 'type', 'PersonalName')); | |
503 } | |
504 | |
505 # | |
506 # Then we have #PCDATA elements without an attribute list. | |
507 # For them I create an entry on @PCDataStack. | |
508 # | |
509 if (exists $PCDATA_NAMES{$e}) { | |
510 push (@PCDataStack, ''); | |
511 | |
512 # | |
513 # And finally, all non-PCDATA elements go to the objectStack | |
514 # | |
515 } elsif (exists $SIMPLE_TREATMENT{$e}) { | |
516 push (@ObjectStack, {}); | |
517 | |
518 } elsif ($e eq 'PersonalNameSubject') { | |
519 push (@ObjectStack, {'type' => 'PersonalName'}); | |
520 | |
521 } elsif ($e eq 'DescriptorName' or | |
522 $e eq 'Descriptor') { | |
523 if (&_eq_hash_elem (\%attrs, 'MajorTopicYN', "Y")) { | |
524 my $peek = $ObjectStack[$#ObjectStack]; | |
525 $$peek{'descriptorMajorTopic'} = "Y"; | |
526 } | |
527 | |
528 } elsif ($e eq 'MedlineCitation' || | |
529 $e eq 'NCBIArticle') { | |
530 my %p = ( 'type' => 'MedlineCitation' ); | |
531 $p{'owner'} = $attrs{'Owner'} if $attrs{'Owner'}; | |
532 $p{'status'} = $attrs{'Status'} if $attrs{'Status'}; | |
533 push (@ObjectStack, \%p); | |
534 | |
535 } elsif ($e eq 'GrantList') { | |
536 if (&_eq_hash_elem (\%attrs, 'CompleteYN', "N")) { | |
537 my $peek = $ObjectStack[$#ObjectStack]; | |
538 $$peek{'grantListComplete'} = "N"; | |
539 } | |
540 | |
541 } elsif ($e eq 'DataBankList') { | |
542 if (&_eq_hash_elem (\%attrs, 'CompleteYN', "N")) { | |
543 my $peek = $ObjectStack[$#ObjectStack]; | |
544 $$peek{'dataBankListComplete'} = "N"; | |
545 } | |
546 | |
547 } elsif ($e eq 'AuthorList') { | |
548 if (&_eq_hash_elem (\%attrs, 'CompleteYN', "N")) { | |
549 my $peek = $ObjectStack[$#ObjectStack]; | |
550 $$peek{'authorListComplete'} = "N"; | |
551 } | |
552 | |
553 } elsif ($e eq 'OtherAbstract') { | |
554 my %p = (); | |
555 $p{'type'} = $attrs{'Type'} if $attrs{'Type'}; | |
556 push (@ObjectStack, \%p); | |
557 # push (@ObjectStack, { 'type' => 'Abstract' }); | |
558 | |
559 } | |
560 } | |
561 | |
562 sub handle_end { | |
563 my ($expat, $e) = @_; | |
564 # | |
565 # First I have to deal with those elements which are both PCDATA | |
566 # (and therefore they are on the pcdataStack) and which have an | |
567 # attribute list (therefore they are also known as a separate | |
568 # p-object on the objectStack. | |
569 # | |
570 if ($e eq 'QualifierName' or | |
571 $e eq 'SubHeading') { | |
572 my $p = pop @ObjectStack; # pSubHeading | |
573 $$p{'subHeading'} = pop @PCDataStack; | |
574 &_add_element ('subHeadings', $p); # adding to pMeshHeadings | |
575 # &_debug_object_stack ("END", $e); | |
576 return; | |
577 | |
578 } elsif ($e eq 'GeneralNote') { | |
579 my $p = pop @ObjectStack; # pGeneralNote | |
580 $$p{'generalNote'} = pop @PCDataStack; | |
581 &_add_element ('generalNotes', $p); # adding to pMedlineCitation | |
582 # &_debug_object_stack ("END", $e); | |
583 return; | |
584 | |
585 } elsif ($e eq 'OtherID') { | |
586 my $p = pop @ObjectStack; # pOtherID | |
587 $$p{'otherID'} = pop @PCDataStack; | |
588 &_add_element ('otherIDs', $p); # adding to pMedlineCitation | |
589 # &_debug_object_stack ("END", $e); | |
590 return; | |
591 } | |
592 | |
593 # | |
594 # both object and pcdata stacks elements mixed here together | |
595 # (the element names appear in the order of frequency in the | |
596 # medline data set) | |
597 # | |
598 | |
599 if (exists $POP_DATA_AND_PEEK_OBJ{$e}) { | |
600 &_data2obj ("\l$e"); | |
601 | |
602 } elsif (exists $POP_OBJ_AND_PEEK_OBJ{$e}) { | |
603 &_obj2obj ("\l$e"); | |
604 | |
605 } elsif (exists $POP_AND_ADD_ELEMENT{$e}) { | |
606 &_add_element ($POP_AND_ADD_ELEMENT{$e}, pop @ObjectStack); | |
607 | |
608 } elsif (exists $POP_AND_ADD_DATA_ELEMENT{$e}) { | |
609 &_add_element ($POP_AND_ADD_DATA_ELEMENT{$e}); | |
610 | |
611 } elsif ($e eq 'Author' or | |
612 $e eq 'Investigator') { | |
613 my $pAuthor; | |
614 my $p = pop @ObjectStack; # pPersonalName or pAuthor | |
615 if (&_eq_hash_elem ($p, 'type', 'PersonalName')) { | |
616 $pAuthor = pop @ObjectStack; | |
617 $$pAuthor{'personalName'} = $p; | |
618 } else { | |
619 $pAuthor = $p; | |
620 } | |
621 my $peek = $ObjectStack[$#ObjectStack]; # pMedlineCitation, pArticle or pBook | |
622 if (&_eq_hash_elem ($peek, 'type', 'MedlineCitation')) { | |
623 &_add_element ('investigators', $pAuthor); | |
624 } else { | |
625 &_add_element ('authors', $pAuthor); | |
626 } | |
627 | |
628 } elsif ($e eq 'MedlineJournalInfo') { | |
629 &_obj2obj ('journalInfo'); | |
630 | |
631 } elsif ($e eq 'PMID') { | |
632 my $peek = $ObjectStack[$#ObjectStack]; # pMedlineCitation, pReference or pDeleteCitation | |
633 if (&_eq_hash_elem ($peek, 'type', 'DeleteCitation')) { | |
634 &_add_element ('PMIDs'); | |
635 } else { | |
636 $$peek{'PMID'} = pop @PCDataStack; | |
637 } | |
638 | |
639 } elsif ($e eq 'MedlineID') { | |
640 my $peek = $ObjectStack[$#ObjectStack]; # pMedlineCitation, pReference or pDeleteCitation | |
641 if (&_eq_hash_elem ($peek, 'type', 'DeleteCitation')) { | |
642 &_add_element ('MedlineIDs'); | |
643 } else { | |
644 $$peek{'medlineID'} = pop @PCDataStack; | |
645 } | |
646 | |
647 # } elsif ($e eq 'OtherAbstract') { | |
648 # my $pAbstract = pop @ObjectStack; | |
649 # my $pOtherAbstract = pop @ObjectStack; | |
650 # $$pOtherAbstract{'abstract'} = $pAbstract | |
651 # &_add_element ('otherAbstracts', $pOtherAbstract); | |
652 | |
653 } elsif ($e eq 'Affiliation') { | |
654 my $peek = $ObjectStack[$#ObjectStack]; | |
655 if (&_eq_hash_elem ($peek, 'type', 'PersonalName')) { | |
656 my $peek2 = $ObjectStack[$#ObjectStack - 1]; | |
657 $$peek2{'affiliation'} = pop @PCDataStack; | |
658 } else { | |
659 $$peek{'affiliation'} = pop @PCDataStack; | |
660 } | |
661 | |
662 } elsif ($e eq 'DeleteCitation') { | |
663 pop @ObjectStack; | |
664 ### warn ("'DeleteCitation' tag found. Not known what to do with it."); # silently ignored | |
665 | |
666 } elsif ($e eq 'MedlineCitation') { | |
667 | |
668 # | |
669 # Here we finally have the whole citation ready. | |
670 # | |
671 &_process_citation (pop @ObjectStack); | |
672 | |
673 # | |
674 # ERROR: if we are here, there was an unexpected element | |
675 # | |
676 } elsif (exists $PCDATA_NAMES{$e}) { | |
677 pop @PCDataStack; | |
678 warn ("An unexpected element found: $e"); | |
679 } | |
680 # &_debug_object_stack ("END", $e); | |
681 | |
682 } | |
683 | |
684 # what to do when we have the whole $citation ready | |
685 sub _process_citation { | |
686 my ($citation) = @_; | |
687 $citation = $Convert->convert ($citation) if defined $Convert; | |
688 | |
689 if ($Callback) { | |
690 &$Callback ($citation); | |
691 } else { | |
692 push (@Citations, $citation); | |
693 } | |
694 } | |
695 | |
696 # add $element into an array named $key to the top object at @ObjectStack; | |
697 # if $element is empty, take it from @PCDataStack | |
698 sub _add_element { | |
699 my ($key, $element) = @_; | |
700 my $peek = $ObjectStack[$#ObjectStack]; | |
701 $$peek{$key} = [] unless $$peek{$key}; | |
702 push (@{ $$peek{$key} }, (defined $element ? $element : pop @PCDataStack)); | |
703 } | |
704 | |
705 # remove top of @PCDataStack and put it into top object at @ObjectStack under name $key | |
706 sub _data2obj { | |
707 my ($key) = @_; | |
708 my $peek = $ObjectStack[$#ObjectStack]; | |
709 $$peek{$key} = pop @PCDataStack; | |
710 } | |
711 | |
712 # remove top of @ObjectStack and put it into now-top at @ObjectStack under name $key | |
713 sub _obj2obj { | |
714 my ($key) = @_; | |
715 my $p = pop @ObjectStack; | |
716 my $peek = $ObjectStack[$#ObjectStack]; | |
717 $$peek{$key} = $p; | |
718 } | |
719 | |
720 # check if a $key exists in a ref-hash $rh and if it is equal to $value | |
721 sub _eq_hash_elem { | |
722 my ($rh, $key, $value) = @_; | |
723 return (defined $$rh{$key} and $$rh{$key} eq $value); | |
724 } | |
725 | |
726 # | |
727 # --- only for debugging | |
728 # | |
729 use vars qw(%DEBUGSTACK); | |
730 %DEBUGSTACK = (); | |
731 sub _debug_object_stack { | |
732 my ($action, $element) = @_; | |
733 if ($action =~ /^START/o) { | |
734 $DEBUGSTACK{$element} = (@ObjectStack+0); | |
735 } else { | |
736 return if $element eq 'LastName'; | |
737 print "Element $element starts on " . | |
738 $DEBUGSTACK{$element} . 'and ends on ' . (@ObjectStack+0) . "\n" | |
739 if $DEBUGSTACK{$element} != (@ObjectStack+0); | |
740 } | |
741 } | |
742 | |
743 1; | |
744 __END__ |