0
|
1 # $Id: IO.pm,v 1.8 2002/10/22 07:45:11 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Biblio::IO
|
|
4 #
|
|
5 # Cared for by Martin Senger <senger@ebi.ac.uk>
|
|
6 # For copyright and disclaimer see below.
|
|
7
|
|
8 # POD documentation - main docs before the code
|
|
9
|
|
10 =head1 NAME
|
|
11
|
|
12 Bio::Biblio::IO - Handling the bibliographic references
|
|
13
|
|
14 =head1 SYNOPSIS
|
|
15
|
|
16 use Bio::Biblio::IO;
|
|
17
|
|
18 # getting citations from a file
|
|
19 $in = Bio::Biblio::IO->new ('-file' => 'myfile.xml' ,
|
|
20 '-format' => 'medlinexml');
|
|
21 --- OR ---
|
|
22
|
|
23 # getting citations from a string
|
|
24 $in = Bio::Biblio::IO->new ('-data' => '<MedlineCitation>...</MedlineCitation>' ,
|
|
25 '-format' => 'medlinexml');
|
|
26 --- OR ---
|
|
27
|
|
28 # getting citations from a string if IO::String is installed
|
|
29 use IO::String;
|
|
30 $in = Bio::Biblio::IO->new ('-fh' => IO::String->new ($citation),
|
|
31 '-format' => 'medlinexml');
|
|
32
|
|
33 $in = Bio::Biblio::IO->new(-fh => $io_handle , '-format' => 'medlinexml');
|
|
34
|
|
35 --- OR ---
|
|
36
|
|
37 # getting citations from any IO handler
|
|
38 $in = Bio::Biblio::IO->new('-fh' => $io_handle ,
|
|
39 '-format' => 'medlinexml');
|
|
40
|
|
41
|
|
42 # now, having $in, we can read all citations
|
|
43 while ( my $citation = $in->next_bibref() ) {
|
|
44 &do_something_with_citation ($citation);
|
|
45 }
|
|
46
|
|
47 --- OR ---
|
|
48
|
|
49 # again reading all citation but now a callback defined in your
|
|
50 # code is used (note that the reading starts already when new()
|
|
51 # is called)
|
|
52 $io = new Bio::Biblio::IO ('-format' => 'medlinexml',
|
|
53 '-file' => $testfile,
|
|
54 '-callback' => \&callback);
|
|
55 sub callback {
|
|
56 my $citation = shift;
|
|
57 print $citation->{'_identifier'} . "\n";
|
|
58 }
|
|
59
|
|
60 Now, to actually get a citation in an XML format,
|
|
61 use I<Bio::Biblio> module which returns an XML string:
|
|
62
|
|
63 use Bio::Biblio;
|
|
64 my $xml = new Bio::Biblio->get_by_id ('94033980');
|
|
65 my $reader = Bio::Biblio::IO->new ('-data' => $xml,
|
|
66 '-format' => 'medlinexml');
|
|
67
|
|
68 while (my $citation = $reader->next_bibref()) {
|
|
69 ... do something here with $citation
|
|
70 }
|
|
71
|
|
72 And, finally, the resulting citation can be received in different
|
|
73 output formats:
|
|
74
|
|
75 $io = new Bio::Biblio::IO ('-format' => 'medlinexml',
|
|
76 '-result' => 'raw');
|
|
77 --- OR ---
|
|
78
|
|
79 $io = new Bio::Biblio::IO ('-format' => 'medlinexml',
|
|
80 '-result' => 'medline2ref');
|
|
81
|
|
82 --- OR ---
|
|
83
|
|
84 $io = new Bio::Biblio::IO ('-format' => 'pubmedxml',
|
|
85 '-result' => 'pubmed2ref');
|
|
86
|
|
87 =head1 DESCRIPTION
|
|
88
|
|
89 Bio::Biblio::IO is a handler module for accessing bibliographic
|
|
90 citations. The citations can be in different formats - assuming that
|
|
91 there is a corresponding module knowing that format in Bio::Biblio::IO
|
|
92 directory (e.g. Bio::Biblio::IO::medlinexml). The format (and the
|
|
93 module name) is given by the argument I<-format>.
|
|
94
|
|
95 Once an instance of C<Bio::Biblio::IO> class is available, the
|
|
96 citations can be read by calling repeatedly method I<next_bibref>:
|
|
97
|
|
98 while (my $citation = $reader->next_bibref()) {
|
|
99 ... do something here with $citation
|
|
100 }
|
|
101
|
|
102 However, this may imply that all citations were already read into the
|
|
103 memory. If you expect a huge amount of citations to be read, you may
|
|
104 choose a I<callback> option. Your subroutine is specified in the
|
|
105 C<new()> method and is called everytime a new citation is available
|
|
106 (see an example above in SYNOPSIS).
|
|
107
|
|
108 The citations returned by I<next_bibref> or given to your callback
|
|
109 routine can be of different formats depending on the argument
|
|
110 I<-result>. One result type is I<raw> and it is represented by a
|
|
111 simple, not blessed hash table:
|
|
112
|
|
113 $io = new Bio::Biblio::IO ('-result' => 'raw');
|
|
114
|
|
115 What other result formats are available depends on the module who
|
|
116 reads the citations in the first place. At the moment, the following
|
|
117 ones are available:
|
|
118
|
|
119 $io = new Bio::Biblio::IO ('-result' => 'medline2ref');
|
|
120
|
|
121 This is a default result format for reading citations by the
|
|
122 I<medlinexml> module. The C<medlinexml> module is again the default
|
|
123 one. Which means that you can almost omit arguments (you still need to
|
|
124 say where the citations come from):
|
|
125
|
|
126 $io = new Bio::Biblio::IO ('-file' => 'data/medline_data.xml');
|
|
127
|
|
128 Another result format available is for PUBMED citations (which is a
|
|
129 super-set of the MEDLINE citations having few more tags):
|
|
130
|
|
131 $io = new Bio::Biblio::IO ('-format' => 'pubmedxml',
|
|
132 '-result' => 'pubmed2ref',
|
|
133 '-data' => $citation);
|
|
134
|
|
135 Or, because C<pubmed2ref> is a default one for PUBMED citations, you can say just:
|
|
136
|
|
137 $io = new Bio::Biblio::IO ('-format' => 'pubmedxml',
|
|
138 '-data' => $citation);
|
|
139
|
|
140 Both C<medline2ref> and C<pubmed2ref> results are objects defined in
|
|
141 the directory C<Bio::Biblio>.
|
|
142
|
|
143 =head1 SEE ALSO
|
|
144
|
|
145 =over
|
|
146
|
|
147 =item *
|
|
148
|
|
149 An example script I<examples/biblio.pl>. It has many options and its
|
|
150 own help. The relevant options to this IO module are I<-f>
|
|
151 (specifying what file to read) and I<-O> (specifying what result
|
|
152 format to achieve).
|
|
153
|
|
154 =item *
|
|
155
|
|
156 OpenBQS home page: http://industry.ebi.ac.uk/openBQS
|
|
157
|
|
158 =item *
|
|
159
|
|
160 Comments to the Perl client: http://industry.ebi.ac.uk/openBQS/Client_perl.html
|
|
161
|
|
162 =back
|
|
163
|
|
164 =head1 FEEDBACK
|
|
165
|
|
166 =head2 Mailing Lists
|
|
167
|
|
168 User feedback is an integral part of the evolution of this
|
|
169 and other Bioperl modules. Send your comments and suggestions preferably
|
|
170 to one of the Bioperl mailing lists.
|
|
171 Your participation is much appreciated.
|
|
172
|
|
173 bioperl-l@bioperl.org - General discussion
|
|
174 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
175
|
|
176 =head2 Reporting Bugs
|
|
177
|
|
178 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
179 the bugs and their resolution.
|
|
180 Bug reports can be submitted via email or the web:
|
|
181
|
|
182 bioperl-bugs@bioperl.org
|
|
183 http://bugzilla.bioperl.org/
|
|
184
|
|
185 =head1 AUTHOR
|
|
186
|
|
187 Martin Senger (senger@ebi.ac.uk)
|
|
188
|
|
189 =head1 COPYRIGHT
|
|
190
|
|
191 Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved.
|
|
192
|
|
193 This module is free software; you can redistribute it and/or modify
|
|
194 it under the same terms as Perl itself.
|
|
195
|
|
196 =head1 DISCLAIMER
|
|
197
|
|
198 This software is provided "as is" without warranty of any kind.
|
|
199
|
|
200 =head1 APPENDIX
|
|
201
|
|
202 The rest of the documentation details each of the object
|
|
203 methods. Internal methods are preceded with a _
|
|
204
|
|
205 =cut
|
|
206
|
|
207
|
|
208 # Let the code begin...
|
|
209
|
|
210 package Bio::Biblio::IO;
|
|
211
|
|
212 use strict;
|
|
213 use vars qw(@ISA);
|
|
214
|
|
215 use Bio::Root::Root;
|
|
216 use Bio::Root::IO;
|
|
217 use Symbol();
|
|
218
|
|
219 @ISA = qw(Bio::Root::Root Bio::Root::IO);
|
|
220
|
|
221 my $entry = 0;
|
|
222
|
|
223 sub new {
|
|
224 my ($caller, @args) = @_;
|
|
225 my $class = ref ($caller) || $caller;
|
|
226
|
|
227 # if $caller is an object, or if it is an underlying
|
|
228 # 'real-work-doing' class (e.g. Bio::Biblio::IO::medlinexml) then
|
|
229 # we want to call SUPER to create and bless an object
|
|
230 if( $class =~ /Bio::Biblio::IO::(\S+)/ ) {
|
|
231 my ($self) = $class->SUPER::new (@args);
|
|
232 $self->_initialize (@args);
|
|
233 return $self;
|
|
234
|
|
235 # this is called only the first time when somebody calls: 'new
|
|
236 # Bio::Biblio::IO (...)', and it actually loads a 'real-work-doing'
|
|
237 # module and call this new() method again (unless the loaded
|
|
238 # module has its own new() method)
|
|
239 } else {
|
|
240 my %param = @args;
|
|
241 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
|
|
242 my $format = $param{'-format'} ||
|
|
243 $class->_guess_format( $param{-file} || $ARGV[0] ) ||
|
|
244 'medlinexml';
|
|
245 $format = "\L$format"; # normalize capitalization to lower case
|
|
246
|
|
247 # load module with the real implementation - as defined in $format
|
|
248 return undef unless (&_load_format_module ($format));
|
|
249
|
|
250 # this will call this same method new() - but rather its
|
|
251 # upper (object) branche
|
|
252 return "Bio::Biblio::IO::$format"->new(@args);
|
|
253 }
|
|
254 }
|
|
255
|
|
256 sub newFh {
|
|
257 my $class = shift;
|
|
258 return unless my $self = $class->new(@_);
|
|
259 return $self->fh;
|
|
260 }
|
|
261
|
|
262
|
|
263 sub fh {
|
|
264 my $self = shift;
|
|
265 my $class = ref($self) || $self;
|
|
266 my $s = Symbol::gensym;
|
|
267 tie $$s,$class,$self;
|
|
268 return $s;
|
|
269 }
|
|
270
|
|
271 # _initialize is chained for all Bio::Biblio::IO classes
|
|
272
|
|
273 sub _initialize {
|
|
274 my ($self, @args) = @_;
|
|
275 # initialize the IO part
|
|
276 $self->_initialize_io (@args);
|
|
277 }
|
|
278
|
|
279 =head2 next_bibref
|
|
280
|
|
281 Usage : $citation = stream->next_bibref
|
|
282 Function: Reads the next citation object from the stream and returns it.
|
|
283 Returns : a Bio::Biblio::Ref citation object, or something else
|
|
284 (depending on the '-result' argument given in the 'new()'
|
|
285 method).
|
|
286 Args : none
|
|
287
|
|
288 =cut
|
|
289
|
|
290 sub next_bibref {
|
|
291 my ($self) = shift;
|
|
292 $self->throw ("Sorry, you cannot read from a generic Bio::Biblio::IO object.");
|
|
293 }
|
|
294
|
|
295 # -----------------------------------------------------------------------------
|
|
296
|
|
297 =head2 _load_format_module
|
|
298
|
|
299 Usage : $class->_load_format_module ($format)
|
|
300 Returns : 1 on success, undef on failure
|
|
301 Args : 'format' should contain the last part of the
|
|
302 name of a module who does the real implementation
|
|
303
|
|
304 It does (in run-time) a similar thing as
|
|
305
|
|
306 require Bio::Biblio::IO::$format
|
|
307
|
|
308 It throws an exception if it fails to find and load the module
|
|
309 (for example, because of the compilation errors in the module).
|
|
310
|
|
311 =cut
|
|
312
|
|
313 sub _load_format_module {
|
|
314 my ($format) = @_;
|
|
315 my ($module, $load, $m);
|
|
316
|
|
317 $module = "_<Bio/Biblio/IO/$format.pm";
|
|
318 $load = "Bio/Biblio/IO/$format.pm";
|
|
319
|
|
320 return 1 if $main::{$module};
|
|
321 eval {
|
|
322 require $load;
|
|
323 };
|
|
324 if ( $@ ) {
|
|
325 Bio::Root::Root->throw (<<END);
|
|
326 $load: $format cannot be found or loaded
|
|
327 Exception $@
|
|
328 For more information about the Biblio system please see the Bio::Biblio::IO docs.
|
|
329 END
|
|
330 ;
|
|
331 return;
|
|
332 }
|
|
333 return 1;
|
|
334 }
|
|
335
|
|
336 =head2 _guess_format
|
|
337
|
|
338 Usage : $class->_guess_format ($filename)
|
|
339 Returns : string with a guessed format of the input data (e.g. 'medlinexml')
|
|
340 Args : a file name whose extension can help to guess its format
|
|
341
|
|
342 It makes an expert guess what kind of data are in the given file
|
|
343 (but be prepare that $filename may be empty).
|
|
344
|
|
345 =cut
|
|
346
|
|
347 sub _guess_format {
|
|
348 my $class = shift;
|
|
349 return unless $_ = shift;
|
|
350 return 'medlinexml' if (/\.(xml|medlinexml)$/i);
|
|
351 return;
|
|
352 }
|
|
353
|
|
354 sub DESTROY {
|
|
355 my $self = shift;
|
|
356
|
|
357 $self->close();
|
|
358 }
|
|
359
|
|
360 sub TIEHANDLE {
|
|
361 my ($class,$val) = @_;
|
|
362 return bless {'biblio' => $val}, $class;
|
|
363 }
|
|
364
|
|
365 sub READLINE {
|
|
366 my $self = shift;
|
|
367 return $self->{'biblio'}->next_bibref() unless wantarray;
|
|
368 my (@list, $obj);
|
|
369 push @list, $obj while $obj = $self->{'biblio'}->next_bibref();
|
|
370 return @list;
|
|
371 }
|
|
372
|
|
373 1;
|