0
|
1 # $Id: IO.pm,v 1.3 2002/10/22 07:45:21 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Structure::IO
|
|
4 #
|
|
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
|
|
6 # and Lincoln Stein <lstein@cshl.org>
|
|
7 # and Kris Boulez <kris.boulez@algonomics.com>
|
|
8 #
|
|
9 # Copyright 2001, 2002 Kris Boulez
|
|
10 #
|
|
11 # You may distribute this module under the same terms as perl itself
|
|
12 #
|
|
13 # _history
|
|
14 # October 18, 1999 Largely rewritten by Lincoln Stein
|
|
15 # November 16, 2001 Copied Bio::SeqIO to Bio::Structure::IO and modified
|
|
16 # where needed. Factoring out common methods
|
|
17 # (to Bio::Root::IO) might be a good idea.
|
|
18
|
|
19 # POD documentation - main docs before the code
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::Structure::IO - Handler for Structure Formats
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 use Bio::Structure::IO;
|
|
28
|
|
29 $in = Bio::Structure::IO->new(-file => "inputfilename" , '-format' => 'pdb');
|
|
30 $out = Bio::Structure::IO->new(-file => ">outputfilename" , '-format' => 'pdb');
|
|
31 # note: we quote -format to keep older perl's from complaining.
|
|
32
|
|
33 while ( my $struc = $in->next_structure() ) {
|
|
34 $out->write_structure($struc);
|
|
35 }
|
|
36
|
|
37 now, to actually get at the structure object, use the standard Bio::Structure
|
|
38 methods (look at L<Bio::Structure> if you don't know what they are)
|
|
39
|
|
40 use Bio::Structure::IO;
|
|
41
|
|
42 $in = Bio::Structure::IO->new(-file => "inputfilename" , '-format' => 'pdb');
|
|
43
|
|
44 while ( my $struc = $in->next_structure() ) {
|
|
45 print "Structure ",$struc->id," number of models: ",scalar $struc->model,"\n";
|
|
46 }
|
|
47
|
|
48
|
|
49
|
|
50 =head1 DESCRIPTION
|
|
51
|
|
52 [ The following description is a copy-paste from the Bio::SeqIO description.
|
|
53 This is not surprising as the code is also mostly a copy. ]
|
|
54
|
|
55 Bio::Structure::IO is a handler module for the formats in the Structure::IO set
|
|
56 (eg, Bio::Structure::IO::pdb). It is the officially sanctioned way of getting at
|
|
57 the format objects, which most people should use.
|
|
58
|
|
59 The Bio::Structure::IO system can be thought of like biological file handles.
|
|
60 They are attached to filehandles with smart formatting rules (eg, PDB format)
|
|
61 and can either read or write structure objects (Bio::Structure objects, or
|
|
62 more correctly, Bio::Structure::StructureI implementing objects, of which
|
|
63 Bio::Structure is one such object). If you want to know what to do with a
|
|
64 Bio::Structure object, read L<Bio::Structure>
|
|
65
|
|
66 The idea is that you request a stream object for a particular format.
|
|
67 All the stream objects have a notion of an internal file that is read
|
|
68 from or written to. A particular Structure::IO object instance is configured
|
|
69 for either input or output. A specific example of a stream object is
|
|
70 the Bio::Structure::IO::pdb object.
|
|
71
|
|
72 Each stream object has functions
|
|
73
|
|
74 $stream->next_structure();
|
|
75
|
|
76 and
|
|
77
|
|
78 $stream->write_structure($struc);
|
|
79
|
|
80 also
|
|
81
|
|
82 $stream->type() # returns 'INPUT' or 'OUTPUT'
|
|
83
|
|
84 As an added bonus, you can recover a filehandle that is tied to the
|
|
85 Structure::IOIO object, allowing you to use the standard E<lt>E<gt> and print operations
|
|
86 to read and write structure::IOuence objects:
|
|
87
|
|
88 use Bio::Structure::IO;
|
|
89
|
|
90 $stream = Bio::Structure::IO->newFh(-format => 'pdb'); # read from standard input
|
|
91
|
|
92 while ( $structure = <$stream> ) {
|
|
93 # do something with $structure
|
|
94 }
|
|
95
|
|
96 and
|
|
97
|
|
98 print $stream $structure; # when stream is in output mode
|
|
99
|
|
100
|
|
101 =head1 CONSTRUCTORS
|
|
102
|
|
103 =head2 Bio::Structure::IO-E<gt>new()
|
|
104
|
|
105 $stream = Bio::Structure::IO->new(-file => 'filename', -format=>$format);
|
|
106 $stream = Bio::Structure::IO->new(-fh => \*FILEHANDLE, -format=>$format);
|
|
107 $stream = Bio::Structure::IO->new(-format => $format);
|
|
108
|
|
109 The new() class method constructs a new Bio::Structure::IO object. The
|
|
110 returned object can be used to retrieve or print Bio::Structure objects.
|
|
111 new() accepts the following parameters:
|
|
112
|
|
113 =over 4
|
|
114
|
|
115 =item -file
|
|
116
|
|
117 A file path to be opened for reading or writing. The usual Perl
|
|
118 conventions apply:
|
|
119
|
|
120 'file' # open file for reading
|
|
121 '>file' # open file for writing
|
|
122 '>>file' # open file for appending
|
|
123 '+<file' # open file read/write
|
|
124 'command |' # open a pipe from the command
|
|
125 '| command' # open a pipe to the command
|
|
126
|
|
127 =item -fh
|
|
128
|
|
129 You may provide new() with a previously-opened filehandle. For
|
|
130 example, to read from STDIN:
|
|
131
|
|
132 $strucIO = Bio::Structure::IO->new(-fh => \*STDIN);
|
|
133
|
|
134 Note that you must pass filehandles as references to globs.
|
|
135
|
|
136 If neither a filehandle nor a filename is specified, then the module
|
|
137 will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt>
|
|
138 semantics.
|
|
139
|
|
140 A string filehandle is handy if you want to modify the output in the
|
|
141 memory, before printing it out. The following program reads in EMBL
|
|
142 formatted entries from a file and prints them out in fasta format with
|
|
143 some HTML tags:
|
|
144 [ not relevant for Bio::Structure::IO as only one format is supported
|
|
145 at the moment ]
|
|
146
|
|
147 use Bio::SeqIO;
|
|
148 use IO::String;
|
|
149 my $in = Bio::SeqIO->new('-file' => "emblfile" ,
|
|
150 '-format' => 'EMBL');
|
|
151 while ( my $seq = $in->next_seq() ) {
|
|
152 # the output handle is reset for every file
|
|
153 my $stringio = IO::String->new($string);
|
|
154 my $out = Bio::SeqIO->new('-fh' => $stringio,
|
|
155 '-format' => 'fasta');
|
|
156 # output goes into $string
|
|
157 $out->write_seq($seq);
|
|
158 # modify $string
|
|
159 $string =~ s|(>)(\w+)|$1<font color="Red">$2</font>|g;
|
|
160 # print into STDOUT
|
|
161 print $string;
|
|
162 }
|
|
163
|
|
164 =item -format
|
|
165
|
|
166 Specify the format of the file. Supported formats include:
|
|
167
|
|
168 PDB Protein Data Bank format
|
|
169
|
|
170 If no format is specified and a filename is given, then the module
|
|
171 will attempt to deduce it from the filename. If this is unsuccessful,
|
|
172 PDB format is assumed.
|
|
173
|
|
174 The format name is case insensitive. 'PDB', 'Pdb' and 'pdb' are
|
|
175 all supported.
|
|
176
|
|
177 =back
|
|
178
|
|
179 =head2 Bio::Structure::IO-E<gt>newFh()
|
|
180
|
|
181 $fh = Bio::Structure::IO->newFh(-fh => \*FILEHANDLE, -format=>$format);
|
|
182 $fh = Bio::Structure::IO->newFh(-format => $format);
|
|
183 # etc.
|
|
184
|
|
185 This constructor behaves like new(), but returns a tied filehandle
|
|
186 rather than a Bio::Structure::IO object. You can read structures from this
|
|
187 object using the familiar E<lt>E<gt> operator, and write to it using
|
|
188 print(). The usual array and $_ semantics work. For example, you can
|
|
189 read all structure objects into an array like this:
|
|
190
|
|
191 @structures = <$fh>;
|
|
192
|
|
193 Other operations, such as read(), sysread(), write(), close(), and printf()
|
|
194 are not supported.
|
|
195
|
|
196 =head1 OBJECT METHODS
|
|
197
|
|
198 See below for more detailed summaries. The main methods are:
|
|
199
|
|
200 =head2 $structure = $structIO-E<gt>next_structure()
|
|
201
|
|
202 Fetch the next structure from the stream.
|
|
203
|
|
204 =head2 $structIO-E<gt>write_structure($struc [,$another_struc,...])
|
|
205
|
|
206 Write the specified structure(s) to the stream.
|
|
207
|
|
208 =head2 TIEHANDLE(), READLINE(), PRINT()
|
|
209
|
|
210 These provide the tie interface. See L<perltie> for more details.
|
|
211
|
|
212 =head1 FEEDBACK
|
|
213
|
|
214 =head2 Mailing Lists
|
|
215
|
|
216 User feedback is an integral part of the evolution of this
|
|
217 and other Bioperl modules. Send your comments and suggestions preferably
|
|
218 to one of the Bioperl mailing lists.
|
|
219 Your participation is much appreciated.
|
|
220
|
|
221 bioperl-l@bioperl.org - General discussion
|
|
222 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
223
|
|
224 =head2 Reporting Bugs
|
|
225
|
|
226 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
227 the bugs and their resolution.
|
|
228 Bug reports can be submitted via email or the web:
|
|
229
|
|
230 bioperl-bugs@bioperl.org
|
|
231 http://bugzilla.bioperl.org/
|
|
232
|
|
233 =head1 AUTHOR - Ewan Birney, Lincoln Stein, Kris Boulez
|
|
234
|
|
235 Email birney@ebi.ac.uk, kris.boulez@algonomics
|
|
236
|
|
237 Describe contact details here
|
|
238
|
|
239 =head1 APPENDIX
|
|
240
|
|
241 The rest of the documentation details each of the object
|
|
242 methods. Internal methods are usually preceded with a _
|
|
243
|
|
244 =cut
|
|
245
|
|
246 # Let the code begin...
|
|
247
|
|
248 package Bio::Structure::IO;
|
|
249
|
|
250 use strict;
|
|
251 use vars qw(@ISA);
|
|
252
|
|
253 use Bio::Root::Root;
|
|
254 use Bio::Root::IO;
|
|
255 use Bio::PrimarySeq;
|
|
256 use Symbol();
|
|
257
|
|
258 @ISA = qw(Bio::Root::Root Bio::Root::IO);
|
|
259
|
|
260 =head2 new
|
|
261
|
|
262 Title : new
|
|
263 Usage : $stream = Bio::Structure::IO->new(-file => $filename, -format => 'Format')
|
|
264 Function: Returns a new structIOstream
|
|
265 Returns : A Bio::Structure::IO handler initialised with the appropriate format
|
|
266 Args : -file => $filename
|
|
267 -format => format
|
|
268 -fh => filehandle to attach to
|
|
269
|
|
270 =cut
|
|
271
|
|
272 my $entry = 0;
|
|
273
|
|
274 sub new {
|
|
275 my ($caller,@args) = @_;
|
|
276 my $class = ref($caller) || $caller;
|
|
277
|
|
278 # or do we want to call SUPER on an object if $caller is an
|
|
279 # object?
|
|
280 if( $class =~ /Bio::Structure::IO::(\S+)/ ) {
|
|
281 my ($self) = $class->SUPER::new(@args);
|
|
282 $self->_initialize(@args);
|
|
283 return $self;
|
|
284 } else {
|
|
285
|
|
286 my %param = @args;
|
|
287 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
|
|
288 my $format = $param{'-format'} ||
|
|
289 $class->_guess_format( $param{-file} || $ARGV[0] ) ||
|
|
290 'pdb';
|
|
291 $format = "\L$format"; # normalize capitalization to lower case
|
|
292
|
|
293 # normalize capitalization
|
|
294 return undef unless( &_load_format_module($format) );
|
|
295 return "Bio::Structure::IO::$format"->new(@args);
|
|
296 }
|
|
297 }
|
|
298
|
|
299 =head2 newFh
|
|
300
|
|
301 Title : newFh
|
|
302 Usage : $fh = Bio::Structure::IO->newFh(-file=>$filename,-format=>'Format')
|
|
303 Function: does a new() followed by an fh()
|
|
304 Example : $fh = Bio::Structure::IO->newFh(-file=>$filename,-format=>'Format')
|
|
305 $structure = <$fh>; # read a structure object
|
|
306 print $fh $structure; # write a structure object
|
|
307 Returns : filehandle tied to the Bio::Structure::IO::Fh class
|
|
308 Args :
|
|
309
|
|
310 =cut
|
|
311
|
|
312 sub newFh {
|
|
313 my $class = shift;
|
|
314 return unless my $self = $class->new(@_);
|
|
315 return $self->fh;
|
|
316 }
|
|
317
|
|
318 =head2 fh
|
|
319
|
|
320 Title : fh
|
|
321 Usage : $obj->fh
|
|
322 Function:
|
|
323 Example : $fh = $obj->fh; # make a tied filehandle
|
|
324 $structure = <$fh>; # read a structure object
|
|
325 print $fh $structure; # write a structure object
|
|
326 Returns : filehandle tied to the Bio::Structure::IO::Fh class
|
|
327 Args :
|
|
328
|
|
329 =cut
|
|
330
|
|
331
|
|
332 sub fh {
|
|
333 my $self = shift;
|
|
334 my $class = ref($self) || $self;
|
|
335 my $s = Symbol::gensym;
|
|
336 tie $$s,$class,$self;
|
|
337 return $s;
|
|
338 }
|
|
339
|
|
340
|
|
341 # _initialize is chained for all SeqIO classes
|
|
342
|
|
343 sub _initialize {
|
|
344 my($self, @args) = @_;
|
|
345
|
|
346 # not really necessary unless we put more in RootI
|
|
347 $self->SUPER::_initialize(@args);
|
|
348
|
|
349 # initialize the IO part
|
|
350 $self->_initialize_io(@args);
|
|
351 }
|
|
352
|
|
353 =head2 next_structure
|
|
354
|
|
355 Title : next_structure
|
|
356 Usage : $structure = stream->next_structure
|
|
357 Function: Reads the next structure object from the stream and returns it.
|
|
358
|
|
359 Certain driver modules may encounter entries in the stream that
|
|
360 are either misformatted or that use syntax not yet understood
|
|
361 by the driver. If such an incident is recoverable, e.g., by
|
|
362 dismissing a feature of a feature table or some other non-mandatory
|
|
363 part of an entry, the driver will issue a warning. In the case
|
|
364 of a non-recoverable situation an exception will be thrown.
|
|
365 Do not assume that you can resume parsing the same stream after
|
|
366 catching the exception. Note that you can always turn recoverable
|
|
367 errors into exceptions by calling $stream->verbose(2) (see
|
|
368 Bio::RootI POD page).
|
|
369 Returns : a Bio::Structure structure object
|
|
370 Args : none
|
|
371
|
|
372 =cut
|
|
373
|
|
374 sub next_structure {
|
|
375 my ($self, $struc) = @_;
|
|
376 $self->throw("Sorry, you cannot read from a generic Bio::Structure::IO object.");
|
|
377 }
|
|
378
|
|
379 # Do we want people to read out the sequence directly from a $structIO stream
|
|
380 #
|
|
381 ##=head2 next_primary_seq
|
|
382 ##
|
|
383 ## Title : next_primary_seq
|
|
384 ## Usage : $seq = $stream->next_primary_seq
|
|
385 ## Function: Provides a primaryseq type of sequence object
|
|
386 ## Returns : A Bio::PrimarySeqI object
|
|
387 ## Args : none
|
|
388 ##
|
|
389 ##
|
|
390 ##=cut
|
|
391 ##
|
|
392 ##sub next_primary_seq {
|
|
393 ## my ($self) = @_;
|
|
394 ##
|
|
395 ## # in this case, we default to next_seq. This is because
|
|
396 ## # Bio::Seq's are Bio::PrimarySeqI objects. However we
|
|
397 ## # expect certain sub classes to override this method to provide
|
|
398 ## # less parsing heavy methods to retrieving the objects
|
|
399 ##
|
|
400 ## return $self->next_seq();
|
|
401 ##}
|
|
402
|
|
403 =head2 write_structure
|
|
404
|
|
405 Title : write_structure
|
|
406 Usage : $stream->write_structure($structure)
|
|
407 Function: writes the $structure object into the stream
|
|
408 Returns : 1 for success and 0 for error
|
|
409 Args : Bio::Structure object
|
|
410
|
|
411 =cut
|
|
412
|
|
413 sub write_seq {
|
|
414 my ($self, $struc) = @_;
|
|
415 $self->throw("Sorry, you cannot write to a generic Bio::Structure::IO object.");
|
|
416 }
|
|
417
|
|
418
|
|
419 # De we need this here
|
|
420 #
|
|
421 ##=head2 alphabet
|
|
422 ##
|
|
423 ## Title : alphabet
|
|
424 ## Usage : $self->alphabet($newval)
|
|
425 ## Function: Set/get the molecule type for the Seq objects to be created.
|
|
426 ## Example : $seqio->alphabet('protein')
|
|
427 ## Returns : value of alphabet: 'dna', 'rna', or 'protein'
|
|
428 ## Args : newvalue (optional)
|
|
429 ## Throws : Exception if the argument is not one of 'dna', 'rna', or 'protein'
|
|
430 ##
|
|
431 ##=cut
|
|
432 ##
|
|
433 ##sub alphabet {
|
|
434 ## my ($self, $value) = @_;
|
|
435 ##
|
|
436 ## if ( defined $value) {
|
|
437 ## # instead of hard-coding the allowed values once more, we check by
|
|
438 ## # creating a dummy sequence object
|
|
439 ## eval {
|
|
440 ## my $seq = Bio::PrimarySeq->new('-alphabet' => $value);
|
|
441 ## };
|
|
442 ## if($@) {
|
|
443 ## $self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values.");
|
|
444 ## }
|
|
445 ## $self->{'alphabet'} = "\L$value";
|
|
446 ## }
|
|
447 ## return $self->{'alphabet'};
|
|
448 ##}
|
|
449
|
|
450 =head2 _load_format_module
|
|
451
|
|
452 Title : _load_format_module
|
|
453 Usage : *INTERNAL Structure::IO stuff*
|
|
454 Function: Loads up (like use) a module at run time on demand
|
|
455 Example :
|
|
456 Returns :
|
|
457 Args :
|
|
458
|
|
459 =cut
|
|
460
|
|
461 sub _load_format_module {
|
|
462 my ($format) = @_;
|
|
463 my ($module, $load, $m);
|
|
464
|
|
465 $module = "_<Bio/Structure/IO/$format.pm";
|
|
466 $load = "Bio/Structure/IO/$format.pm";
|
|
467
|
|
468 return 1 if $main::{$module};
|
|
469 eval {
|
|
470 require $load;
|
|
471 };
|
|
472 if ( $@ ) {
|
|
473 print STDERR <<END;
|
|
474 $load: $format cannot be found
|
|
475 Exception $@
|
|
476 For more information about the Structure::IO system please see the
|
|
477 Bio::Structure::IO docs. This includes ways of checking for formats at
|
|
478 compile time, not run time
|
|
479 END
|
|
480 ;
|
|
481 return;
|
|
482 }
|
|
483 return 1;
|
|
484 }
|
|
485
|
|
486 =head2 _concatenate_lines
|
|
487
|
|
488 Title : _concatenate_lines
|
|
489 Usage : $s = _concatenate_lines($line, $continuation_line)
|
|
490 Function: Private. Concatenates two strings assuming that the second stems
|
|
491 from a continuation line of the first. Adds a space between both
|
|
492 unless the first ends with a dash.
|
|
493
|
|
494 Takes care of either arg being empty.
|
|
495 Example :
|
|
496 Returns : A string.
|
|
497 Args :
|
|
498
|
|
499 =cut
|
|
500
|
|
501 sub _concatenate_lines {
|
|
502 my ($self, $s1, $s2) = @_;
|
|
503 $s1 .= " " if($s1 && ($s1 !~ /-$/) && $s2);
|
|
504 return ($s1 ? $s1 : "") . ($s2 ? $s2 : "");
|
|
505 }
|
|
506
|
|
507 =head2 _filehandle
|
|
508
|
|
509 Title : _filehandle
|
|
510 Usage : $obj->_filehandle($newval)
|
|
511 Function: This method is deprecated. Call _fh() instead.
|
|
512 Example :
|
|
513 Returns : value of _filehandle
|
|
514 Args : newvalue (optional)
|
|
515
|
|
516
|
|
517 =cut
|
|
518
|
|
519 sub _filehandle {
|
|
520 my ($self,@args) = @_;
|
|
521 return $self->_fh(@args);
|
|
522 }
|
|
523
|
|
524 =head2 _guess_format
|
|
525
|
|
526 Title : _guess_format
|
|
527 Usage : $obj->_guess_format($filename)
|
|
528 Function:
|
|
529 Example :
|
|
530 Returns : guessed format of filename (lower case)
|
|
531 Args :
|
|
532
|
|
533 =cut
|
|
534
|
|
535 sub _guess_format {
|
|
536 my $class = shift;
|
|
537 return unless $_ = shift;
|
|
538 return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i;
|
|
539 return 'genbank' if /\.(gb|gbank|genbank)$/i;
|
|
540 return 'scf' if /\.scf$/i;
|
|
541 return 'pir' if /\.pir$/i;
|
|
542 return 'embl' if /\.(embl|ebl|emb|dat)$/i;
|
|
543 return 'raw' if /\.(txt)$/i;
|
|
544 return 'gcg' if /\.gcg$/i;
|
|
545 return 'ace' if /\.ace$/i;
|
|
546 return 'bsml' if /\.(bsm|bsml)$/i;
|
|
547 return 'pdb' if /\.(ent|pdb)$/i;
|
|
548 }
|
|
549
|
|
550 sub DESTROY {
|
|
551 my $self = shift;
|
|
552
|
|
553 $self->close();
|
|
554 }
|
|
555
|
|
556 sub TIEHANDLE {
|
|
557 my ($class,$val) = @_;
|
|
558 return bless {'structio' => $val}, $class;
|
|
559 }
|
|
560
|
|
561 sub READLINE {
|
|
562 my $self = shift;
|
|
563 return $self->{'structio'}->next_seq() unless wantarray;
|
|
564 my (@list, $obj);
|
|
565 push @list, $obj while $obj = $self->{'structio'}->next_seq();
|
|
566 return @list;
|
|
567 }
|
|
568
|
|
569 sub PRINT {
|
|
570 my $self = shift;
|
|
571 $self->{'structio'}->write_seq(@_);
|
|
572 }
|
|
573
|
|
574 1;
|
|
575
|