0
|
1 # $Id: AlignIO.pm,v 1.28 2002/10/22 07:38:23 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::AlignIO
|
|
4 #
|
|
5 # based on the Bio::SeqIO module
|
|
6 # by Ewan Birney <birney@sanger.ac.uk>
|
|
7 # and Lincoln Stein <lstein@cshl.org>
|
|
8 #
|
|
9 # Copyright Peter Schattner
|
|
10 #
|
|
11 # You may distribute this module under the same terms as perl itself
|
|
12 #
|
|
13 # _history
|
|
14 # October 18, 1999 SeqIO largely rewritten by Lincoln Stein
|
|
15 # September, 2000 AlignIO written by Peter Schattner
|
|
16
|
|
17 # POD documentation - main docs before the code
|
|
18
|
|
19 =head1 NAME
|
|
20
|
|
21 Bio::AlignIO - Handler for AlignIO Formats
|
|
22
|
|
23 =head1 SYNOPSIS
|
|
24
|
|
25 use Bio::AlignIO;
|
|
26
|
|
27 $inputfilename = "testaln.fasta";
|
|
28 $in = Bio::AlignIO->new(-file => $inputfilename ,
|
|
29 '-format' => 'fasta');
|
|
30 $out = Bio::AlignIO->new(-file => ">out.aln.pfam" ,
|
|
31 '-format' => 'pfam');
|
|
32 # note: we quote -format to keep older perl's from complaining.
|
|
33
|
|
34 while ( my $aln = $in->next_aln() ) {
|
|
35 $out->write_aln($aln);
|
|
36 }
|
|
37
|
|
38 or
|
|
39
|
|
40 use Bio::AlignIO;
|
|
41
|
|
42 $inputfilename = "testaln.fasta";
|
|
43 $in = Bio::AlignIO->newFh(-file => $inputfilename ,
|
|
44 '-format' => 'fasta');
|
|
45 $out = Bio::AlignIO->newFh('-format' => 'pfam');
|
|
46
|
|
47 # World's shortest Fasta<->pfam format converter:
|
|
48 print $out $_ while <$in>;
|
|
49
|
|
50 =head1 DESCRIPTION
|
|
51
|
|
52 Bio::AlignIO is a handler module for the formats in the AlignIO set
|
|
53 (eg, Bio::AlignIO::fasta). It is the officially sanctioned way of
|
|
54 getting at the alignment objects, which most people should use. The
|
|
55 resulting alignment is a Bio::Align::AlignI compliant object. See
|
|
56 L<Bio::Align::AlignI> for more information.
|
|
57
|
|
58 The idea is that you request a stream object for a particular format.
|
|
59 All the stream objects have a notion of an internal file that is read
|
|
60 from or written to. A particular AlignIO object instance is configured
|
|
61 for either input or output. A specific example of a stream object is
|
|
62 the Bio::AlignIO::fasta object.
|
|
63
|
|
64 Each stream object has functions
|
|
65
|
|
66 $stream->next_aln();
|
|
67
|
|
68 and
|
|
69
|
|
70 $stream->write_aln($aln);
|
|
71
|
|
72 also
|
|
73
|
|
74 $stream->type() # returns 'INPUT' or 'OUTPUT'
|
|
75
|
|
76 As an added bonus, you can recover a filehandle that is tied to the
|
|
77 AlignIO object, allowing you to use the standard E<lt>E<gt> and print
|
|
78 operations to read and write sequence objects:
|
|
79
|
|
80 use Bio::AlignIO;
|
|
81
|
|
82 # read from standard input
|
|
83 $stream = Bio::AlignIO->newFh(-format => 'Fasta');
|
|
84
|
|
85 while ( $aln = <$stream> ) {
|
|
86 # do something with $aln
|
|
87 }
|
|
88
|
|
89 and
|
|
90
|
|
91 print $stream $aln; # when stream is in output mode
|
|
92
|
|
93 This makes the simplest ever reformatter
|
|
94
|
|
95 #!/usr/local/bin/perl
|
|
96
|
|
97 $format1 = shift;
|
|
98 $format2 = shift ||
|
|
99 die "Usage: reformat format1 format2 < input > output";
|
|
100
|
|
101 use Bio::AlignIO;
|
|
102
|
|
103 $in = Bio::AlignIO->newFh(-format => $format1 );
|
|
104 $out = Bio::AlignIO->newFh(-format => $format2 );
|
|
105 # note: you might want to quote -format to keep
|
|
106 # older perl's from complaining.
|
|
107
|
|
108 print $out $_ while <$in>;
|
|
109
|
|
110 AlignIO.pm is patterned on the module SeqIO.pm and shares most the
|
|
111 SeqIO.pm features. One significant difference currently is that
|
|
112 AlignIO.pm usually handles IO for only a single alignment at a time
|
|
113 (SeqIO.pm handles IO for multiple sequences in a single stream.) The
|
|
114 principal reason for this is that whereas simultaneously handling
|
|
115 multiple sequences is a common requirement, simultaneous handling of
|
|
116 multiple alignments is not. The only current exception is format
|
|
117 "bl2seq" which parses results of the Blast bl2seq program and which
|
|
118 may produce several alignment pairs. This set of alignment pairs can
|
|
119 be read using multiple calls to next_aln.
|
|
120
|
|
121 Capability for IO for more than one multiple alignment - other than
|
|
122 for bl2seq format -(which may be of use for certain applications such
|
|
123 as IO for Pfam libraries) may be included in the future. For this
|
|
124 reason we keep the name "next_aln()" for the alignment input routine,
|
|
125 even though in most cases only one alignment is read (or written) at a
|
|
126 time and the name "read_aln()" might be more appropriate.
|
|
127
|
|
128 =head1 CONSTRUCTORS
|
|
129
|
|
130 =head2 Bio::AlignIO-E<gt>new()
|
|
131
|
|
132 $seqIO = Bio::AlignIO->new(-file => 'filename', -format=>$format);
|
|
133 $seqIO = Bio::AlignIO->new(-fh => \*FILEHANDLE, -format=>$format);
|
|
134 $seqIO = Bio::AlignIO->new(-format => $format);
|
|
135
|
|
136 The new() class method constructs a new Bio::AlignIO object. The
|
|
137 returned object can be used to retrieve or print BioAlign
|
|
138 objects. new() accepts the following parameters:
|
|
139
|
|
140 =over 4
|
|
141
|
|
142 =item -file
|
|
143
|
|
144 A file path to be opened for reading or writing. The usual Perl
|
|
145 conventions apply:
|
|
146
|
|
147 'file' # open file for reading
|
|
148 '>file' # open file for writing
|
|
149 '>>file' # open file for appending
|
|
150 '+<file' # open file read/write
|
|
151 'command |' # open a pipe from the command
|
|
152 '| command' # open a pipe to the command
|
|
153
|
|
154 =item -fh
|
|
155
|
|
156 You may provide new() with a previously-opened filehandle. For
|
|
157 example, to read from STDIN:
|
|
158
|
|
159 $seqIO = Bio::AlignIO->new(-fh => \*STDIN);
|
|
160
|
|
161 Note that you must pass filehandles as references to globs.
|
|
162
|
|
163 If neither a filehandle nor a filename is specified, then the module
|
|
164 will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt>
|
|
165 semantics.
|
|
166
|
|
167 =item -format
|
|
168
|
|
169 Specify the format of the file. Supported formats include:
|
|
170
|
|
171 fasta FASTA format
|
|
172 selex selex (hmmer) format
|
|
173 stockholm stockholm format
|
|
174 prodom prodom (protein domain) format
|
|
175 clustalw clustalw (.aln) format
|
|
176 msf msf (GCG) format
|
|
177 mase mase (seaview) format
|
|
178 bl2seq Bl2seq Blast output
|
|
179 nexus Swofford et al NEXUS format
|
|
180 pfam Pfam sequence alignment format
|
|
181 phylip Felsenstein's PHYLIP format
|
|
182 emboss EMBOSS water and needle format
|
|
183 mega MEGA format
|
|
184 meme MEME format
|
|
185 psi PSI-BLAST format
|
|
186
|
|
187 Currently only those formats which were implemented in L<Bio::SimpleAlign>
|
|
188 have been incorporated in AlignIO.pm. Specifically, mase, stockholm
|
|
189 and prodom have only been implemented for input. See the specific module
|
|
190 (e.g. L<Bio::AlignIO::meme>) for notes on supported versions.
|
|
191
|
|
192 If no format is specified and a filename is given, then the module
|
|
193 will attempt to deduce it from the filename suffix. If this is unsuccessful,
|
|
194 Fasta format is assumed.
|
|
195
|
|
196 The format name is case insensitive. 'FASTA', 'Fasta' and 'fasta' are
|
|
197 all supported.
|
|
198
|
|
199 =back
|
|
200
|
|
201 =head2 Bio::AlignIO-E<gt>newFh()
|
|
202
|
|
203 $fh = Bio::AlignIO->newFh(-fh => \*FILEHANDLE, -format=>$format);
|
|
204 $fh = Bio::AlignIO->newFh(-format => $format);
|
|
205 # etc.
|
|
206
|
|
207 This constructor behaves like new(), but returns a tied filehandle
|
|
208 rather than a Bio::AlignIO object. You can read sequences from this
|
|
209 object using the familiar E<lt>E<gt> operator, and write to it using print().
|
|
210 The usual array and $_ semantics work. For example, you can read all
|
|
211 sequence objects into an array like this:
|
|
212
|
|
213 @sequences = <$fh>;
|
|
214
|
|
215 Other operations, such as read(), sysread(), write(), close(), and printf()
|
|
216 are not supported.
|
|
217
|
|
218 =over 1
|
|
219
|
|
220 =item -flush
|
|
221
|
|
222 By default, all files (or filehandles) opened for writing alignments
|
|
223 will be flushed after each write_aln() (making the file immediately
|
|
224 usable). If you don't need this facility and would like to marginally
|
|
225 improve the efficiency of writing multiple sequences to the same file
|
|
226 (or filehandle), pass the -flush option '0' or any other value that
|
|
227 evaluates as defined but false:
|
|
228
|
|
229 my $clustal = new Bio::AlignIO -file => "<prot.aln",
|
|
230 -format => "clustalw";
|
|
231 my $msf = new Bio::AlignIO -file => ">prot.msf",
|
|
232 -format => "msf",
|
|
233 -flush => 0; # go as fast as we can!
|
|
234 while($seq = $clustal->next_aln) { $msf->write_aln($seq) }
|
|
235
|
|
236 =back
|
|
237
|
|
238 =head1 OBJECT METHODS
|
|
239
|
|
240 See below for more detailed summaries. The main methods are:
|
|
241
|
|
242 =head2 $alignment = $AlignIO-E<gt>next_aln()
|
|
243
|
|
244 Fetch an alignment from a formatted file.
|
|
245
|
|
246 =head2 $AlignIO-E<gt>write_aln($aln)
|
|
247
|
|
248 Write the specified alignment to a file..
|
|
249
|
|
250 =head2 TIEHANDLE(), READLINE(), PRINT()
|
|
251
|
|
252 These provide the tie interface. See L<perltie> for more details.
|
|
253
|
|
254 =head1 FEEDBACK
|
|
255
|
|
256 =head2 Mailing Lists
|
|
257
|
|
258 User feedback is an integral part of the evolution of this and other
|
|
259 Bioperl modules. Send your comments and suggestions preferably to one
|
|
260 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
261
|
|
262 bioperl-l@bioperl.org - General discussion
|
|
263 http://bio.perl.org/MailList.html - About the mailing lists
|
|
264
|
|
265 =head2 Reporting Bugs
|
|
266
|
|
267 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
268 the bugs and their resolution.
|
|
269 Bug reports can be submitted via email or the web:
|
|
270
|
|
271 bioperl-bugs@bio.perl.org
|
|
272 http://bugzilla.bioperl.org/
|
|
273
|
|
274 =head1 AUTHOR - Peter Schattner
|
|
275
|
|
276 Email: schattner@alum.mit.edu
|
|
277
|
|
278 =head1 CONTRIBUTORS
|
|
279
|
|
280 Jason Stajich, jason@bioperl.org
|
|
281
|
|
282 =head1 APPENDIX
|
|
283
|
|
284 The rest of the documentation details each of the object
|
|
285 methods. Internal methods are usually preceded with a _
|
|
286
|
|
287 =cut
|
|
288
|
|
289 # 'Let the code begin...
|
|
290
|
|
291 package Bio::AlignIO;
|
|
292
|
|
293 use strict;
|
|
294 use vars qw(@ISA);
|
|
295
|
|
296 use Bio::Root::Root;
|
|
297 use Bio::Seq;
|
|
298 use Bio::LocatableSeq;
|
|
299 use Bio::SimpleAlign;
|
|
300 use Bio::Root::IO;
|
|
301 @ISA = qw(Bio::Root::Root Bio::Root::IO);
|
|
302
|
|
303 =head2 new
|
|
304
|
|
305 Title : new
|
|
306 Usage : $stream = Bio::AlignIO->new(-file => $filename,
|
|
307 '-format' => 'Format')
|
|
308 Function: Returns a new seqstream
|
|
309 Returns : A Bio::AlignIO::Handler initialised with
|
|
310 the appropriate format
|
|
311 Args : -file => $filename
|
|
312 -format => format
|
|
313 -fh => filehandle to attach to
|
|
314
|
|
315 =cut
|
|
316
|
|
317 sub new {
|
|
318 my ($caller,@args) = @_;
|
|
319 my $class = ref($caller) || $caller;
|
|
320
|
|
321 # or do we want to call SUPER on an object if $caller is an
|
|
322 # object?
|
|
323 if( $class =~ /Bio::AlignIO::(\S+)/ ) {
|
|
324 my ($self) = $class->SUPER::new(@args);
|
|
325 $self->_initialize(@args);
|
|
326 return $self;
|
|
327 } else {
|
|
328
|
|
329 my %param = @args;
|
|
330 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
|
|
331 my $format = $param{'-format'} ||
|
|
332 $class->_guess_format( $param{-file} || $ARGV[0] ) ||
|
|
333 'fasta';
|
|
334 $format = "\L$format"; # normalize capitalization to lower case
|
|
335
|
|
336 # normalize capitalization
|
|
337 return undef unless( $class->_load_format_module($format) );
|
|
338 return "Bio::AlignIO::$format"->new(@args);
|
|
339 }
|
|
340 }
|
|
341
|
|
342
|
|
343 =head2 newFh
|
|
344
|
|
345 Title : newFh
|
|
346 Usage : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format')
|
|
347 Function: does a new() followed by an fh()
|
|
348 Example : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format')
|
|
349 $sequence = <$fh>; # read a sequence object
|
|
350 print $fh $sequence; # write a sequence object
|
|
351 Returns : filehandle tied to the Bio::AlignIO::Fh class
|
|
352 Args :
|
|
353
|
|
354 =cut
|
|
355
|
|
356 sub newFh {
|
|
357 my $class = shift;
|
|
358 return unless my $self = $class->new(@_);
|
|
359 return $self->fh;
|
|
360 }
|
|
361
|
|
362 =head2 fh
|
|
363
|
|
364 Title : fh
|
|
365 Usage : $obj->fh
|
|
366 Function:
|
|
367 Example : $fh = $obj->fh; # make a tied filehandle
|
|
368 $sequence = <$fh>; # read a sequence object
|
|
369 print $fh $sequence; # write a sequence object
|
|
370 Returns : filehandle tied to the Bio::AlignIO::Fh class
|
|
371 Args :
|
|
372
|
|
373 =cut
|
|
374
|
|
375
|
|
376 sub fh {
|
|
377 my $self = shift;
|
|
378 my $class = ref($self) || $self;
|
|
379 my $s = Symbol::gensym;
|
|
380 tie $$s,$class,$self;
|
|
381 return $s;
|
|
382 }
|
|
383
|
|
384 # _initialize is where the heavy stuff will happen when new is called
|
|
385
|
|
386 sub _initialize {
|
|
387 my($self,@args) = @_;
|
|
388
|
|
389 $self->_initialize_io(@args);
|
|
390 1;
|
|
391 }
|
|
392
|
|
393 =head2 _load_format_module
|
|
394
|
|
395 Title : _load_format_module
|
|
396 Usage : *INTERNAL AlignIO stuff*
|
|
397 Function: Loads up (like use) a module at run time on demand
|
|
398 Example :
|
|
399 Returns :
|
|
400 Args :
|
|
401
|
|
402 =cut
|
|
403
|
|
404 sub _load_format_module {
|
|
405 my ($self,$format) = @_;
|
|
406 my $module = "Bio::AlignIO::" . $format;
|
|
407 my $ok;
|
|
408
|
|
409 eval {
|
|
410 $ok = $self->_load_module($module);
|
|
411 };
|
|
412 if ( $@ ) {
|
|
413 print STDERR <<END;
|
|
414 $self: $format cannot be found
|
|
415 Exception $@
|
|
416 For more information about the AlignIO system please see the AlignIO docs.
|
|
417 This includes ways of checking for formats at compile time, not run time
|
|
418 END
|
|
419 ;
|
|
420 return;
|
|
421 }
|
|
422 return 1;
|
|
423 }
|
|
424
|
|
425 =head2 next_aln
|
|
426
|
|
427 Title : next_aln
|
|
428 Usage : $aln = stream->next_aln
|
|
429 Function: reads the next $aln object from the stream
|
|
430 Returns : a Bio::Align::AlignI compliant object
|
|
431 Args :
|
|
432
|
|
433 =cut
|
|
434
|
|
435 sub next_aln {
|
|
436 my ($self,$aln) = @_;
|
|
437 $self->throw("Sorry, you cannot read from a generic Bio::AlignIO object.");
|
|
438 }
|
|
439
|
|
440 =head2 write_aln
|
|
441
|
|
442 Title : write_aln
|
|
443 Usage : $stream->write_aln($aln)
|
|
444 Function: writes the $aln object into the stream
|
|
445 Returns : 1 for success and 0 for error
|
|
446 Args : Bio::Seq object
|
|
447
|
|
448 =cut
|
|
449
|
|
450 sub write_aln {
|
|
451 my ($self,$aln) = @_;
|
|
452 $self->throw("Sorry, you cannot write to a generic Bio::AlignIO object.");
|
|
453 }
|
|
454
|
|
455 =head2 _guess_format
|
|
456
|
|
457 Title : _guess_format
|
|
458 Usage : $obj->_guess_format($filename)
|
|
459 Function:
|
|
460 Example :
|
|
461 Returns : guessed format of filename (lower case)
|
|
462 Args :
|
|
463
|
|
464 =cut
|
|
465
|
|
466 sub _guess_format {
|
|
467 my $class = shift;
|
|
468 return unless $_ = shift;
|
|
469 return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i;
|
|
470 return 'msf' if /\.(msf|pileup|gcg)$/i;
|
|
471 return 'pfam' if /\.(pfam|pfm)$/i;
|
|
472 return 'selex' if /\.(selex|slx|selx|slex|sx)$/i;
|
|
473 return 'phylip' if /\.(phylip|phlp|phyl|phy|phy|ph)$/i;
|
|
474 return 'nexus' if /\.(nexus|nex)$/i;
|
|
475 return 'mega' if( /\.(meg|mega)$/i );
|
|
476 return 'clustalw' if( /\.aln$/i );
|
|
477 return 'meme' if( /\.meme$/i );
|
|
478 return 'emboss' if( /\.(water|needle)$/i );
|
|
479 return 'psi' if( /\.psi$/i );
|
|
480 }
|
|
481
|
|
482 sub DESTROY {
|
|
483 my $self = shift;
|
|
484 $self->close();
|
|
485 }
|
|
486
|
|
487 sub TIEHANDLE {
|
|
488 my $class = shift;
|
|
489 return bless {'alignio' => shift},$class;
|
|
490 }
|
|
491
|
|
492 sub READLINE {
|
|
493 my $self = shift;
|
|
494 return $self->{'alignio'}->next_aln() unless wantarray;
|
|
495 my (@list,$obj);
|
|
496 push @list,$obj while $obj = $self->{'alignio'}->next_aln();
|
|
497 return @list;
|
|
498 }
|
|
499
|
|
500 sub PRINT {
|
|
501 my $self = shift;
|
|
502 $self->{'alignio'}->write_aln(@_);
|
|
503 }
|
|
504
|
|
505 1;
|