Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/AlignIO.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: AlignIO.pm,v 1.28 2002/10/22 07:38:23 lapp Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::AlignIO | |
| 4 # | |
| 5 # based on the Bio::SeqIO module | |
| 6 # by Ewan Birney <birney@sanger.ac.uk> | |
| 7 # and Lincoln Stein <lstein@cshl.org> | |
| 8 # | |
| 9 # Copyright Peter Schattner | |
| 10 # | |
| 11 # You may distribute this module under the same terms as perl itself | |
| 12 # | |
| 13 # _history | |
| 14 # October 18, 1999 SeqIO largely rewritten by Lincoln Stein | |
| 15 # September, 2000 AlignIO written by Peter Schattner | |
| 16 | |
| 17 # POD documentation - main docs before the code | |
| 18 | |
| 19 =head1 NAME | |
| 20 | |
| 21 Bio::AlignIO - Handler for AlignIO Formats | |
| 22 | |
| 23 =head1 SYNOPSIS | |
| 24 | |
| 25 use Bio::AlignIO; | |
| 26 | |
| 27 $inputfilename = "testaln.fasta"; | |
| 28 $in = Bio::AlignIO->new(-file => $inputfilename , | |
| 29 '-format' => 'fasta'); | |
| 30 $out = Bio::AlignIO->new(-file => ">out.aln.pfam" , | |
| 31 '-format' => 'pfam'); | |
| 32 # note: we quote -format to keep older perl's from complaining. | |
| 33 | |
| 34 while ( my $aln = $in->next_aln() ) { | |
| 35 $out->write_aln($aln); | |
| 36 } | |
| 37 | |
| 38 or | |
| 39 | |
| 40 use Bio::AlignIO; | |
| 41 | |
| 42 $inputfilename = "testaln.fasta"; | |
| 43 $in = Bio::AlignIO->newFh(-file => $inputfilename , | |
| 44 '-format' => 'fasta'); | |
| 45 $out = Bio::AlignIO->newFh('-format' => 'pfam'); | |
| 46 | |
| 47 # World's shortest Fasta<->pfam format converter: | |
| 48 print $out $_ while <$in>; | |
| 49 | |
| 50 =head1 DESCRIPTION | |
| 51 | |
| 52 Bio::AlignIO is a handler module for the formats in the AlignIO set | |
| 53 (eg, Bio::AlignIO::fasta). It is the officially sanctioned way of | |
| 54 getting at the alignment objects, which most people should use. The | |
| 55 resulting alignment is a Bio::Align::AlignI compliant object. See | |
| 56 L<Bio::Align::AlignI> for more information. | |
| 57 | |
| 58 The idea is that you request a stream object for a particular format. | |
| 59 All the stream objects have a notion of an internal file that is read | |
| 60 from or written to. A particular AlignIO object instance is configured | |
| 61 for either input or output. A specific example of a stream object is | |
| 62 the Bio::AlignIO::fasta object. | |
| 63 | |
| 64 Each stream object has functions | |
| 65 | |
| 66 $stream->next_aln(); | |
| 67 | |
| 68 and | |
| 69 | |
| 70 $stream->write_aln($aln); | |
| 71 | |
| 72 also | |
| 73 | |
| 74 $stream->type() # returns 'INPUT' or 'OUTPUT' | |
| 75 | |
| 76 As an added bonus, you can recover a filehandle that is tied to the | |
| 77 AlignIO object, allowing you to use the standard E<lt>E<gt> and print | |
| 78 operations to read and write sequence objects: | |
| 79 | |
| 80 use Bio::AlignIO; | |
| 81 | |
| 82 # read from standard input | |
| 83 $stream = Bio::AlignIO->newFh(-format => 'Fasta'); | |
| 84 | |
| 85 while ( $aln = <$stream> ) { | |
| 86 # do something with $aln | |
| 87 } | |
| 88 | |
| 89 and | |
| 90 | |
| 91 print $stream $aln; # when stream is in output mode | |
| 92 | |
| 93 This makes the simplest ever reformatter | |
| 94 | |
| 95 #!/usr/local/bin/perl | |
| 96 | |
| 97 $format1 = shift; | |
| 98 $format2 = shift || | |
| 99 die "Usage: reformat format1 format2 < input > output"; | |
| 100 | |
| 101 use Bio::AlignIO; | |
| 102 | |
| 103 $in = Bio::AlignIO->newFh(-format => $format1 ); | |
| 104 $out = Bio::AlignIO->newFh(-format => $format2 ); | |
| 105 # note: you might want to quote -format to keep | |
| 106 # older perl's from complaining. | |
| 107 | |
| 108 print $out $_ while <$in>; | |
| 109 | |
| 110 AlignIO.pm is patterned on the module SeqIO.pm and shares most the | |
| 111 SeqIO.pm features. One significant difference currently is that | |
| 112 AlignIO.pm usually handles IO for only a single alignment at a time | |
| 113 (SeqIO.pm handles IO for multiple sequences in a single stream.) The | |
| 114 principal reason for this is that whereas simultaneously handling | |
| 115 multiple sequences is a common requirement, simultaneous handling of | |
| 116 multiple alignments is not. The only current exception is format | |
| 117 "bl2seq" which parses results of the Blast bl2seq program and which | |
| 118 may produce several alignment pairs. This set of alignment pairs can | |
| 119 be read using multiple calls to next_aln. | |
| 120 | |
| 121 Capability for IO for more than one multiple alignment - other than | |
| 122 for bl2seq format -(which may be of use for certain applications such | |
| 123 as IO for Pfam libraries) may be included in the future. For this | |
| 124 reason we keep the name "next_aln()" for the alignment input routine, | |
| 125 even though in most cases only one alignment is read (or written) at a | |
| 126 time and the name "read_aln()" might be more appropriate. | |
| 127 | |
| 128 =head1 CONSTRUCTORS | |
| 129 | |
| 130 =head2 Bio::AlignIO-E<gt>new() | |
| 131 | |
| 132 $seqIO = Bio::AlignIO->new(-file => 'filename', -format=>$format); | |
| 133 $seqIO = Bio::AlignIO->new(-fh => \*FILEHANDLE, -format=>$format); | |
| 134 $seqIO = Bio::AlignIO->new(-format => $format); | |
| 135 | |
| 136 The new() class method constructs a new Bio::AlignIO object. The | |
| 137 returned object can be used to retrieve or print BioAlign | |
| 138 objects. new() accepts the following parameters: | |
| 139 | |
| 140 =over 4 | |
| 141 | |
| 142 =item -file | |
| 143 | |
| 144 A file path to be opened for reading or writing. The usual Perl | |
| 145 conventions apply: | |
| 146 | |
| 147 'file' # open file for reading | |
| 148 '>file' # open file for writing | |
| 149 '>>file' # open file for appending | |
| 150 '+<file' # open file read/write | |
| 151 'command |' # open a pipe from the command | |
| 152 '| command' # open a pipe to the command | |
| 153 | |
| 154 =item -fh | |
| 155 | |
| 156 You may provide new() with a previously-opened filehandle. For | |
| 157 example, to read from STDIN: | |
| 158 | |
| 159 $seqIO = Bio::AlignIO->new(-fh => \*STDIN); | |
| 160 | |
| 161 Note that you must pass filehandles as references to globs. | |
| 162 | |
| 163 If neither a filehandle nor a filename is specified, then the module | |
| 164 will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt> | |
| 165 semantics. | |
| 166 | |
| 167 =item -format | |
| 168 | |
| 169 Specify the format of the file. Supported formats include: | |
| 170 | |
| 171 fasta FASTA format | |
| 172 selex selex (hmmer) format | |
| 173 stockholm stockholm format | |
| 174 prodom prodom (protein domain) format | |
| 175 clustalw clustalw (.aln) format | |
| 176 msf msf (GCG) format | |
| 177 mase mase (seaview) format | |
| 178 bl2seq Bl2seq Blast output | |
| 179 nexus Swofford et al NEXUS format | |
| 180 pfam Pfam sequence alignment format | |
| 181 phylip Felsenstein's PHYLIP format | |
| 182 emboss EMBOSS water and needle format | |
| 183 mega MEGA format | |
| 184 meme MEME format | |
| 185 psi PSI-BLAST format | |
| 186 | |
| 187 Currently only those formats which were implemented in L<Bio::SimpleAlign> | |
| 188 have been incorporated in AlignIO.pm. Specifically, mase, stockholm | |
| 189 and prodom have only been implemented for input. See the specific module | |
| 190 (e.g. L<Bio::AlignIO::meme>) for notes on supported versions. | |
| 191 | |
| 192 If no format is specified and a filename is given, then the module | |
| 193 will attempt to deduce it from the filename suffix. If this is unsuccessful, | |
| 194 Fasta format is assumed. | |
| 195 | |
| 196 The format name is case insensitive. 'FASTA', 'Fasta' and 'fasta' are | |
| 197 all supported. | |
| 198 | |
| 199 =back | |
| 200 | |
| 201 =head2 Bio::AlignIO-E<gt>newFh() | |
| 202 | |
| 203 $fh = Bio::AlignIO->newFh(-fh => \*FILEHANDLE, -format=>$format); | |
| 204 $fh = Bio::AlignIO->newFh(-format => $format); | |
| 205 # etc. | |
| 206 | |
| 207 This constructor behaves like new(), but returns a tied filehandle | |
| 208 rather than a Bio::AlignIO object. You can read sequences from this | |
| 209 object using the familiar E<lt>E<gt> operator, and write to it using print(). | |
| 210 The usual array and $_ semantics work. For example, you can read all | |
| 211 sequence objects into an array like this: | |
| 212 | |
| 213 @sequences = <$fh>; | |
| 214 | |
| 215 Other operations, such as read(), sysread(), write(), close(), and printf() | |
| 216 are not supported. | |
| 217 | |
| 218 =over 1 | |
| 219 | |
| 220 =item -flush | |
| 221 | |
| 222 By default, all files (or filehandles) opened for writing alignments | |
| 223 will be flushed after each write_aln() (making the file immediately | |
| 224 usable). If you don't need this facility and would like to marginally | |
| 225 improve the efficiency of writing multiple sequences to the same file | |
| 226 (or filehandle), pass the -flush option '0' or any other value that | |
| 227 evaluates as defined but false: | |
| 228 | |
| 229 my $clustal = new Bio::AlignIO -file => "<prot.aln", | |
| 230 -format => "clustalw"; | |
| 231 my $msf = new Bio::AlignIO -file => ">prot.msf", | |
| 232 -format => "msf", | |
| 233 -flush => 0; # go as fast as we can! | |
| 234 while($seq = $clustal->next_aln) { $msf->write_aln($seq) } | |
| 235 | |
| 236 =back | |
| 237 | |
| 238 =head1 OBJECT METHODS | |
| 239 | |
| 240 See below for more detailed summaries. The main methods are: | |
| 241 | |
| 242 =head2 $alignment = $AlignIO-E<gt>next_aln() | |
| 243 | |
| 244 Fetch an alignment from a formatted file. | |
| 245 | |
| 246 =head2 $AlignIO-E<gt>write_aln($aln) | |
| 247 | |
| 248 Write the specified alignment to a file.. | |
| 249 | |
| 250 =head2 TIEHANDLE(), READLINE(), PRINT() | |
| 251 | |
| 252 These provide the tie interface. See L<perltie> for more details. | |
| 253 | |
| 254 =head1 FEEDBACK | |
| 255 | |
| 256 =head2 Mailing Lists | |
| 257 | |
| 258 User feedback is an integral part of the evolution of this and other | |
| 259 Bioperl modules. Send your comments and suggestions preferably to one | |
| 260 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 261 | |
| 262 bioperl-l@bioperl.org - General discussion | |
| 263 http://bio.perl.org/MailList.html - About the mailing lists | |
| 264 | |
| 265 =head2 Reporting Bugs | |
| 266 | |
| 267 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 268 the bugs and their resolution. | |
| 269 Bug reports can be submitted via email or the web: | |
| 270 | |
| 271 bioperl-bugs@bio.perl.org | |
| 272 http://bugzilla.bioperl.org/ | |
| 273 | |
| 274 =head1 AUTHOR - Peter Schattner | |
| 275 | |
| 276 Email: schattner@alum.mit.edu | |
| 277 | |
| 278 =head1 CONTRIBUTORS | |
| 279 | |
| 280 Jason Stajich, jason@bioperl.org | |
| 281 | |
| 282 =head1 APPENDIX | |
| 283 | |
| 284 The rest of the documentation details each of the object | |
| 285 methods. Internal methods are usually preceded with a _ | |
| 286 | |
| 287 =cut | |
| 288 | |
| 289 # 'Let the code begin... | |
| 290 | |
| 291 package Bio::AlignIO; | |
| 292 | |
| 293 use strict; | |
| 294 use vars qw(@ISA); | |
| 295 | |
| 296 use Bio::Root::Root; | |
| 297 use Bio::Seq; | |
| 298 use Bio::LocatableSeq; | |
| 299 use Bio::SimpleAlign; | |
| 300 use Bio::Root::IO; | |
| 301 @ISA = qw(Bio::Root::Root Bio::Root::IO); | |
| 302 | |
| 303 =head2 new | |
| 304 | |
| 305 Title : new | |
| 306 Usage : $stream = Bio::AlignIO->new(-file => $filename, | |
| 307 '-format' => 'Format') | |
| 308 Function: Returns a new seqstream | |
| 309 Returns : A Bio::AlignIO::Handler initialised with | |
| 310 the appropriate format | |
| 311 Args : -file => $filename | |
| 312 -format => format | |
| 313 -fh => filehandle to attach to | |
| 314 | |
| 315 =cut | |
| 316 | |
| 317 sub new { | |
| 318 my ($caller,@args) = @_; | |
| 319 my $class = ref($caller) || $caller; | |
| 320 | |
| 321 # or do we want to call SUPER on an object if $caller is an | |
| 322 # object? | |
| 323 if( $class =~ /Bio::AlignIO::(\S+)/ ) { | |
| 324 my ($self) = $class->SUPER::new(@args); | |
| 325 $self->_initialize(@args); | |
| 326 return $self; | |
| 327 } else { | |
| 328 | |
| 329 my %param = @args; | |
| 330 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys | |
| 331 my $format = $param{'-format'} || | |
| 332 $class->_guess_format( $param{-file} || $ARGV[0] ) || | |
| 333 'fasta'; | |
| 334 $format = "\L$format"; # normalize capitalization to lower case | |
| 335 | |
| 336 # normalize capitalization | |
| 337 return undef unless( $class->_load_format_module($format) ); | |
| 338 return "Bio::AlignIO::$format"->new(@args); | |
| 339 } | |
| 340 } | |
| 341 | |
| 342 | |
| 343 =head2 newFh | |
| 344 | |
| 345 Title : newFh | |
| 346 Usage : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format') | |
| 347 Function: does a new() followed by an fh() | |
| 348 Example : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format') | |
| 349 $sequence = <$fh>; # read a sequence object | |
| 350 print $fh $sequence; # write a sequence object | |
| 351 Returns : filehandle tied to the Bio::AlignIO::Fh class | |
| 352 Args : | |
| 353 | |
| 354 =cut | |
| 355 | |
| 356 sub newFh { | |
| 357 my $class = shift; | |
| 358 return unless my $self = $class->new(@_); | |
| 359 return $self->fh; | |
| 360 } | |
| 361 | |
| 362 =head2 fh | |
| 363 | |
| 364 Title : fh | |
| 365 Usage : $obj->fh | |
| 366 Function: | |
| 367 Example : $fh = $obj->fh; # make a tied filehandle | |
| 368 $sequence = <$fh>; # read a sequence object | |
| 369 print $fh $sequence; # write a sequence object | |
| 370 Returns : filehandle tied to the Bio::AlignIO::Fh class | |
| 371 Args : | |
| 372 | |
| 373 =cut | |
| 374 | |
| 375 | |
| 376 sub fh { | |
| 377 my $self = shift; | |
| 378 my $class = ref($self) || $self; | |
| 379 my $s = Symbol::gensym; | |
| 380 tie $$s,$class,$self; | |
| 381 return $s; | |
| 382 } | |
| 383 | |
| 384 # _initialize is where the heavy stuff will happen when new is called | |
| 385 | |
| 386 sub _initialize { | |
| 387 my($self,@args) = @_; | |
| 388 | |
| 389 $self->_initialize_io(@args); | |
| 390 1; | |
| 391 } | |
| 392 | |
| 393 =head2 _load_format_module | |
| 394 | |
| 395 Title : _load_format_module | |
| 396 Usage : *INTERNAL AlignIO stuff* | |
| 397 Function: Loads up (like use) a module at run time on demand | |
| 398 Example : | |
| 399 Returns : | |
| 400 Args : | |
| 401 | |
| 402 =cut | |
| 403 | |
| 404 sub _load_format_module { | |
| 405 my ($self,$format) = @_; | |
| 406 my $module = "Bio::AlignIO::" . $format; | |
| 407 my $ok; | |
| 408 | |
| 409 eval { | |
| 410 $ok = $self->_load_module($module); | |
| 411 }; | |
| 412 if ( $@ ) { | |
| 413 print STDERR <<END; | |
| 414 $self: $format cannot be found | |
| 415 Exception $@ | |
| 416 For more information about the AlignIO system please see the AlignIO docs. | |
| 417 This includes ways of checking for formats at compile time, not run time | |
| 418 END | |
| 419 ; | |
| 420 return; | |
| 421 } | |
| 422 return 1; | |
| 423 } | |
| 424 | |
| 425 =head2 next_aln | |
| 426 | |
| 427 Title : next_aln | |
| 428 Usage : $aln = stream->next_aln | |
| 429 Function: reads the next $aln object from the stream | |
| 430 Returns : a Bio::Align::AlignI compliant object | |
| 431 Args : | |
| 432 | |
| 433 =cut | |
| 434 | |
| 435 sub next_aln { | |
| 436 my ($self,$aln) = @_; | |
| 437 $self->throw("Sorry, you cannot read from a generic Bio::AlignIO object."); | |
| 438 } | |
| 439 | |
| 440 =head2 write_aln | |
| 441 | |
| 442 Title : write_aln | |
| 443 Usage : $stream->write_aln($aln) | |
| 444 Function: writes the $aln object into the stream | |
| 445 Returns : 1 for success and 0 for error | |
| 446 Args : Bio::Seq object | |
| 447 | |
| 448 =cut | |
| 449 | |
| 450 sub write_aln { | |
| 451 my ($self,$aln) = @_; | |
| 452 $self->throw("Sorry, you cannot write to a generic Bio::AlignIO object."); | |
| 453 } | |
| 454 | |
| 455 =head2 _guess_format | |
| 456 | |
| 457 Title : _guess_format | |
| 458 Usage : $obj->_guess_format($filename) | |
| 459 Function: | |
| 460 Example : | |
| 461 Returns : guessed format of filename (lower case) | |
| 462 Args : | |
| 463 | |
| 464 =cut | |
| 465 | |
| 466 sub _guess_format { | |
| 467 my $class = shift; | |
| 468 return unless $_ = shift; | |
| 469 return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i; | |
| 470 return 'msf' if /\.(msf|pileup|gcg)$/i; | |
| 471 return 'pfam' if /\.(pfam|pfm)$/i; | |
| 472 return 'selex' if /\.(selex|slx|selx|slex|sx)$/i; | |
| 473 return 'phylip' if /\.(phylip|phlp|phyl|phy|phy|ph)$/i; | |
| 474 return 'nexus' if /\.(nexus|nex)$/i; | |
| 475 return 'mega' if( /\.(meg|mega)$/i ); | |
| 476 return 'clustalw' if( /\.aln$/i ); | |
| 477 return 'meme' if( /\.meme$/i ); | |
| 478 return 'emboss' if( /\.(water|needle)$/i ); | |
| 479 return 'psi' if( /\.psi$/i ); | |
| 480 } | |
| 481 | |
| 482 sub DESTROY { | |
| 483 my $self = shift; | |
| 484 $self->close(); | |
| 485 } | |
| 486 | |
| 487 sub TIEHANDLE { | |
| 488 my $class = shift; | |
| 489 return bless {'alignio' => shift},$class; | |
| 490 } | |
| 491 | |
| 492 sub READLINE { | |
| 493 my $self = shift; | |
| 494 return $self->{'alignio'}->next_aln() unless wantarray; | |
| 495 my (@list,$obj); | |
| 496 push @list,$obj while $obj = $self->{'alignio'}->next_aln(); | |
| 497 return @list; | |
| 498 } | |
| 499 | |
| 500 sub PRINT { | |
| 501 my $self = shift; | |
| 502 $self->{'alignio'}->write_aln(@_); | |
| 503 } | |
| 504 | |
| 505 1; |
