Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/AlignIO/phylip.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: phylip.pm,v 1.24.2.1 2003/01/26 15:52:30 jason Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::AlignIO::phylip | |
| 4 # | |
| 5 # Copyright Heikki Lehvaslaiho | |
| 6 # | |
| 7 | |
| 8 =head1 NAME | |
| 9 | |
| 10 Bio::AlignIO::phylip - PHYLIP format sequence input/output stream | |
| 11 | |
| 12 =head1 SYNOPSIS | |
| 13 | |
| 14 # Do not use this module directly. Use it via the Bio::AlignIO class. | |
| 15 | |
| 16 use Bio::AlignIO; | |
| 17 use Bio::SimpleAlign; | |
| 18 #you can set the name length to something other than the default 10 | |
| 19 #if you use a version of phylip (hacked) that accepts ids > 10 | |
| 20 my $phylipstream = new Bio::AlignIO(-format => 'phylip', | |
| 21 -fh => \*STDOUT, | |
| 22 -idlength=>30); | |
| 23 # convert data from one format to another | |
| 24 my $gcgstream = new Bio::AlignIO(-format => 'msf', | |
| 25 -file => 't/data/cysprot1a.msf'); | |
| 26 | |
| 27 while( my $aln = $gcgstream->next_aln ) { | |
| 28 $phylipstream->write_aln($aln); | |
| 29 } | |
| 30 | |
| 31 # do it again with phylip sequential format format | |
| 32 $phylipstream->interleaved(0); | |
| 33 # can also initialize the object like this | |
| 34 $phylipstream = new Bio::AlignIO(-interleaved => 0, | |
| 35 -format => 'phylip', | |
| 36 -fh => \*STDOUT, | |
| 37 -idlength=>10); | |
| 38 $gcgstream = new Bio::AlignIO(-format => 'msf', | |
| 39 -file => 't/data/cysprot1a.msf'); | |
| 40 | |
| 41 while( my $aln = $gcgstream->next_aln ) { | |
| 42 $phylipstream->write_aln($aln); | |
| 43 } | |
| 44 | |
| 45 =head1 DESCRIPTION | |
| 46 | |
| 47 This object can transform Bio::SimpleAlign objects to and from PHYLIP | |
| 48 interleaved format. It will not work with PHYLIP sequencial format. | |
| 49 | |
| 50 This module will output PHYLIP sequential format. By specifying the | |
| 51 flag -interleaved =E<gt> 0 in the initialization the module can output | |
| 52 data in interleaved format. | |
| 53 | |
| 54 =head1 FEEDBACK | |
| 55 | |
| 56 =head2 Reporting Bugs | |
| 57 | |
| 58 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 59 the bugs and their resolution. | |
| 60 Bug reports can be submitted via email or the web: | |
| 61 | |
| 62 bioperl-bugs@bio.perl.org | |
| 63 http://bugzilla.bioperl.org/ | |
| 64 | |
| 65 =head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich | |
| 66 | |
| 67 Email: heikki@ebi.ac.uk | |
| 68 Email: jason@bioperl.org | |
| 69 | |
| 70 =head1 APPENDIX | |
| 71 | |
| 72 The rest of the documentation details each of the object | |
| 73 methods. Internal methods are usually preceded with a _ | |
| 74 | |
| 75 =cut | |
| 76 | |
| 77 # Let the code begin... | |
| 78 | |
| 79 package Bio::AlignIO::phylip; | |
| 80 use vars qw(@ISA $DEFAULTIDLENGTH $DEFAULTLINELEN); | |
| 81 use strict; | |
| 82 | |
| 83 use Bio::SimpleAlign; | |
| 84 use Bio::AlignIO; | |
| 85 | |
| 86 @ISA = qw(Bio::AlignIO); | |
| 87 | |
| 88 BEGIN { | |
| 89 $DEFAULTIDLENGTH = 10; | |
| 90 $DEFAULTLINELEN = 60; | |
| 91 } | |
| 92 | |
| 93 =head2 new | |
| 94 | |
| 95 Title : new | |
| 96 Usage : my $alignio = new Bio::AlignIO(-format => 'phylip' | |
| 97 -file => '>file', | |
| 98 -idlength => 10, | |
| 99 -idlinebreak => 1); | |
| 100 Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer | |
| 101 Returns : L<Bio::AlignIO> object | |
| 102 Args : [specific for writing of phylip format files] | |
| 103 -idlength => integer - length of the id (will pad w/ | |
| 104 spaces if needed) | |
| 105 -interleaved => boolean - whether or not write as interleaved | |
| 106 or sequential format | |
| 107 -linelength => integer of how long a sequence lines should be | |
| 108 -idlinebreak => insert a line break after the sequence id | |
| 109 so that sequence starts on the next line | |
| 110 | |
| 111 =cut | |
| 112 | |
| 113 sub _initialize { | |
| 114 my($self,@args) = @_; | |
| 115 $self->SUPER::_initialize(@args); | |
| 116 | |
| 117 my ($interleave,$linelen,$idlinebreak, | |
| 118 $idlength) = $self->_rearrange([qw(INTERLEAVED | |
| 119 LINELENGTH | |
| 120 IDLINEBREAK | |
| 121 IDLENGTH)],@args); | |
| 122 $self->interleaved(1) if( $interleave || ! defined $interleave); | |
| 123 $self->idlength($idlength || $DEFAULTIDLENGTH); | |
| 124 $self->id_linebreak(1) if( $idlinebreak ); | |
| 125 $self->line_length($linelen) if defined $linelen && $linelen > 0; | |
| 126 1; | |
| 127 } | |
| 128 | |
| 129 =head2 next_aln | |
| 130 | |
| 131 Title : next_aln | |
| 132 Usage : $aln = $stream->next_aln() | |
| 133 Function: returns the next alignment in the stream. | |
| 134 Throws an exception if trying to read in PHYLIP | |
| 135 sequential format. | |
| 136 Returns : L<Bio::SimpleAlign> object | |
| 137 Args : | |
| 138 | |
| 139 =cut | |
| 140 | |
| 141 sub next_aln { | |
| 142 my $self = shift; | |
| 143 my $entry; | |
| 144 my ($seqcount, $residuecount, %hash, $name,$str, | |
| 145 @names,$seqname,$start,$end,$count,$seq); | |
| 146 | |
| 147 my $aln = Bio::SimpleAlign->new(-source => 'phylip'); | |
| 148 $entry = $self->_readline and | |
| 149 ($seqcount, $residuecount) = $entry =~ /\s*(\d+)\s+(\d+)/; | |
| 150 return 0 unless $seqcount and $residuecount; | |
| 151 | |
| 152 # first alignment section | |
| 153 my $idlen = $self->idlength; | |
| 154 $count = 0; | |
| 155 my $non_interleaved = ! $self->interleaved ; | |
| 156 while( $entry = $self->_readline) { | |
| 157 last if( $entry =~ /^\s?$/ && ! $non_interleaved ); | |
| 158 | |
| 159 if( $entry =~ /^\s+(.+)$/ ) { | |
| 160 $str = $1; | |
| 161 $non_interleaved = 1; | |
| 162 $str =~ s/\s//g; | |
| 163 $count = scalar @names; | |
| 164 $hash{$count} .= $str; | |
| 165 } elsif( $entry =~ /^(.{$idlen})\s+(.*)\s$/ ) { | |
| 166 $name = $1; | |
| 167 $str = $2; | |
| 168 $name =~ s/[\s\/]/_/g; | |
| 169 $name =~ s/_+$//; # remove any trailing _'s | |
| 170 push @names, $name; | |
| 171 $str =~ s/\s//g; | |
| 172 $count = scalar @names; | |
| 173 $hash{$count} = $str; | |
| 174 } | |
| 175 $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; | |
| 176 } | |
| 177 | |
| 178 unless( $non_interleaved ) { | |
| 179 # interleaved sections | |
| 180 $count = 0; | |
| 181 while( $entry = $self->_readline) { | |
| 182 # finish current entry | |
| 183 if($entry =~/\s*\d+\s+\d+/){ | |
| 184 $self->_pushback($entry); | |
| 185 last; | |
| 186 } | |
| 187 $count = 0, next if $entry =~ /^\s$/; | |
| 188 | |
| 189 $entry =~ /\s*(.*)$/ && do { | |
| 190 $str = $1; | |
| 191 $str =~ s/\s//g; | |
| 192 $count++; | |
| 193 $hash{$count} .= $str; | |
| 194 }; | |
| 195 $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; | |
| 196 } | |
| 197 } | |
| 198 return 0 if scalar @names < 1; | |
| 199 | |
| 200 # sequence creation | |
| 201 $count = 0; | |
| 202 foreach $name ( @names ) { | |
| 203 $count++; | |
| 204 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { | |
| 205 $seqname = $1; | |
| 206 $start = $2; | |
| 207 $end = $3; | |
| 208 } else { | |
| 209 $seqname=$name; | |
| 210 $start = 1; | |
| 211 $str = $hash{$count}; | |
| 212 $str =~ s/[^A-Za-z]//g; | |
| 213 $end = length($str); | |
| 214 } | |
| 215 # consistency test | |
| 216 $self->throw("Length of sequence [$seqname] is not [$residuecount]! ") | |
| 217 unless CORE::length($hash{$count}) == $residuecount; | |
| 218 | |
| 219 $seq = new Bio::LocatableSeq('-seq'=>$hash{$count}, | |
| 220 '-id'=>$seqname, | |
| 221 '-start'=>$start, | |
| 222 '-end'=>$end, | |
| 223 ); | |
| 224 | |
| 225 $aln->add_seq($seq); | |
| 226 | |
| 227 } | |
| 228 return $aln; | |
| 229 } | |
| 230 | |
| 231 | |
| 232 =head2 write_aln | |
| 233 | |
| 234 Title : write_aln | |
| 235 Usage : $stream->write_aln(@aln) | |
| 236 Function: writes the $aln object into the stream in MSF format | |
| 237 Returns : 1 for success and 0 for error | |
| 238 Args : L<Bio::Align::AlignI> object | |
| 239 | |
| 240 =cut | |
| 241 | |
| 242 sub write_aln { | |
| 243 my ($self,@aln) = @_; | |
| 244 my $count = 0; | |
| 245 my $wrapped = 0; | |
| 246 my $maxname; | |
| 247 my ($length,$date,$name,$seq,$miss,$pad, | |
| 248 %hash,@arr,$tempcount,$index,$idlength); | |
| 249 | |
| 250 foreach my $aln (@aln) { | |
| 251 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { | |
| 252 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); | |
| 253 next; | |
| 254 } | |
| 255 $self->throw("All sequences in the alignment must be the same length") | |
| 256 unless $aln->is_flush(1) ; | |
| 257 | |
| 258 $aln->set_displayname_flat(); # plain | |
| 259 $length = $aln->length(); | |
| 260 $self->_print (sprintf(" %s %s\n", $aln->no_sequences, $aln->length)); | |
| 261 | |
| 262 $idlength = $self->idlength(); | |
| 263 foreach $seq ( $aln->each_seq() ) { | |
| 264 $name = $aln->displayname($seq->get_nse); | |
| 265 $name = substr($name, 0, $idlength) if length($name) > $idlength; | |
| 266 $name = sprintf("%-".$idlength."s",$name); | |
| 267 if( $self->interleaved() ) { | |
| 268 $name .= ' ' ; | |
| 269 } elsif( $self->id_linebreak) { | |
| 270 $name .= "\n"; | |
| 271 } | |
| 272 | |
| 273 #phylip needs dashes not dots | |
| 274 my $seq = $seq->seq(); | |
| 275 $seq=~s/\./-/g; | |
| 276 $hash{$name} = $seq; | |
| 277 push(@arr,$name); | |
| 278 } | |
| 279 | |
| 280 if( $self->interleaved() ) { | |
| 281 while( $count < $length ) { | |
| 282 | |
| 283 # there is another block to go! | |
| 284 foreach $name ( @arr ) { | |
| 285 my $dispname = $name; | |
| 286 $dispname = '' if $wrapped; | |
| 287 $self->_print (sprintf("%".($idlength+3)."s",$dispname)); | |
| 288 $tempcount = $count; | |
| 289 $index = 0; | |
| 290 while( ($tempcount + $idlength < $length) && ($index < 5) ) { | |
| 291 $self->_print (sprintf("%s ",substr($hash{$name}, | |
| 292 $tempcount, | |
| 293 $idlength))); | |
| 294 $tempcount += $idlength; | |
| 295 $index++; | |
| 296 } | |
| 297 # last | |
| 298 if( $index < 5) { | |
| 299 # space to print! | |
| 300 $self->_print (sprintf("%s ",substr($hash{$name}, | |
| 301 $tempcount))); | |
| 302 $tempcount += $idlength; | |
| 303 } | |
| 304 $self->_print ("\n"); | |
| 305 } | |
| 306 $self->_print ("\n"); | |
| 307 $count = $tempcount; | |
| 308 $wrapped = 1; | |
| 309 } | |
| 310 } else { | |
| 311 foreach $name ( @arr ) { | |
| 312 my $dispname = $name; | |
| 313 $dispname = '' if $wrapped; | |
| 314 $self->_print (sprintf("%s%s\n",$dispname,$hash{$name})); | |
| 315 } | |
| 316 } | |
| 317 } | |
| 318 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
| 319 return 1; | |
| 320 } | |
| 321 | |
| 322 =head2 interleaved | |
| 323 | |
| 324 Title : interleaved | |
| 325 Usage : my $interleaved = $obj->interleaved | |
| 326 Function: Get/Set Interleaved status | |
| 327 Returns : boolean | |
| 328 Args : boolean | |
| 329 | |
| 330 | |
| 331 =cut | |
| 332 | |
| 333 sub interleaved{ | |
| 334 my ($self,$value) = @_; | |
| 335 my $previous = $self->{'_interleaved'}; | |
| 336 if( defined $value ) { | |
| 337 $self->{'_interleaved'} = $value; | |
| 338 } | |
| 339 return $previous; | |
| 340 } | |
| 341 | |
| 342 =head2 idlength | |
| 343 | |
| 344 Title : idlength | |
| 345 Usage : my $idlength = $obj->interleaved | |
| 346 Function: Get/Set value of id length | |
| 347 Returns : string | |
| 348 Args : string | |
| 349 | |
| 350 | |
| 351 =cut | |
| 352 | |
| 353 sub idlength { | |
| 354 my($self,$value) = @_; | |
| 355 if (defined $value){ | |
| 356 $self->{'_idlength'} = $value; | |
| 357 } | |
| 358 return $self->{'_idlength'}; | |
| 359 } | |
| 360 | |
| 361 =head2 line_length | |
| 362 | |
| 363 Title : line_length | |
| 364 Usage : $obj->line_length($newval) | |
| 365 Function: | |
| 366 Returns : value of line_length | |
| 367 Args : newvalue (optional) | |
| 368 | |
| 369 | |
| 370 =cut | |
| 371 | |
| 372 sub line_length{ | |
| 373 my ($self,$value) = @_; | |
| 374 if( defined $value) { | |
| 375 $self->{'line_length'} = $value; | |
| 376 } | |
| 377 return $self->{'line_length'} || $DEFAULTLINELEN; | |
| 378 | |
| 379 } | |
| 380 | |
| 381 =head2 id_linebreak | |
| 382 | |
| 383 Title : id_linebreak | |
| 384 Usage : $obj->id_linebreak($newval) | |
| 385 Function: | |
| 386 Returns : value of id_linebreak | |
| 387 Args : newvalue (optional) | |
| 388 | |
| 389 | |
| 390 =cut | |
| 391 | |
| 392 sub id_linebreak{ | |
| 393 my ($self,$value) = @_; | |
| 394 if( defined $value) { | |
| 395 $self->{'_id_linebreak'} = $value; | |
| 396 } | |
| 397 return $self->{'_id_linebreak'} || 0; | |
| 398 } | |
| 399 | |
| 400 1; |
