Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/AlignIO/phylip.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: phylip.pm,v 1.24.2.1 2003/01/26 15:52:30 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::AlignIO::phylip | |
4 # | |
5 # Copyright Heikki Lehvaslaiho | |
6 # | |
7 | |
8 =head1 NAME | |
9 | |
10 Bio::AlignIO::phylip - PHYLIP format sequence input/output stream | |
11 | |
12 =head1 SYNOPSIS | |
13 | |
14 # Do not use this module directly. Use it via the Bio::AlignIO class. | |
15 | |
16 use Bio::AlignIO; | |
17 use Bio::SimpleAlign; | |
18 #you can set the name length to something other than the default 10 | |
19 #if you use a version of phylip (hacked) that accepts ids > 10 | |
20 my $phylipstream = new Bio::AlignIO(-format => 'phylip', | |
21 -fh => \*STDOUT, | |
22 -idlength=>30); | |
23 # convert data from one format to another | |
24 my $gcgstream = new Bio::AlignIO(-format => 'msf', | |
25 -file => 't/data/cysprot1a.msf'); | |
26 | |
27 while( my $aln = $gcgstream->next_aln ) { | |
28 $phylipstream->write_aln($aln); | |
29 } | |
30 | |
31 # do it again with phylip sequential format format | |
32 $phylipstream->interleaved(0); | |
33 # can also initialize the object like this | |
34 $phylipstream = new Bio::AlignIO(-interleaved => 0, | |
35 -format => 'phylip', | |
36 -fh => \*STDOUT, | |
37 -idlength=>10); | |
38 $gcgstream = new Bio::AlignIO(-format => 'msf', | |
39 -file => 't/data/cysprot1a.msf'); | |
40 | |
41 while( my $aln = $gcgstream->next_aln ) { | |
42 $phylipstream->write_aln($aln); | |
43 } | |
44 | |
45 =head1 DESCRIPTION | |
46 | |
47 This object can transform Bio::SimpleAlign objects to and from PHYLIP | |
48 interleaved format. It will not work with PHYLIP sequencial format. | |
49 | |
50 This module will output PHYLIP sequential format. By specifying the | |
51 flag -interleaved =E<gt> 0 in the initialization the module can output | |
52 data in interleaved format. | |
53 | |
54 =head1 FEEDBACK | |
55 | |
56 =head2 Reporting Bugs | |
57 | |
58 Report bugs to the Bioperl bug tracking system to help us keep track | |
59 the bugs and their resolution. | |
60 Bug reports can be submitted via email or the web: | |
61 | |
62 bioperl-bugs@bio.perl.org | |
63 http://bugzilla.bioperl.org/ | |
64 | |
65 =head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich | |
66 | |
67 Email: heikki@ebi.ac.uk | |
68 Email: jason@bioperl.org | |
69 | |
70 =head1 APPENDIX | |
71 | |
72 The rest of the documentation details each of the object | |
73 methods. Internal methods are usually preceded with a _ | |
74 | |
75 =cut | |
76 | |
77 # Let the code begin... | |
78 | |
79 package Bio::AlignIO::phylip; | |
80 use vars qw(@ISA $DEFAULTIDLENGTH $DEFAULTLINELEN); | |
81 use strict; | |
82 | |
83 use Bio::SimpleAlign; | |
84 use Bio::AlignIO; | |
85 | |
86 @ISA = qw(Bio::AlignIO); | |
87 | |
88 BEGIN { | |
89 $DEFAULTIDLENGTH = 10; | |
90 $DEFAULTLINELEN = 60; | |
91 } | |
92 | |
93 =head2 new | |
94 | |
95 Title : new | |
96 Usage : my $alignio = new Bio::AlignIO(-format => 'phylip' | |
97 -file => '>file', | |
98 -idlength => 10, | |
99 -idlinebreak => 1); | |
100 Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer | |
101 Returns : L<Bio::AlignIO> object | |
102 Args : [specific for writing of phylip format files] | |
103 -idlength => integer - length of the id (will pad w/ | |
104 spaces if needed) | |
105 -interleaved => boolean - whether or not write as interleaved | |
106 or sequential format | |
107 -linelength => integer of how long a sequence lines should be | |
108 -idlinebreak => insert a line break after the sequence id | |
109 so that sequence starts on the next line | |
110 | |
111 =cut | |
112 | |
113 sub _initialize { | |
114 my($self,@args) = @_; | |
115 $self->SUPER::_initialize(@args); | |
116 | |
117 my ($interleave,$linelen,$idlinebreak, | |
118 $idlength) = $self->_rearrange([qw(INTERLEAVED | |
119 LINELENGTH | |
120 IDLINEBREAK | |
121 IDLENGTH)],@args); | |
122 $self->interleaved(1) if( $interleave || ! defined $interleave); | |
123 $self->idlength($idlength || $DEFAULTIDLENGTH); | |
124 $self->id_linebreak(1) if( $idlinebreak ); | |
125 $self->line_length($linelen) if defined $linelen && $linelen > 0; | |
126 1; | |
127 } | |
128 | |
129 =head2 next_aln | |
130 | |
131 Title : next_aln | |
132 Usage : $aln = $stream->next_aln() | |
133 Function: returns the next alignment in the stream. | |
134 Throws an exception if trying to read in PHYLIP | |
135 sequential format. | |
136 Returns : L<Bio::SimpleAlign> object | |
137 Args : | |
138 | |
139 =cut | |
140 | |
141 sub next_aln { | |
142 my $self = shift; | |
143 my $entry; | |
144 my ($seqcount, $residuecount, %hash, $name,$str, | |
145 @names,$seqname,$start,$end,$count,$seq); | |
146 | |
147 my $aln = Bio::SimpleAlign->new(-source => 'phylip'); | |
148 $entry = $self->_readline and | |
149 ($seqcount, $residuecount) = $entry =~ /\s*(\d+)\s+(\d+)/; | |
150 return 0 unless $seqcount and $residuecount; | |
151 | |
152 # first alignment section | |
153 my $idlen = $self->idlength; | |
154 $count = 0; | |
155 my $non_interleaved = ! $self->interleaved ; | |
156 while( $entry = $self->_readline) { | |
157 last if( $entry =~ /^\s?$/ && ! $non_interleaved ); | |
158 | |
159 if( $entry =~ /^\s+(.+)$/ ) { | |
160 $str = $1; | |
161 $non_interleaved = 1; | |
162 $str =~ s/\s//g; | |
163 $count = scalar @names; | |
164 $hash{$count} .= $str; | |
165 } elsif( $entry =~ /^(.{$idlen})\s+(.*)\s$/ ) { | |
166 $name = $1; | |
167 $str = $2; | |
168 $name =~ s/[\s\/]/_/g; | |
169 $name =~ s/_+$//; # remove any trailing _'s | |
170 push @names, $name; | |
171 $str =~ s/\s//g; | |
172 $count = scalar @names; | |
173 $hash{$count} = $str; | |
174 } | |
175 $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; | |
176 } | |
177 | |
178 unless( $non_interleaved ) { | |
179 # interleaved sections | |
180 $count = 0; | |
181 while( $entry = $self->_readline) { | |
182 # finish current entry | |
183 if($entry =~/\s*\d+\s+\d+/){ | |
184 $self->_pushback($entry); | |
185 last; | |
186 } | |
187 $count = 0, next if $entry =~ /^\s$/; | |
188 | |
189 $entry =~ /\s*(.*)$/ && do { | |
190 $str = $1; | |
191 $str =~ s/\s//g; | |
192 $count++; | |
193 $hash{$count} .= $str; | |
194 }; | |
195 $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; | |
196 } | |
197 } | |
198 return 0 if scalar @names < 1; | |
199 | |
200 # sequence creation | |
201 $count = 0; | |
202 foreach $name ( @names ) { | |
203 $count++; | |
204 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { | |
205 $seqname = $1; | |
206 $start = $2; | |
207 $end = $3; | |
208 } else { | |
209 $seqname=$name; | |
210 $start = 1; | |
211 $str = $hash{$count}; | |
212 $str =~ s/[^A-Za-z]//g; | |
213 $end = length($str); | |
214 } | |
215 # consistency test | |
216 $self->throw("Length of sequence [$seqname] is not [$residuecount]! ") | |
217 unless CORE::length($hash{$count}) == $residuecount; | |
218 | |
219 $seq = new Bio::LocatableSeq('-seq'=>$hash{$count}, | |
220 '-id'=>$seqname, | |
221 '-start'=>$start, | |
222 '-end'=>$end, | |
223 ); | |
224 | |
225 $aln->add_seq($seq); | |
226 | |
227 } | |
228 return $aln; | |
229 } | |
230 | |
231 | |
232 =head2 write_aln | |
233 | |
234 Title : write_aln | |
235 Usage : $stream->write_aln(@aln) | |
236 Function: writes the $aln object into the stream in MSF format | |
237 Returns : 1 for success and 0 for error | |
238 Args : L<Bio::Align::AlignI> object | |
239 | |
240 =cut | |
241 | |
242 sub write_aln { | |
243 my ($self,@aln) = @_; | |
244 my $count = 0; | |
245 my $wrapped = 0; | |
246 my $maxname; | |
247 my ($length,$date,$name,$seq,$miss,$pad, | |
248 %hash,@arr,$tempcount,$index,$idlength); | |
249 | |
250 foreach my $aln (@aln) { | |
251 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { | |
252 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); | |
253 next; | |
254 } | |
255 $self->throw("All sequences in the alignment must be the same length") | |
256 unless $aln->is_flush(1) ; | |
257 | |
258 $aln->set_displayname_flat(); # plain | |
259 $length = $aln->length(); | |
260 $self->_print (sprintf(" %s %s\n", $aln->no_sequences, $aln->length)); | |
261 | |
262 $idlength = $self->idlength(); | |
263 foreach $seq ( $aln->each_seq() ) { | |
264 $name = $aln->displayname($seq->get_nse); | |
265 $name = substr($name, 0, $idlength) if length($name) > $idlength; | |
266 $name = sprintf("%-".$idlength."s",$name); | |
267 if( $self->interleaved() ) { | |
268 $name .= ' ' ; | |
269 } elsif( $self->id_linebreak) { | |
270 $name .= "\n"; | |
271 } | |
272 | |
273 #phylip needs dashes not dots | |
274 my $seq = $seq->seq(); | |
275 $seq=~s/\./-/g; | |
276 $hash{$name} = $seq; | |
277 push(@arr,$name); | |
278 } | |
279 | |
280 if( $self->interleaved() ) { | |
281 while( $count < $length ) { | |
282 | |
283 # there is another block to go! | |
284 foreach $name ( @arr ) { | |
285 my $dispname = $name; | |
286 $dispname = '' if $wrapped; | |
287 $self->_print (sprintf("%".($idlength+3)."s",$dispname)); | |
288 $tempcount = $count; | |
289 $index = 0; | |
290 while( ($tempcount + $idlength < $length) && ($index < 5) ) { | |
291 $self->_print (sprintf("%s ",substr($hash{$name}, | |
292 $tempcount, | |
293 $idlength))); | |
294 $tempcount += $idlength; | |
295 $index++; | |
296 } | |
297 # last | |
298 if( $index < 5) { | |
299 # space to print! | |
300 $self->_print (sprintf("%s ",substr($hash{$name}, | |
301 $tempcount))); | |
302 $tempcount += $idlength; | |
303 } | |
304 $self->_print ("\n"); | |
305 } | |
306 $self->_print ("\n"); | |
307 $count = $tempcount; | |
308 $wrapped = 1; | |
309 } | |
310 } else { | |
311 foreach $name ( @arr ) { | |
312 my $dispname = $name; | |
313 $dispname = '' if $wrapped; | |
314 $self->_print (sprintf("%s%s\n",$dispname,$hash{$name})); | |
315 } | |
316 } | |
317 } | |
318 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
319 return 1; | |
320 } | |
321 | |
322 =head2 interleaved | |
323 | |
324 Title : interleaved | |
325 Usage : my $interleaved = $obj->interleaved | |
326 Function: Get/Set Interleaved status | |
327 Returns : boolean | |
328 Args : boolean | |
329 | |
330 | |
331 =cut | |
332 | |
333 sub interleaved{ | |
334 my ($self,$value) = @_; | |
335 my $previous = $self->{'_interleaved'}; | |
336 if( defined $value ) { | |
337 $self->{'_interleaved'} = $value; | |
338 } | |
339 return $previous; | |
340 } | |
341 | |
342 =head2 idlength | |
343 | |
344 Title : idlength | |
345 Usage : my $idlength = $obj->interleaved | |
346 Function: Get/Set value of id length | |
347 Returns : string | |
348 Args : string | |
349 | |
350 | |
351 =cut | |
352 | |
353 sub idlength { | |
354 my($self,$value) = @_; | |
355 if (defined $value){ | |
356 $self->{'_idlength'} = $value; | |
357 } | |
358 return $self->{'_idlength'}; | |
359 } | |
360 | |
361 =head2 line_length | |
362 | |
363 Title : line_length | |
364 Usage : $obj->line_length($newval) | |
365 Function: | |
366 Returns : value of line_length | |
367 Args : newvalue (optional) | |
368 | |
369 | |
370 =cut | |
371 | |
372 sub line_length{ | |
373 my ($self,$value) = @_; | |
374 if( defined $value) { | |
375 $self->{'line_length'} = $value; | |
376 } | |
377 return $self->{'line_length'} || $DEFAULTLINELEN; | |
378 | |
379 } | |
380 | |
381 =head2 id_linebreak | |
382 | |
383 Title : id_linebreak | |
384 Usage : $obj->id_linebreak($newval) | |
385 Function: | |
386 Returns : value of id_linebreak | |
387 Args : newvalue (optional) | |
388 | |
389 | |
390 =cut | |
391 | |
392 sub id_linebreak{ | |
393 my ($self,$value) = @_; | |
394 if( defined $value) { | |
395 $self->{'_id_linebreak'} = $value; | |
396 } | |
397 return $self->{'_id_linebreak'} || 0; | |
398 } | |
399 | |
400 1; |