0
|
1 # $Id: nexus.pm,v 1.12.2.1 2003/04/07 15:17:17 heikki Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::AlignIO::nexus
|
|
4 #
|
|
5 # Copyright Heikki Lehvaslaiho
|
|
6 #
|
|
7
|
|
8 =head1 NAME
|
|
9
|
|
10 Bio::AlignIO::nexus - NEXUS format sequence input/output stream
|
|
11
|
|
12 =head1 SYNOPSIS
|
|
13
|
|
14 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
|
|
15
|
|
16 =head1 DESCRIPTION
|
|
17
|
|
18 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS
|
|
19 data blocks. See method documentation for supported NEXUS features.
|
|
20
|
|
21 =head1 ACKNOWLEDGEMENTS
|
|
22
|
|
23 Will Fisher has written an excellent standalone NEXUS format parser in
|
|
24 perl, readnexus. A number of tricks were adapted from it.
|
|
25
|
|
26 =head1 FEEDBACK
|
|
27
|
|
28 =head2 Reporting Bugs
|
|
29
|
|
30 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
31 the bugs and their resolution.
|
|
32 Bug reports can be submitted via email or the web:
|
|
33
|
|
34 bioperl-bugs@bio.perl.org
|
|
35 http://bugzilla.bioperl.org/
|
|
36
|
|
37 =head1 AUTHORS - Heikki Lehvaslaiho
|
|
38
|
|
39 Email: heikki@ebi.ac.uk
|
|
40
|
|
41
|
|
42 =head1 APPENDIX
|
|
43
|
|
44 The rest of the documentation details each of the object
|
|
45 methods. Internal methods are usually preceded with a _
|
|
46
|
|
47 =cut
|
|
48
|
|
49 # Let the code begin...
|
|
50
|
|
51 package Bio::AlignIO::nexus;
|
|
52 use vars qw(@ISA %valid_type);
|
|
53 use strict;
|
|
54 no strict "refs";
|
|
55
|
|
56 use Bio::AlignIO;
|
|
57
|
|
58 @ISA = qw(Bio::AlignIO);
|
|
59
|
|
60 BEGIN {
|
|
61 %valid_type = map {$_, 1} qw( dna rna protein standard);
|
|
62 }
|
|
63
|
|
64 =head2 next_aln
|
|
65
|
|
66 Title : next_aln
|
|
67 Usage : $aln = $stream->next_aln()
|
|
68 Function: Returns the next alignment in the stream.
|
|
69
|
|
70 Supports the following NEXUS format features:
|
|
71 - The file has to start with '#NEXUS'
|
|
72 - Reads in the name of the alignment from a comment
|
|
73 (anything after 'TITLE: ') .
|
|
74 - Sequence names can be given in a taxa block, too.
|
|
75 - If matchchar notation is used, converts
|
|
76 them back to sequence characters.
|
|
77 - Does character conversions specified in the
|
|
78 NEXUS equate command.
|
|
79 - Sequence names of type 'Homo sapiens' and
|
|
80 Homo_sapiens are treated identically.
|
|
81
|
|
82 Returns : L<Bio::Align::AlignI> object
|
|
83 Args :
|
|
84
|
|
85 =cut
|
|
86
|
|
87 sub next_aln {
|
|
88 my $self = shift;
|
|
89 my $entry;
|
|
90 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
|
|
91 $match, $gap, $missing, $equate, $interleave,
|
|
92 $name,$str,@names,$seqname,$start,$end,$count,$seq);
|
|
93
|
|
94 my $aln = Bio::SimpleAlign->new(-source => 'nexus');
|
|
95
|
|
96 # file starts with '#NEXUS' but we allow white space only lines before it
|
|
97 $entry = $self->_readline;
|
|
98 $entry = $self->_readline while $entry =~ /^\s+$/;
|
|
99
|
|
100 return unless $entry;
|
|
101 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
|
|
102 unless $entry =~ /^#NEXUS/i;
|
|
103
|
|
104 # skip anything before either the taxa or data block
|
|
105 # but read in the optional title in a comment
|
|
106 while (defined($entry = $self->_readline)) {
|
|
107 local ($_) = $entry;
|
|
108 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
|
|
109 last if /^begin +data/i || /^begin +taxa/i;
|
|
110 }
|
|
111 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
|
|
112
|
|
113 # data and taxa blocks
|
|
114 my $taxlabels;
|
|
115 while ($entry = $self->_readline) {
|
|
116 local ($_) = $entry;
|
|
117
|
|
118 # read in seq names if in taxa block
|
|
119 $taxlabels = 1 if /taxlabels/i;
|
|
120 if ($taxlabels) {
|
|
121 @names = $self->_read_taxlabels;
|
|
122 $taxlabels = 0;
|
|
123 }
|
|
124
|
|
125 /ntax ?= ?(\d+)/i and $seqcount = $1;
|
|
126 /nchar ?= ?(\d+)/i and $residuecount = $1;
|
|
127 /matchchar ?= ?(.)/i and $match = $1;
|
|
128 /gap ?= ?(.)/i and $gap = $1;
|
|
129 /missing ?= ?(.)/i and $missing = $1;
|
|
130 /equate ?= ?"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
|
|
131 /datatype ?= ?(\w+)/i and $alphabet = lc $1;
|
|
132 /interleave/i and $interleave = 1 ;
|
|
133
|
|
134 last if /matrix/i;
|
|
135 }
|
|
136 $self->throw("Not a valid NEXUS sequence file. Datatype not specified")
|
|
137 unless $alphabet;
|
|
138 $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
|
|
139 unless $valid_type{$alphabet};
|
|
140
|
|
141 $aln->gap_char($gap);
|
|
142 $aln->missing_char($missing);
|
|
143
|
|
144 #
|
|
145 # if data is not right after the matrix line
|
|
146 # read the empty lines out
|
|
147 #
|
|
148 while ($entry = $self->_readline) {
|
|
149 unless ($entry =~ /^\s+$/) {
|
|
150 $self->_pushback($entry);
|
|
151 last;
|
|
152 }
|
|
153 }
|
|
154
|
|
155 #
|
|
156 # matrix command
|
|
157 #
|
|
158 # first alignment section
|
|
159 if (@names == 0) { # taxa block did not exist
|
|
160 while ($entry = $self->_readline) {
|
|
161 local ($_) = $entry;
|
|
162
|
|
163 s/\[[^[]+\]//g; #] remove comments
|
|
164 if ($interleave) {
|
|
165 /^\s+$/ and last;
|
|
166 } else {
|
|
167 /^\s+$/ and next;
|
|
168 }
|
|
169 /^\s*;\s*$/ and last;
|
|
170 if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #'
|
|
171 $name = ($2 || $3);
|
|
172 $str = $4;
|
|
173 $name =~ s/ /_/g;
|
|
174 push @names, $name;
|
|
175
|
|
176 $str =~ s/\s//g;
|
|
177 $count = @names;
|
|
178 $hash{$count} = $str;
|
|
179 };
|
|
180 $self->throw("Not a valid interleaved NEXUS file!
|
|
181 seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
|
|
182 }
|
|
183 }
|
|
184
|
|
185 # interleaved sections
|
|
186 $count = 0;
|
|
187 while( $entry = $self->_readline) {
|
|
188 local ($_) = $entry;
|
|
189 s/\[[^[]+\]//g; #] remove comments
|
|
190 last if /^\s*;/;
|
|
191
|
|
192 $count = 0, next if $entry =~ /^\s*$/;
|
|
193 if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #'
|
|
194 $str = $4;
|
|
195 $str =~ s/\s//g;
|
|
196 $count++;
|
|
197 $hash{$count} .= $str;
|
|
198 };
|
|
199 $self->throw("Not a valid interleaved NEXUS file!
|
|
200 seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
|
|
201
|
|
202 }
|
|
203
|
|
204 return 0 if @names < 1;
|
|
205
|
|
206 # sequence creation
|
|
207 $count = 0;
|
|
208 foreach $name ( @names ) {
|
|
209 $count++;
|
|
210 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
|
|
211 $seqname = $1;
|
|
212 $start = $2;
|
|
213 $end = $3;
|
|
214 } else {
|
|
215 $seqname=$name;
|
|
216 $start = 1;
|
|
217 $str = $hash{$count};
|
|
218 $str =~ s/[^A-Za-z]//g;
|
|
219 $end = length($str);
|
|
220 }
|
|
221
|
|
222 # consistency test
|
|
223 $self->throw("Length of sequence [$seqname] is not [$residuecount]! ")
|
|
224 unless CORE::length($hash{$count}) == $residuecount;
|
|
225
|
|
226 $seq = new Bio::LocatableSeq('-seq'=>$hash{$count},
|
|
227 '-id'=>$seqname,
|
|
228 '-start'=>$start,
|
|
229 '-end'=>$end,
|
|
230 'alphabet'=>$alphabet
|
|
231 );
|
|
232 $aln->add_seq($seq);
|
|
233 }
|
|
234
|
|
235 # if matchchar is used
|
|
236 $aln->unmatch($match) if $match;
|
|
237
|
|
238 # if equate ( e.g. equate="T=C G=A") is used
|
|
239 if ($equate) {
|
|
240 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
|
|
241 }
|
|
242
|
|
243 while ($entry !~ /endblock/i) {
|
|
244 $entry = $self->_readline;
|
|
245 }
|
|
246
|
|
247 return $aln;
|
|
248 }
|
|
249
|
|
250 sub _read_taxlabels {
|
|
251 my ($self) = @_;
|
|
252 my ($name, @names);
|
|
253 while (my $entry = $self->_readline) {
|
|
254 ($name) = $entry =~ /\s*(\S+)\s+/;
|
|
255 $name =~ s/\[[^\[]+\]//g;
|
|
256 $name =~ s/\W/_/g;
|
|
257 push @names, $name;
|
|
258 last if /^\s*;/;
|
|
259 }
|
|
260 return @names;
|
|
261 }
|
|
262
|
|
263 =head2 write_aln
|
|
264
|
|
265 Title : write_aln
|
|
266 Usage : $stream->write_aln(@aln)
|
|
267 Function: Writes the $aln object into the stream in interleaved NEXUS
|
|
268 format. Everything is written into a data block.
|
|
269 SimpleAlign methods match_char, missing_char and gap_char must be set
|
|
270 if you want to see them in the output.
|
|
271 Returns : 1 for success and 0 for error
|
|
272 Args : L<Bio::Align::AlignI> object
|
|
273
|
|
274 =cut
|
|
275
|
|
276 sub write_aln {
|
|
277 my ($self,@aln) = @_;
|
|
278 my $count = 0;
|
|
279 my $wrapped = 0;
|
|
280 my $maxname;
|
|
281 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
|
|
282 my ($match, $missing, $gap,$symbols) = ('', '', '','');
|
|
283
|
|
284 foreach my $aln (@aln) {
|
|
285 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
|
|
286 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
|
|
287 next;
|
|
288 }
|
|
289 $self->throw("All sequences in the alignment must be the same length")
|
|
290 unless $aln->is_flush($self->verbose);
|
|
291
|
|
292 $length = $aln->length();
|
|
293
|
|
294 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
|
|
295 $aln->id, $aln->no_sequences, $length));
|
|
296 $match = "match=". $aln->match_char if $aln->match_char;
|
|
297 $missing = "missing=". $aln->missing_char if $aln->missing_char;
|
|
298 $gap = "gap=". $aln->gap_char if $aln->gap_char;
|
|
299 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"' if( $aln->symbol_chars);
|
|
300 $self->_print (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
|
|
301 $aln->get_seq_by_pos(1)->alphabet, $match, $missing, $gap, $symbols));
|
|
302
|
|
303 my $indent = $aln->maxdisplayname_length;
|
|
304 $aln->set_displayname_flat();
|
|
305 foreach $seq ( $aln->each_seq() ) {
|
|
306 $name = $aln->displayname($seq->get_nse());
|
|
307 $name = sprintf("%-${indent}s", $name);
|
|
308 $hash{$name} = $seq->seq();
|
|
309 push(@arr,$name);
|
|
310 }
|
|
311
|
|
312 while( $count < $length ) {
|
|
313 # there is another block to go!
|
|
314 foreach $name ( @arr ) {
|
|
315 my $dispname = $name;
|
|
316 # $dispname = '' if $wrapped;
|
|
317 $self->_print (sprintf("%${indent}s ",$dispname));
|
|
318 $tempcount = $count;
|
|
319 $index = 0;
|
|
320 while( ($tempcount + 10 < $length) && ($index < 5) ) {
|
|
321 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
|
|
322 $tempcount += 10;
|
|
323 $index++;
|
|
324 }
|
|
325 # last
|
|
326 if( $index < 5) {
|
|
327 # space to print!
|
|
328 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
|
|
329 $tempcount += 10;
|
|
330 }
|
|
331 $self->_print ("\n");
|
|
332 }
|
|
333 $self->_print ("\n\n");
|
|
334 $count = $tempcount;
|
|
335 $wrapped = 1;
|
|
336 }
|
|
337 $self->_print (";\n\nendblock;\n");
|
|
338 }
|
|
339 $self->flush if $self->_flush_on_write && defined $self->_fh;
|
|
340 return 1;
|
|
341 }
|
|
342
|
|
343 1;
|