comparison variant_effect_predictor/Bio/AlignIO/nexus.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: nexus.pm,v 1.12.2.1 2003/04/07 15:17:17 heikki Exp $
2 #
3 # BioPerl module for Bio::AlignIO::nexus
4 #
5 # Copyright Heikki Lehvaslaiho
6 #
7
8 =head1 NAME
9
10 Bio::AlignIO::nexus - NEXUS format sequence input/output stream
11
12 =head1 SYNOPSIS
13
14 Do not use this module directly. Use it via the L<Bio::AlignIO> class.
15
16 =head1 DESCRIPTION
17
18 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS
19 data blocks. See method documentation for supported NEXUS features.
20
21 =head1 ACKNOWLEDGEMENTS
22
23 Will Fisher has written an excellent standalone NEXUS format parser in
24 perl, readnexus. A number of tricks were adapted from it.
25
26 =head1 FEEDBACK
27
28 =head2 Reporting Bugs
29
30 Report bugs to the Bioperl bug tracking system to help us keep track
31 the bugs and their resolution.
32 Bug reports can be submitted via email or the web:
33
34 bioperl-bugs@bio.perl.org
35 http://bugzilla.bioperl.org/
36
37 =head1 AUTHORS - Heikki Lehvaslaiho
38
39 Email: heikki@ebi.ac.uk
40
41
42 =head1 APPENDIX
43
44 The rest of the documentation details each of the object
45 methods. Internal methods are usually preceded with a _
46
47 =cut
48
49 # Let the code begin...
50
51 package Bio::AlignIO::nexus;
52 use vars qw(@ISA %valid_type);
53 use strict;
54 no strict "refs";
55
56 use Bio::AlignIO;
57
58 @ISA = qw(Bio::AlignIO);
59
60 BEGIN {
61 %valid_type = map {$_, 1} qw( dna rna protein standard);
62 }
63
64 =head2 next_aln
65
66 Title : next_aln
67 Usage : $aln = $stream->next_aln()
68 Function: Returns the next alignment in the stream.
69
70 Supports the following NEXUS format features:
71 - The file has to start with '#NEXUS'
72 - Reads in the name of the alignment from a comment
73 (anything after 'TITLE: ') .
74 - Sequence names can be given in a taxa block, too.
75 - If matchchar notation is used, converts
76 them back to sequence characters.
77 - Does character conversions specified in the
78 NEXUS equate command.
79 - Sequence names of type 'Homo sapiens' and
80 Homo_sapiens are treated identically.
81
82 Returns : L<Bio::Align::AlignI> object
83 Args :
84
85 =cut
86
87 sub next_aln {
88 my $self = shift;
89 my $entry;
90 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet,
91 $match, $gap, $missing, $equate, $interleave,
92 $name,$str,@names,$seqname,$start,$end,$count,$seq);
93
94 my $aln = Bio::SimpleAlign->new(-source => 'nexus');
95
96 # file starts with '#NEXUS' but we allow white space only lines before it
97 $entry = $self->_readline;
98 $entry = $self->_readline while $entry =~ /^\s+$/;
99
100 return unless $entry;
101 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry")
102 unless $entry =~ /^#NEXUS/i;
103
104 # skip anything before either the taxa or data block
105 # but read in the optional title in a comment
106 while (defined($entry = $self->_readline)) {
107 local ($_) = $entry;
108 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1;
109 last if /^begin +data/i || /^begin +taxa/i;
110 }
111 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name;
112
113 # data and taxa blocks
114 my $taxlabels;
115 while ($entry = $self->_readline) {
116 local ($_) = $entry;
117
118 # read in seq names if in taxa block
119 $taxlabels = 1 if /taxlabels/i;
120 if ($taxlabels) {
121 @names = $self->_read_taxlabels;
122 $taxlabels = 0;
123 }
124
125 /ntax ?= ?(\d+)/i and $seqcount = $1;
126 /nchar ?= ?(\d+)/i and $residuecount = $1;
127 /matchchar ?= ?(.)/i and $match = $1;
128 /gap ?= ?(.)/i and $gap = $1;
129 /missing ?= ?(.)/i and $missing = $1;
130 /equate ?= ?"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A";
131 /datatype ?= ?(\w+)/i and $alphabet = lc $1;
132 /interleave/i and $interleave = 1 ;
133
134 last if /matrix/i;
135 }
136 $self->throw("Not a valid NEXUS sequence file. Datatype not specified")
137 unless $alphabet;
138 $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]")
139 unless $valid_type{$alphabet};
140
141 $aln->gap_char($gap);
142 $aln->missing_char($missing);
143
144 #
145 # if data is not right after the matrix line
146 # read the empty lines out
147 #
148 while ($entry = $self->_readline) {
149 unless ($entry =~ /^\s+$/) {
150 $self->_pushback($entry);
151 last;
152 }
153 }
154
155 #
156 # matrix command
157 #
158 # first alignment section
159 if (@names == 0) { # taxa block did not exist
160 while ($entry = $self->_readline) {
161 local ($_) = $entry;
162
163 s/\[[^[]+\]//g; #] remove comments
164 if ($interleave) {
165 /^\s+$/ and last;
166 } else {
167 /^\s+$/ and next;
168 }
169 /^\s*;\s*$/ and last;
170 if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #'
171 $name = ($2 || $3);
172 $str = $4;
173 $name =~ s/ /_/g;
174 push @names, $name;
175
176 $str =~ s/\s//g;
177 $count = @names;
178 $hash{$count} = $str;
179 };
180 $self->throw("Not a valid interleaved NEXUS file!
181 seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount;
182 }
183 }
184
185 # interleaved sections
186 $count = 0;
187 while( $entry = $self->_readline) {
188 local ($_) = $entry;
189 s/\[[^[]+\]//g; #] remove comments
190 last if /^\s*;/;
191
192 $count = 0, next if $entry =~ /^\s*$/;
193 if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #'
194 $str = $4;
195 $str =~ s/\s//g;
196 $count++;
197 $hash{$count} .= $str;
198 };
199 $self->throw("Not a valid interleaved NEXUS file!
200 seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount;
201
202 }
203
204 return 0 if @names < 1;
205
206 # sequence creation
207 $count = 0;
208 foreach $name ( @names ) {
209 $count++;
210 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
211 $seqname = $1;
212 $start = $2;
213 $end = $3;
214 } else {
215 $seqname=$name;
216 $start = 1;
217 $str = $hash{$count};
218 $str =~ s/[^A-Za-z]//g;
219 $end = length($str);
220 }
221
222 # consistency test
223 $self->throw("Length of sequence [$seqname] is not [$residuecount]! ")
224 unless CORE::length($hash{$count}) == $residuecount;
225
226 $seq = new Bio::LocatableSeq('-seq'=>$hash{$count},
227 '-id'=>$seqname,
228 '-start'=>$start,
229 '-end'=>$end,
230 'alphabet'=>$alphabet
231 );
232 $aln->add_seq($seq);
233 }
234
235 # if matchchar is used
236 $aln->unmatch($match) if $match;
237
238 # if equate ( e.g. equate="T=C G=A") is used
239 if ($equate) {
240 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g;
241 }
242
243 while ($entry !~ /endblock/i) {
244 $entry = $self->_readline;
245 }
246
247 return $aln;
248 }
249
250 sub _read_taxlabels {
251 my ($self) = @_;
252 my ($name, @names);
253 while (my $entry = $self->_readline) {
254 ($name) = $entry =~ /\s*(\S+)\s+/;
255 $name =~ s/\[[^\[]+\]//g;
256 $name =~ s/\W/_/g;
257 push @names, $name;
258 last if /^\s*;/;
259 }
260 return @names;
261 }
262
263 =head2 write_aln
264
265 Title : write_aln
266 Usage : $stream->write_aln(@aln)
267 Function: Writes the $aln object into the stream in interleaved NEXUS
268 format. Everything is written into a data block.
269 SimpleAlign methods match_char, missing_char and gap_char must be set
270 if you want to see them in the output.
271 Returns : 1 for success and 0 for error
272 Args : L<Bio::Align::AlignI> object
273
274 =cut
275
276 sub write_aln {
277 my ($self,@aln) = @_;
278 my $count = 0;
279 my $wrapped = 0;
280 my $maxname;
281 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index );
282 my ($match, $missing, $gap,$symbols) = ('', '', '','');
283
284 foreach my $aln (@aln) {
285 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) {
286 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
287 next;
288 }
289 $self->throw("All sequences in the alignment must be the same length")
290 unless $aln->is_flush($self->verbose);
291
292 $length = $aln->length();
293
294 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n",
295 $aln->id, $aln->no_sequences, $length));
296 $match = "match=". $aln->match_char if $aln->match_char;
297 $missing = "missing=". $aln->missing_char if $aln->missing_char;
298 $gap = "gap=". $aln->gap_char if $aln->gap_char;
299 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"' if( $aln->symbol_chars);
300 $self->_print (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n",
301 $aln->get_seq_by_pos(1)->alphabet, $match, $missing, $gap, $symbols));
302
303 my $indent = $aln->maxdisplayname_length;
304 $aln->set_displayname_flat();
305 foreach $seq ( $aln->each_seq() ) {
306 $name = $aln->displayname($seq->get_nse());
307 $name = sprintf("%-${indent}s", $name);
308 $hash{$name} = $seq->seq();
309 push(@arr,$name);
310 }
311
312 while( $count < $length ) {
313 # there is another block to go!
314 foreach $name ( @arr ) {
315 my $dispname = $name;
316 # $dispname = '' if $wrapped;
317 $self->_print (sprintf("%${indent}s ",$dispname));
318 $tempcount = $count;
319 $index = 0;
320 while( ($tempcount + 10 < $length) && ($index < 5) ) {
321 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10)));
322 $tempcount += 10;
323 $index++;
324 }
325 # last
326 if( $index < 5) {
327 # space to print!
328 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount)));
329 $tempcount += 10;
330 }
331 $self->_print ("\n");
332 }
333 $self->_print ("\n\n");
334 $count = $tempcount;
335 $wrapped = 1;
336 }
337 $self->_print (";\n\nendblock;\n");
338 }
339 $self->flush if $self->_flush_on_write && defined $self->_fh;
340 return 1;
341 }
342
343 1;