Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/AlignIO/nexus.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: nexus.pm,v 1.12.2.1 2003/04/07 15:17:17 heikki Exp $ | |
2 # | |
3 # BioPerl module for Bio::AlignIO::nexus | |
4 # | |
5 # Copyright Heikki Lehvaslaiho | |
6 # | |
7 | |
8 =head1 NAME | |
9 | |
10 Bio::AlignIO::nexus - NEXUS format sequence input/output stream | |
11 | |
12 =head1 SYNOPSIS | |
13 | |
14 Do not use this module directly. Use it via the L<Bio::AlignIO> class. | |
15 | |
16 =head1 DESCRIPTION | |
17 | |
18 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS | |
19 data blocks. See method documentation for supported NEXUS features. | |
20 | |
21 =head1 ACKNOWLEDGEMENTS | |
22 | |
23 Will Fisher has written an excellent standalone NEXUS format parser in | |
24 perl, readnexus. A number of tricks were adapted from it. | |
25 | |
26 =head1 FEEDBACK | |
27 | |
28 =head2 Reporting Bugs | |
29 | |
30 Report bugs to the Bioperl bug tracking system to help us keep track | |
31 the bugs and their resolution. | |
32 Bug reports can be submitted via email or the web: | |
33 | |
34 bioperl-bugs@bio.perl.org | |
35 http://bugzilla.bioperl.org/ | |
36 | |
37 =head1 AUTHORS - Heikki Lehvaslaiho | |
38 | |
39 Email: heikki@ebi.ac.uk | |
40 | |
41 | |
42 =head1 APPENDIX | |
43 | |
44 The rest of the documentation details each of the object | |
45 methods. Internal methods are usually preceded with a _ | |
46 | |
47 =cut | |
48 | |
49 # Let the code begin... | |
50 | |
51 package Bio::AlignIO::nexus; | |
52 use vars qw(@ISA %valid_type); | |
53 use strict; | |
54 no strict "refs"; | |
55 | |
56 use Bio::AlignIO; | |
57 | |
58 @ISA = qw(Bio::AlignIO); | |
59 | |
60 BEGIN { | |
61 %valid_type = map {$_, 1} qw( dna rna protein standard); | |
62 } | |
63 | |
64 =head2 next_aln | |
65 | |
66 Title : next_aln | |
67 Usage : $aln = $stream->next_aln() | |
68 Function: Returns the next alignment in the stream. | |
69 | |
70 Supports the following NEXUS format features: | |
71 - The file has to start with '#NEXUS' | |
72 - Reads in the name of the alignment from a comment | |
73 (anything after 'TITLE: ') . | |
74 - Sequence names can be given in a taxa block, too. | |
75 - If matchchar notation is used, converts | |
76 them back to sequence characters. | |
77 - Does character conversions specified in the | |
78 NEXUS equate command. | |
79 - Sequence names of type 'Homo sapiens' and | |
80 Homo_sapiens are treated identically. | |
81 | |
82 Returns : L<Bio::Align::AlignI> object | |
83 Args : | |
84 | |
85 =cut | |
86 | |
87 sub next_aln { | |
88 my $self = shift; | |
89 my $entry; | |
90 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet, | |
91 $match, $gap, $missing, $equate, $interleave, | |
92 $name,$str,@names,$seqname,$start,$end,$count,$seq); | |
93 | |
94 my $aln = Bio::SimpleAlign->new(-source => 'nexus'); | |
95 | |
96 # file starts with '#NEXUS' but we allow white space only lines before it | |
97 $entry = $self->_readline; | |
98 $entry = $self->_readline while $entry =~ /^\s+$/; | |
99 | |
100 return unless $entry; | |
101 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry") | |
102 unless $entry =~ /^#NEXUS/i; | |
103 | |
104 # skip anything before either the taxa or data block | |
105 # but read in the optional title in a comment | |
106 while (defined($entry = $self->_readline)) { | |
107 local ($_) = $entry; | |
108 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1; | |
109 last if /^begin +data/i || /^begin +taxa/i; | |
110 } | |
111 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name; | |
112 | |
113 # data and taxa blocks | |
114 my $taxlabels; | |
115 while ($entry = $self->_readline) { | |
116 local ($_) = $entry; | |
117 | |
118 # read in seq names if in taxa block | |
119 $taxlabels = 1 if /taxlabels/i; | |
120 if ($taxlabels) { | |
121 @names = $self->_read_taxlabels; | |
122 $taxlabels = 0; | |
123 } | |
124 | |
125 /ntax ?= ?(\d+)/i and $seqcount = $1; | |
126 /nchar ?= ?(\d+)/i and $residuecount = $1; | |
127 /matchchar ?= ?(.)/i and $match = $1; | |
128 /gap ?= ?(.)/i and $gap = $1; | |
129 /missing ?= ?(.)/i and $missing = $1; | |
130 /equate ?= ?"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A"; | |
131 /datatype ?= ?(\w+)/i and $alphabet = lc $1; | |
132 /interleave/i and $interleave = 1 ; | |
133 | |
134 last if /matrix/i; | |
135 } | |
136 $self->throw("Not a valid NEXUS sequence file. Datatype not specified") | |
137 unless $alphabet; | |
138 $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]") | |
139 unless $valid_type{$alphabet}; | |
140 | |
141 $aln->gap_char($gap); | |
142 $aln->missing_char($missing); | |
143 | |
144 # | |
145 # if data is not right after the matrix line | |
146 # read the empty lines out | |
147 # | |
148 while ($entry = $self->_readline) { | |
149 unless ($entry =~ /^\s+$/) { | |
150 $self->_pushback($entry); | |
151 last; | |
152 } | |
153 } | |
154 | |
155 # | |
156 # matrix command | |
157 # | |
158 # first alignment section | |
159 if (@names == 0) { # taxa block did not exist | |
160 while ($entry = $self->_readline) { | |
161 local ($_) = $entry; | |
162 | |
163 s/\[[^[]+\]//g; #] remove comments | |
164 if ($interleave) { | |
165 /^\s+$/ and last; | |
166 } else { | |
167 /^\s+$/ and next; | |
168 } | |
169 /^\s*;\s*$/ and last; | |
170 if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #' | |
171 $name = ($2 || $3); | |
172 $str = $4; | |
173 $name =~ s/ /_/g; | |
174 push @names, $name; | |
175 | |
176 $str =~ s/\s//g; | |
177 $count = @names; | |
178 $hash{$count} = $str; | |
179 }; | |
180 $self->throw("Not a valid interleaved NEXUS file! | |
181 seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount; | |
182 } | |
183 } | |
184 | |
185 # interleaved sections | |
186 $count = 0; | |
187 while( $entry = $self->_readline) { | |
188 local ($_) = $entry; | |
189 s/\[[^[]+\]//g; #] remove comments | |
190 last if /^\s*;/; | |
191 | |
192 $count = 0, next if $entry =~ /^\s*$/; | |
193 if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #' | |
194 $str = $4; | |
195 $str =~ s/\s//g; | |
196 $count++; | |
197 $hash{$count} .= $str; | |
198 }; | |
199 $self->throw("Not a valid interleaved NEXUS file! | |
200 seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount; | |
201 | |
202 } | |
203 | |
204 return 0 if @names < 1; | |
205 | |
206 # sequence creation | |
207 $count = 0; | |
208 foreach $name ( @names ) { | |
209 $count++; | |
210 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { | |
211 $seqname = $1; | |
212 $start = $2; | |
213 $end = $3; | |
214 } else { | |
215 $seqname=$name; | |
216 $start = 1; | |
217 $str = $hash{$count}; | |
218 $str =~ s/[^A-Za-z]//g; | |
219 $end = length($str); | |
220 } | |
221 | |
222 # consistency test | |
223 $self->throw("Length of sequence [$seqname] is not [$residuecount]! ") | |
224 unless CORE::length($hash{$count}) == $residuecount; | |
225 | |
226 $seq = new Bio::LocatableSeq('-seq'=>$hash{$count}, | |
227 '-id'=>$seqname, | |
228 '-start'=>$start, | |
229 '-end'=>$end, | |
230 'alphabet'=>$alphabet | |
231 ); | |
232 $aln->add_seq($seq); | |
233 } | |
234 | |
235 # if matchchar is used | |
236 $aln->unmatch($match) if $match; | |
237 | |
238 # if equate ( e.g. equate="T=C G=A") is used | |
239 if ($equate) { | |
240 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g; | |
241 } | |
242 | |
243 while ($entry !~ /endblock/i) { | |
244 $entry = $self->_readline; | |
245 } | |
246 | |
247 return $aln; | |
248 } | |
249 | |
250 sub _read_taxlabels { | |
251 my ($self) = @_; | |
252 my ($name, @names); | |
253 while (my $entry = $self->_readline) { | |
254 ($name) = $entry =~ /\s*(\S+)\s+/; | |
255 $name =~ s/\[[^\[]+\]//g; | |
256 $name =~ s/\W/_/g; | |
257 push @names, $name; | |
258 last if /^\s*;/; | |
259 } | |
260 return @names; | |
261 } | |
262 | |
263 =head2 write_aln | |
264 | |
265 Title : write_aln | |
266 Usage : $stream->write_aln(@aln) | |
267 Function: Writes the $aln object into the stream in interleaved NEXUS | |
268 format. Everything is written into a data block. | |
269 SimpleAlign methods match_char, missing_char and gap_char must be set | |
270 if you want to see them in the output. | |
271 Returns : 1 for success and 0 for error | |
272 Args : L<Bio::Align::AlignI> object | |
273 | |
274 =cut | |
275 | |
276 sub write_aln { | |
277 my ($self,@aln) = @_; | |
278 my $count = 0; | |
279 my $wrapped = 0; | |
280 my $maxname; | |
281 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index ); | |
282 my ($match, $missing, $gap,$symbols) = ('', '', '',''); | |
283 | |
284 foreach my $aln (@aln) { | |
285 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { | |
286 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); | |
287 next; | |
288 } | |
289 $self->throw("All sequences in the alignment must be the same length") | |
290 unless $aln->is_flush($self->verbose); | |
291 | |
292 $length = $aln->length(); | |
293 | |
294 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n", | |
295 $aln->id, $aln->no_sequences, $length)); | |
296 $match = "match=". $aln->match_char if $aln->match_char; | |
297 $missing = "missing=". $aln->missing_char if $aln->missing_char; | |
298 $gap = "gap=". $aln->gap_char if $aln->gap_char; | |
299 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"' if( $aln->symbol_chars); | |
300 $self->_print (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n", | |
301 $aln->get_seq_by_pos(1)->alphabet, $match, $missing, $gap, $symbols)); | |
302 | |
303 my $indent = $aln->maxdisplayname_length; | |
304 $aln->set_displayname_flat(); | |
305 foreach $seq ( $aln->each_seq() ) { | |
306 $name = $aln->displayname($seq->get_nse()); | |
307 $name = sprintf("%-${indent}s", $name); | |
308 $hash{$name} = $seq->seq(); | |
309 push(@arr,$name); | |
310 } | |
311 | |
312 while( $count < $length ) { | |
313 # there is another block to go! | |
314 foreach $name ( @arr ) { | |
315 my $dispname = $name; | |
316 # $dispname = '' if $wrapped; | |
317 $self->_print (sprintf("%${indent}s ",$dispname)); | |
318 $tempcount = $count; | |
319 $index = 0; | |
320 while( ($tempcount + 10 < $length) && ($index < 5) ) { | |
321 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10))); | |
322 $tempcount += 10; | |
323 $index++; | |
324 } | |
325 # last | |
326 if( $index < 5) { | |
327 # space to print! | |
328 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount))); | |
329 $tempcount += 10; | |
330 } | |
331 $self->_print ("\n"); | |
332 } | |
333 $self->_print ("\n\n"); | |
334 $count = $tempcount; | |
335 $wrapped = 1; | |
336 } | |
337 $self->_print (";\n\nendblock;\n"); | |
338 } | |
339 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
340 return 1; | |
341 } | |
342 | |
343 1; |