Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/AlignIO/nexus.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: nexus.pm,v 1.12.2.1 2003/04/07 15:17:17 heikki Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::AlignIO::nexus | |
| 4 # | |
| 5 # Copyright Heikki Lehvaslaiho | |
| 6 # | |
| 7 | |
| 8 =head1 NAME | |
| 9 | |
| 10 Bio::AlignIO::nexus - NEXUS format sequence input/output stream | |
| 11 | |
| 12 =head1 SYNOPSIS | |
| 13 | |
| 14 Do not use this module directly. Use it via the L<Bio::AlignIO> class. | |
| 15 | |
| 16 =head1 DESCRIPTION | |
| 17 | |
| 18 This object can transform L<Bio::Align::AlignI> objects to and from NEXUS | |
| 19 data blocks. See method documentation for supported NEXUS features. | |
| 20 | |
| 21 =head1 ACKNOWLEDGEMENTS | |
| 22 | |
| 23 Will Fisher has written an excellent standalone NEXUS format parser in | |
| 24 perl, readnexus. A number of tricks were adapted from it. | |
| 25 | |
| 26 =head1 FEEDBACK | |
| 27 | |
| 28 =head2 Reporting Bugs | |
| 29 | |
| 30 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 31 the bugs and their resolution. | |
| 32 Bug reports can be submitted via email or the web: | |
| 33 | |
| 34 bioperl-bugs@bio.perl.org | |
| 35 http://bugzilla.bioperl.org/ | |
| 36 | |
| 37 =head1 AUTHORS - Heikki Lehvaslaiho | |
| 38 | |
| 39 Email: heikki@ebi.ac.uk | |
| 40 | |
| 41 | |
| 42 =head1 APPENDIX | |
| 43 | |
| 44 The rest of the documentation details each of the object | |
| 45 methods. Internal methods are usually preceded with a _ | |
| 46 | |
| 47 =cut | |
| 48 | |
| 49 # Let the code begin... | |
| 50 | |
| 51 package Bio::AlignIO::nexus; | |
| 52 use vars qw(@ISA %valid_type); | |
| 53 use strict; | |
| 54 no strict "refs"; | |
| 55 | |
| 56 use Bio::AlignIO; | |
| 57 | |
| 58 @ISA = qw(Bio::AlignIO); | |
| 59 | |
| 60 BEGIN { | |
| 61 %valid_type = map {$_, 1} qw( dna rna protein standard); | |
| 62 } | |
| 63 | |
| 64 =head2 next_aln | |
| 65 | |
| 66 Title : next_aln | |
| 67 Usage : $aln = $stream->next_aln() | |
| 68 Function: Returns the next alignment in the stream. | |
| 69 | |
| 70 Supports the following NEXUS format features: | |
| 71 - The file has to start with '#NEXUS' | |
| 72 - Reads in the name of the alignment from a comment | |
| 73 (anything after 'TITLE: ') . | |
| 74 - Sequence names can be given in a taxa block, too. | |
| 75 - If matchchar notation is used, converts | |
| 76 them back to sequence characters. | |
| 77 - Does character conversions specified in the | |
| 78 NEXUS equate command. | |
| 79 - Sequence names of type 'Homo sapiens' and | |
| 80 Homo_sapiens are treated identically. | |
| 81 | |
| 82 Returns : L<Bio::Align::AlignI> object | |
| 83 Args : | |
| 84 | |
| 85 =cut | |
| 86 | |
| 87 sub next_aln { | |
| 88 my $self = shift; | |
| 89 my $entry; | |
| 90 my ($aln_name, $seqcount, $residuecount, %hash, $alphabet, | |
| 91 $match, $gap, $missing, $equate, $interleave, | |
| 92 $name,$str,@names,$seqname,$start,$end,$count,$seq); | |
| 93 | |
| 94 my $aln = Bio::SimpleAlign->new(-source => 'nexus'); | |
| 95 | |
| 96 # file starts with '#NEXUS' but we allow white space only lines before it | |
| 97 $entry = $self->_readline; | |
| 98 $entry = $self->_readline while $entry =~ /^\s+$/; | |
| 99 | |
| 100 return unless $entry; | |
| 101 $self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry") | |
| 102 unless $entry =~ /^#NEXUS/i; | |
| 103 | |
| 104 # skip anything before either the taxa or data block | |
| 105 # but read in the optional title in a comment | |
| 106 while (defined($entry = $self->_readline)) { | |
| 107 local ($_) = $entry; | |
| 108 /\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1; | |
| 109 last if /^begin +data/i || /^begin +taxa/i; | |
| 110 } | |
| 111 $aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name; | |
| 112 | |
| 113 # data and taxa blocks | |
| 114 my $taxlabels; | |
| 115 while ($entry = $self->_readline) { | |
| 116 local ($_) = $entry; | |
| 117 | |
| 118 # read in seq names if in taxa block | |
| 119 $taxlabels = 1 if /taxlabels/i; | |
| 120 if ($taxlabels) { | |
| 121 @names = $self->_read_taxlabels; | |
| 122 $taxlabels = 0; | |
| 123 } | |
| 124 | |
| 125 /ntax ?= ?(\d+)/i and $seqcount = $1; | |
| 126 /nchar ?= ?(\d+)/i and $residuecount = $1; | |
| 127 /matchchar ?= ?(.)/i and $match = $1; | |
| 128 /gap ?= ?(.)/i and $gap = $1; | |
| 129 /missing ?= ?(.)/i and $missing = $1; | |
| 130 /equate ?= ?"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A"; | |
| 131 /datatype ?= ?(\w+)/i and $alphabet = lc $1; | |
| 132 /interleave/i and $interleave = 1 ; | |
| 133 | |
| 134 last if /matrix/i; | |
| 135 } | |
| 136 $self->throw("Not a valid NEXUS sequence file. Datatype not specified") | |
| 137 unless $alphabet; | |
| 138 $self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]") | |
| 139 unless $valid_type{$alphabet}; | |
| 140 | |
| 141 $aln->gap_char($gap); | |
| 142 $aln->missing_char($missing); | |
| 143 | |
| 144 # | |
| 145 # if data is not right after the matrix line | |
| 146 # read the empty lines out | |
| 147 # | |
| 148 while ($entry = $self->_readline) { | |
| 149 unless ($entry =~ /^\s+$/) { | |
| 150 $self->_pushback($entry); | |
| 151 last; | |
| 152 } | |
| 153 } | |
| 154 | |
| 155 # | |
| 156 # matrix command | |
| 157 # | |
| 158 # first alignment section | |
| 159 if (@names == 0) { # taxa block did not exist | |
| 160 while ($entry = $self->_readline) { | |
| 161 local ($_) = $entry; | |
| 162 | |
| 163 s/\[[^[]+\]//g; #] remove comments | |
| 164 if ($interleave) { | |
| 165 /^\s+$/ and last; | |
| 166 } else { | |
| 167 /^\s+$/ and next; | |
| 168 } | |
| 169 /^\s*;\s*$/ and last; | |
| 170 if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #' | |
| 171 $name = ($2 || $3); | |
| 172 $str = $4; | |
| 173 $name =~ s/ /_/g; | |
| 174 push @names, $name; | |
| 175 | |
| 176 $str =~ s/\s//g; | |
| 177 $count = @names; | |
| 178 $hash{$count} = $str; | |
| 179 }; | |
| 180 $self->throw("Not a valid interleaved NEXUS file! | |
| 181 seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount; | |
| 182 } | |
| 183 } | |
| 184 | |
| 185 # interleaved sections | |
| 186 $count = 0; | |
| 187 while( $entry = $self->_readline) { | |
| 188 local ($_) = $entry; | |
| 189 s/\[[^[]+\]//g; #] remove comments | |
| 190 last if /^\s*;/; | |
| 191 | |
| 192 $count = 0, next if $entry =~ /^\s*$/; | |
| 193 if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)\s$/) { #' | |
| 194 $str = $4; | |
| 195 $str =~ s/\s//g; | |
| 196 $count++; | |
| 197 $hash{$count} .= $str; | |
| 198 }; | |
| 199 $self->throw("Not a valid interleaved NEXUS file! | |
| 200 seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount; | |
| 201 | |
| 202 } | |
| 203 | |
| 204 return 0 if @names < 1; | |
| 205 | |
| 206 # sequence creation | |
| 207 $count = 0; | |
| 208 foreach $name ( @names ) { | |
| 209 $count++; | |
| 210 if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { | |
| 211 $seqname = $1; | |
| 212 $start = $2; | |
| 213 $end = $3; | |
| 214 } else { | |
| 215 $seqname=$name; | |
| 216 $start = 1; | |
| 217 $str = $hash{$count}; | |
| 218 $str =~ s/[^A-Za-z]//g; | |
| 219 $end = length($str); | |
| 220 } | |
| 221 | |
| 222 # consistency test | |
| 223 $self->throw("Length of sequence [$seqname] is not [$residuecount]! ") | |
| 224 unless CORE::length($hash{$count}) == $residuecount; | |
| 225 | |
| 226 $seq = new Bio::LocatableSeq('-seq'=>$hash{$count}, | |
| 227 '-id'=>$seqname, | |
| 228 '-start'=>$start, | |
| 229 '-end'=>$end, | |
| 230 'alphabet'=>$alphabet | |
| 231 ); | |
| 232 $aln->add_seq($seq); | |
| 233 } | |
| 234 | |
| 235 # if matchchar is used | |
| 236 $aln->unmatch($match) if $match; | |
| 237 | |
| 238 # if equate ( e.g. equate="T=C G=A") is used | |
| 239 if ($equate) { | |
| 240 $aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g; | |
| 241 } | |
| 242 | |
| 243 while ($entry !~ /endblock/i) { | |
| 244 $entry = $self->_readline; | |
| 245 } | |
| 246 | |
| 247 return $aln; | |
| 248 } | |
| 249 | |
| 250 sub _read_taxlabels { | |
| 251 my ($self) = @_; | |
| 252 my ($name, @names); | |
| 253 while (my $entry = $self->_readline) { | |
| 254 ($name) = $entry =~ /\s*(\S+)\s+/; | |
| 255 $name =~ s/\[[^\[]+\]//g; | |
| 256 $name =~ s/\W/_/g; | |
| 257 push @names, $name; | |
| 258 last if /^\s*;/; | |
| 259 } | |
| 260 return @names; | |
| 261 } | |
| 262 | |
| 263 =head2 write_aln | |
| 264 | |
| 265 Title : write_aln | |
| 266 Usage : $stream->write_aln(@aln) | |
| 267 Function: Writes the $aln object into the stream in interleaved NEXUS | |
| 268 format. Everything is written into a data block. | |
| 269 SimpleAlign methods match_char, missing_char and gap_char must be set | |
| 270 if you want to see them in the output. | |
| 271 Returns : 1 for success and 0 for error | |
| 272 Args : L<Bio::Align::AlignI> object | |
| 273 | |
| 274 =cut | |
| 275 | |
| 276 sub write_aln { | |
| 277 my ($self,@aln) = @_; | |
| 278 my $count = 0; | |
| 279 my $wrapped = 0; | |
| 280 my $maxname; | |
| 281 my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index ); | |
| 282 my ($match, $missing, $gap,$symbols) = ('', '', '',''); | |
| 283 | |
| 284 foreach my $aln (@aln) { | |
| 285 if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { | |
| 286 $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); | |
| 287 next; | |
| 288 } | |
| 289 $self->throw("All sequences in the alignment must be the same length") | |
| 290 unless $aln->is_flush($self->verbose); | |
| 291 | |
| 292 $length = $aln->length(); | |
| 293 | |
| 294 $self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n", | |
| 295 $aln->id, $aln->no_sequences, $length)); | |
| 296 $match = "match=". $aln->match_char if $aln->match_char; | |
| 297 $missing = "missing=". $aln->missing_char if $aln->missing_char; | |
| 298 $gap = "gap=". $aln->gap_char if $aln->gap_char; | |
| 299 $symbols = 'symbols="'.join('',$aln->symbol_chars). '"' if( $aln->symbol_chars); | |
| 300 $self->_print (sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n", | |
| 301 $aln->get_seq_by_pos(1)->alphabet, $match, $missing, $gap, $symbols)); | |
| 302 | |
| 303 my $indent = $aln->maxdisplayname_length; | |
| 304 $aln->set_displayname_flat(); | |
| 305 foreach $seq ( $aln->each_seq() ) { | |
| 306 $name = $aln->displayname($seq->get_nse()); | |
| 307 $name = sprintf("%-${indent}s", $name); | |
| 308 $hash{$name} = $seq->seq(); | |
| 309 push(@arr,$name); | |
| 310 } | |
| 311 | |
| 312 while( $count < $length ) { | |
| 313 # there is another block to go! | |
| 314 foreach $name ( @arr ) { | |
| 315 my $dispname = $name; | |
| 316 # $dispname = '' if $wrapped; | |
| 317 $self->_print (sprintf("%${indent}s ",$dispname)); | |
| 318 $tempcount = $count; | |
| 319 $index = 0; | |
| 320 while( ($tempcount + 10 < $length) && ($index < 5) ) { | |
| 321 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10))); | |
| 322 $tempcount += 10; | |
| 323 $index++; | |
| 324 } | |
| 325 # last | |
| 326 if( $index < 5) { | |
| 327 # space to print! | |
| 328 $self->_print (sprintf("%s ",substr($hash{$name},$tempcount))); | |
| 329 $tempcount += 10; | |
| 330 } | |
| 331 $self->_print ("\n"); | |
| 332 } | |
| 333 $self->_print ("\n\n"); | |
| 334 $count = $tempcount; | |
| 335 $wrapped = 1; | |
| 336 } | |
| 337 $self->_print (";\n\nendblock;\n"); | |
| 338 } | |
| 339 $self->flush if $self->_flush_on_write && defined $self->_fh; | |
| 340 return 1; | |
| 341 } | |
| 342 | |
| 343 1; |
