2
|
1 #!/usr/local/bin/perl -w
|
|
2
|
|
3 =head1 NAME
|
|
4
|
|
5 logo.pm - organizes data in FASTA and CLUSTAL formats into height data.
|
|
6
|
|
7 =head1 SYNOPSIS
|
|
8
|
|
9 Perl module
|
|
10
|
|
11 =head1 DESCRIPTION
|
|
12
|
|
13 logo.pm: Takes in strings of aligned sequences and sorts them vertically
|
|
14 based on height as assigned by the following equations found in
|
|
15 Schneider and Stephens paper "Sequence Logos: A New Way to Display
|
|
16 Consensus Sequences":
|
|
17
|
|
18 height = f(b,l) * R(l) (1)
|
|
19
|
|
20 where f(b,l) is the frequency of base or amino acid "b" at position
|
|
21 "l". R(l) is amount of information present at position "l" and can
|
|
22 be quantified as follows:
|
|
23
|
|
24 R(l) for amino acids = log(20) - (H(l) + e(n)) (2a)
|
|
25 R(l) for nucleic acids = 2 - (H(l) + e(n)) (2b)
|
|
26
|
|
27 where log is taken base 2, H(l) is the uncertainty at position "l",
|
|
28 and e(n) is the error correction factor for small "n". H(l) is
|
|
29 computed as follows:
|
|
30
|
|
31 H(l) = - (Sum f(b,l) * log[ f(b,l) ]) (3)
|
|
32
|
|
33 where again, log is taken base 2. f(b,l) is the frequency of base
|
|
34 "b" at position "l". The sum is taken over all amino acids or
|
|
35 bases, depending on which the data is.
|
|
36
|
|
37 Currently, logo.pm uses an approximation for e(n), given by:
|
|
38
|
|
39 e(n) = (s-1) / (2 * ln2 * n) (4)
|
|
40
|
|
41 Where s is 4 for nucleotides, 20 for amino acids ; n is the number
|
|
42 of sequences in the alignment. e(n) also gives the height of error
|
|
43 bars.
|
|
44
|
|
45 =cut
|
|
46
|
|
47 package logo;
|
|
48
|
|
49 use strict;
|
|
50 use Carp;
|
|
51
|
|
52 ################################################################################
|
|
53 ###### SOME VARIABLES ######
|
|
54 ################################################################################
|
|
55
|
|
56 my $DEBUG = 0;
|
|
57
|
|
58 my $AA = 0;
|
|
59 my $NA = 1;
|
|
60
|
|
61 my %BASES = ("a" => "adenine",
|
|
62 "t" => "thymine",
|
|
63 "g" => "guanine",
|
|
64 "c" => "cytosine",
|
|
65 "u" => "uracil");
|
|
66
|
|
67 # does not include B or Z
|
|
68 my %AMINOACIDS = ("a" => "", "c" => "", "d" => "", "e" => "", "f" => "",
|
|
69 "g" => "", "h" => "", "i" => "", "k" => "", "l" => "",
|
|
70 "m" => "", "n" => "", "p" => "", "q" => "", "r" => "",
|
|
71 "s" => "", "t" => "", "v" => "", "w" => "", "y" => "");
|
|
72
|
|
73 my @data;
|
|
74 my $kind;
|
|
75 my ($seqs_r, $desc_r);
|
|
76
|
|
77 my $CONFIDENCE_LIMIT = 0.90;
|
|
78
|
|
79 ################################################################################
|
|
80 ###### SOME FUNCTIONS ######
|
|
81 ################################################################################
|
|
82
|
|
83 =head1 APPENDIX
|
|
84
|
|
85 =cut
|
|
86
|
|
87 =head2 getHeightData()
|
|
88
|
|
89 Usage : my ($height_data_r, $description_data_r, $kind) =
|
|
90 logo::getHeightData($input_data_r, $params);
|
|
91 Returns : * REFERENCE TO array of height data
|
|
92 * REFERENCE TO array of input description strings
|
|
93 * $AA if the data is amino acid, $NA otherise
|
|
94 Args : $input_data_r : input data in CLUSTAL or FASTA formats
|
|
95 : $params : hash of parameters
|
|
96
|
|
97 getHeightData is the entry point into the logo.pm module. $input_data_r is a
|
|
98 reference to an array of strings containing FASTA or CLUSTAL data, where all
|
|
99 lines whose first character is "#" is considered a comment line.
|
|
100
|
|
101 $params is a hash of parameters with the following keys:
|
|
102 * smallsampletoggle : 0 to turn off small sample correction, otherwise
|
|
103 small sample correction is on
|
|
104 * input_kind : 0 for amino acids, 1 for nucleic acids; if undefined,
|
|
105 logo.pm will attempt to automatically detect whether the
|
|
106 input consists of amino or nucleic acid data. If
|
|
107 $input_kind is defined, only those residues defined by
|
|
108 $input_kind will be in the output -- all other residues are
|
|
109 considered as spaces. For example, if $input_kind is $NA,
|
|
110 the residue "I" or "i" are considered spaces, since "I" and
|
|
111 "i" are not nucleic acid residues.
|
|
112 * stretch : stretch all characters so they are flush at the maximum number
|
|
113 of bits allowed
|
|
114
|
|
115 Sample use:
|
|
116
|
|
117 # get FASTA data
|
|
118 open (FASTA, "$fastafile");
|
|
119 my @inputdata = <FASTA>;
|
|
120 close (FASTA);
|
|
121
|
|
122 my %heightparams = (
|
|
123 smallsamplecorrection => 0,
|
|
124 input_kind => 0,
|
|
125 stretch => 0
|
|
126 );
|
|
127
|
|
128 # get height data
|
|
129 my ($heightdata_r, $desc_r, $kind) = logo::getHeightData(\@inputdata, \%heightparams);
|
|
130
|
|
131 =cut
|
|
132
|
|
133 # entry point into module
|
|
134 sub getHeightData {
|
|
135
|
|
136 # $smallsampletoggle is toggle to turn off small sample correction (1 to turn off)
|
|
137 # $input_kind can be $AA or $NA or undef
|
|
138 my ($input_data_r, $params) = @_;
|
|
139
|
|
140 # gary 040119: adjust for formats (Unix is \n, Mac is \r, Windows is \r\n)
|
|
141 $input_data_r = normalizeData($input_data_r);
|
|
142
|
|
143 # gets sequences, sets $kind temporarily
|
|
144 my ($goodlength, $maxreslength, $badline, $validformat);
|
|
145 ($seqs_r, $desc_r, $maxreslength, $goodlength, $badline, $validformat) =
|
|
146 getSeqs($input_data_r, $params->{input_kind});
|
|
147
|
|
148 # for(my $i = 0; $i < scalar @$seqs_r ; $i++) {
|
|
149 # print STDERR ($desc_r->[$i] . "\n" . $seqs_r->[$i] . "\n");
|
|
150 # }
|
|
151 # print STDERR "maxreslength = $maxreslength\n";
|
|
152 #
|
|
153 # exit(0);
|
|
154
|
|
155 if ($DEBUG) { print STDERR ("point 1\n");}
|
|
156
|
|
157 # check for valid format
|
|
158 if ((defined $validformat) && ($validformat == 1)) {
|
|
159 # print("returning\n");
|
|
160 return (undef, undef, undef, undef, undef, 1);
|
|
161 }
|
|
162
|
|
163 if ($DEBUG) { print STDERR ("point 2\n");}
|
|
164
|
|
165 # check for bad length
|
|
166 if (!$goodlength) {
|
|
167 return (undef, undef, undef, $goodlength, $badline);
|
|
168 }
|
|
169
|
|
170 # reset $kind if in $input_kind
|
|
171 if (defined $params->{input_kind} && isLegalKind($params->{input_kind}) ) {
|
|
172 $kind = $params->{input_kind};
|
|
173 }
|
|
174
|
|
175 # build data
|
|
176 buildData(@$seqs_r, $params->{smallsampletoggle}, $params->{stretch}, $maxreslength);
|
|
177
|
|
178 if ($DEBUG) { print STDERR ("point 3\n");}
|
|
179
|
|
180 # print STDERR ("data size = ", scalar @data, "\n");
|
|
181 # foreach (@data) {
|
|
182 # print STDERR ("$_\n");
|
|
183 # }
|
|
184 #
|
|
185 # exit(0);
|
|
186 #
|
|
187 # print STDERR ("return at 2\n");
|
|
188 return (\@data, $desc_r, $kind, $goodlength, $badline);
|
|
189 }
|
|
190
|
|
191 sub isLegalKind {
|
|
192 return ($_[0] =~ /^[01]$/);
|
|
193 }
|
|
194
|
|
195 ################################################################################
|
|
196 #
|
|
197 # sub normalizeData($data_r) returns $data_r, with Mac/Unix/Windows newline
|
|
198 # style normalized to standard Unix-style newline style
|
|
199 #
|
|
200 ################################################################################
|
|
201 sub normalizeData {
|
|
202 my ($data_r) = @_;
|
|
203
|
|
204 # check args
|
|
205 if (not defined $data_r) {
|
|
206 die "data_r must be defined\n";
|
|
207 }
|
|
208
|
|
209 my @normalized = ();
|
|
210 foreach my $pseudo_line (@$data_r) {
|
|
211 my @split_line = split(/[\r\n]+/, $pseudo_line);
|
|
212 push(@normalized, @split_line);
|
|
213 }
|
|
214
|
|
215 return \@normalized;
|
|
216 }
|
|
217
|
|
218
|
|
219 ################################################################################
|
|
220 #
|
|
221 # sub getSeqs($data_r, $kind) returns 5 values:
|
|
222 #
|
|
223 # * array reference to sequence strings
|
|
224 # * array reference to sequence names
|
|
225 # * length of sequence
|
|
226 # * 1 if all sequences have the same length, 0 else
|
|
227 # * line number L where sequenceLength(L) != sequenceLength(other lines), else
|
|
228 # undef
|
|
229 #
|
|
230 ################################################################################
|
|
231 sub getSeqs {
|
|
232 my ($input_data_r, $kind) = @_;
|
|
233
|
|
234 unless( $input_data_r->[0] ){
|
|
235 return (undef, undef, undef, undef, undef, 1);
|
|
236 }
|
|
237
|
|
238 # skip all comment chars and lines of all spaces
|
|
239 while ( ($input_data_r->[0] =~ /^\s*\#/) || ($input_data_r->[0] =~ /^\s*$/) ) {
|
|
240 shift @$input_data_r;
|
|
241 if( !defined $input_data_r->[0]) {return (undef, undef, undef, undef, undef, 1);}
|
|
242 }
|
|
243
|
|
244 if (isFormat_FASTA($input_data_r)) {
|
|
245 return getSeqs_FASTA($input_data_r, $kind);
|
|
246
|
|
247 } elsif (isFormat_CLUSTAL($input_data_r)) {
|
|
248 return getSeqs_CLUSTAL($input_data_r, $kind);
|
|
249
|
|
250 } elsif (isFormat_FLAT($input_data_r)) {
|
|
251 return getSeqs_FLAT($input_data_r, $kind);
|
|
252
|
|
253 } else {
|
|
254 if ($DEBUG) {print STDERR ("format nothing\n");}
|
|
255 return (undef, undef, undef, undef, undef, 1);
|
|
256 }
|
|
257
|
|
258 # if ($_[0] =~ />/) {
|
|
259 # return getSeqs_FASTA(@_);
|
|
260 # } else {
|
|
261 # return getSeqs_CLUSTAL(@_);
|
|
262 # }
|
|
263 }
|
|
264
|
|
265 ################################################################################
|
|
266 #
|
|
267 # sub isFormat_FASTA($data_r) returns 1 if $data_r is in FASTA format
|
|
268 #
|
|
269 ################################################################################s
|
|
270 sub isFormat_FASTA {
|
|
271 my ($input_data_r) = @_;
|
|
272
|
|
273 # check args
|
|
274 if (not defined $input_data_r) {
|
|
275 Carp::confess("logo::isFormat_FASTA : input_data_r must be defined\n");
|
|
276 }
|
|
277
|
|
278 if ($input_data_r->[0] =~ />/) {
|
|
279 return 1;
|
|
280 } else {
|
|
281 return 0;
|
|
282 }
|
|
283 }
|
|
284
|
|
285 ################################################################################
|
|
286 #
|
|
287 # sub isFormat_CLUSTAL($data_r) returns 1 if $data_r is in CLUSTAL format
|
|
288 #
|
|
289 ################################################################################
|
|
290 sub isFormat_CLUSTAL {
|
|
291 my ($input_data_r) = @_;
|
|
292
|
|
293 # check args
|
|
294 if (not defined $input_data_r) {
|
|
295 Carp::confess("logo::isFormat_CLUSTAL : input_data_r must be " .
|
|
296 "defined\n");
|
|
297 }
|
|
298
|
|
299 my $i=0;
|
|
300
|
|
301 # # skip spaces or just "*" and "." and ":"
|
|
302 # while ($input_data_r->[$i] =~ /^[\*\:\s]*$/) {
|
|
303 # $i++;
|
|
304 # }
|
|
305
|
|
306 # if it looks like CLUSTAL W (version) ... , then it must be clustal
|
|
307 if ($input_data_r->[$i] =~ /^\s*CLUSTAL/) {
|
|
308 return 1;
|
|
309 }
|
|
310
|
|
311 # CLUSTAL looks like: "name seq"
|
|
312 if ($input_data_r->[$i] =~ /^\s*(\S+)\s+(\S+)\s*$/) {
|
|
313 return 1;
|
|
314 } else {
|
|
315 return 0;
|
|
316 }
|
|
317 }
|
|
318
|
|
319 ################################################################################
|
|
320 #
|
|
321 # sub isFormat_FLAT($data_r) returns 1 if $data_r is in FLAT format
|
|
322 #
|
|
323 ################################################################################
|
|
324 sub isFormat_FLAT {
|
|
325 my ($input_data_r) = @_;
|
|
326
|
|
327 # check args
|
|
328 if (not defined $input_data_r) {
|
|
329 Carp::confess("logo::isFormat_FLAT : input_data_r must be defined\n");
|
|
330 }
|
|
331
|
|
332 # print("checking flat\n");
|
|
333 # print("first element = -->", $input_data_r->[0], "<--\n");
|
|
334
|
|
335 if ($input_data_r->[0] =~ /^[a-zA-Z\-]+\s*$/) {
|
|
336 return 1;
|
|
337 } else {
|
|
338 return 0;
|
|
339 }
|
|
340 }
|
|
341
|
|
342 ################################################################################
|
|
343 ###### FORMATTING FUNCTIONS ######
|
|
344 ################################################################################
|
|
345
|
|
346 # the flat sequence format is as follows:
|
|
347 # sequence1
|
|
348 # sequence2
|
|
349 # sequence3
|
|
350 # ...
|
|
351 # sequenceN
|
|
352 sub getSeqs_FLAT {
|
|
353
|
|
354 if ($DEBUG) {print STDERR "DOING FLAT\n";}
|
|
355
|
|
356 my ($input_data_r, $input_kind) = @_;
|
|
357
|
|
358 my $linelength = 0;
|
|
359 my $seqCount = 0;
|
|
360 my $total_residues = 0;
|
|
361 my (@returnVal, @desc) = ();
|
|
362 my $prevlinelength = undef;
|
|
363 my $NA_count = 0;
|
|
364
|
|
365 foreach my $seq (@$input_data_r) {
|
|
366 # chomp $seq;
|
|
367 $seq =~ s/\s+$//;
|
|
368
|
|
369 my @chars = split(//,$seq);
|
|
370
|
|
371 my $char;
|
|
372 foreach (@chars) {
|
|
373 $total_residues++;
|
|
374 $linelength++;
|
|
375
|
|
376 # set $char
|
|
377 if (defined $input_kind) {
|
|
378 if ($input_kind == $AA) {
|
|
379 $char = (isAA($_)) ? $_ : "-";
|
|
380 } else { # == $NA
|
|
381 $char = (isNA($_)) ? $_ : "-";
|
|
382 }
|
|
383 } else {
|
|
384 $char = $_;
|
|
385 if (isNA($char)) {
|
|
386 $NA_count++;
|
|
387 }
|
|
388 }
|
|
389
|
|
390 $returnVal[$seqCount] .= $char;
|
|
391 }
|
|
392 $desc[$seqCount] = "no name";
|
|
393
|
|
394 if ($seqCount == 0) {
|
|
395 $prevlinelength = $linelength;
|
|
396 } elsif ($prevlinelength != $linelength) { # different number of residues, so complain
|
|
397 return (undef, undef, undef, 0, $seq); # 0 for not same length, $seq is name
|
|
398 }
|
|
399 $linelength=0;
|
|
400
|
|
401 $seqCount++;
|
|
402 }
|
|
403
|
|
404 # determine whether to use $NA or $AA
|
|
405 if (!defined $input_kind) {
|
|
406 if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) {
|
|
407 $kind = $NA;
|
|
408 } else {
|
|
409 $kind = $AA;
|
|
410 }
|
|
411 }
|
|
412
|
|
413 return (\@returnVal, \@desc, $prevlinelength, 1, undef);
|
|
414 }
|
|
415
|
|
416 sub getSeqs_CLUSTAL {
|
|
417
|
|
418 if ($DEBUG) {print STDERR "DOING CLUSTAL\n";}
|
|
419
|
|
420 my ($input_data_r, $input_kind) = @_;
|
|
421
|
|
422 my @returnVal;
|
|
423 my @desc;
|
|
424 my $seqCount=0;
|
|
425 my $reslength = 0;
|
|
426 my ($name, $seq);
|
|
427
|
|
428 # my $input_kind = pop @_;
|
|
429 # my $CONFIDENCE_LIMIT = 0.90;
|
|
430 my $NA_count = 0;
|
|
431 my $total_residues = 0;
|
|
432 my ($prevlinelength, $linelength) = (0,0);
|
|
433
|
|
434 # foreach (@_) {
|
|
435 foreach (@$input_data_r) {
|
|
436 # chomp;
|
|
437
|
|
438 if ($DEBUG) {print STDERR ("line = $_\n")};
|
|
439
|
|
440 $_ =~ s/\s+$//;
|
|
441
|
|
442 # skip if it is a comment character -- first character is "#"
|
|
443 next if (/^\s*\#/);
|
|
444
|
|
445 # skil if it is a CLUSTAL W header line
|
|
446 next if (/^\s*CLUSTAL/);
|
|
447
|
|
448 # if spaces or just "*" and "." and ":"
|
|
449 if (/^[\*\.\:\s]*$/) {
|
|
450 $seqCount=0;
|
|
451 $prevlinelength=0;
|
|
452 next;
|
|
453 }
|
|
454
|
|
455 ($name,$seq) = (/^\s*(\S+)\s+(\S+)\s*$/);
|
|
456
|
|
457 if ($DEBUG) { print STDERR ("name, seq = $name, $seq\n"); }
|
|
458
|
|
459 # add new entry
|
|
460 if (!defined $desc[$seqCount]) {
|
|
461 $desc[$seqCount] = $name;
|
|
462 $returnVal[$seqCount] = "";
|
|
463 }
|
|
464
|
|
465 if(!defined $seq) {return (undef, undef, undef, undef, undef, 1);} # Something has gone terrible wrong
|
|
466
|
|
467 my @chars = split(//,$seq);
|
|
468 my $char;
|
|
469 foreach (@chars) {
|
|
470 if ($seqCount == 0) {
|
|
471 $reslength++; # all sequences have same residue length, so only count first one
|
|
472 }
|
|
473
|
|
474 $total_residues++;
|
|
475 $linelength++;
|
|
476
|
|
477 # set $char
|
|
478 if (defined $input_kind) {
|
|
479 if ($input_kind == $AA) {
|
|
480 $char = (isAA($_)) ? $_ : "-";
|
|
481 } else { # == $NA
|
|
482 $char = (isNA($_)) ? $_ : "-";
|
|
483 }
|
|
484 } else {
|
|
485 $char = $_;
|
|
486 if (isNA($char)) {
|
|
487 $NA_count++;
|
|
488 }
|
|
489 }
|
|
490
|
|
491 $returnVal[$seqCount] .= $char;
|
|
492 }
|
|
493
|
|
494 if ($seqCount == 0) {
|
|
495 $prevlinelength = $linelength;
|
|
496 } elsif ($prevlinelength != $linelength) { # different number of residues, so complain
|
|
497 return (undef, undef, undef, 0, $name);
|
|
498 }
|
|
499 $linelength=0;
|
|
500
|
|
501 $seqCount++;
|
|
502 }
|
|
503
|
|
504 # determine whether to use $NA or $AA
|
|
505 if (!defined $input_kind ) {
|
|
506 if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) {
|
|
507 $kind = $NA;
|
|
508 } else {
|
|
509 $kind = $AA;
|
|
510 }
|
|
511 }
|
|
512
|
|
513 return (\@returnVal, \@desc, $reslength, 1, undef);
|
|
514 }
|
|
515
|
|
516 # if $input_kind is defined, residues that are not defined are set to space
|
|
517 sub getSeqs_FASTA {
|
|
518
|
|
519 if ($DEBUG) {print STDERR "DOING FASTA\n";}
|
|
520
|
|
521 my ($input_data_r, $input_kind) = @_;
|
|
522
|
|
523 my @returnVal;
|
|
524 my @desc;
|
|
525 my $count=-1;
|
|
526 my $newElem=0;
|
|
527
|
|
528 # my $input_kind = pop @_;
|
|
529
|
|
530 # my $CONFIDENCE_LIMIT = 0.90;
|
|
531 my $NA_count = 0;
|
|
532 my $total_residues = 0;
|
|
533 my $reslength = 0;
|
|
534 my $maxreslength = 0;
|
|
535
|
|
536 my ($goodlength, $currline, $prevline);
|
|
537
|
|
538
|
|
539 # # skip all lines that are all spaces
|
|
540 # while ($_[0] =~ /^\s*$/) {
|
|
541 # shift @_;
|
|
542 # }
|
|
543
|
|
544 # foreach (@_) {
|
|
545 foreach (@$input_data_r) {
|
|
546
|
|
547 $_ =~ s/\s+$//;
|
|
548
|
|
549 # skip if it is a comment character -- first character is "#"
|
|
550 next if (/^\s*\#/);
|
|
551
|
|
552 # skip all lines that are all spaces
|
|
553 next if (/^\s*$/);
|
|
554
|
|
555 $_ =~ s/\s+$//; # cut trailing white space
|
|
556 $_ =~ s/^\s+//; # cut leading white space
|
|
557 if (/>/) {
|
|
558 $currline = $_;
|
|
559 ($desc[scalar @desc]) = ($_ =~ />\s*(.+)$/);
|
|
560
|
|
561 if (not $newElem) {
|
|
562 $count++;
|
|
563 $newElem = 1;
|
|
564 }
|
|
565 } else {
|
|
566 if ($newElem) {
|
|
567 $maxreslength = $reslength if $maxreslength == 0;
|
|
568 if (($maxreslength != 0) && ($maxreslength != $reslength)) {
|
|
569 return (undef, undef, undef, 0, $prevline);
|
|
570 }
|
|
571
|
|
572 $maxreslength = $reslength;
|
|
573 $reslength = 0;
|
|
574 }
|
|
575
|
|
576 my @chars = split(//,$_);
|
|
577 my $char;
|
|
578 foreach (@chars) {
|
|
579 $reslength++;
|
|
580 $total_residues++;
|
|
581
|
|
582 # set $char
|
|
583 if (defined $input_kind) {
|
|
584 if ($input_kind == $AA) {
|
|
585 $char = (isAA($_)) ? $_ : "-";
|
|
586 } else { # == $NA
|
|
587 $char = (isNA($_)) ? $_ : "-";
|
|
588 }
|
|
589 } else {
|
|
590 $char = ($_ =~ /[a-zA-Z]/) ? $_ : "-"; # if not alpha char, use space
|
|
591 if (isNA($char) && !isSpace($char)) {
|
|
592 $NA_count++;
|
|
593 }
|
|
594 }
|
|
595
|
|
596 if ($newElem) {
|
|
597 $returnVal[$count] = $char;
|
|
598 } else {
|
|
599 $returnVal[$count] .= $char;
|
|
600 }
|
|
601 $newElem = 0;
|
|
602 }
|
|
603 $prevline = $currline if $currline =~ />/;
|
|
604 }
|
|
605 }
|
|
606
|
|
607 # check if last is biggest
|
|
608 if (($maxreslength != 0) && ($maxreslength != $reslength)) {
|
|
609 return (undef, undef, undef, 0, $prevline);
|
|
610 }
|
|
611 # $maxreslength = ($reslength > $maxreslength) ? $reslength : $maxreslength;
|
|
612
|
|
613 # determine whether to use $NA or $AA
|
|
614 if (!defined $input_kind) {
|
|
615 if ($NA_count / ($total_residues+1) >= $CONFIDENCE_LIMIT) {
|
|
616 $kind = $NA;
|
|
617 } else {
|
|
618 $kind = $AA;
|
|
619 }
|
|
620 }
|
|
621
|
|
622 return (\@returnVal, \@desc, $maxreslength || $reslength, 1, undef); # 1 for good lengths
|
|
623 }
|
|
624
|
|
625 sub isSpace {
|
|
626 return $_[0] =~ /[ \-]/;
|
|
627 }
|
|
628
|
|
629 sub isAA {
|
|
630 return (defined $AMINOACIDS{lc $_[0]});
|
|
631 }
|
|
632
|
|
633 sub isNA {
|
|
634 return (defined $BASES{lc $_[0]});
|
|
635 }
|
|
636
|
|
637 ################################################################################
|
|
638 ###### DATA BUILDING FUNCTIONS ######
|
|
639 ################################################################################
|
|
640
|
|
641
|
|
642 # arguments: takes reference to array and lines aligned sequences of bases or
|
|
643 # amino acids
|
|
644 # returns: updated reference array to reflect contents of sequences sorted
|
|
645 # vertically by height as described by (1)
|
|
646 sub buildData {
|
|
647
|
|
648 my $currentx = 0;
|
|
649 my $h;
|
|
650 my $count=0;
|
|
651 my $maxreslength = pop (@_);
|
|
652 my $stretch = pop(@_);
|
|
653 my $smallsampletoggle = pop (@_);
|
|
654 my $totalsize = $#_+1;
|
|
655
|
|
656 while ($currentx < $maxreslength) { #(length $_[0])) {
|
|
657 my $allspaces = 1;
|
|
658 my $spaceCount=0;
|
|
659
|
|
660 # get vertical sequence
|
|
661 my @vert=();
|
|
662 foreach (@_) { # foreach sequence
|
|
663 my $currentchar;
|
|
664
|
|
665 # set currentchar, set to " " if $_ is not long enough
|
|
666 if ($currentx >= (length $_)) {
|
|
667 $currentchar = " ";
|
|
668 } else {
|
|
669 $currentchar = substr($_,$currentx,1);
|
|
670 }
|
|
671
|
|
672 # in all sequences, "-" is considered as a space
|
|
673 # don't count " " and "-"
|
|
674 if (($currentchar ne "-") && ($currentchar ne " ")) {
|
|
675 $vert[scalar @vert] = uc substr($_,$currentx,1);
|
|
676 $allspaces = 0;
|
|
677 } else {
|
|
678 $spaceCount++;
|
|
679 }
|
|
680 }
|
|
681
|
|
682 if ($allspaces) {
|
|
683 # build @vert
|
|
684 @vert = (" 0", ">0");
|
|
685
|
|
686 # place in @data
|
|
687 $data[scalar @data] = \@vert;
|
|
688
|
|
689 $currentx++;
|
|
690 next;
|
|
691 }
|
|
692
|
|
693 my $error;
|
|
694 if ($smallsampletoggle) {
|
|
695 $error=getError($kind,$totalsize - $spaceCount);
|
|
696 } else {
|
|
697 $error = 0;
|
|
698 }
|
|
699
|
|
700 # sort vertical sequence by amino acid or base
|
|
701 @vert = sort(@vert);
|
|
702 my $total = $#vert + 1;
|
|
703
|
|
704 # find H(l) -- must be done before collapsing
|
|
705 $h = getH(@vert);
|
|
706
|
|
707 # collect like terms
|
|
708 @vert = collapse(@vert);
|
|
709
|
|
710 # get R(l)
|
|
711 my $r;
|
|
712 if (!defined $stretch || !$stretch) {
|
|
713 $r= getR($kind, $h, $error);
|
|
714 } else {
|
|
715 $r = ($kind == $NA) ? 2 : (log(20) / log(2));
|
|
716 }
|
|
717
|
|
718 # place heights
|
|
719 my $count=0;
|
|
720 my $height;
|
|
721 my $elem;
|
|
722 foreach $elem (@vert) {
|
|
723 $height = getHeight(substr($elem, 2) / $total,$r);
|
|
724 $vert[$count] = substr($elem,0,1) . $height;
|
|
725 $count++;
|
|
726 }
|
|
727
|
|
728 # sort by height
|
|
729 @vert = sort height_sort @vert;
|
|
730
|
|
731 # put in error bar size
|
|
732 $vert[$count] = ">$error";
|
|
733
|
|
734 # place in @data
|
|
735 $data[scalar @data] = \@vert;
|
|
736
|
|
737 $currentx++;
|
|
738 }
|
|
739 }
|
|
740
|
|
741 # uses error approximation given by:
|
|
742 # e := (s-1) / (2 * ln2 * ntrue);
|
|
743 sub getError {
|
|
744 return ((($_[0] == $NA) ? 4 : 20) - 1) / (2 * log(2) * $_[1]);
|
|
745 }
|
|
746
|
|
747 sub height_sort {
|
|
748 my ($lettera, $heighta) = ($a =~ /^(.{1})(\S+)$/); #substr($a,1);
|
|
749 my ($letterb, $heightb) = ($b =~ /^(.{1})(\S+)$/); #substr($b,1);
|
|
750
|
|
751 # compare by height first, then letter
|
|
752 if ($heighta <=> $heightb) {
|
|
753 return $heighta <=> $heightb;
|
|
754 } else {
|
|
755 return $letterb cmp $lettera; #reversed for some reason...
|
|
756 }
|
|
757 }
|
|
758
|
|
759 sub collapse {
|
|
760 my @returnVal;
|
|
761 my $current = $_[0];
|
|
762 my $count=0;
|
|
763 my $freq;
|
|
764
|
|
765 foreach (@_) {
|
|
766 if ($current eq $_) {
|
|
767 $count++;
|
|
768 } else {
|
|
769 $returnVal[scalar @returnVal] = "$current $count";
|
|
770
|
|
771 $current = $_;
|
|
772 $count=1;
|
|
773 }
|
|
774 }
|
|
775
|
|
776 # add last element
|
|
777 $returnVal[scalar @returnVal] = "$current $count";
|
|
778
|
|
779 return @returnVal;
|
|
780 }
|
|
781
|
|
782 # arguments : $_[0] : list of bases or amino acids
|
|
783 sub getH {
|
|
784 my $h = 0;
|
|
785 my (@vert) = @_; # vertical sequence (comparing multiple sequences)
|
|
786
|
|
787 my $current = $vert[0];
|
|
788 my $count=0;
|
|
789 my $freq;
|
|
790
|
|
791 foreach (@vert) {
|
|
792 if ($current eq $_) {
|
|
793 $count++;
|
|
794 } else {
|
|
795 $freq = $count / ($#vert + 1);
|
|
796 $h += $freq * (log ($freq) / log(2));
|
|
797
|
|
798 $current = $_;
|
|
799 $count=1;
|
|
800 }
|
|
801 }
|
|
802
|
|
803 # add last element
|
|
804 $freq = $count / ($#vert + 1);
|
|
805 $h += $freq * (log ($freq) / log(2));
|
|
806
|
|
807 return -$h;
|
|
808 }
|
|
809
|
|
810 # arguments : $_[0] : $AA or $NA
|
|
811 # $_[1] : uncertainty = H(l)
|
|
812 # $_[2] : error correction factor for small number of sequences
|
|
813 # = e(n) ; assummed to be 0 if not given
|
|
814 sub getR {
|
|
815 my $max = ($_[0] == $NA) ? 2 : (log(20) / log(2));
|
|
816 my $e = (defined $_[2]) ? $_[2] : 0;
|
|
817 return ($max - ($_[1] + $e));
|
|
818 }
|
|
819
|
|
820 # arguments: $_[0] : frequency = f(b,l)
|
|
821 # $_[1] : R(l)
|
|
822 sub getHeight {
|
|
823 return $_[0] * $_[1]
|
|
824 }
|
|
825
|
|
826 1;
|