|
3
|
1 #!/opt/local/bin/perl
|
|
|
2 #Create a BreakDancer configuration file from a set of bam files
|
|
|
3
|
|
|
4 use strict;
|
|
|
5 use warnings;
|
|
|
6 use Getopt::Std;
|
|
|
7 use Statistics::Descriptive;
|
|
|
8 use GD::Graph::histogram;
|
|
|
9 use FindBin qw($Bin);
|
|
|
10 use lib "$FindBin::Bin";
|
|
|
11 #use lib '/gscuser/kchen/1000genomes/analysis/scripts/';
|
|
|
12 use AlnParser;
|
|
|
13
|
|
|
14 my %opts = (q=>35, n=>10000, v=>1, c=>4, b=>50, s=>50);
|
|
|
15 getopts('q:n:c:b:p:hmf:gCv:', \%opts);
|
|
|
16 die("
|
|
|
17 Usage: bam2cfg.pl <bam files>
|
|
|
18 Options:
|
|
|
19 -q INT Minimum mapping quality [$opts{q}]
|
|
|
20 -m Using mapping quality instead of alternative mapping quality
|
|
|
21 -s Minimal mean insert size [$opts{s}]
|
|
|
22 -C Change default system from Illumina to SOLiD
|
|
|
23 -c FLOAT Cutoff in unit of standard deviation [$opts{c}]
|
|
|
24 -n INT Number of observation required to estimate mean and s.d. insert size [$opts{n}]
|
|
|
25 -v FLOAT Cutoff on coefficients of variation [$opts{v}]
|
|
|
26 -f STRING A two column tab-delimited text file (RG, LIB) specify the RG=>LIB mapping, useful when BAM header is incomplete
|
|
|
27 -b INT Number of bins in the histogram [$opts{b}]
|
|
|
28 -g Output mapping flag distribution
|
|
|
29 -h Plot insert size histogram for each BAM library
|
|
|
30 \n
|
|
|
31 ") unless (@ARGV);
|
|
|
32
|
|
|
33 my $AP=new AlnParser(platform=>$opts{C});
|
|
|
34
|
|
|
35 my %cRGlib; my %clibs;
|
|
|
36 if($opts{f}){
|
|
|
37 open(RGLIB,"<$opts{f}") || die "unable to open $opts{f}\n";
|
|
|
38 while(<RGLIB>){
|
|
|
39 chomp;
|
|
|
40 my ($rg,$lib)=split;
|
|
|
41 $clibs{$lib}=1;
|
|
|
42 $cRGlib{$rg}=$lib;
|
|
|
43 }
|
|
|
44 }
|
|
|
45
|
|
|
46 foreach my $fbam(@ARGV){
|
|
|
47 my %RGlib;
|
|
|
48 my %RGplatform;
|
|
|
49 my %libs;
|
|
|
50 my %insert_stat;
|
|
|
51 my %readlen_stat;
|
|
|
52 my %libpos;
|
|
|
53 my %flagHgram;
|
|
|
54 my $recordcounter=0;
|
|
|
55 my $expected_max;
|
|
|
56 if(defined $opts{f}){
|
|
|
57 %RGlib=%cRGlib;
|
|
|
58 %libs=%clibs;
|
|
|
59 }
|
|
|
60 my $ppos=0;
|
|
|
61 open(BAM,"samtools view -h $fbam |") || die "unable to open $fbam\n";
|
|
|
62 while(<BAM>){
|
|
|
63 chomp;
|
|
|
64 if(/^\@RG/){ #getting RG=>LIB mapping from the bam header
|
|
|
65 my ($id)=($_=~/ID\:(\S+)/);
|
|
|
66 my ($lib)=($_=~/LB\:(\S+)/);
|
|
|
67 my ($platform)=($_=~/PL\:(\S+)/);
|
|
|
68 my ($sample)=($_=~/SM\:(\S+)/);
|
|
|
69 my ($insertsize)=($_=~/PI\:(\d+)/);
|
|
|
70 #if(defined $insertsize && $insertsize>0){
|
|
|
71 #$lib=$sample . '_'. $lib;
|
|
|
72 $libs{$lib}=1;
|
|
|
73 $RGlib{$id}=$lib;
|
|
|
74 $RGplatform{$id}=$platform;
|
|
|
75 #}
|
|
|
76 }
|
|
|
77 else{
|
|
|
78 next if(/^\@/);
|
|
|
79 my @libas=keys %libs;
|
|
|
80 my @selected_libs=keys %insert_stat;
|
|
|
81 if($#libas<0){
|
|
|
82 if($#selected_libs>=0){
|
|
|
83 last;
|
|
|
84 }
|
|
|
85 else{
|
|
|
86 $libs{'NA'}=1;
|
|
|
87 $RGlib{'NA'}='NA';
|
|
|
88 $RGplatform{'NA'}=($opts{C})?'solid':'illumina';
|
|
|
89 }
|
|
|
90 }
|
|
|
91 if(!defined $expected_max || $expected_max<=0){
|
|
|
92 $expected_max=3*($#libas+1)*$opts{n};
|
|
|
93 }
|
|
|
94 last if($recordcounter>$expected_max);
|
|
|
95
|
|
|
96 my $t=$AP->in($_,'sam',\%RGplatform,$opts{m});
|
|
|
97 #die "Please sort bam by position\n" if($t->{pos}<$ppos);
|
|
|
98 next if($t->{pos}<$ppos);
|
|
|
99 $ppos=$t->{pos};
|
|
|
100 my $lib;
|
|
|
101 if(defined $t->{readgroup}){
|
|
|
102 if(defined $RGlib{$t->{readgroup}}){
|
|
|
103 $lib=$RGlib{$t->{readgroup}};
|
|
|
104 }
|
|
|
105 else{
|
|
|
106 $lib=$t->{readgroup};
|
|
|
107 $libs{$t->{readgroup}}=1;
|
|
|
108 $RGlib{$t->{readgroup}}=$t->{readgroup};
|
|
|
109 }
|
|
|
110 }
|
|
|
111 else{
|
|
|
112 $lib='NA';
|
|
|
113 }
|
|
|
114 #my $lib=($t->{readgroup})?$RGlib{$t->{readgroup}}:'NA'; #when multiple libraries are in a BAM file
|
|
|
115 next unless(defined $lib && $libs{$lib});
|
|
|
116 $readlen_stat{$lib}=Statistics::Descriptive::Full->new() if(!defined $readlen_stat{$lib});
|
|
|
117 $readlen_stat{$lib}->add_data($t->{readlen});
|
|
|
118 next if ($t->{qual}<=$opts{q}); #skip low quality mapped reads
|
|
|
119 $recordcounter++;
|
|
|
120 $libpos{$lib}++;
|
|
|
121 if(defined $t->{readgroup}){
|
|
|
122 $flagHgram{$t->{readgroup}}{$t->{flag}}++;
|
|
|
123 $flagHgram{$t->{readgroup}}{all}++;
|
|
|
124 }
|
|
|
125
|
|
|
126 my $nreads=(defined $insert_stat{$lib})?$insert_stat{$lib}->count():1;
|
|
|
127 if($nreads/$libpos{$lib}<1e-4){ #single-end lane
|
|
|
128 delete $libs{$lib};
|
|
|
129 delete $insert_stat{$lib};
|
|
|
130 }
|
|
|
131 next unless(($t->{flag}==18 || $t->{flag}==20) && $t->{dist}>=0);
|
|
|
132
|
|
|
133 $insert_stat{$lib}=Statistics::Descriptive::Full->new() if(!defined $insert_stat{$lib});
|
|
|
134 $insert_stat{$lib}->add_data($t->{dist});
|
|
|
135 if($insert_stat{$lib}->count()>$opts{n}){
|
|
|
136 delete $libs{$lib};
|
|
|
137 }
|
|
|
138 }
|
|
|
139 }
|
|
|
140 close(BAM);
|
|
|
141
|
|
|
142 my %stdms;
|
|
|
143 my %stdps;
|
|
|
144
|
|
|
145 foreach my $lib(keys %insert_stat){
|
|
|
146 my $readlen=$readlen_stat{$lib}->mean();
|
|
|
147 my @isize=$insert_stat{$lib}->get_data();
|
|
|
148 my $mean=$insert_stat{$lib}->mean();
|
|
|
149 my $std=$insert_stat{$lib}->standard_deviation();
|
|
|
150
|
|
|
151 delete $insert_stat{$lib};
|
|
|
152 my $insertsize=Statistics::Descriptive::Full->new();
|
|
|
153 foreach my $x(@isize){
|
|
|
154 next if($x>$mean+5*$std);
|
|
|
155 $insertsize->add_data($x);
|
|
|
156 }
|
|
|
157
|
|
|
158 $mean=$insertsize->mean();
|
|
|
159 $std=$insertsize->standard_deviation();
|
|
|
160 next if($mean<$opts{s});
|
|
|
161 my $cv=$std/$mean;
|
|
|
162 if($cv>=$opts{v}){
|
|
|
163 print STDERR "Coefficient of variation $cv in library $lib is larger than the cutoff $opts{v}, poor quality data, excluding from further analysis.\n";
|
|
|
164 next;
|
|
|
165 }
|
|
|
166
|
|
|
167 my $num=$insertsize->count();
|
|
|
168 next if($num<100);
|
|
|
169
|
|
|
170 my ($stdm,$stdp)=(0,0);
|
|
|
171 my ($nstdm,$nstdp)=(0,0);
|
|
|
172 foreach my $x($insertsize->get_data()){
|
|
|
173 if($x>$mean){
|
|
|
174 $stdp+=($x-$mean)**2;
|
|
|
175 $nstdp++;
|
|
|
176 }
|
|
|
177 else{
|
|
|
178 $stdm+=($x-$mean)**2;
|
|
|
179 $nstdm++;
|
|
|
180 }
|
|
|
181 }
|
|
|
182 $stdm=sqrt($stdm/($nstdm-1));
|
|
|
183 $stdp=sqrt($stdp/($nstdp-1));
|
|
|
184
|
|
|
185 $stdms{$lib}=$stdm;
|
|
|
186 $stdps{$lib}=$stdp;
|
|
|
187 $insert_stat{$lib}=$insertsize;
|
|
|
188 }
|
|
|
189
|
|
|
190 foreach my $rg(keys %RGlib){
|
|
|
191 my $lib=$RGlib{$rg};
|
|
|
192 my $platform=$RGplatform{$rg} || 'illumina'; #default illumina
|
|
|
193 next unless($insert_stat{$lib});
|
|
|
194 my $readlen=$readlen_stat{$lib}->mean();
|
|
|
195 my $mean=$insert_stat{$lib}->mean();
|
|
|
196 my $std=$insert_stat{$lib}->standard_deviation();
|
|
|
197 my $num=$insert_stat{$lib}->count();
|
|
|
198
|
|
|
199 my $upper=$mean+$opts{c}*$stdps{$lib} if(defined $opts{c});
|
|
|
200 my $lower=$mean-$opts{c}*$stdms{$lib} if(defined $opts{c});
|
|
|
201 $lower=0 if(defined $lower && $lower<0);
|
|
|
202
|
|
|
203 printf "readgroup\:%s\tplatform:%s\tmap\:%s\treadlen\:%.2f\tlib\:%s\tnum:%d",$rg,$platform,$fbam,$readlen,$lib,$num;
|
|
|
204 printf "\tlower\:%.2f\tupper\:%.2f",$lower,$upper if(defined $upper && defined $lower);
|
|
|
205 printf "\tmean\:%.2f\tstd\:%.2f",$mean,$std;
|
|
|
206
|
|
|
207
|
|
|
208 # compute the normality
|
|
|
209 my @data=$insert_stat{$lib}->get_data();
|
|
|
210 my $n_data = @data;
|
|
|
211 @data = sort {$a <=> $b} @data;
|
|
|
212 my $n_data_ = @data;
|
|
|
213 my $p_value = ShapiroWilk(\@data);
|
|
|
214 if($p_value>0) {
|
|
|
215 my $log_p_value = log($p_value)/log(10);
|
|
|
216 printf "\tSWnormality\:%.2f",$log_p_value;
|
|
|
217 }
|
|
|
218 elsif($p_value == -1) {
|
|
|
219 printf "\tSWnormality\:data not qualified -1";
|
|
|
220 }
|
|
|
221 elsif($p_value == -2.1){
|
|
|
222 printf "\tSWnormality\:data not qualified -2.1";
|
|
|
223 }
|
|
|
224 elsif($p_value == -2.2){
|
|
|
225 printf "\tSWnormality\:data not qualified -2.2";
|
|
|
226 }
|
|
|
227 elsif($p_value == -2.3){
|
|
|
228 printf "\tSWnormality\:data not qualified -2.3";
|
|
|
229 }
|
|
|
230 elsif($p_value == 0) {
|
|
|
231 printf "\tSWnormality\:minus infinity";
|
|
|
232 }
|
|
|
233
|
|
|
234
|
|
|
235
|
|
|
236 if($opts{g}){
|
|
|
237 printf "\tflag:";
|
|
|
238 foreach my $f(sort keys %{$flagHgram{$rg}}){
|
|
|
239 next if($f eq 'all');
|
|
|
240 printf "%d(%.2f%%)",$f,($flagHgram{$rg}{$f} || 0)*100/$flagHgram{$rg}{all};
|
|
|
241 }
|
|
|
242 printf "%d",$flagHgram{$rg}{all};
|
|
|
243 }
|
|
|
244 printf "\texe:samtools view\n";
|
|
|
245 }
|
|
|
246
|
|
|
247 if($opts{h}){ # plot insert size histogram for each library
|
|
|
248 foreach my $lib(keys %insert_stat){
|
|
|
249 my $graph = new GD::Graph::histogram(1000,600);
|
|
|
250 my $library="$fbam.$lib";
|
|
|
251 $graph->set(
|
|
|
252 x_label => 'Insert Size (bp)',
|
|
|
253 y_label => 'Count',
|
|
|
254 title => $library,
|
|
|
255 x_labels_vertical => 1,
|
|
|
256 bar_spacing => 0,
|
|
|
257 shadow_depth => 1,
|
|
|
258 shadowclr => 'dred',
|
|
|
259 transparent => 0,
|
|
|
260 histogram_bins => $opts{b},
|
|
|
261 ) or warn $graph->error;
|
|
|
262 my @data=$insert_stat{$lib}->get_data();
|
|
|
263 my $gd = $graph->plot(\@data) or die $graph->error;
|
|
|
264
|
|
|
265 $library=~s/.*\///g;
|
|
|
266 my $imagefile="$library.insertsize_histogram.png";
|
|
|
267 open(IMG, ">$imagefile") or die $!;
|
|
|
268 binmode IMG;
|
|
|
269 print IMG $gd->png;
|
|
|
270
|
|
|
271 my $datafile="$library.insertsize_histogram";
|
|
|
272 open(OUT,">$datafile");
|
|
|
273 foreach my $x(@data){
|
|
|
274 print OUT "$x\n";
|
|
|
275 }
|
|
|
276 close(OUT);
|
|
|
277 }
|
|
|
278 }
|
|
|
279 }
|
|
|
280
|
|
|
281 ###################################################### Shapiro Wilk Normality test (including functions ppnd, alnorm, poly, min, sign and asin) #########################################################
|
|
|
282 sub ShapiroWilk {
|
|
|
283
|
|
|
284 # read input
|
|
|
285 my ($x) = @_;
|
|
|
286
|
|
|
287 my $n=@$x;
|
|
|
288 my $w = 0;
|
|
|
289 my $ifault = 2;
|
|
|
290 my $init = 0;
|
|
|
291 my $n2 = $n/2;
|
|
|
292 my @a;
|
|
|
293 my $n1 = $n;
|
|
|
294
|
|
|
295 my $upper = 1;
|
|
|
296 my @c1 = (0.0, 0.221157, -0.147981, -2.07119, 4.434685, -2.706056);
|
|
|
297 my @c2 = (0.0, 0.042981, -0.293762, -1.752461, 5.682633, -3.582633);
|
|
|
298 my @c3 = (0.5440, -0.39978, 0.025054, -0.6714*(10**(-3)));
|
|
|
299 my @c4 = (1.3822, -0.77857, 0.062767, -0.0020322);
|
|
|
300 my @c5 = (-1.5861, -0.31082, -0.083751, 0.0038915);
|
|
|
301 my @c6 = (-0.4803, -0.082676, 0.0030302);
|
|
|
302 my @c7 = (0.164, 0.533);
|
|
|
303 my @c8 = (0.1736, 0.315);
|
|
|
304 my @c9 = (0.256, -0.00635);
|
|
|
305 my @g = (-2.273, 0.459);
|
|
|
306 my $z90 = 1.2816;
|
|
|
307 my $z95 = 1.6449;
|
|
|
308 my $z99 = 2.3263;
|
|
|
309 my $zm = 1.7509;
|
|
|
310 my $zss = 0.56268;
|
|
|
311 my $bf1 = 0.8378;
|
|
|
312 my $xx90 = 0.556;
|
|
|
313 my $xx95 = 0.622;
|
|
|
314 my $zero = 0.0;
|
|
|
315 my $one = 1.0;
|
|
|
316 my $two = 2.0;
|
|
|
317 my $three = 3.0;
|
|
|
318 my $sqrth = 0.70711;
|
|
|
319 my $qtr = 0.25;
|
|
|
320 my $th = 0.375;
|
|
|
321 my $small = 1*(10**(-19));
|
|
|
322 my $pi6 = 1.909859;
|
|
|
323 my $stqr = 1.047198;
|
|
|
324
|
|
|
325 my $summ2;
|
|
|
326 my $ssumm2;
|
|
|
327 my $fac;
|
|
|
328 my $rsn;
|
|
|
329 my $an;
|
|
|
330 my $an25;
|
|
|
331 my $a1;
|
|
|
332 my $a2;
|
|
|
333 my $delta;
|
|
|
334 my $range;
|
|
|
335 my $sa;
|
|
|
336 my $sx;
|
|
|
337 my $ssx;
|
|
|
338 my $ssa;
|
|
|
339 my $sax;
|
|
|
340 my $asa;
|
|
|
341 my $xsx;
|
|
|
342 my $ssassx;
|
|
|
343 my $w1;
|
|
|
344 my $y;
|
|
|
345 my $xx;
|
|
|
346 my $xi;
|
|
|
347 my $gamma;
|
|
|
348 my $m;
|
|
|
349 my $s;
|
|
|
350 my $ld;
|
|
|
351 my $bf;
|
|
|
352 my $z90f;
|
|
|
353 my $z95f;
|
|
|
354 my $z99f;
|
|
|
355 my $zfm;
|
|
|
356 my $zsd;
|
|
|
357 my $zbar;
|
|
|
358 my $ncens;
|
|
|
359 my $nn2;
|
|
|
360 my $i;
|
|
|
361 my $i1;
|
|
|
362 my $j;
|
|
|
363
|
|
|
364 my $pw = $one;
|
|
|
365 if($w >= $zero) {$w = $one;}
|
|
|
366 $an = $n;
|
|
|
367 $ifault = 3;
|
|
|
368 $nn2 = $n/2;
|
|
|
369 if($n2 < $nn2) {return (-1);}
|
|
|
370 $ifault = 1;
|
|
|
371 if($n < 3) {return (-1);}
|
|
|
372
|
|
|
373 # If INIT is false, calculates coefficients for the test
|
|
|
374
|
|
|
375 if(!$init){
|
|
|
376 if($n == 3) {
|
|
|
377 $a[1-1] = $sqrth;}
|
|
|
378 else {
|
|
|
379 $an25 = $an + $qtr;
|
|
|
380 $summ2 = $zero;
|
|
|
381 for(my $i = 1; $i <= $n2; $i++){
|
|
|
382 ($a[$i-1], $ifault) = ppnd(($i - $th)/$an25);
|
|
|
383 my $a_val = $a[$i-1];
|
|
|
384 $summ2 = $summ2 + $a[$i-1] ** 2;
|
|
|
385 }
|
|
|
386 $summ2 = $summ2 * $two;
|
|
|
387 $ssumm2 = $summ2 ** 0.5;
|
|
|
388 $rsn = $one / ($an ** 0.5);
|
|
|
389 my $poly_result = poly_(\@c1, 6, $rsn);
|
|
|
390 $a1 = $poly_result - $a[1-1] / $ssumm2;
|
|
|
391
|
|
|
392 # Normalize coefficients
|
|
|
393
|
|
|
394 if($n > 5) {
|
|
|
395 $i1 = 3;
|
|
|
396 $a2 = -$a[2-1]/$ssumm2 + poly_(\@c2, 6, $rsn);
|
|
|
397 my $a_val1_new = $a[1-1];
|
|
|
398 my $a_val2_new = $a[2-1];
|
|
|
399 my $temp1 = $summ2 - $two * $a[1-1] ** 2 - $two * $a[2-1] ** 2;
|
|
|
400 my $temp2 = $one - $two * $a1 ** 2 - $two * $a2 ** 2;
|
|
|
401 $fac = (($summ2 - $two * $a[1-1] ** 2 - $two * $a[2-1] ** 2)/($one - $two * $a1 ** 2 - $two * $a2 ** 2)) ** 0.5;
|
|
|
402 $a[1-1] = $a1;
|
|
|
403 $a[2-1] = $a2;
|
|
|
404 }
|
|
|
405 else {
|
|
|
406 $i1 = 2;
|
|
|
407 $fac = (($summ2 - $two * $a[1-1] ** 2)/ ($one - $two * $a1 ** 2)) ** 0.5;
|
|
|
408 $a[1-1] = $a1;
|
|
|
409 }
|
|
|
410 for($i = $i1; $i <= $nn2; $i++){
|
|
|
411 $a[$i-1] = -$a[$i-1]/$fac;
|
|
|
412 my $a_val = $a[$i-1];
|
|
|
413 }
|
|
|
414 for($i = 1; $i <= $nn2; $i++){
|
|
|
415 my $a_value = $a[$i-1];
|
|
|
416 }
|
|
|
417 }
|
|
|
418 $init = 1;
|
|
|
419 }
|
|
|
420 if($n1 < 3) {return (-1);}
|
|
|
421 $ncens = $n - $n1;
|
|
|
422 $ifault = 4;
|
|
|
423 if($ncens < 0 || ($ncens > 0 && $n < 20)) {return (-1);}
|
|
|
424 $ifault = 5;
|
|
|
425 $delta = $ncens/$an;
|
|
|
426 if($delta > 0.8) {return (-2.1);}
|
|
|
427
|
|
|
428 # If W input as negative, calculate significance level of -W
|
|
|
429
|
|
|
430 if($w < $zero) {
|
|
|
431 $w1 = $one + $w;
|
|
|
432 $ifault = 0;
|
|
|
433 }
|
|
|
434 else{
|
|
|
435 # Check for zero range
|
|
|
436 $ifault = 6;
|
|
|
437 $range = @$x[$n1-1] - @$x[1-1];
|
|
|
438 if($range < $small) {return (-2.2);}
|
|
|
439
|
|
|
440 # Check for correct sort order on range - scaled X
|
|
|
441
|
|
|
442 $ifault = 7;
|
|
|
443 $xx = @$x[1-1]/$range;
|
|
|
444 $sx = $xx;
|
|
|
445 $sa = -$a[1-1];
|
|
|
446 $j = $n - 1;
|
|
|
447 for($i = 2; $i <= $n1; $i++){
|
|
|
448 $xi = @$x[$i-1]/$range;
|
|
|
449 if($xx-$xi > $small){
|
|
|
450 return (-2.3);
|
|
|
451 }
|
|
|
452 $sx = $sx + $xi;
|
|
|
453 if($i != $j) {
|
|
|
454 my $sign_value = sign($i - $j);
|
|
|
455 my $min_value = $a[min($i, $j)-1];
|
|
|
456 my $min_ = min($i, $j);
|
|
|
457 $sa = $sa + sign($i - $j) * $a[min($i, $j)-1];
|
|
|
458 }
|
|
|
459 $xx = $xi;
|
|
|
460 $j = $j - 1;
|
|
|
461 }
|
|
|
462 $ifault = 0;
|
|
|
463 if($n > 5000) {$ifault = 2};
|
|
|
464
|
|
|
465 # Calculate W statistic as squared correlation between data and coefficients
|
|
|
466 $sa = $sa/$n1;
|
|
|
467 $sx = $sx/$n1;
|
|
|
468 $ssa = $zero;
|
|
|
469 $ssx = $zero;
|
|
|
470 $sax = $zero;
|
|
|
471 $j = $n;
|
|
|
472 for($i = 1; $i <= $n1; $i ++){
|
|
|
473 if($i != $j) {
|
|
|
474 my $sign_result = sign($i - $j);
|
|
|
475 my $min_result = min($i, $j);
|
|
|
476 $asa = sign($i - $j) * $a[min($i, $j)-1] - $sa;
|
|
|
477 }
|
|
|
478 else {
|
|
|
479 $asa = -$sa;
|
|
|
480 }
|
|
|
481 $xsx = @$x[$i-1]/$range - $sx;
|
|
|
482 $ssa = $ssa + $asa * $asa;
|
|
|
483 $ssx = $ssx + $xsx * $xsx;
|
|
|
484 $sax = $sax + $asa * $xsx;
|
|
|
485 $j = $j - 1;
|
|
|
486 }
|
|
|
487
|
|
|
488 # W1 equals (1-W) claculated to avoid excessive rounding error for W very near 1 (a potential problem in very large samples)
|
|
|
489 $ssassx = ($ssa * $ssx) ** 0.5;
|
|
|
490 $w1 = ($ssassx - $sax) * ($ssassx + $sax)/($ssa * $ssx);
|
|
|
491 }
|
|
|
492
|
|
|
493 $w = $one - $w1;
|
|
|
494
|
|
|
495 # Calculate significance level for W (exact for N=3)
|
|
|
496
|
|
|
497 if($n == 3){
|
|
|
498 $pw = $pi6 * (asin($w**0.5) - $stqr);
|
|
|
499 return ($pw);
|
|
|
500 }
|
|
|
501 $y = log($w1);
|
|
|
502 $xx = log($an);
|
|
|
503 $m = $zero;
|
|
|
504 $s = $one;
|
|
|
505 if($n <= 11){
|
|
|
506 $gamma = poly_(\@g, 2, $an);
|
|
|
507 if($y >= $gamma){
|
|
|
508 $pw = $small;
|
|
|
509 return ($pw);
|
|
|
510 }
|
|
|
511 $y = -log($gamma - $y);
|
|
|
512 $m = poly_(\@c3, 4, $an);
|
|
|
513 $s = exp(poly_(\@c4, 4, $an))
|
|
|
514 }
|
|
|
515 else{
|
|
|
516 $m = poly_(\@c5, 4, $xx);
|
|
|
517 $s = exp(poly_(\@c6, 3, $xx));
|
|
|
518 }
|
|
|
519 if($ncens > 0){
|
|
|
520
|
|
|
521 # Censoring by proportion NCENS/N. Calculate mean and sd of normal equivalent deviate of W.
|
|
|
522
|
|
|
523 $ld = -log($delta);
|
|
|
524 $bf = $one + $xx * $bf1;
|
|
|
525 $z90f = $z90 + $bf * poly_(\@c7, 2, $xx90 ** $xx) ** $ld;
|
|
|
526 $z95f = $z95 + $bf * poly_(\@c8, 2, $xx95 ** $xx) ** $ld;
|
|
|
527 $z99f = $z99 + $bf * poly_(\@c9, 2, $xx) ** $ld;
|
|
|
528
|
|
|
529 # Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get pseudo-mean and pseudo-sd of z as the slope and intercept
|
|
|
530
|
|
|
531 $zfm = ($z90f + $z95f + $z99f)/$three;
|
|
|
532 $zsd = ($z90*($z90f-$zfm)+$z95*($z95f-$zfm)+$z99*($z99f-$zfm))/$zss;
|
|
|
533 $zbar = $zfm - $zsd * $zm;
|
|
|
534 $m = $m + $zbar * $s;
|
|
|
535 $s = $s * $zsd;
|
|
|
536 }
|
|
|
537 $pw = alnorm(($y - $m)/$s, $upper);
|
|
|
538 return ($pw);
|
|
|
539
|
|
|
540 }
|
|
|
541
|
|
|
542 ############################################### ppnd function############################################################
|
|
|
543 sub ppnd {
|
|
|
544
|
|
|
545 # read input
|
|
|
546 my ($p) = @_;
|
|
|
547
|
|
|
548 # define output
|
|
|
549 my $ifault;
|
|
|
550 my $normal_dev;
|
|
|
551
|
|
|
552 # Local variables
|
|
|
553
|
|
|
554 my $zero = 0.0;
|
|
|
555 my $one = 1.0;
|
|
|
556 my $half = 0.5;
|
|
|
557 my $split1 = 0.425;
|
|
|
558 my $split2 = 5.0;
|
|
|
559 my $const1 = 0.180625;
|
|
|
560 my $const2 = 1.6;
|
|
|
561 my $q;
|
|
|
562 my $r;
|
|
|
563
|
|
|
564 # Coefficients for P close to 0.5
|
|
|
565
|
|
|
566 my $a0 = 3.3871327179;
|
|
|
567 my $a1 = 5.0434271938*10;
|
|
|
568 my $a2 = 1.5929113202*10**2;
|
|
|
569 my $a3 = 5.9109374720*10;
|
|
|
570 my $b1 = 1.7895169469*10;
|
|
|
571 my $b2 = 7.8757757664*10;
|
|
|
572 my $b3 = 6.7187563600*10;
|
|
|
573 # HASH SUM AB 32.3184577772
|
|
|
574
|
|
|
575 # Coefficients for P not close to 0, 0.5 or 1.
|
|
|
576
|
|
|
577 my $c0 = 1.4234372777;
|
|
|
578 my $c1 = 2.7568153900;
|
|
|
579 my $c2 = 1.3067284816;
|
|
|
580 my $c3 = 1.7023821103*(10**(-1));
|
|
|
581 my $d1 = 7.3700164250*(10**(-1));
|
|
|
582 my $d2 = 1.2021132975*(10**(-1));
|
|
|
583 # HASH SUM CD 15.7614929821
|
|
|
584
|
|
|
585 # Coefficients for P near 0 or 1.
|
|
|
586
|
|
|
587 my $e0 = 6.6579051150;
|
|
|
588 my $e1 = 3.0812263860;
|
|
|
589 my $e2 = 4.2868294337*(10**(-1));
|
|
|
590 my $e3 = 1.7337203997*(10**(-2));
|
|
|
591 my $f1 = 2.4197894225*(10**(-1));
|
|
|
592 my $f2 = 1.2258202635*(10**(-2));
|
|
|
593 # HASH SUM EF 19.4052910204
|
|
|
594
|
|
|
595 $ifault = 0;
|
|
|
596 $q = $p - $half;
|
|
|
597 if(abs($q) <= $split1) {
|
|
|
598 $r = $const1 - $q * $q;
|
|
|
599 $normal_dev = $q * ((($a3 * $r + $a2) * $r + $a1) * $r + $a0) / ((($b3 * $r + $b2) * $r + $b1) * $r + $one);
|
|
|
600 }
|
|
|
601 else{
|
|
|
602 if($q < $zero) {
|
|
|
603 $r = $p;}
|
|
|
604 else {
|
|
|
605 $r = $one - $p;
|
|
|
606 }
|
|
|
607 if($r <= $zero) {
|
|
|
608 $ifault = 1;
|
|
|
609 $normal_dev = $zero;
|
|
|
610 return ($normal_dev, $ifault);
|
|
|
611 }
|
|
|
612 $r = (-log($r))**0.5;
|
|
|
613 if($r <= $split2) {
|
|
|
614 $r = $r - $const2;
|
|
|
615 $normal_dev = ((($c3 * $r + $c2) * $r + $c1) * $r + $c0) / (($d2 * $r + $d1) * $r + $one);
|
|
|
616 }
|
|
|
617 else{
|
|
|
618 $r = $r - $split2;
|
|
|
619 $normal_dev = ((($e3 * $r + $e2) * $r + $e1) * $r + $e0) / (($f2 * $r + $f1) * $r + $one);
|
|
|
620 }
|
|
|
621 if($q < $zero) {$normal_dev = - $normal_dev;}
|
|
|
622 return ($normal_dev, $ifault);
|
|
|
623 }
|
|
|
624 }
|
|
|
625
|
|
|
626
|
|
|
627 ################################ alnorm function #####################################################################
|
|
|
628 #FUNCTION alnorm(x, upper) RESULT(fn_val)
|
|
|
629
|
|
|
630 # Evaluates the tail area of the standardised normal curve from x to infinity if upper is .true. or
|
|
|
631 # from minus infinity to x if upper is .false.
|
|
|
632
|
|
|
633 sub alnorm {
|
|
|
634
|
|
|
635 # get the inputs
|
|
|
636 my ($x, $upper) = @_;
|
|
|
637
|
|
|
638 # define the output
|
|
|
639 my $fn_val;
|
|
|
640
|
|
|
641 my $zero = 0.0;
|
|
|
642 my $one = 1.0;
|
|
|
643 my $half = 0.5;
|
|
|
644 my $con = 1.28;
|
|
|
645 my $z;
|
|
|
646 my $y;
|
|
|
647 my $up;
|
|
|
648
|
|
|
649 #!*** machine dependent constants
|
|
|
650 my $ltone = 7.0;
|
|
|
651 my $utzero = 18.66;
|
|
|
652
|
|
|
653 my $p = 0.398942280444;
|
|
|
654 my $q = 0.39990348504;
|
|
|
655 my $r = 0.398942280385;
|
|
|
656 my $a1 = 5.75885480458;
|
|
|
657 my $a2 = 2.62433121679;
|
|
|
658 my $a3 = 5.92885724438;
|
|
|
659 my $b1 = -29.8213557807;
|
|
|
660 my $b2 = 48.6959930692;
|
|
|
661 my $c1 = -3.8052*(10**(-8));
|
|
|
662 my $c2 = 3.98064794*(10**(-4));
|
|
|
663 my $c3 = -0.151679116635;
|
|
|
664 my $c4 = 4.8385912808;
|
|
|
665 my $c5 = 0.742380924027;
|
|
|
666 my $c6 = 3.99019417011;
|
|
|
667 my $d1 = 1.00000615302;
|
|
|
668 my $d2 = 1.98615381364;
|
|
|
669 my $d3 = 5.29330324926;
|
|
|
670 my $d4 = -15.1508972451;
|
|
|
671 my $d5 = 30.789933034;
|
|
|
672
|
|
|
673 $up = $upper;
|
|
|
674 $z = $x;
|
|
|
675 if($z < $zero){
|
|
|
676 if($up == 0){$up = 1;}
|
|
|
677 else {$up = 0;}
|
|
|
678 $z = -$z;
|
|
|
679 }
|
|
|
680 if($z <= $ltone || $up && $z <= $utzero){
|
|
|
681 $y = $half*$z*$z;
|
|
|
682 if($z > $con) {$fn_val = $r*exp(-$y)/($z+$c1+$d1/($z+$c2+$d2/($z+$c3+$d3/($z+$c4+$d4/($z+$c5+$d5/($z+$c6))))));}
|
|
|
683 else {$fn_val = $half - $z*($p - $q*$y/($y+$a1+$b1/($y+$a2+$b2/($y+$a3))));}
|
|
|
684 }
|
|
|
685 else{
|
|
|
686 $fn_val = $zero;
|
|
|
687 }
|
|
|
688 if(!$up) {$fn_val = $one - $fn_val};
|
|
|
689 return $fn_val;
|
|
|
690 }
|
|
|
691
|
|
|
692 ##################################### poly function #########################################################
|
|
|
693
|
|
|
694 # FUNCTION poly(c, nord, x) RESULT(fn_val)
|
|
|
695
|
|
|
696 # Calculates the algebraic polynomial of order nored-1 with array of coefficients c. Zero order coefficient is c(1)
|
|
|
697
|
|
|
698 sub poly_ {
|
|
|
699
|
|
|
700 my ($c, $nord, $x) = @_;
|
|
|
701
|
|
|
702 my $fn_val;
|
|
|
703
|
|
|
704 my $i;
|
|
|
705 my $j;
|
|
|
706 my $n2;
|
|
|
707 my $p;
|
|
|
708
|
|
|
709 $fn_val = @$c[1-1];
|
|
|
710 if($nord == 1) {return $fn_val;}
|
|
|
711 $p = $x*@$c[$nord-1];
|
|
|
712 if($nord == 2) {
|
|
|
713 $fn_val = $fn_val + $p;
|
|
|
714 }
|
|
|
715 else {
|
|
|
716 $n2 = $nord - 2;
|
|
|
717 $j = $n2 + 1;
|
|
|
718 for($i = 1; $i <= $n2; $i++){
|
|
|
719 $p = ($p + @$c[$j-1])*$x;
|
|
|
720 $j = $j - 1;
|
|
|
721 }
|
|
|
722 $fn_val = $fn_val + $p;
|
|
|
723 }
|
|
|
724
|
|
|
725 return $fn_val;
|
|
|
726 }
|
|
|
727
|
|
|
728 ###################################### min function ############################################
|
|
|
729
|
|
|
730 # FUNCTION min(a,b) RESULT min of a, b
|
|
|
731
|
|
|
732 sub min {
|
|
|
733
|
|
|
734 my ($a, $b) = @_;
|
|
|
735
|
|
|
736 if($a <= $b) { return $a; }
|
|
|
737 else { return $b; }
|
|
|
738 }
|
|
|
739
|
|
|
740 ###################################### sign function ############################################
|
|
|
741
|
|
|
742 # FUNCTION sign(a,b) RESULT sign of a, b
|
|
|
743
|
|
|
744 sub sign {
|
|
|
745
|
|
|
746 my ($a) = @_;
|
|
|
747
|
|
|
748 if($a >= 0) { return 1; }
|
|
|
749 else { return -1; }
|
|
|
750 }
|
|
|
751
|
|
|
752 ###################################### asin function ############################################
|
|
|
753
|
|
|
754 # FUNCTION asin(a) RESULT arcsin of a
|
|
|
755
|
|
|
756 sub asin {
|
|
|
757
|
|
|
758 my ($a) = @_;
|
|
|
759 if($a > 1 || $a < -1) {
|
|
|
760 print "error: abs sin value > 1";
|
|
|
761 return 0; }
|
|
|
762 else {
|
|
|
763 my $b = (1-$a**2)**0.5;
|
|
|
764 my $fn_val = atan2($a, (1-$a**2)**0.5);
|
|
|
765 my $fn_test = (1-$a**2)**0.5;
|
|
|
766 return $fn_val;
|
|
|
767 }
|
|
|
768 }
|
|
|
769
|