Mercurial > repos > devteam > snpfreq
annotate snpFreq2.pl @ 1:1e488c26e9d8 draft default tip
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
author | devteam |
---|---|
date | Tue, 13 Oct 2015 12:52:33 -0400 |
parents | afdfa7335319 |
children |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env perl |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 #using large SNP tables (~1G) may require large memory ~15G in R | |
7 | |
8 #expected input: path to file, cols of counts (2 sets of 3), threshold | |
9 if (!@ARGV or scalar @ARGV != 11) { | |
10 if (!@ARGV or scalar @ARGV != 6) { #snpTable usage | |
11 print "usage for tab separated allele counts\n", | |
12 "snpFreq.pl inputType #threshold /path/to/snps.txt outfile <6 column numbers(1 based) with counts for alleles, first one group then another>\n"; | |
13 print "OR for SNP tables\n"; | |
14 print "usage snpFreq.pl inputType #threshold /path/to/snpTable.txt outfile group1File group2File\n"; | |
15 exit 1; | |
16 } | |
17 } | |
18 | |
19 #get and verify inputs | |
20 my ($file, $a1, $a2, $a3, $b1, $b2, $b3, $thresh, $outfile); | |
21 if ($ARGV[0] eq 'tab') { | |
22 shift @ARGV; | |
23 $thresh = shift @ARGV; | |
24 if ($thresh !~ /^\d*\.?\d+$/) { | |
25 print "Error the threshold must be a number. Got $thresh\n"; | |
26 exit 1; | |
27 }elsif ($thresh > .3) { | |
28 print "Error the threshold can not be greater than 0.3 got $thresh\n"; | |
29 exit 1; | |
30 } | |
31 $file = shift @ARGV; | |
32 $outfile = shift @ARGV; | |
33 $a1 = shift @ARGV; | |
34 if ($a1 =~ /\D/ or $a1 < 1) { | |
35 print "Error the column number, must be an integer greater than or equal to 1. Got $a1\n"; | |
36 exit 1; | |
37 } | |
38 $a2 = shift @ARGV; | |
39 if ($a2 =~ /\D/ or $a2 < 1) { | |
40 print "Error the column number, must be an integer greater than or equal to 1. Got $a2\n"; | |
41 exit 1; | |
42 } | |
43 $a3 = shift @ARGV; | |
44 if ($a3 =~ /\D/ or $a3 < 1) { | |
45 print "Error the column number, must be an integer greater than or equal to 1. Got $a3\n"; | |
46 exit 1; | |
47 } | |
48 $b1 = shift @ARGV; | |
49 if ($b1 =~ /\D/ or $b1 < 1) { | |
50 print "Error the column number, must be an integer greater than or equal to 1. Got $b1\n"; | |
51 exit 1; | |
52 } | |
53 $b2 = shift @ARGV; | |
54 if ($b2 =~ /\D/ or $b2 < 1) { | |
55 print "Error the column number, must be an integer greater than or equal to 1. Got $b2\n"; | |
56 exit 1; | |
57 } | |
58 $b3 = shift @ARGV; | |
59 if ($b3 =~ /\D/ or $b3 < 1) { | |
60 print "Error the column number, must be an integer greater than or equal to 1. Got $b3\n"; | |
61 exit 1; | |
62 } | |
63 }else { #snp table convert and assign variables | |
64 #snpTable.txt #threshold outfile workingdir | |
65 shift @ARGV; | |
66 $thresh = shift @ARGV; | |
67 if ($thresh !~ /^\d*\.?\d+$/) { | |
68 print "Error the threshold must be a number. Got $thresh\n"; | |
69 exit 1; | |
70 }elsif ($thresh > .3) { | |
71 print "Error the threshold can not be greater than 0.3 got $thresh\n"; | |
72 exit 1; | |
73 } | |
74 $file = shift @ARGV; | |
75 $outfile = shift @ARGV; | |
76 my $grpFile = shift @ARGV; | |
77 my @g1; | |
78 open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n"; | |
79 while (<FH>) { | |
80 chomp; | |
81 if (/^(\d+)\s/) { push(@g1, $1); } | |
82 } | |
83 close FH or die "Couldn't close $grpFile, $!\n"; | |
84 $grpFile = shift @ARGV; | |
85 my @g2; | |
86 open(FH, $grpFile) or die "Couldn't open $grpFile, $!\n"; | |
87 while (<FH>) { | |
88 chomp; | |
89 if (/^(\d+)\s/) { push(@g2, $1); } | |
90 } | |
91 close FH or die "Couldn't close $grpFile, $!\n"; | |
92 if ($file =~ /.gz$/) { | |
1
1e488c26e9d8
planemo upload commit 33927a87ba2eee9bf0ecdd376a66241b17b3d734
devteam
parents:
0
diff
changeset
|
93 open(FH, "zcat < $file |") or die "Couldn't read $file, $!\n"; |
0 | 94 }else { |
95 open(FH, $file) or die "Couldn't read $file, $!\n"; | |
96 } | |
97 open(OUT, ">", "snpTable.txt") or die "Couldn't open snpTable.txt, $!\n"; | |
98 my $size; | |
99 while (<FH>) { | |
100 chomp; | |
101 if (/^#/) { next; } #header | |
102 my @f = split(/\t/); | |
103 $size = scalar @f; | |
104 my @gc1 = (0, 0, 0); | |
105 my @gc2 = (0, 0, 0); | |
106 foreach my $g (@g1) { | |
107 my $i = $g+1; #g is 1 based first col want 0 based snp call column | |
108 if ($i > $#f) { die "ERROR looking for index $i which is greater than the list $#f\n"; } | |
109 if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref | |
110 $gc1[0]++; | |
111 }elsif ($f[$i] == 1) { | |
112 $gc1[2]++; | |
113 }elsif ($f[$i] == 0) { | |
114 $gc1[1]++; | |
115 }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; } | |
116 } | |
117 foreach my $g (@g2) { | |
118 my $i = $g+1; #g is 1 based first col want 0 based snp call column | |
119 if ($f[$i] == -1 or $f[$i] == 2) { #treat unknown as ref | |
120 $gc2[0]++; | |
121 }elsif ($f[$i] == 1) { | |
122 $gc2[2]++; | |
123 }elsif ($f[$i] == 0) { | |
124 $gc2[1]++; | |
125 }else { die "Unexpected value for genotype $f[$i] in ", join(" ", @f), "\n"; } | |
126 } | |
127 print OUT join("\t", @f), "\t", join("\t", @gc1), "\t", join("\t", @gc2), | |
128 "\n"; | |
129 } | |
130 close FH or die "Couldn't close $file, $!\n"; | |
131 close OUT or die "Couldn't close snpTable.txt, $!\n"; | |
132 my $i = $size + 1; #next 1 based column after input data | |
133 $a1 = $i++; | |
134 $a2 = $i++; | |
135 $a3 = $i++; | |
136 $b1 = $i++; | |
137 $b2 = $i++; | |
138 $b3 = $i++; | |
139 $file = "snpTable.txt"; | |
140 } | |
141 | |
142 #run a fishers exact test (using R) on whole table | |
143 my $cmd = qq|options(warn=-1) | |
144 tab <- read.table('$file', sep="\t") | |
145 size <- length(tab[,1]) | |
146 width <- length(tab[1,]) | |
147 x <- 1:size | |
148 y <- matrix(data=0, nr=size, nc=6) | |
149 for(i in 1:size) { | |
150 m <- matrix(c(tab[i,$a1], tab[i,$b1], tab[i,$a2], tab[i,$b2], tab[i,$a3], tab[i,$b3]), nrow=2) | |
151 t <- fisher.test(m) | |
152 x[i] <- t\$p.value | |
153 if (x[i] >= 1) { | |
154 x[i] <- .999 | |
155 } | |
156 n <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3] + tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) | |
157 n_a <- (tab[i,$a1] + tab[i,$a2] + tab[i,$a3]) | |
158 y[i,1] <- ((tab[i,$a1] + tab[i,$b1])*(n_a))/n | |
159 y[i,1] <- round(y[i,1],3) | |
160 y[i,2] <- ((tab[i,$a2] + tab[i,$b2])*(n_a))/n | |
161 y[i,2] <- round(y[i,2],3) | |
162 y[i,3] <- ((tab[i,$a3] + tab[i,$b3])*(n_a))/n | |
163 y[i,3] <- round(y[i,3],3) | |
164 n_b <- (tab[i,$b1] + tab[i,$b2] + tab[i,$b3]) | |
165 y[i,4] <- ((tab[i,$a1] + tab[i,$b1])*(n_b))/n | |
166 y[i,4] <- round(y[i,4],3) | |
167 y[i,5] <- ((tab[i,$a2] + tab[i,$b2])*(n_b))/n | |
168 y[i,5] <- round(y[i,5],3) | |
169 y[i,6] <- ((tab[i,$a3] + tab[i,$b3])*(n_b))/n | |
170 y[i,6] <- round(y[i,6],3) | |
171 }|; | |
172 #results <- data.frame(tab[1:size,1:width], x[1:size]) | |
173 #write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") | |
174 #q()|; | |
175 | |
176 #my $cmd2 = qq|suppressPackageStartupMessages(library(lib.loc='/afs/bx.psu.edu/home/giardine/lib/R', qvalue)) | |
177 my $cmd2 = qq|suppressPackageStartupMessages(library(qvalue)) | |
178 qobj <- qvalue(x[1:size], lambda=seq(0,0.90,$thresh), pi0.method="bootstrap", fdr.level=0.1, robust=FALSE, smooth.log.pi0 = FALSE) | |
179 q <- qobj\$qvalues | |
180 results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size], q[1:size]) | |
181 write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") | |
182 q()|; | |
183 | |
184 #for TESTING | |
185 my $pr = qq|results <- data.frame(tab[1:size,1:width], y[1:size,1:6], x[1:size]) | |
186 write.table(results, file="$outfile", row.names = FALSE ,col.names = FALSE,quote = FALSE, sep="\t") | |
187 q()|; | |
188 | |
189 open(FT, "| R --slave --vanilla") | |
190 or die "Couldn't call fisher.text, $!\n"; | |
191 print FT $cmd, "\n"; #fisher test | |
192 print FT $cmd2, "\n"; #qvalues and results | |
193 #print FT $pr, "\n"; | |
194 close FT or die "Couldn't finish fisher.test, $!\n"; | |
195 | |
196 exit; |