annotate countwins.r @ 0:d42f4d78c85e draft

Uploaded
author messersc
date Wed, 17 Dec 2014 10:40:23 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
1 ###############################
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
2 #### Peak finder for NGS Data
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
3 #### R script
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
4 ###############################
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
5
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
6
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
7
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
8
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
9 # =======================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
10 # User-defined variables
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
11 # =======================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
12 cutoff = NA #To enforce an SNR cutoff ratio, delete NA and enter the number you want
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
13 strict = 1 #to make peak calling more / less strict, increase / decrease this number - Only use this if you are getting an unusually large or small number of peaks.
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
14 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
15 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
16
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
17
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
18
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
19
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
20 # ================================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
21 # Required Libraries check & load
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
22 # ================================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
23 if ((is.element('mclust', installed.packages()[,1]) == FALSE) || (is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('sqldf', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
24 stop("R package 'Mclust', 'signal', 'sqldf' and 'parallel' are required. Please install them!")
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
25 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
26 suppressPackageStartupMessages(library("mclust"))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
27 suppressPackageStartupMessages(library("signal"))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
28 suppressPackageStartupMessages(library("sqldf"))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
29 suppressPackageStartupMessages(library("tcltk"))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
30 suppressPackageStartupMessages(library("parallel"))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
31 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
32
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
33
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
34
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
35
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
36
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
37 # =================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
38 # Custom Functions
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
39 # =================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
40 #Get per-row Geometric mean (takes list, returns vectors, not lists!)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
41 geomeanL <- function(mat){
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
42 n = length(mat)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
43 if (n > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
44 mult = (mat[[1]])*(mat[[2]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
45 if (n > 2) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
46 for (i in 3:n) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
47 mult = mult*(mat[[i]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
48 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
49 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
50 mult = mult^(1/n)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
51 mat = mult
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
52 return(mat)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
53 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
54 return(mat[[1]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
55 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
56 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
57
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
58
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
59
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
60 #Get per-row Geometric mean (takes matrix, returns vectors, not lists!)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
61 geomean <- function(mat){
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
62 n = NCOL(mat)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
63 if (n > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
64 mult = (mat[,1])*(mat[,2])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
65 if (n > 2) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
66 for (i in 3:n) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
67 mult = mult*(mat[,i])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
68 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
69 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
70 mult = mult^(1/n)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
71 mat = mult
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
72 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
73 return(mat)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
74 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
75
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
76
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
77
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
78 #Read in bed(st2) file
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
79 parsein = function(bedfile) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
80 l = sqldf()
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
81 readsFile = file(bedfile)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
82 return(sqldf("SELECT V1 + 1 As 'start' FROM readsFile", file.format=list(header = FALSE, sep = "\t"))$start)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
83 l = sqldf()
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
84 close(readsFile)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
85 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
86
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
87
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
88 #Read in bedpe(st2) file
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
89 parseinpe = function(bedfile) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
90 l = sqldf()
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
91 readsFile = file(bedfile)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
92 sebastian = sqldf("SELECT * FROM readsFile", file.format=list(header = FALSE, sep = "\t"))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
93 return(cbind(sebastian[[1]]+1, sebastian[[2]]))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
94 l = sqldf()
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
95 close(readsFile)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
96 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
97
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
98
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
99 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
100 countreads = function(bedfile, reads, frag, chromsize, filelist) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
101
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
102 o = which(filelist == bedfile)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
103
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
104 counts = vector(mode = "numeric", length = chromsize)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
105
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
106 for (j in 1:length(reads[[o]])) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
107 if ((reads[[o]][j]+frag[o]-1) <= chromsize) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
108 counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
109 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
110 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
111 counts = counts/(mean(counts))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
112 return(counts)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
113 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
114
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
115
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
116
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
117 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
118 countreadspe = function(bedfile, reads, chromsize, filelist) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
119
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
120 o = which(filelist == bedfile)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
121
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
122 counts = vector(mode = "numeric", length = chromsize)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
123
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
124 for (j in 1:length(reads[[o]][,1])) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
125 counts[(reads[[o]][j,1]):(reads[[o]][j,2])] = counts[(reads[[o]][j,1]):(reads[[o]][j,2])] + 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
126 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
127 counts = counts/(mean(counts))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
128 return(counts)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
129 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
130
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
131
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
132
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
133 #find enriched bins
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
134 pickbins = function(winStart, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
135
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
136 if ((winStart + binSize) <= chromSize) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
137 winEnd = winStart + binSize
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
138 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
139 winEnd = chromSize
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
140 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
141 binSizeTemp = winEnd - winStart
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
142 tempend = winEnd - 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
143
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
144 #extract subset of the background
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
145 if (bkgd != "None") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
146 Cs = counts[[numdup+1]][winStart:tempend]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
147 mCs = mean(Cs)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
148 dCs = sd(Cs)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
149 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
150 Cs = sample(C, binSizeTemp, replace = TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
151 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
152
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
153
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
154 #find out whether it's enriched
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
155 go = rep(0, numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
156 for (g in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
157 mS = (mean(counts[[g]][winStart:tempend]))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
158 ratio = mS/dCs
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
159 if ((mS > (mCs * strict)) && (ratio > cutoff)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
160 go[g] = 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
161 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
162 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
163 veep = sum(go)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
164
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
165 #get counts when enriched
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
166 if (veep == numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
167 mS = rep(0, numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
168 for (g in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
169 mS[g] = (mean(counts[[g]][winStart:winEnd]))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
170 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
171 cairo = mean(mS)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
172 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
173 mS = rep(0, numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
174 for (g in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
175 mS[g] = (mean(counts[[g]][winStart:winEnd]))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
176 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
177 cairo = -(mean(mS))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
178 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
179 return(cairo)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
180 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
181
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
182
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
183
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
184 #find enriched wins
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
185 pickwins = function(winStart, coffeeshopSud, counts, numdup, startlist, winSize) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
186
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
187 plz = which(startlist == winStart)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
188 winEnd = coffeeshopSud[plz]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
189 rWinSize = winEnd - winStart + 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
190
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
191
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
192 if(rWinSize >= winSize) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
193 mS = rep(0, numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
194 for (g in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
195 mS[g] = (sum(counts[[g]][winStart:winEnd]))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
196 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
197 veep = sum(mS)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
198 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
199 veep = FALSE
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
200 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
201
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
202 return(veep)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
203 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
204
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
205
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
206
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
207 #Initialize MClust clustering parameters
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
208 smoothcounts = function(winStart, coffeeshopSud, numdup, counts, startlist) { #helper function1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
209
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
210 plz = which(startlist == winStart)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
211 winEnd = coffeeshopSud[plz]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
212
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
213 #extract subset of the IP
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
214 Rs = matrix(0, nrow = (winEnd - winStart + 1), ncol = numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
215 for (j in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
216 Rs[,j] = counts[[j]][winStart:winEnd]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
217 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
218 #smooth extended read counts
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
219 for (j in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
220 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
221 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
222 return(Rs)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
223 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
224 cluster = function(model, sig, init, clustnummer, noise) { #helper function2
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
225 noisy = sample(noise, length(sig[,1]), replace = TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
226 clust = me(model, sig+noisy, init)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
227 bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
228 out = list(bicc = bicc, param = clust$parameters)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
229 return(out)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
230 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
231 initparam = function(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise) { #main function
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
232
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
233 n = length(coffeeshopNord)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
234 #smooth extended read counts
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
235 if (cornum > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
236 sig = mclapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord, mc.cores = cornum, mc.preschedule = TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
237 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
238 sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
239 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
240 sig = do.call(rbind, sig)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
241
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
242 #kmeans initialization
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
243 init = kmeans(sig, clustnummer)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
244 init = unmap(init$cluster)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
245
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
246
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
247 if (cornum > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
248 param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
249 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
250 param = lapply(modelnames, cluster, sig, init, clustnummer, noise)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
251 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
252
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
253 bicc = vector(mode = "numeric", length = length(modelnames))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
254 for (i in 1:length(modelnames)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
255 bicc[i] = as.numeric(param[[i]]$bicc)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
256 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
257 bicc = which.max(bicc)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
258
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
259 out = list(initparam = param[[bicc]]$param, modelname = modelnames[bicc])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
260 return(out)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
261 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
262
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
263
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
264
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
265
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
266 #find peaks
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
267 findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
268
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
269
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
270 plz = which(startlist == winStart)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
271 winEnd = coffeeshopSud[plz]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
272
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
273
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
274 #will store peak information
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
275 writethis = list()
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
276
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
277
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
278 rWinSizeTemp = winEnd - winStart + 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
279
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
280 #extract subset of the IP
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
281 Rs = matrix(nrow = rWinSizeTemp, ncol = numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
282 Rsr = Rs
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
283 for (j in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
284 Rsr[,j] = counts[[j]][winStart:winEnd]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
285 Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
286 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
287
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
288
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
289 ####CLUSTERING START HERE#####
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
290 noisy = sample(noise, rWinSizeTemp, replace = TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
291 clust = em(param$modelname, Rs+noisy, param$initparam)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
292 clust$classification = map(clust$z)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
293 ####CLUSTERING DONE####
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
294
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
295 #check whether clutering succeeded
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
296 if((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
297
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
298
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
299 #check whether all components replicates agreed on clustering assignments
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
300 ccx = 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
301 if (numdup > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
302 cc = vector(mode = "numeric", length = numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
303 for (g in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
304 cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
305 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
306 ccx = sum(diff(cc))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
307 cluster = cc[1]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
308 rm(cc)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
309 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
310 cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
311 ccx = 0
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
312 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
313 if (ccx == 0) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
314
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
315 #extract subset of the background
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
316 if (bkgd != "None") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
317 Cs = counts[[numdup+1]][winStart:winEnd]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
318 Cs = filtfilt(rep(1,80)/80,1,Cs)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
319 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
320 Cs = sample(C, rWinSizeTemp, replace = TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
321 Cs = filtfilt(rep(1,80)/80,1,Cs)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
322 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
323
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
324 #find region boundaries
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
325 loc = 1:length(clust$classification)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
326 gmclass = cbind(loc, clust$classification)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
327 locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
328 rStart = locPeak[1] #start position of the region
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
329 rEnd = locPeak[length(locPeak[,1]),1] #end position of the region
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
330
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
331 #peak resolution check
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
332 if (resol == "region") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
333 pSize = rEnd - rStart
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
334 signal = (geomean(Rs[rStart:rEnd,]))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
335 signal2 = (signal) - (Cs[rStart:rEnd])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
336 gm = mean(signal2)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
337 summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
338 will2k = wilcox.test(signal, Cs[rStart:rEnd])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
339
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
340 #Is there signal in the region above background
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
341 if (gm > 0) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
342 writethis[[1]] = rStart + winStart - 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
343 writethis[[2]] = rEnd + winStart
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
344 writethis[[3]] = paste0(chromName, ".", rStart+winStart -1)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
345 writethis[[4]] = "1000"
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
346 writethis[[5]] = "."
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
347 writethis[[6]] = gm
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
348 writethis[[7]] = will2k$p.value
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
349 writethis[[8]] = "-1"
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
350 writethis[[9]] = summit
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
351 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
352 } else if (resol == "peak") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
353 #find out where separate peaks are
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
354 d = diff(locPeak[,1])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
355 d[length(d)+1] = 0
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
356 locPeak = cbind(locPeak, d)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
357 bound1 = which(locPeak[,3] > 1, arr.in=TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
358 bound2 = bound1 + 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
359 bound = locPeak[sort(c(bound1,bound2))]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
360 bound = c(rStart, bound, rEnd)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
361 w = 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
362 warum = 0
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
363 while (w < length(bound)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
364 pStart = bound[w] + winStart - 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
365 pEnd = bound[w+1] + winStart
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
366 pSize = pEnd - pStart
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
367 signal = (geomean(Rs[(bound[w]):(bound[w+1]),]))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
368 signal2 = (signal) - (Cs[bound[w]:bound[w+1]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
369 gm = mean(signal2)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
370 summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
371 will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
372
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
373 weil = warum * 9
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
374 #Is there signal in the region above background
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
375 if (gm > 0) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
376 writethis[[1+weil]] = pStart
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
377 writethis[[2+weil]] = pEnd
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
378 writethis[[3+weil]] = paste0(chromName, ".", pStart)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
379 writethis[[4+weil]] = "1000"
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
380 writethis[[5+weil]] = "."
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
381 writethis[[6+weil]] = gm
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
382 writethis[[7+weil]] = will2k$p.value
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
383 writethis[[8+weil]] = "-1"
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
384 writethis[[9+weil]] = summit
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
385 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
386 w = w + 2
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
387 warum = warum + 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
388 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
389
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
390 } #peak resolution check
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
391 } #all replicates agree on clustering assignments?
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
392 } #clustering worked?
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
393 return(writethis)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
394 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
395
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
396
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
397 #filter return value of findpeak()
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
398 processPeaks = function(peaks) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
399 peaks = matrix(unlist(peaks), ncol=9, byrow=TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
400 peaks = peaks[peaks[,1] != FALSE,,drop=FALSE]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
401 peaks = data.frame(peaks)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
402 return(peaks)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
403 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
404 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
405
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
406
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
407
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
408
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
409
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
410
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
411
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
412
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
413
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
414
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
415
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
416
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
417
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
418
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
419
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
420
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
421
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
422
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
423 # ==========================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
424 # Parse-in System Variables
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
425 # ==========================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
426 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
427
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
428 #Parsing arguments and storing values
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
429 for (each.arg in args) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
430 #chormosome size file
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
431 if (grepl('^-sfile=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
432 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
433 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
434 size.file <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
435 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
436 stop('No genome size file')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
437 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
438 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
439 #bed file names
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
440 if (grepl('^-bednames=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
441 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
442 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
443 bednames <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
444 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
445 stop('No bed file names')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
446 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
447 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
448 #bed files directory
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
449 if (grepl('^-frag=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
450 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
451 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
452 frag <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
453 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
454 stop('No fragment length given')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
455 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
456 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
457 #bakcground files directory
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
458 if (grepl('^-bkgd=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
459 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
460 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
461 bkgd <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
462 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
463 stop('No background file')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
464 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
465 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
466 #bakcground files directory
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
467 if (grepl('^-out=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
468 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
469 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
470 out <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
471 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
472 stop('No output directory given')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
473 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
474 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
475 #Cluster number
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
476 if (grepl('^-clustnummer=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
477 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
478 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
479 clustnummer <- as.numeric(arg.split[2])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
480 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
481 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
482 #resolution
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
483 if (grepl('^-resolution=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
484 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
485 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
486 resol <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
487 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
488 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
489 #processor cores
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
490 if (grepl('^-p=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
491 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
492 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
493 cornum <- as.numeric(arg.split[2])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
494 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
495 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
496 #minimum window size
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
497 if (grepl('^-window=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
498 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
499 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
500 winSize <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
501 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
502 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
503 #window size
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
504 if (grepl('^-bin=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
505 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
506 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
507 binsize <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
508 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
509 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
510 #type (paired / single)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
511 if (grepl('^-type=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
512 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on =
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
513 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
514 type <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
515 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
516 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
517 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
518
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
519 ##Parse in variables
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
520 chromosomes = read.table(size.file, header=FALSE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
521 chromName = chromosomes$V1; #which chromosome
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
522 chromSize = chromosomes$V2; #chromosomes size
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
523 rm(chromosomes)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
524
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
525 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
526 numdup = length(readsFiles) #number of replicates
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
527 if (bkgd != "None") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
528 readsFiles[[numdup+1]] = bkgd
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
529 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
530 winSize = as.numeric(winSize)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
531 binSize = as.numeric(binsize)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
532 winSize = binSize * winSize
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
533
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
534 if (type == "single") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
535 frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
536 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
537 rm(bednames)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
538
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
539 if (numdup > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
540 modelnames = c("VVV","VEV")
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
541 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
542 modelnames = "V"
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
543 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
544
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
545 options(stringsAsFactors = FALSE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
546 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
547
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
548
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
549
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
550
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
551 # =======================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
552 # Some preliminary stuff
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
553 # =======================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
554 if (type == "single") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
555 if (cornum > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
556 datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
557 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
558 datain = lapply(readsFiles, parsein) #read in all bed files (samples and control)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
559 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
560 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
561
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
562 if (type == "paired") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
563 if (cornum > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
564 datain = mclapply(readsFiles, parseinpe, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
565 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
566 datain = lapply(readsFiles, parseinpe) #read in all bed files (samples and control)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
567 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
568 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
569
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
570 #minimum peak size (only a recommendation)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
571 minpeak = floor(binSize / 4)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
572
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
573 #make bins vector
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
574 bins = seq(from = 1, to = (chromSize - 1), by = binSize)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
575 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
576
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
577
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
578
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
579
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
580 # ===============
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
581 # Counting Reads
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
582 # ===============
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
583 if (type == "single") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
584 if (cornum > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
585 counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
586 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
587 counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
588 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
589 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
590
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
591 if (type == "paired") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
592 if (cornum > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
593 counts = mclapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
594 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
595 counts = lapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
596 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
597 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
598
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
599 rm(datain)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
600
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
601 #get total counts
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
602 mS = vector(mode = "numeric", length = numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
603 for (g in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
604 mS[g] = sum(counts[[g]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
605 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
606 totalCounts = sum(mS)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
607 rm(mS)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
608 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
609
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
610
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
611
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
612 # ===================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
613 # Estimating Cutoff
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
614 # ===================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
615 if (is.na(cutoff)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
616 if (bkgd != "None") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
617 cutoff = vector(length = numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
618 sdC = sd(counts[[numdup+1]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
619 for (x in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
620 cutoff[x] = (mean(counts[[x]]))/(sdC)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
621 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
622 cutoff = max(cutoff)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
623 C = NULL
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
624 mCs = NULL
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
625 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
626 cutoff = vector(length = numdup)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
627 mmV = var(geomeanL(counts))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
628 mmM = mean(geomeanL(counts))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
629 sigma = log(1+((mmV) / ((mmM)^2)))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
630 mu = (log(mmM)) - (0.5 * (sigma))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
631 C = rlnorm(100000, mu, sqrt(sigma))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
632 for (x in 1:numdup) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
633 cutoff[x] = (mean(counts[[x]]))/(sd(C))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
634 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
635 cutoff = max(cutoff)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
636 mCs = (mean(sample(C, binSize*5, replace = TRUE)))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
637 dCs = (sd(sample(C, binSize*5, replace = TRUE)))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
638 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
639 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
640 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
641
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
642
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
643
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
644
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
645 # ========================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
646 # Picking Enriched Windows
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
647 # ========================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
648 binNum = vector(mode = "numeric", length = 41)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
649 binTotalSize = vector(mode = "numeric", length = 41)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
650 binTotalSizeRatio = vector(mode = "numeric", length = 41)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
651 binTotalCount = vector(mode = "numeric", length = 41)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
652 binTotalCountNO = vector(mode = "numeric", length = 41)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
653 binTotalCountRatio = vector(mode = "numeric", length = 41)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
654 maxiFun = vector(mode = "numeric", length = 41)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
655 ggg = seq(1,5, by = 0.1)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
656 for (i in 1:41) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
657 if (cornum > 1) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
658 coffeeshop = mclapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd, mc.cores = cornum, mc.preschedule = TRUE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
659 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
660 coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
661 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
662 coffeeshop = as.numeric(unlist(coffeeshop))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
663 coffeeshopYES = coffeeshop[coffeeshop > 0]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
664 coffeeshopNO = -(coffeeshop[coffeeshop <= 0])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
665
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
666 binNum[i] = length(coffeeshopYES)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
667 binTotalSize[i] = binNum[i] * binSize
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
668 binTotalSizeRatio[i] = binTotalSize[i] / chromSize
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
669
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
670 binTotalCount[i] = sum(coffeeshopYES)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
671 binTotalCountNO[i] = sum(coffeeshopNO)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
672 binTotalCountRatio[i] = binTotalCount[i] / binTotalCountNO[i]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
673
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
674 maxiFun[i] = binTotalCountRatio[i] / binTotalSizeRatio[i]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
675 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
676 rm(bins)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
677 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
678
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
679
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
680 write(paste(ggg, binNum, binTotalSize, binTotalSizeRatio, binTotalCount, binTotalCountNO, binTotalCountRatio, maxiFun, sep = " "), file = paste0(file = paste0("/home/mibrahim/Documents/", chromName, ".GOYEAH"), ncolumns = 1))