Mercurial > repos > messersc > jamm
comparison signalmaker.r @ 0:d42f4d78c85e draft
Uploaded
author | messersc |
---|---|
date | Wed, 17 Dec 2014 10:40:23 -0500 |
parents | |
children | 243f75d0ed6e |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d42f4d78c85e |
---|---|
1 ############################### | |
2 #### Signal Maker for NGS Data | |
3 #### R script | |
4 ############################### | |
5 | |
6 | |
7 | |
8 | |
9 # ======================= | |
10 # User-defined variables | |
11 # ======================= | |
12 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000) | |
13 #=======================> DONE! | |
14 | |
15 | |
16 | |
17 | |
18 # ================================ | |
19 # Required Libraries check & load | |
20 # ================================ | |
21 if ((is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) { | |
22 stop("R packages 'signal'and 'parallel' are required. Please install them!") | |
23 } | |
24 suppressPackageStartupMessages(library("signal")) | |
25 suppressPackageStartupMessages(library("parallel")) | |
26 #=======================> DONE! | |
27 | |
28 | |
29 | |
30 | |
31 | |
32 # ================= | |
33 # Custom Functions | |
34 # ================= | |
35 #Get per-row Geometric mean (takes list, returns vectors, not lists!) | |
36 geomeanL <- function(mat){ | |
37 n = length(mat) | |
38 if (n > 1) { | |
39 mult = (mat[[1]])*(mat[[2]]) | |
40 if (n > 2) { | |
41 for (i in 3:n) { | |
42 mult = mult*(mat[[i]]) | |
43 } | |
44 } | |
45 mult = mult^(1/n) | |
46 mat = mult | |
47 return(mat) | |
48 } else { | |
49 return(mat[[1]]) | |
50 } | |
51 } | |
52 | |
53 | |
54 | |
55 #Get per-row Geometric mean (takes matrix, returns vectors, not lists!) | |
56 geomean <- function(mat){ | |
57 n = NCOL(mat) | |
58 if (n > 1) { | |
59 mult = (mat[,1])*(mat[,2]) | |
60 if (n > 2) { | |
61 for (i in 3:n) { | |
62 mult = mult*(mat[,i]) | |
63 } | |
64 } | |
65 mult = mult^(1/n) | |
66 mat = mult | |
67 } | |
68 return(mat) | |
69 } | |
70 | |
71 | |
72 | |
73 #Read in bed(st2) file | |
74 parsein = function(bedfile) { | |
75 reads = read.table(bedfile, header = FALSE)[[1]] | |
76 return(reads) | |
77 } | |
78 | |
79 | |
80 | |
81 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats) | |
82 countreads = function(bedfile, reads, frag, chromsize, filelist) { | |
83 | |
84 o = which(filelist == bedfile) | |
85 | |
86 counts = vector(mode = "numeric", length = chromsize) | |
87 | |
88 for (j in 1:length(reads[[o]])) { | |
89 if ((reads[[o]][j]+frag[o]-1) <= chromsize) { | |
90 counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1 | |
91 } | |
92 } | |
93 | |
94 mCount = mean(counts) | |
95 | |
96 if (chrcount == 1) { | |
97 counts = counts/mCount | |
98 write(paste(mCount), file = paste0(out, "/norma.", o, ".info")) | |
99 } else { | |
100 meanCounts = mean(as.numeric(read.table(paste0(out, "/norma.", o, ".info"))[[1]])) | |
101 if ((mCount > (5*meanCounts)) || (mCount < (0.2*meanCounts))) { | |
102 mCount = meanCounts | |
103 } else { | |
104 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE) | |
105 } | |
106 counts = counts/mCount | |
107 } | |
108 | |
109 return(counts) | |
110 } | |
111 | |
112 | |
113 | |
114 | |
115 #generate signal | |
116 getsig = function(winStart, winEnds, startList, stepSize, chromName) { | |
117 | |
118 | |
119 plz = which(startList == winStart) | |
120 winEnd = winEnds[plz] | |
121 rWinSizeTemp = winEnd - winStart + 1 | |
122 | |
123 #extract subset of the IP | |
124 Rs = matrix(nrow = rWinSizeTemp, ncol = numdup) | |
125 for (j in 1:numdup) { | |
126 Rs[,j] = counts[[j]][winStart:winEnd] | |
127 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j]) | |
128 } | |
129 | |
130 #extract subset of the background and initial signal | |
131 if (bkgd != "None") { | |
132 Cs = counts[[numdup+1]][winStart:winEnd] | |
133 Cs = filtfilt(rep(1,80)/80,1,Cs) | |
134 signal = (geomean(Rs)) - (Cs) | |
135 } else { | |
136 signal = geomean(Rs) | |
137 } | |
138 | |
139 | |
140 #get binned signal | |
141 signal = round(colMeans(matrix(c(signal, rep(0, stepSize - length(signal) %% stepSize)), stepSize)), digits = 2) | |
142 steps = seq(winStart, winEnd, by = stepSize) | |
143 starts = steps - 1 | |
144 ends = steps + stepSize - 1 | |
145 if (ends[length(ends)] > winEnd) { | |
146 ends[length(ends)] = winEnd | |
147 } | |
148 | |
149 #write to file | |
150 write(paste(chromName, starts, ends, signal, sep = " "), file = paste0(out, "/", chromName, plz, ".bedSignal"), ncolumns = 1) | |
151 | |
152 | |
153 return(NULL) | |
154 } | |
155 #=======================> DONE! | |
156 | |
157 | |
158 | |
159 | |
160 | |
161 # ========================== | |
162 # Parse-in System Variables | |
163 # ========================== | |
164 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line | |
165 | |
166 #Parsing arguments and storing values | |
167 for (each.arg in args) { | |
168 #bed file names | |
169 if (grepl('^-bednames=',each.arg)) { | |
170 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
171 if (! is.na(arg.split[2]) ) { | |
172 bednames <- arg.split[2] | |
173 } else { | |
174 stop('No bed file names') | |
175 } | |
176 } | |
177 #bed file names | |
178 if (grepl('^-chromo=',each.arg)) { | |
179 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
180 if (! is.na(arg.split[2]) ) { | |
181 chromName <- arg.split[2] | |
182 } else { | |
183 stop('No bed file names') | |
184 } | |
185 } | |
186 #bed files directory | |
187 if (grepl('^-frag=',each.arg)) { | |
188 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
189 if (! is.na(arg.split[2]) ) { | |
190 frag <- arg.split[2] | |
191 } else { | |
192 stop('No fragment length given') | |
193 } | |
194 } | |
195 #bakcground files directory | |
196 if (grepl('^-bkgd=',each.arg)) { | |
197 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
198 if (! is.na(arg.split[2]) ) { | |
199 bkgd <- arg.split[2] | |
200 } else { | |
201 stop('No background file') | |
202 } | |
203 } | |
204 #bakcground files directory | |
205 if (grepl('^-out=',each.arg)) { | |
206 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
207 if (! is.na(arg.split[2]) ) { | |
208 out <- arg.split[2] | |
209 } else { | |
210 stop('No output directory given') | |
211 } | |
212 } | |
213 #Cluster number | |
214 if (grepl('^-clustnummer=',each.arg)) { | |
215 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
216 if (! is.na(arg.split[2]) ) { | |
217 clustnummer <- as.numeric(arg.split[2]) | |
218 } | |
219 } | |
220 #regions | |
221 if (grepl('^-regions=',each.arg)) { | |
222 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
223 if (! is.na(arg.split[2]) ) { | |
224 regions <- arg.split[2] | |
225 } | |
226 } | |
227 #processor cores | |
228 if (grepl('^-p=',each.arg)) { | |
229 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
230 if (! is.na(arg.split[2]) ) { | |
231 cornum <- as.numeric(arg.split[2]) | |
232 } | |
233 } | |
234 #chromSize | |
235 if (grepl('^-chromoS=',each.arg)) { | |
236 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
237 if (! is.na(arg.split[2]) ) { | |
238 chromSize <- as.numeric(arg.split[2]) | |
239 } | |
240 } | |
241 #stepsize | |
242 if (grepl('^-sig=',each.arg)) { | |
243 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
244 if (! is.na(arg.split[2]) ) { | |
245 sigstep <- as.numeric(arg.split[2]) | |
246 } | |
247 } | |
248 #chrcount | |
249 if (grepl('^-chrcount=',each.arg)) { | |
250 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] | |
251 if (! is.na(arg.split[2]) ) { | |
252 chrcount <- as.numeric(arg.split[2]) | |
253 } | |
254 } | |
255 } | |
256 | |
257 ##Parse in variables | |
258 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]]) | |
259 numdup = length(readsFiles) #number of replicates | |
260 if (bkgd != "None") { | |
261 readsFiles[[numdup+1]] = bkgd | |
262 } | |
263 frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]]) | |
264 rm(bednames) | |
265 | |
266 options(stringsAsFactors = FALSE) | |
267 #=======================> DONE! | |
268 | |
269 | |
270 | |
271 | |
272 # ======================= | |
273 # Some preliminary stuff | |
274 # ======================= | |
275 #import regions | |
276 regions = read.table(regions, header = FALSE) | |
277 regions = regions[regions[[1]] == chromName, , drop = FALSE] | |
278 regions = cbind(regions[[2]] + 1, regions[[3]]) | |
279 | |
280 #import read data | |
281 if (cornum > 1) { | |
282 datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control) | |
283 } else { | |
284 datain = lapply(readsFiles, parsein) #read in all bed files (samples and control) | |
285 } | |
286 #=======================> DONE! | |
287 | |
288 | |
289 | |
290 # =============== | |
291 # Counting Reads | |
292 # =============== | |
293 if (cornum > 1) { | |
294 counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE) | |
295 } else { | |
296 counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles) | |
297 } | |
298 rm(datain) | |
299 #=======================> DONE! | |
300 | |
301 | |
302 | |
303 # ============== | |
304 # Getting signal | |
305 # ============== | |
306 if (cornum > 1) { | |
307 sig = mclapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName, mc.cores = cornum, mc.preschedule = TRUE) | |
308 } else { | |
309 sig = lapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName) | |
310 } | |
311 message("Done!") | |
312 #=======================> DONE! |