0
|
1 ###############################
|
|
2 #### Signal Maker for NGS Data
|
|
3 #### R script
|
|
4 ###############################
|
|
5
|
|
6
|
|
7
|
|
8
|
|
9 # =======================
|
|
10 # User-defined variables
|
|
11 # =======================
|
|
12 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000)
|
|
13 #=======================> DONE!
|
|
14
|
|
15
|
|
16
|
|
17
|
|
18 # ================================
|
|
19 # Required Libraries check & load
|
|
20 # ================================
|
|
21 if ((is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) {
|
|
22 stop("R packages 'signal'and 'parallel' are required. Please install them!")
|
|
23 }
|
|
24 suppressPackageStartupMessages(library("signal"))
|
|
25 suppressPackageStartupMessages(library("parallel"))
|
|
26 #=======================> DONE!
|
|
27
|
|
28
|
|
29
|
|
30
|
|
31
|
|
32 # =================
|
|
33 # Custom Functions
|
|
34 # =================
|
|
35 #Get per-row Geometric mean (takes list, returns vectors, not lists!)
|
|
36 geomeanL <- function(mat){
|
|
37 n = length(mat)
|
|
38 if (n > 1) {
|
|
39 mult = (mat[[1]])*(mat[[2]])
|
|
40 if (n > 2) {
|
|
41 for (i in 3:n) {
|
|
42 mult = mult*(mat[[i]])
|
|
43 }
|
|
44 }
|
|
45 mult = mult^(1/n)
|
|
46 mat = mult
|
|
47 return(mat)
|
|
48 } else {
|
|
49 return(mat[[1]])
|
|
50 }
|
|
51 }
|
|
52
|
|
53
|
|
54
|
|
55 #Get per-row Geometric mean (takes matrix, returns vectors, not lists!)
|
|
56 geomean <- function(mat){
|
|
57 n = NCOL(mat)
|
|
58 if (n > 1) {
|
|
59 mult = (mat[,1])*(mat[,2])
|
|
60 if (n > 2) {
|
|
61 for (i in 3:n) {
|
|
62 mult = mult*(mat[,i])
|
|
63 }
|
|
64 }
|
|
65 mult = mult^(1/n)
|
|
66 mat = mult
|
|
67 }
|
|
68 return(mat)
|
|
69 }
|
|
70
|
|
71
|
|
72
|
|
73 #Read in bed(st2) file
|
|
74 parsein = function(bedfile) {
|
|
75 reads = read.table(bedfile, header = FALSE)[[1]]
|
|
76 return(reads)
|
|
77 }
|
|
78
|
|
79
|
|
80
|
|
81 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
|
|
82 countreads = function(bedfile, reads, frag, chromsize, filelist) {
|
|
83
|
|
84 o = which(filelist == bedfile)
|
|
85
|
|
86 counts = vector(mode = "numeric", length = chromsize)
|
|
87
|
|
88 for (j in 1:length(reads[[o]])) {
|
|
89 if ((reads[[o]][j]+frag[o]-1) <= chromsize) {
|
|
90 counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1
|
|
91 }
|
|
92 }
|
|
93
|
|
94 mCount = mean(counts)
|
|
95
|
|
96 if (chrcount == 1) {
|
|
97 counts = counts/mCount
|
|
98 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"))
|
|
99 } else {
|
|
100 meanCounts = mean(as.numeric(read.table(paste0(out, "/norma.", o, ".info"))[[1]]))
|
|
101 if ((mCount > (5*meanCounts)) || (mCount < (0.2*meanCounts))) {
|
|
102 mCount = meanCounts
|
|
103 } else {
|
|
104 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
|
|
105 }
|
|
106 counts = counts/mCount
|
|
107 }
|
|
108
|
|
109 return(counts)
|
|
110 }
|
|
111
|
|
112
|
|
113
|
|
114
|
|
115 #generate signal
|
|
116 getsig = function(winStart, winEnds, startList, stepSize, chromName) {
|
|
117
|
|
118
|
|
119 plz = which(startList == winStart)
|
|
120 winEnd = winEnds[plz]
|
|
121 rWinSizeTemp = winEnd - winStart + 1
|
|
122
|
|
123 #extract subset of the IP
|
|
124 Rs = matrix(nrow = rWinSizeTemp, ncol = numdup)
|
|
125 for (j in 1:numdup) {
|
|
126 Rs[,j] = counts[[j]][winStart:winEnd]
|
|
127 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
|
|
128 }
|
|
129
|
|
130 #extract subset of the background and initial signal
|
|
131 if (bkgd != "None") {
|
|
132 Cs = counts[[numdup+1]][winStart:winEnd]
|
|
133 Cs = filtfilt(rep(1,80)/80,1,Cs)
|
|
134 signal = (geomean(Rs)) - (Cs)
|
|
135 } else {
|
|
136 signal = geomean(Rs)
|
|
137 }
|
|
138
|
|
139
|
|
140 #get binned signal
|
|
141 signal = round(colMeans(matrix(c(signal, rep(0, stepSize - length(signal) %% stepSize)), stepSize)), digits = 2)
|
|
142 steps = seq(winStart, winEnd, by = stepSize)
|
|
143 starts = steps - 1
|
|
144 ends = steps + stepSize - 1
|
|
145 if (ends[length(ends)] > winEnd) {
|
|
146 ends[length(ends)] = winEnd
|
|
147 }
|
|
148
|
|
149 #write to file
|
|
150 write(paste(chromName, starts, ends, signal, sep = " "), file = paste0(out, "/", chromName, plz, ".bedSignal"), ncolumns = 1)
|
|
151
|
|
152
|
|
153 return(NULL)
|
|
154 }
|
|
155 #=======================> DONE!
|
|
156
|
|
157
|
|
158
|
|
159
|
|
160
|
|
161 # ==========================
|
|
162 # Parse-in System Variables
|
|
163 # ==========================
|
|
164 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
|
|
165
|
|
166 #Parsing arguments and storing values
|
|
167 for (each.arg in args) {
|
|
168 #bed file names
|
|
169 if (grepl('^-bednames=',each.arg)) {
|
|
170 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
171 if (! is.na(arg.split[2]) ) {
|
|
172 bednames <- arg.split[2]
|
|
173 } else {
|
|
174 stop('No bed file names')
|
|
175 }
|
|
176 }
|
|
177 #bed file names
|
|
178 if (grepl('^-chromo=',each.arg)) {
|
|
179 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
180 if (! is.na(arg.split[2]) ) {
|
|
181 chromName <- arg.split[2]
|
|
182 } else {
|
|
183 stop('No bed file names')
|
|
184 }
|
|
185 }
|
|
186 #bed files directory
|
|
187 if (grepl('^-frag=',each.arg)) {
|
|
188 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
189 if (! is.na(arg.split[2]) ) {
|
|
190 frag <- arg.split[2]
|
|
191 } else {
|
|
192 stop('No fragment length given')
|
|
193 }
|
|
194 }
|
|
195 #bakcground files directory
|
|
196 if (grepl('^-bkgd=',each.arg)) {
|
|
197 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
198 if (! is.na(arg.split[2]) ) {
|
|
199 bkgd <- arg.split[2]
|
|
200 } else {
|
|
201 stop('No background file')
|
|
202 }
|
|
203 }
|
|
204 #bakcground files directory
|
|
205 if (grepl('^-out=',each.arg)) {
|
|
206 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
207 if (! is.na(arg.split[2]) ) {
|
|
208 out <- arg.split[2]
|
|
209 } else {
|
|
210 stop('No output directory given')
|
|
211 }
|
|
212 }
|
|
213 #Cluster number
|
|
214 if (grepl('^-clustnummer=',each.arg)) {
|
|
215 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
216 if (! is.na(arg.split[2]) ) {
|
|
217 clustnummer <- as.numeric(arg.split[2])
|
|
218 }
|
|
219 }
|
|
220 #regions
|
|
221 if (grepl('^-regions=',each.arg)) {
|
|
222 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
223 if (! is.na(arg.split[2]) ) {
|
|
224 regions <- arg.split[2]
|
|
225 }
|
|
226 }
|
|
227 #processor cores
|
|
228 if (grepl('^-p=',each.arg)) {
|
|
229 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
230 if (! is.na(arg.split[2]) ) {
|
|
231 cornum <- as.numeric(arg.split[2])
|
|
232 }
|
|
233 }
|
|
234 #chromSize
|
|
235 if (grepl('^-chromoS=',each.arg)) {
|
|
236 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
237 if (! is.na(arg.split[2]) ) {
|
|
238 chromSize <- as.numeric(arg.split[2])
|
|
239 }
|
|
240 }
|
|
241 #stepsize
|
|
242 if (grepl('^-sig=',each.arg)) {
|
|
243 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
244 if (! is.na(arg.split[2]) ) {
|
|
245 sigstep <- as.numeric(arg.split[2])
|
|
246 }
|
|
247 }
|
|
248 #chrcount
|
|
249 if (grepl('^-chrcount=',each.arg)) {
|
|
250 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
251 if (! is.na(arg.split[2]) ) {
|
|
252 chrcount <- as.numeric(arg.split[2])
|
|
253 }
|
|
254 }
|
|
255 }
|
|
256
|
|
257 ##Parse in variables
|
|
258 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
|
|
259 numdup = length(readsFiles) #number of replicates
|
|
260 if (bkgd != "None") {
|
|
261 readsFiles[[numdup+1]] = bkgd
|
|
262 }
|
|
263 frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]])
|
|
264 rm(bednames)
|
|
265
|
|
266 options(stringsAsFactors = FALSE)
|
|
267 #=======================> DONE!
|
|
268
|
|
269
|
|
270
|
|
271
|
|
272 # =======================
|
|
273 # Some preliminary stuff
|
|
274 # =======================
|
|
275 #import regions
|
|
276 regions = read.table(regions, header = FALSE)
|
|
277 regions = regions[regions[[1]] == chromName, , drop = FALSE]
|
|
278 regions = cbind(regions[[2]] + 1, regions[[3]])
|
|
279
|
|
280 #import read data
|
|
281 if (cornum > 1) {
|
|
282 datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
|
|
283 } else {
|
|
284 datain = lapply(readsFiles, parsein) #read in all bed files (samples and control)
|
|
285 }
|
|
286 #=======================> DONE!
|
|
287
|
|
288
|
|
289
|
|
290 # ===============
|
|
291 # Counting Reads
|
|
292 # ===============
|
|
293 if (cornum > 1) {
|
|
294 counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE)
|
|
295 } else {
|
|
296 counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles)
|
|
297 }
|
|
298 rm(datain)
|
|
299 #=======================> DONE!
|
|
300
|
|
301
|
|
302
|
|
303 # ==============
|
|
304 # Getting signal
|
|
305 # ==============
|
|
306 if (cornum > 1) {
|
|
307 sig = mclapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName, mc.cores = cornum, mc.preschedule = TRUE)
|
|
308 } else {
|
|
309 sig = lapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName)
|
|
310 }
|
|
311 message("Done!")
|
|
312 #=======================> DONE!
|