comparison signalmaker.r @ 0:d42f4d78c85e draft

Uploaded
author messersc
date Wed, 17 Dec 2014 10:40:23 -0500
parents
children 243f75d0ed6e
comparison
equal deleted inserted replaced
-1:000000000000 0:d42f4d78c85e
1 ###############################
2 #### Signal Maker for NGS Data
3 #### R script
4 ###############################
5
6
7
8
9 # =======================
10 # User-defined variables
11 # =======================
12 options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000)
13 #=======================> DONE!
14
15
16
17
18 # ================================
19 # Required Libraries check & load
20 # ================================
21 if ((is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) {
22 stop("R packages 'signal'and 'parallel' are required. Please install them!")
23 }
24 suppressPackageStartupMessages(library("signal"))
25 suppressPackageStartupMessages(library("parallel"))
26 #=======================> DONE!
27
28
29
30
31
32 # =================
33 # Custom Functions
34 # =================
35 #Get per-row Geometric mean (takes list, returns vectors, not lists!)
36 geomeanL <- function(mat){
37 n = length(mat)
38 if (n > 1) {
39 mult = (mat[[1]])*(mat[[2]])
40 if (n > 2) {
41 for (i in 3:n) {
42 mult = mult*(mat[[i]])
43 }
44 }
45 mult = mult^(1/n)
46 mat = mult
47 return(mat)
48 } else {
49 return(mat[[1]])
50 }
51 }
52
53
54
55 #Get per-row Geometric mean (takes matrix, returns vectors, not lists!)
56 geomean <- function(mat){
57 n = NCOL(mat)
58 if (n > 1) {
59 mult = (mat[,1])*(mat[,2])
60 if (n > 2) {
61 for (i in 3:n) {
62 mult = mult*(mat[,i])
63 }
64 }
65 mult = mult^(1/n)
66 mat = mult
67 }
68 return(mat)
69 }
70
71
72
73 #Read in bed(st2) file
74 parsein = function(bedfile) {
75 reads = read.table(bedfile, header = FALSE)[[1]]
76 return(reads)
77 }
78
79
80
81 #Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
82 countreads = function(bedfile, reads, frag, chromsize, filelist) {
83
84 o = which(filelist == bedfile)
85
86 counts = vector(mode = "numeric", length = chromsize)
87
88 for (j in 1:length(reads[[o]])) {
89 if ((reads[[o]][j]+frag[o]-1) <= chromsize) {
90 counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1
91 }
92 }
93
94 mCount = mean(counts)
95
96 if (chrcount == 1) {
97 counts = counts/mCount
98 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"))
99 } else {
100 meanCounts = mean(as.numeric(read.table(paste0(out, "/norma.", o, ".info"))[[1]]))
101 if ((mCount > (5*meanCounts)) || (mCount < (0.2*meanCounts))) {
102 mCount = meanCounts
103 } else {
104 write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
105 }
106 counts = counts/mCount
107 }
108
109 return(counts)
110 }
111
112
113
114
115 #generate signal
116 getsig = function(winStart, winEnds, startList, stepSize, chromName) {
117
118
119 plz = which(startList == winStart)
120 winEnd = winEnds[plz]
121 rWinSizeTemp = winEnd - winStart + 1
122
123 #extract subset of the IP
124 Rs = matrix(nrow = rWinSizeTemp, ncol = numdup)
125 for (j in 1:numdup) {
126 Rs[,j] = counts[[j]][winStart:winEnd]
127 Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
128 }
129
130 #extract subset of the background and initial signal
131 if (bkgd != "None") {
132 Cs = counts[[numdup+1]][winStart:winEnd]
133 Cs = filtfilt(rep(1,80)/80,1,Cs)
134 signal = (geomean(Rs)) - (Cs)
135 } else {
136 signal = geomean(Rs)
137 }
138
139
140 #get binned signal
141 signal = round(colMeans(matrix(c(signal, rep(0, stepSize - length(signal) %% stepSize)), stepSize)), digits = 2)
142 steps = seq(winStart, winEnd, by = stepSize)
143 starts = steps - 1
144 ends = steps + stepSize - 1
145 if (ends[length(ends)] > winEnd) {
146 ends[length(ends)] = winEnd
147 }
148
149 #write to file
150 write(paste(chromName, starts, ends, signal, sep = " "), file = paste0(out, "/", chromName, plz, ".bedSignal"), ncolumns = 1)
151
152
153 return(NULL)
154 }
155 #=======================> DONE!
156
157
158
159
160
161 # ==========================
162 # Parse-in System Variables
163 # ==========================
164 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
165
166 #Parsing arguments and storing values
167 for (each.arg in args) {
168 #bed file names
169 if (grepl('^-bednames=',each.arg)) {
170 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
171 if (! is.na(arg.split[2]) ) {
172 bednames <- arg.split[2]
173 } else {
174 stop('No bed file names')
175 }
176 }
177 #bed file names
178 if (grepl('^-chromo=',each.arg)) {
179 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
180 if (! is.na(arg.split[2]) ) {
181 chromName <- arg.split[2]
182 } else {
183 stop('No bed file names')
184 }
185 }
186 #bed files directory
187 if (grepl('^-frag=',each.arg)) {
188 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
189 if (! is.na(arg.split[2]) ) {
190 frag <- arg.split[2]
191 } else {
192 stop('No fragment length given')
193 }
194 }
195 #bakcground files directory
196 if (grepl('^-bkgd=',each.arg)) {
197 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
198 if (! is.na(arg.split[2]) ) {
199 bkgd <- arg.split[2]
200 } else {
201 stop('No background file')
202 }
203 }
204 #bakcground files directory
205 if (grepl('^-out=',each.arg)) {
206 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
207 if (! is.na(arg.split[2]) ) {
208 out <- arg.split[2]
209 } else {
210 stop('No output directory given')
211 }
212 }
213 #Cluster number
214 if (grepl('^-clustnummer=',each.arg)) {
215 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
216 if (! is.na(arg.split[2]) ) {
217 clustnummer <- as.numeric(arg.split[2])
218 }
219 }
220 #regions
221 if (grepl('^-regions=',each.arg)) {
222 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
223 if (! is.na(arg.split[2]) ) {
224 regions <- arg.split[2]
225 }
226 }
227 #processor cores
228 if (grepl('^-p=',each.arg)) {
229 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
230 if (! is.na(arg.split[2]) ) {
231 cornum <- as.numeric(arg.split[2])
232 }
233 }
234 #chromSize
235 if (grepl('^-chromoS=',each.arg)) {
236 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
237 if (! is.na(arg.split[2]) ) {
238 chromSize <- as.numeric(arg.split[2])
239 }
240 }
241 #stepsize
242 if (grepl('^-sig=',each.arg)) {
243 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
244 if (! is.na(arg.split[2]) ) {
245 sigstep <- as.numeric(arg.split[2])
246 }
247 }
248 #chrcount
249 if (grepl('^-chrcount=',each.arg)) {
250 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
251 if (! is.na(arg.split[2]) ) {
252 chrcount <- as.numeric(arg.split[2])
253 }
254 }
255 }
256
257 ##Parse in variables
258 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
259 numdup = length(readsFiles) #number of replicates
260 if (bkgd != "None") {
261 readsFiles[[numdup+1]] = bkgd
262 }
263 frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]])
264 rm(bednames)
265
266 options(stringsAsFactors = FALSE)
267 #=======================> DONE!
268
269
270
271
272 # =======================
273 # Some preliminary stuff
274 # =======================
275 #import regions
276 regions = read.table(regions, header = FALSE)
277 regions = regions[regions[[1]] == chromName, , drop = FALSE]
278 regions = cbind(regions[[2]] + 1, regions[[3]])
279
280 #import read data
281 if (cornum > 1) {
282 datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
283 } else {
284 datain = lapply(readsFiles, parsein) #read in all bed files (samples and control)
285 }
286 #=======================> DONE!
287
288
289
290 # ===============
291 # Counting Reads
292 # ===============
293 if (cornum > 1) {
294 counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE)
295 } else {
296 counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles)
297 }
298 rm(datain)
299 #=======================> DONE!
300
301
302
303 # ==============
304 # Getting signal
305 # ==============
306 if (cornum > 1) {
307 sig = mclapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName, mc.cores = cornum, mc.preschedule = TRUE)
308 } else {
309 sig = lapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName)
310 }
311 message("Done!")
312 #=======================> DONE!