Mercurial > repos > messersc > jamm
diff signalmaker.r @ 0:d42f4d78c85e draft
Uploaded
author | messersc |
---|---|
date | Wed, 17 Dec 2014 10:40:23 -0500 |
parents | |
children | 243f75d0ed6e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/signalmaker.r Wed Dec 17 10:40:23 2014 -0500 @@ -0,0 +1,312 @@ +############################### +#### Signal Maker for NGS Data +#### R script +############################### + + + + +# ======================= +# User-defined variables +# ======================= +options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000) +#=======================> DONE! + + + + +# ================================ +# Required Libraries check & load +# ================================ +if ((is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) { + stop("R packages 'signal'and 'parallel' are required. Please install them!") +} +suppressPackageStartupMessages(library("signal")) +suppressPackageStartupMessages(library("parallel")) +#=======================> DONE! + + + + + +# ================= +# Custom Functions +# ================= +#Get per-row Geometric mean (takes list, returns vectors, not lists!) +geomeanL <- function(mat){ + n = length(mat) + if (n > 1) { + mult = (mat[[1]])*(mat[[2]]) + if (n > 2) { + for (i in 3:n) { + mult = mult*(mat[[i]]) + } + } + mult = mult^(1/n) + mat = mult + return(mat) + } else { + return(mat[[1]]) + } +} + + + +#Get per-row Geometric mean (takes matrix, returns vectors, not lists!) +geomean <- function(mat){ + n = NCOL(mat) + if (n > 1) { + mult = (mat[,1])*(mat[,2]) + if (n > 2) { + for (i in 3:n) { + mult = mult*(mat[,i]) + } + } + mult = mult^(1/n) + mat = mult + } +return(mat) +} + + + +#Read in bed(st2) file +parsein = function(bedfile) { + reads = read.table(bedfile, header = FALSE)[[1]] + return(reads) +} + + + +#Produces normalized extended read counts (takes output of parsein(), return a vector of floats) +countreads = function(bedfile, reads, frag, chromsize, filelist) { + + o = which(filelist == bedfile) + + counts = vector(mode = "numeric", length = chromsize) + + for (j in 1:length(reads[[o]])) { + if ((reads[[o]][j]+frag[o]-1) <= chromsize) { + counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1 + } + } + + mCount = mean(counts) + + if (chrcount == 1) { + counts = counts/mCount + write(paste(mCount), file = paste0(out, "/norma.", o, ".info")) + } else { + meanCounts = mean(as.numeric(read.table(paste0(out, "/norma.", o, ".info"))[[1]])) + if ((mCount > (5*meanCounts)) || (mCount < (0.2*meanCounts))) { + mCount = meanCounts + } else { + write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE) + } + counts = counts/mCount + } + + return(counts) +} + + + + +#generate signal +getsig = function(winStart, winEnds, startList, stepSize, chromName) { + + + plz = which(startList == winStart) + winEnd = winEnds[plz] + rWinSizeTemp = winEnd - winStart + 1 + + #extract subset of the IP + Rs = matrix(nrow = rWinSizeTemp, ncol = numdup) + for (j in 1:numdup) { + Rs[,j] = counts[[j]][winStart:winEnd] + Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j]) + } + + #extract subset of the background and initial signal + if (bkgd != "None") { + Cs = counts[[numdup+1]][winStart:winEnd] + Cs = filtfilt(rep(1,80)/80,1,Cs) + signal = (geomean(Rs)) - (Cs) + } else { + signal = geomean(Rs) + } + + + #get binned signal + signal = round(colMeans(matrix(c(signal, rep(0, stepSize - length(signal) %% stepSize)), stepSize)), digits = 2) + steps = seq(winStart, winEnd, by = stepSize) + starts = steps - 1 + ends = steps + stepSize - 1 + if (ends[length(ends)] > winEnd) { + ends[length(ends)] = winEnd + } + + #write to file + write(paste(chromName, starts, ends, signal, sep = " "), file = paste0(out, "/", chromName, plz, ".bedSignal"), ncolumns = 1) + + +return(NULL) +} +#=======================> DONE! + + + + + +# ========================== +# Parse-in System Variables +# ========================== +args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line + +#Parsing arguments and storing values +for (each.arg in args) { + #bed file names + if (grepl('^-bednames=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + bednames <- arg.split[2] + } else { + stop('No bed file names') + } + } + #bed file names + if (grepl('^-chromo=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + chromName <- arg.split[2] + } else { + stop('No bed file names') + } + } + #bed files directory + if (grepl('^-frag=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + frag <- arg.split[2] + } else { + stop('No fragment length given') + } + } + #bakcground files directory + if (grepl('^-bkgd=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + bkgd <- arg.split[2] + } else { + stop('No background file') + } + } + #bakcground files directory + if (grepl('^-out=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + out <- arg.split[2] + } else { + stop('No output directory given') + } + } + #Cluster number + if (grepl('^-clustnummer=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + clustnummer <- as.numeric(arg.split[2]) + } + } + #regions + if (grepl('^-regions=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + regions <- arg.split[2] + } + } + #processor cores + if (grepl('^-p=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + cornum <- as.numeric(arg.split[2]) + } + } + #chromSize + if (grepl('^-chromoS=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + chromSize <- as.numeric(arg.split[2]) + } + } + #stepsize + if (grepl('^-sig=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + sigstep <- as.numeric(arg.split[2]) + } + } + #chrcount + if (grepl('^-chrcount=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] + if (! is.na(arg.split[2]) ) { + chrcount <- as.numeric(arg.split[2]) + } + } +} + +##Parse in variables +readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]]) +numdup = length(readsFiles) #number of replicates +if (bkgd != "None") { + readsFiles[[numdup+1]] = bkgd +} +frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]]) +rm(bednames) + +options(stringsAsFactors = FALSE) +#=======================> DONE! + + + + +# ======================= +# Some preliminary stuff +# ======================= +#import regions +regions = read.table(regions, header = FALSE) +regions = regions[regions[[1]] == chromName, , drop = FALSE] +regions = cbind(regions[[2]] + 1, regions[[3]]) + +#import read data +if (cornum > 1) { + datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control) +} else { + datain = lapply(readsFiles, parsein) #read in all bed files (samples and control) +} +#=======================> DONE! + + + +# =============== +# Counting Reads +# =============== +if (cornum > 1) { + counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE) +} else { + counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles) +} +rm(datain) +#=======================> DONE! + + + +# ============== +# Getting signal +# ============== +if (cornum > 1) { + sig = mclapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName, mc.cores = cornum, mc.preschedule = TRUE) +} else { + sig = lapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName) +} +message("Done!") +#=======================> DONE!