diff signalmaker.r @ 0:d42f4d78c85e draft

Uploaded
author messersc
date Wed, 17 Dec 2014 10:40:23 -0500
parents
children 243f75d0ed6e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/signalmaker.r	Wed Dec 17 10:40:23 2014 -0500
@@ -0,0 +1,312 @@
+###############################
+#### Signal Maker for NGS Data
+#### R script		
+###############################
+
+
+
+
+# ======================= 
+# User-defined variables
+# ======================= 
+options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000) 
+#=======================> DONE! 
+
+
+
+
+# ================================ 
+# Required Libraries check & load
+# ================================ 
+if ((is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) {
+	stop("R packages 'signal'and 'parallel' are required. Please install them!")
+}
+suppressPackageStartupMessages(library("signal"))
+suppressPackageStartupMessages(library("parallel"))
+#=======================> DONE! 
+
+
+
+
+
+# ================= 
+# Custom Functions
+# =================
+#Get per-row Geometric mean (takes list, returns vectors, not lists!)
+geomeanL <- function(mat){
+	n = length(mat)
+	if (n > 1) {
+		mult = (mat[[1]])*(mat[[2]])
+		if (n > 2) {
+			for (i in 3:n) {
+				mult = mult*(mat[[i]])
+			}
+		}
+		mult = mult^(1/n)
+		mat = mult
+		return(mat)
+	} else {
+		return(mat[[1]])
+	}
+}
+
+
+
+#Get per-row Geometric mean (takes matrix, returns vectors, not lists!)
+geomean <- function(mat){
+	n = NCOL(mat)
+	if (n > 1) {
+		mult = (mat[,1])*(mat[,2])
+		if (n > 2) {
+			for (i in 3:n) {
+				mult = mult*(mat[,i])
+			}
+		}
+	mult = mult^(1/n)
+	mat = mult
+	}
+return(mat)
+}
+
+
+
+#Read in bed(st2) file
+parsein = function(bedfile) {
+	reads = read.table(bedfile, header = FALSE)[[1]]
+	return(reads)
+}
+
+
+
+#Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
+countreads = function(bedfile, reads, frag, chromsize, filelist) {
+	
+	o = which(filelist == bedfile)
+	
+	counts = vector(mode = "numeric", length = chromsize)
+
+	for (j in 1:length(reads[[o]])) {
+		if ((reads[[o]][j]+frag[o]-1) <= chromsize) {
+			counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] =  counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1
+		}
+	}
+	
+	mCount = mean(counts)
+	
+	if (chrcount == 1) {
+		counts = counts/mCount
+		write(paste(mCount), file = paste0(out, "/norma.", o, ".info"))
+	} else {
+		meanCounts = mean(as.numeric(read.table(paste0(out, "/norma.", o, ".info"))[[1]]))
+		if ((mCount >  (5*meanCounts)) || (mCount <  (0.2*meanCounts))) {
+			mCount = meanCounts
+		} else {
+			write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
+		}
+		counts = counts/mCount
+	}
+		
+	return(counts)
+}
+
+
+
+
+#generate signal
+getsig = function(winStart, winEnds, startList, stepSize, chromName) {
+	
+	
+	plz = which(startList == winStart)
+	winEnd = winEnds[plz]
+	rWinSizeTemp = winEnd - winStart + 1
+	
+	#extract subset of the IP
+	Rs = matrix(nrow = rWinSizeTemp, ncol = numdup)
+	for (j in 1:numdup) {
+		Rs[,j] = counts[[j]][winStart:winEnd]
+		Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
+	}
+
+	#extract subset of the background and initial signal
+	if (bkgd != "None") {
+		Cs = counts[[numdup+1]][winStart:winEnd]
+		Cs = filtfilt(rep(1,80)/80,1,Cs)
+		signal = (geomean(Rs)) - (Cs)
+	} else {
+		signal = geomean(Rs)
+	}
+			
+
+	#get binned signal
+	signal = round(colMeans(matrix(c(signal, rep(0, stepSize - length(signal) %% stepSize)), stepSize)), digits = 2)
+	steps = seq(winStart, winEnd, by = stepSize)
+	starts = steps - 1
+	ends = steps + stepSize - 1
+	if (ends[length(ends)] > winEnd) {
+		ends[length(ends)] = winEnd
+	}
+	
+	#write to file
+	write(paste(chromName, starts, ends, signal, sep = "	"), file = paste0(out, "/", chromName, plz, ".bedSignal"), ncolumns = 1)
+
+	
+return(NULL)
+}
+#=======================> DONE!
+
+
+
+
+
+# ========================== 
+# Parse-in System Variables
+# ==========================
+args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
+
+#Parsing arguments and storing values
+for (each.arg in args) {
+	#bed file names
+	if (grepl('^-bednames=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				bednames <- arg.split[2]
+		} else {
+			stop('No bed file names')
+		} 
+	}
+	#bed file names
+	if (grepl('^-chromo=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				chromName <- arg.split[2]
+		} else {
+			stop('No bed file names')
+		} 
+	}
+	#bed files directory
+	if (grepl('^-frag=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				frag <- arg.split[2]
+		} else {
+			stop('No fragment length given')
+		} 
+	}	
+	#bakcground files directory
+	if (grepl('^-bkgd=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				bkgd <- arg.split[2]
+		} else {
+			stop('No background file')
+		} 
+	}	
+	#bakcground files directory
+	if (grepl('^-out=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				out <- arg.split[2]
+		} else {
+			stop('No output directory given')
+		} 
+	}
+	#Cluster number
+	if (grepl('^-clustnummer=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				clustnummer <- as.numeric(arg.split[2])
+		} 
+	}
+	#regions
+	if (grepl('^-regions=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				regions <- arg.split[2]
+		} 
+	}
+	#processor cores
+	if (grepl('^-p=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				cornum <- as.numeric(arg.split[2])
+		} 
+	}
+	#chromSize
+	if (grepl('^-chromoS=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				chromSize <- as.numeric(arg.split[2])
+		} 
+	}
+	#stepsize
+	if (grepl('^-sig=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				sigstep <- as.numeric(arg.split[2])
+		} 
+	}
+	#chrcount
+	if (grepl('^-chrcount=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
+		if (! is.na(arg.split[2]) ) {
+				chrcount <- as.numeric(arg.split[2])
+		} 
+	}
+}
+
+##Parse in variables
+readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
+numdup = length(readsFiles) #number of replicates
+if (bkgd != "None") {
+	readsFiles[[numdup+1]] = bkgd
+}
+frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]])
+rm(bednames)
+
+options(stringsAsFactors = FALSE)
+#=======================> DONE!
+
+
+
+
+# ======================= 
+# Some preliminary stuff
+# =======================
+#import regions
+regions = read.table(regions, header = FALSE)
+regions = regions[regions[[1]] == chromName, , drop = FALSE]
+regions = cbind(regions[[2]] + 1, regions[[3]])
+
+#import read data
+if (cornum > 1) {
+	datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
+} else {
+	datain = lapply(readsFiles, parsein) #read in all bed files (samples and control)
+}
+#=======================> DONE!
+
+
+
+# =============== 
+# Counting Reads
+# ===============
+if (cornum > 1) {
+	counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE)
+} else {
+	counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles)
+}
+rm(datain)
+#=======================> DONE!
+
+
+
+# ============== 
+# Getting signal
+# ==============
+if (cornum > 1) {
+	sig = mclapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName, mc.cores = cornum, mc.preschedule = TRUE)
+} else {
+	sig = lapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName)
+}
+message("Done!")
+#=======================> DONE!