view signalmaker.r @ 1:243f75d0ed6e draft default tip

Uploaded. Includes new release 1.0.7 with fixed optional controls.
author messersc
date Thu, 19 Feb 2015 05:39:45 -0500
parents d42f4d78c85e
children
line wrap: on
line source

########################################################################
# JAMMv1.0.7rev1 is a peak finder for joint analysis of NGS replicates.
# Copyright (C) 2014-2015  Mahmoud Ibrahim
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# Contact: mahmoud.ibrahim@mdc-berlin.de
########################################################################


# ======================= 
# User-defined variables
# ======================= 
options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000) 
#=======================> DONE! 




# ================================ 
# Required Libraries check & load
# ================================ 
if ((is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) {
	stop("R packages 'signal'and 'parallel' are required. Please install them!")
}
suppressPackageStartupMessages(library("signal"))
suppressPackageStartupMessages(library("parallel"))
#=======================> DONE! 





# ================= 
# Custom Functions
# =================
#Get per-row Geometric mean (takes list, returns vectors, not lists!)
geomeanL <- function(mat){
	n = length(mat)
	if (n > 1) {
		mult = (mat[[1]])*(mat[[2]])
		if (n > 2) {
			for (i in 3:n) {
				mult = mult*(mat[[i]])
			}
		}
		mult = mult^(1/n)
		mat = mult
		return(mat)
	} else {
		return(mat[[1]])
	}
}



#Get per-row Geometric mean (takes matrix, returns vectors, not lists!)
geomean <- function(mat){
	n = NCOL(mat)
	if (n > 1) {
		mult = (mat[,1])*(mat[,2])
		if (n > 2) {
			for (i in 3:n) {
				mult = mult*(mat[,i])
			}
		}
	mult = mult^(1/n)
	mat = mult
	}
return(mat)
}



#Read in bed(st2) file
parsein = function(bedfile) {
	reads = read.table(bedfile, header = FALSE)[[1]]
	return(reads)
}

#Read in bedpe(st2)
parseinpe = function(bedfile) {
	l = read.table(bedfile, header = FALSE)
	l = cbind((l[[1]] + 1), l[[2]])
	return(l)
}

#Produces normalized extended read counts (takes output of parsein(), return a vector of floats)
countreads = function(bedfile, reads, frag, chromsize, filelist, chrcount) {
	
	o = which(filelist == bedfile)
	
	counts = vector(mode = "numeric", length = chromsize)

	for (j in 1:length(reads[[o]])) {
		if ((reads[[o]][j]+frag[o]-1) <= chromsize) {
			counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] =  counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1
		}
	}
	
	mCount = mean(counts)
	
	if (chrcount == 1) {
		counts = counts/mCount
		write(paste(mCount), file = paste0(out, "/norma.", o, ".info"))
	} else {
		meanCounts = mean(as.numeric(read.table(paste0(out, "/norma.", o, ".info"))[[1]]))
		if ((mCount >  (5*meanCounts)) || (mCount <  (0.2*meanCounts))) {
			mCount = meanCounts
		} else {
			write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
		}
		counts = counts/mCount
	}
		
	return(counts)
}




#Produces normalized extended read counts (takes output of parsein(), return a vector of floats / PAIRED END)
countreadspe = function(bedfile, reads, chromsize, filelist, chrcount) {
	
	o = which(filelist == bedfile)
	
	counts = vector(mode = "numeric", length = chromsize)

	for (j in 1:length(reads[[o]][,1])) {
		counts[(reads[[o]][j,1]):(reads[[o]][j,2])] = counts[(reads[[o]][j,1]):(reads[[o]][j,2])] + 1
	}
	
	mCount = mean(counts)
	
	if (chrcount == 1) {
		counts = counts/mCount
		write(paste(mCount), file = paste0(out, "/norma.", o, ".info"))
	} else {
		meanCounts = mean(as.numeric(read.table(paste0(out, "/norma.", o, ".info"))[[1]]))
		if ((mCount >  (5*meanCounts)) || (mCount <  (0.2*meanCounts))) {
			mCount = meanCounts
		} else {
			write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
		}
		counts = counts/mCount
	}
	
	return(counts)
}


#generate signal
getsig = function(winStart, winEnds, startList, stepSize, chromName) {
	
	
	plz = which(startList == winStart)
	winEnd = winEnds[plz]
	rWinSizeTemp = winEnd - winStart + 1
	
	#extract subset of the IP
	Rs = matrix(nrow = rWinSizeTemp, ncol = numdup)
	for (j in 1:numdup) {
		Rs[,j] = counts[[j]][winStart:winEnd]
		Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
	}

	#extract subset of the background and initial signal
	if (bkgd != "None") {
		Cs = counts[[numdup+1]][winStart:winEnd]
		Cs = filtfilt(rep(1,80)/80,1,Cs)
		signal = (geomean(Rs)) - (Cs)
	} else {
		signal = geomean(Rs)
	}
			

	#get binned signal
	signal = round(colMeans(matrix(c(signal, rep(0, stepSize - length(signal) %% stepSize)), stepSize)), digits = 2)
	steps = seq(winStart, winEnd, by = stepSize)
	starts = steps - 1
	ends = steps + stepSize - 1
	if (ends[length(ends)] > winEnd) {
		ends[length(ends)] = winEnd
	}
	
	#write to file
	write(paste(chromName, starts, ends, signal, sep = "	"), file = paste0(out, "/", chromName, plz, ".bedSignal"), ncolumns = 1)

	
return(NULL)
}
#=======================> DONE!





# ========================== 
# Parse-in System Variables
# ==========================
args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line

#Parsing arguments and storing values
for (each.arg in args) {
	#bed file names
	if (grepl('^-bednames=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				bednames <- arg.split[2]
		} else {
			stop('No bed file names')
		} 
	}
	#bed file names
	if (grepl('^-chromo=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				chromName <- arg.split[2]
		} else {
			stop('No bed file names')
		} 
	}
	#bed files directory
	if (grepl('^-frag=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
			frag <- arg.split[2]
		}
	}	
	#bakcground files directory
	if (grepl('^-bkgd=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				bkgd <- arg.split[2]
		} else {
			stop('No background file')
		} 
	}	
	#bakcground files directory
	if (grepl('^-out=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				out <- arg.split[2]
		} else {
			stop('No output directory given')
		} 
	}
	#Cluster number
	if (grepl('^-clustnummer=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				clustnummer <- as.numeric(arg.split[2])
		} 
	}
	#regions
	if (grepl('^-regions=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				regions <- arg.split[2]
		} 
	}
	#processor cores
	if (grepl('^-p=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				cornum <- as.numeric(arg.split[2])
		} 
	}
	#chromSize
	if (grepl('^-chromoS=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				chromSize <- as.numeric(arg.split[2])
		} 
	}
	#stepsize
	if (grepl('^-sig=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				sigstep <- as.numeric(arg.split[2])
		} 
	}
	#chrcount
	if (grepl('^-chrcount=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				chrcount <- as.numeric(arg.split[2])
		} 
	}
	#alignment type
	if (grepl('^-t=',each.arg)) {
		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] 
		if (! is.na(arg.split[2]) ) {
				type <- arg.split[2]
		} 
	}
}

##Parse in variables
readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
numdup = length(readsFiles) #number of replicates
if (bkgd != "None") {
	readsFiles[[numdup+1]] = bkgd
}
if (type == "single") {
	frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]])
}
rm(bednames)

options(stringsAsFactors = FALSE)
#=======================> DONE!




# ======================= 
# Some preliminary stuff
# =======================
#import regions
regions = read.table(regions, header = FALSE)
regions = regions[regions[[1]] == chromName, , drop = FALSE]
regions = cbind(regions[[2]] + 1, regions[[3]])

#import read data
if (type == "single") {
	if (cornum > 1) {
		datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
	} else {
		datain = lapply(readsFiles, parsein) #read in all bed files (samples and control)
	}
}
#import read data
if (type == "paired") {
	if (cornum > 1) {
		datain = mclapply(readsFiles, parseinpe, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control)
	} else {
		datain = lapply(readsFiles, parseinpe) #read in all bed files (samples and control)
	}
}
#=======================> DONE!



# =============== 
# Counting Reads
# ===============
if (type == "single") {
	if (cornum > 1) {
		counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, chrcount = chrcount, mc.cores = cornum, mc.preschedule = TRUE)
	} else {
		counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, chrcount = chrcount)
	}
}

if (type == "paired") {
	if (cornum > 1) {
		counts = mclapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles, chrcount = chrcount, mc.cores = cornum, mc.preschedule = TRUE)
	} else {
		counts = lapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles, chrcount = chrcount)
	}
}
rm(datain)
#=======================> DONE!



# ============== 
# Getting signal
# ==============
if (cornum > 1) {
	sig = mclapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName, mc.cores = cornum, mc.preschedule = TRUE)
} else {
	sig = lapply(regions[,1], getsig, winEnds = regions[,2], startList = regions[,1], stepSize = sigstep, chromName)
}
message("Done!")
#=======================> DONE!