Mercurial > repos > messersc > jamm
diff countwins.r @ 0:d42f4d78c85e draft
Uploaded
author | messersc |
---|---|
date | Wed, 17 Dec 2014 10:40:23 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/countwins.r Wed Dec 17 10:40:23 2014 -0500 @@ -0,0 +1,680 @@ +############################### +#### Peak finder for NGS Data +#### R script +############################### + + + + +# ======================= +# User-defined variables +# ======================= +cutoff = NA #To enforce an SNR cutoff ratio, delete NA and enter the number you want +strict = 1 #to make peak calling more / less strict, increase / decrease this number - Only use this if you are getting an unusually large or small number of peaks. +options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000) +#=======================> DONE! + + + + +# ================================ +# Required Libraries check & load +# ================================ +if ((is.element('mclust', installed.packages()[,1]) == FALSE) || (is.element('signal', installed.packages()[,1]) == FALSE) || (is.element('sqldf', installed.packages()[,1]) == FALSE) || (is.element('parallel', installed.packages()[,1]) == FALSE)) { + stop("R package 'Mclust', 'signal', 'sqldf' and 'parallel' are required. Please install them!") +} +suppressPackageStartupMessages(library("mclust")) +suppressPackageStartupMessages(library("signal")) +suppressPackageStartupMessages(library("sqldf")) +suppressPackageStartupMessages(library("tcltk")) +suppressPackageStartupMessages(library("parallel")) +#=======================> DONE! + + + + + +# ================= +# Custom Functions +# ================= +#Get per-row Geometric mean (takes list, returns vectors, not lists!) +geomeanL <- function(mat){ + n = length(mat) + if (n > 1) { + mult = (mat[[1]])*(mat[[2]]) + if (n > 2) { + for (i in 3:n) { + mult = mult*(mat[[i]]) + } + } + mult = mult^(1/n) + mat = mult + return(mat) + } else { + return(mat[[1]]) + } +} + + + +#Get per-row Geometric mean (takes matrix, returns vectors, not lists!) +geomean <- function(mat){ + n = NCOL(mat) + if (n > 1) { + mult = (mat[,1])*(mat[,2]) + if (n > 2) { + for (i in 3:n) { + mult = mult*(mat[,i]) + } + } + mult = mult^(1/n) + mat = mult + } +return(mat) +} + + + +#Read in bed(st2) file +parsein = function(bedfile) { + l = sqldf() + readsFile = file(bedfile) + return(sqldf("SELECT V1 + 1 As 'start' FROM readsFile", file.format=list(header = FALSE, sep = "\t"))$start) + l = sqldf() + close(readsFile) +} + + +#Read in bedpe(st2) file +parseinpe = function(bedfile) { + l = sqldf() + readsFile = file(bedfile) + sebastian = sqldf("SELECT * FROM readsFile", file.format=list(header = FALSE, sep = "\t")) + return(cbind(sebastian[[1]]+1, sebastian[[2]])) + l = sqldf() + close(readsFile) +} + + +#Produces normalized extended read counts (takes output of parsein(), return a vector of floats) +countreads = function(bedfile, reads, frag, chromsize, filelist) { + + o = which(filelist == bedfile) + + counts = vector(mode = "numeric", length = chromsize) + + for (j in 1:length(reads[[o]])) { + if ((reads[[o]][j]+frag[o]-1) <= chromsize) { + counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] = counts[(reads[[o]][j]):(reads[[o]][j]+frag[o]-1)] + 1 + } + } + counts = counts/(mean(counts)) + return(counts) +} + + + +#Produces normalized extended read counts (takes output of parsein(), return a vector of floats) +countreadspe = function(bedfile, reads, chromsize, filelist) { + + o = which(filelist == bedfile) + + counts = vector(mode = "numeric", length = chromsize) + + for (j in 1:length(reads[[o]][,1])) { + counts[(reads[[o]][j,1]):(reads[[o]][j,2])] = counts[(reads[[o]][j,1]):(reads[[o]][j,2])] + 1 + } + counts = counts/(mean(counts)) + return(counts) +} + + + +#find enriched bins +pickbins = function(winStart, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd) { + + if ((winStart + binSize) <= chromSize) { + winEnd = winStart + binSize + } else { + winEnd = chromSize + } + binSizeTemp = winEnd - winStart + tempend = winEnd - 1 + + #extract subset of the background + if (bkgd != "None") { + Cs = counts[[numdup+1]][winStart:tempend] + mCs = mean(Cs) + dCs = sd(Cs) + } else { + Cs = sample(C, binSizeTemp, replace = TRUE) + } + + + #find out whether it's enriched + go = rep(0, numdup) + for (g in 1:numdup) { + mS = (mean(counts[[g]][winStart:tempend])) + ratio = mS/dCs + if ((mS > (mCs * strict)) && (ratio > cutoff)) { + go[g] = 1 + } + } + veep = sum(go) + + #get counts when enriched + if (veep == numdup) { + mS = rep(0, numdup) + for (g in 1:numdup) { + mS[g] = (mean(counts[[g]][winStart:winEnd])) + } + cairo = mean(mS) + } else { + mS = rep(0, numdup) + for (g in 1:numdup) { + mS[g] = (mean(counts[[g]][winStart:winEnd])) + } + cairo = -(mean(mS)) + } + return(cairo) +} + + + +#find enriched wins +pickwins = function(winStart, coffeeshopSud, counts, numdup, startlist, winSize) { + + plz = which(startlist == winStart) + winEnd = coffeeshopSud[plz] + rWinSize = winEnd - winStart + 1 + + + if(rWinSize >= winSize) { + mS = rep(0, numdup) + for (g in 1:numdup) { + mS[g] = (sum(counts[[g]][winStart:winEnd])) + } + veep = sum(mS) + } else { + veep = FALSE + } + + return(veep) +} + + + +#Initialize MClust clustering parameters +smoothcounts = function(winStart, coffeeshopSud, numdup, counts, startlist) { #helper function1 + + plz = which(startlist == winStart) + winEnd = coffeeshopSud[plz] + + #extract subset of the IP + Rs = matrix(0, nrow = (winEnd - winStart + 1), ncol = numdup) + for (j in 1:numdup) { + Rs[,j] = counts[[j]][winStart:winEnd] + } + #smooth extended read counts + for (j in 1:numdup) { + Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j]) + } + return(Rs) +} +cluster = function(model, sig, init, clustnummer, noise) { #helper function2 + noisy = sample(noise, length(sig[,1]), replace = TRUE) + clust = me(model, sig+noisy, init) + bicc = bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer) + out = list(bicc = bicc, param = clust$parameters) + return(out) +} +initparam = function(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise) { #main function + + n = length(coffeeshopNord) + #smooth extended read counts + if (cornum > 1) { + sig = mclapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord, mc.cores = cornum, mc.preschedule = TRUE) + } else { + sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord) + } + sig = do.call(rbind, sig) + + #kmeans initialization + init = kmeans(sig, clustnummer) + init = unmap(init$cluster) + + + if (cornum > 1) { + param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE) + } else { + param = lapply(modelnames, cluster, sig, init, clustnummer, noise) + } + + bicc = vector(mode = "numeric", length = length(modelnames)) + for (i in 1:length(modelnames)) { + bicc[i] = as.numeric(param[[i]]$bicc) + } + bicc = which.max(bicc) + + out = list(initparam = param[[bicc]]$param, modelname = modelnames[bicc]) + return(out) +} + + + + +#find peaks +findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist) { + + + plz = which(startlist == winStart) + winEnd = coffeeshopSud[plz] + + + #will store peak information + writethis = list() + + + rWinSizeTemp = winEnd - winStart + 1 + + #extract subset of the IP + Rs = matrix(nrow = rWinSizeTemp, ncol = numdup) + Rsr = Rs + for (j in 1:numdup) { + Rsr[,j] = counts[[j]][winStart:winEnd] + Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j]) + } + + + ####CLUSTERING START HERE##### + noisy = sample(noise, rWinSizeTemp, replace = TRUE) + clust = em(param$modelname, Rs+noisy, param$initparam) + clust$classification = map(clust$z) + ####CLUSTERING DONE#### + + #check whether clutering succeeded + if((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) { + + + #check whether all components replicates agreed on clustering assignments + ccx = 1 + if (numdup > 1) { + cc = vector(mode = "numeric", length = numdup) + for (g in 1:numdup) { + cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!) + } + ccx = sum(diff(cc)) + cluster = cc[1] + rm(cc) + } else { + cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!) + ccx = 0 + } + if (ccx == 0) { + + #extract subset of the background + if (bkgd != "None") { + Cs = counts[[numdup+1]][winStart:winEnd] + Cs = filtfilt(rep(1,80)/80,1,Cs) + } else { + Cs = sample(C, rWinSizeTemp, replace = TRUE) + Cs = filtfilt(rep(1,80)/80,1,Cs) + } + + #find region boundaries + loc = 1:length(clust$classification) + gmclass = cbind(loc, clust$classification) + locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE] + rStart = locPeak[1] #start position of the region + rEnd = locPeak[length(locPeak[,1]),1] #end position of the region + + #peak resolution check + if (resol == "region") { + pSize = rEnd - rStart + signal = (geomean(Rs[rStart:rEnd,])) + signal2 = (signal) - (Cs[rStart:rEnd]) + gm = mean(signal2) + summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1 + will2k = wilcox.test(signal, Cs[rStart:rEnd]) + + #Is there signal in the region above background + if (gm > 0) { + writethis[[1]] = rStart + winStart - 1 + writethis[[2]] = rEnd + winStart + writethis[[3]] = paste0(chromName, ".", rStart+winStart -1) + writethis[[4]] = "1000" + writethis[[5]] = "." + writethis[[6]] = gm + writethis[[7]] = will2k$p.value + writethis[[8]] = "-1" + writethis[[9]] = summit + } + } else if (resol == "peak") { + #find out where separate peaks are + d = diff(locPeak[,1]) + d[length(d)+1] = 0 + locPeak = cbind(locPeak, d) + bound1 = which(locPeak[,3] > 1, arr.in=TRUE) + bound2 = bound1 + 1 + bound = locPeak[sort(c(bound1,bound2))] + bound = c(rStart, bound, rEnd) + w = 1 + warum = 0 + while (w < length(bound)) { + pStart = bound[w] + winStart - 1 + pEnd = bound[w+1] + winStart + pSize = pEnd - pStart + signal = (geomean(Rs[(bound[w]):(bound[w+1]),])) + signal2 = (signal) - (Cs[bound[w]:bound[w+1]]) + gm = mean(signal2) + summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1 + will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])]) + + weil = warum * 9 + #Is there signal in the region above background + if (gm > 0) { + writethis[[1+weil]] = pStart + writethis[[2+weil]] = pEnd + writethis[[3+weil]] = paste0(chromName, ".", pStart) + writethis[[4+weil]] = "1000" + writethis[[5+weil]] = "." + writethis[[6+weil]] = gm + writethis[[7+weil]] = will2k$p.value + writethis[[8+weil]] = "-1" + writethis[[9+weil]] = summit + } + w = w + 2 + warum = warum + 1 + } + + } #peak resolution check + } #all replicates agree on clustering assignments? + } #clustering worked? +return(writethis) +} + + +#filter return value of findpeak() +processPeaks = function(peaks) { + peaks = matrix(unlist(peaks), ncol=9, byrow=TRUE) + peaks = peaks[peaks[,1] != FALSE,,drop=FALSE] + peaks = data.frame(peaks) + return(peaks) +} +#=======================> DONE! + + + + + + + + + + + + + + + + + + +# ========================== +# Parse-in System Variables +# ========================== +args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line + +#Parsing arguments and storing values +for (each.arg in args) { + #chormosome size file + if (grepl('^-sfile=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + size.file <- arg.split[2] + } else { + stop('No genome size file') + } + } + #bed file names + if (grepl('^-bednames=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + bednames <- arg.split[2] + } else { + stop('No bed file names') + } + } + #bed files directory + if (grepl('^-frag=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + frag <- arg.split[2] + } else { + stop('No fragment length given') + } + } + #bakcground files directory + if (grepl('^-bkgd=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + bkgd <- arg.split[2] + } else { + stop('No background file') + } + } + #bakcground files directory + if (grepl('^-out=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + out <- arg.split[2] + } else { + stop('No output directory given') + } + } + #Cluster number + if (grepl('^-clustnummer=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + clustnummer <- as.numeric(arg.split[2]) + } + } + #resolution + if (grepl('^-resolution=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + resol <- arg.split[2] + } + } + #processor cores + if (grepl('^-p=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + cornum <- as.numeric(arg.split[2]) + } + } + #minimum window size + if (grepl('^-window=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + winSize <- arg.split[2] + } + } + #window size + if (grepl('^-bin=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + binsize <- arg.split[2] + } + } + #type (paired / single) + if (grepl('^-type=',each.arg)) { + arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]] # split on = + if (! is.na(arg.split[2]) ) { + type <- arg.split[2] + } + } +} + +##Parse in variables +chromosomes = read.table(size.file, header=FALSE) +chromName = chromosomes$V1; #which chromosome +chromSize = chromosomes$V2; #chromosomes size +rm(chromosomes) + +readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]]) +numdup = length(readsFiles) #number of replicates +if (bkgd != "None") { + readsFiles[[numdup+1]] = bkgd +} +winSize = as.numeric(winSize) +binSize = as.numeric(binsize) +winSize = binSize * winSize + +if (type == "single") { + frags = as.numeric(strsplit(frag, ",", fixed = TRUE)[[1]]) +} +rm(bednames) + +if (numdup > 1) { + modelnames = c("VVV","VEV") +} else { + modelnames = "V" +} + +options(stringsAsFactors = FALSE) +#=======================> DONE! + + + + +# ======================= +# Some preliminary stuff +# ======================= +if (type == "single") { + if (cornum > 1) { + datain = mclapply(readsFiles, parsein, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control) + } else { + datain = lapply(readsFiles, parsein) #read in all bed files (samples and control) + } +} + +if (type == "paired") { + if (cornum > 1) { + datain = mclapply(readsFiles, parseinpe, mc.cores = cornum, mc.preschedule = TRUE) #read in all bed files (samples and control) + } else { + datain = lapply(readsFiles, parseinpe) #read in all bed files (samples and control) + } +} + +#minimum peak size (only a recommendation) +minpeak = floor(binSize / 4) + +#make bins vector +bins = seq(from = 1, to = (chromSize - 1), by = binSize) +#=======================> DONE! + + + + +# =============== +# Counting Reads +# =============== +if (type == "single") { + if (cornum > 1) { + counts = mclapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE) + } else { + counts = lapply(readsFiles, countreads, reads = datain, frag = frags, chromsize = chromSize, filelist = readsFiles) + } +} + +if (type == "paired") { + if (cornum > 1) { + counts = mclapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles, mc.cores = cornum, mc.preschedule = TRUE) + } else { + counts = lapply(readsFiles, countreadspe, reads = datain, chromsize = chromSize, filelist = readsFiles) + } +} + +rm(datain) + +#get total counts +mS = vector(mode = "numeric", length = numdup) +for (g in 1:numdup) { + mS[g] = sum(counts[[g]]) +} +totalCounts = sum(mS) +rm(mS) +#=======================> DONE! + + + +# =================== +# Estimating Cutoff +# =================== +if (is.na(cutoff)) { + if (bkgd != "None") { + cutoff = vector(length = numdup) + sdC = sd(counts[[numdup+1]]) + for (x in 1:numdup) { + cutoff[x] = (mean(counts[[x]]))/(sdC) + } + cutoff = max(cutoff) + C = NULL + mCs = NULL + } else { + cutoff = vector(length = numdup) + mmV = var(geomeanL(counts)) + mmM = mean(geomeanL(counts)) + sigma = log(1+((mmV) / ((mmM)^2))) + mu = (log(mmM)) - (0.5 * (sigma)) + C = rlnorm(100000, mu, sqrt(sigma)) + for (x in 1:numdup) { + cutoff[x] = (mean(counts[[x]]))/(sd(C)) + } + cutoff = max(cutoff) + mCs = (mean(sample(C, binSize*5, replace = TRUE))) + dCs = (sd(sample(C, binSize*5, replace = TRUE))) + } +} +#=======================> DONE! + + + + +# ======================== +# Picking Enriched Windows +# ======================== +binNum = vector(mode = "numeric", length = 41) +binTotalSize = vector(mode = "numeric", length = 41) +binTotalSizeRatio = vector(mode = "numeric", length = 41) +binTotalCount = vector(mode = "numeric", length = 41) +binTotalCountNO = vector(mode = "numeric", length = 41) +binTotalCountRatio = vector(mode = "numeric", length = 41) +maxiFun = vector(mode = "numeric", length = 41) +ggg = seq(1,5, by = 0.1) +for (i in 1:41) { + if (cornum > 1) { + coffeeshop = mclapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd, mc.cores = cornum, mc.preschedule = TRUE) + } else { + coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, ggg[i], mCs, dCs, bkgd) + } + coffeeshop = as.numeric(unlist(coffeeshop)) + coffeeshopYES = coffeeshop[coffeeshop > 0] + coffeeshopNO = -(coffeeshop[coffeeshop <= 0]) + + binNum[i] = length(coffeeshopYES) + binTotalSize[i] = binNum[i] * binSize + binTotalSizeRatio[i] = binTotalSize[i] / chromSize + + binTotalCount[i] = sum(coffeeshopYES) + binTotalCountNO[i] = sum(coffeeshopNO) + binTotalCountRatio[i] = binTotalCount[i] / binTotalCountNO[i] + + maxiFun[i] = binTotalCountRatio[i] / binTotalSizeRatio[i] +} +rm(bins) +#=======================> DONE! + + +write(paste(ggg, binNum, binTotalSize, binTotalSizeRatio, binTotalCount, binTotalCountNO, binTotalCountRatio, maxiFun, sep = " "), file = paste0(file = paste0("/home/mibrahim/Documents/", chromName, ".GOYEAH"), ncolumns = 1))