Mercurial > repos > modencode-dcc > idr_package

--- a/batch-consistency-analysis.r	Fri Jan 18 17:30:47 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,182 +0,0 @@
-##############################################################################
-
-# Modified 06/29/12: Kar Ming Chu
-# Modified to work with Galaxy
-
-# Usage: 	Rscript batch-consistency-analysis.r peakfile1 peakfile2 half.width overlap.ratio  is.broadpeak sig.value gtable r.output overlap.output npeaks.output em.sav.output uri.sav.output
-
-# Changes:
-# 	- Appended parameter for input gnome table called gtable
-#	- Appended parameter for specifying Rout output file name (required by Galaxy)
-#	- Appended parameter for specifying Peak overlap output file name (required by Galaxy)
-#	- Appended parameter for specifying Npeak above IDR output file name (required by Galaxy)
-#	- Removed parameter outfile.prefix since main output files are replaced with strict naming
-#	- Appended parameter for specifying em.sav output file (for use with batch-consistency-plot.r)
-#	- Appended parameter for specifying uri.sav output file (for use with batch-consistency-plot.r)
-
-##############################################################################
-
-# modified 3-29-10: Qunhua Li
-# add 2 columns in the output of "-overlapped-peaks.txt": local.idr and IDR
-
-# 01-20-2010 Qunhua Li
-#
-# This program performs consistency analysis for a pair of peak calling outputs
-# It takes narrowPeak or broadPeak formats.
-#
-# usage: Rscript batch-consistency-analysis2.r peakfile1 peakfile2 half.width outfile.prefix overlap.ratio  is.broadpeak sig.value
-#
-# peakfile1 and peakfile2 : the output from peak callers in narrowPeak or broadPeak format
-# half.width: -1 if using the reported peak width,
-#             a numerical value to truncate the peaks to
-# outfile.prefix: prefix of output file
-# overlap.ratio: a value between 0 and 1. It controls how much overlaps two peaks need to have to be called as calling the same region. It is the ratio of overlap / short peak of the two. When setting at 0, it means as long as overlapped width >=1bp, two peaks are deemed as calling the same region.
-# is.broadpeak: a logical value. If broadpeak is used, set as T; if narrowpeak is used, set as F
-# sig.value: type of significant values, "q.value", "p.value" or "signal.value" (default, i.e. fold of enrichment)
-
-args <- commandArgs(trailingOnly=T)
-
-# consistency between peakfile1 and peakfile2
-#input1.dir <- args[1]
-#input2.dir <- args[2] # directories of the two input files
-peakfile1 <- args[1]
-peakfile2 <- args[2]
-
-if(as.numeric(args[3])==-1){ # enter -1 when using the reported length
-  half.width <- NULL
-}else{
-  half.width <- as.numeric(args[3])
-}
-
-overlap.ratio <- args[4]
-
-if(args[5] == "T"){
-  is.broadpeak <- T
-}else{
-  is.broadpeak <- F
-}
-
-sig.value <- args[6]
-
-
-#dir1 <- "~/ENCODE/anshul/data/"
-#dir2 <- dir1
-#peakfile1 <- "../data/SPP.YaleRep1Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak"
-#peakfile2 <- "../data/SPP.YaleRep3Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak"
-#half.width <- NULL
-#overlap.ratio <- 0.1
-#sig.value <- "signal.value"
-
-
-source("/mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/functions-all-clayton-12-13.r")
-
-# read the length of the chromosomes, which will be used to concatenate chr's
-# chr.file <- "genome_table.txt"
-# args[7] is the gtable
-chr.file <- args[7]
-
-chr.size <- read.table(chr.file)
-
-# setting output files
-r.output <- args[8]
-overlap.output <- args[9]
-npeaks.output <- args[10]
-em.sav.output <- args[11]
-uri.sav.output <- args[12]
-
-# sink(paste(output.prefix, "-Rout.txt", sep=""))
-sink(r.output)
-
-############# process the data
-cat("is.broadpeak", is.broadpeak, "\n")
-# process data, summit: the representation of the location of summit
-rep1 <- process.narrowpeak(paste(peakfile1, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak)
-rep2 <- process.narrowpeak(paste(peakfile2, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak)
-
-cat(paste("read", peakfile1, ": ", nrow(rep1$data.ori), "peaks\n", nrow(rep1$data.cleaned), "peaks are left after cleaning\n", peakfile2, ": ", nrow(rep2$data.ori), "peaks\n", nrow(rep2$data.cleaned), " peaks are left after cleaning"))
-
-if(args[3]==-1){
-  cat(paste("half.width=", "reported", "\n"))
-}else{
-  cat(paste("half.width=", half.width, "\n"))
-}
-cat(paste("significant measure=", sig.value, "\n"))
-
-# compute correspondence profile (URI)
-uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned, sig.value1=sig.value, sig.value2=sig.value, overlap.ratio=overlap.ratio)
-
-#uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned)
-
-cat(paste("URI is done\n"))
-
-# save output
-# save(uri.output, file=paste(output.prefix, "-uri.sav", sep=""))
-save(uri.output, file=uri.sav.output)
-cat(paste("URI is saved at: ", uri.sav.output))
-
-
-# EM procedure for inference
-em.output <- fit.em(uri.output$data12.enrich, fix.rho2=T)
-
-#em.output <- fit.2copula.em(uri.output$data12.enrich, fix.rho2=T, "gaussian")
-
-cat(paste("EM is done\n\n"))
-
-save(em.output, file=em.sav.output)
-cat(paste("EM is saved at: ", em.sav.output))
-
-
-# write em output into a file
-
-cat(paste("EM estimation for the following files\n", peakfile1, "\n", peakfile2, "\n", sep=""))
-
-print(em.output$em.fit$para)
-
-# add on 3-29-10
-# output both local idr and IDR
-idr.local <- 1-em.output$em.fit$e.z
-IDR <- c()
-o <- order(idr.local)
-IDR[o] <- cumsum(idr.local[o])/c(1:length(o))
-
-
-write.out.data <- data.frame(chr1=em.output$data.pruned$sample1[, "chr"],
-                    start1=em.output$data.pruned$sample1[, "start.ori"],
-                    stop1=em.output$data.pruned$sample1[, "stop.ori"],
-                    sig.value1=em.output$data.pruned$sample1[, "sig.value"],
-                    chr2=em.output$data.pruned$sample2[, "chr"],
-                    start2=em.output$data.pruned$sample2[, "start.ori"],
-                    stop2=em.output$data.pruned$sample2[, "stop.ori"],
-                    sig.value2=em.output$data.pruned$sample2[, "sig.value"],
-                    idr.local=1-em.output$em.fit$e.z, IDR=IDR)
-
-# write.table(write.out.data, file=paste(output.prefix, "-overlapped-peaks.txt", sep=""))
-write.table(write.out.data, file=overlap.output)
-cat(paste("Write overlapped peaks and local idr to: ", overlap.output, sep=""))
-
-# number of peaks passing IDR range (0.01-0.25)
-IDR.cutoff <- seq(0.01, 0.25, by=0.01)
-idr.o <- order(write.out.data$idr.local)
-idr.ordered <- write.out.data$idr.local[idr.o]
-IDR.sum <- cumsum(idr.ordered)/c(1:length(idr.ordered))
-
-IDR.count <- c()
-n.cutoff <- length(IDR.cutoff)
-for(i in 1:n.cutoff){
-  IDR.count[i] <- sum(IDR.sum <= IDR.cutoff[i])
-}
-
-
-# write the number of peaks passing various IDR range into a file
-idr.cut <- data.frame(peakfile1, peakfile2, IDR.cutoff=IDR.cutoff, IDR.count=IDR.count)
-write.table(idr.cut, file=npeaks.output, append=T, quote=F, row.names=F, col.names=F)
-cat(paste("Write number of peaks above IDR cutoff [0.01, 0.25]: ","npeaks-aboveIDR.txt\n", sep=""))
-
-mar.mean <- get.mar.mean(em.output$em.fit)
-
-cat(paste("Marginal mean of two components:\n"))
-print(mar.mean)
-
-sink()
-
-
--- a/batch-consistency-plot.r	Fri Jan 18 17:30:47 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,68 +0,0 @@
-# 1-20-10 Qunhua Li
-#
-# This program first plots correspondence curve and IDR threshold plot
-# (i.e. number of selected peaks vs IDR) for each pair of sample
-#
-# usage:
-# Rscript batch-consistency-plot-merged.r [npairs] [output.dir] [input.file.prefix 1, 2, 3 ...]
-# [npairs]: integer, number of consistency analyses
-#          (e.g. if 2 replicates, npairs=1, if 3 replicates, npairs=3
-# [output.prefix]: output directory and file name prefix for plot eg. /plots/idrPlot
-# [input.file.prefix 1, 2, 3]: prefix for the output from batch-consistency-analysis2. They are the input files for merged analysis see below for examples (i.e. saved.file.prefix). It can be multiple files
-#
-
-args <- commandArgs(trailingOnly=T)
-npair <- args[1] # number of curves to plot on the same figure
-output.file.prefix <- args[2] # file name for plot, generated from script at the outer level
-df.txt <- 10
-ntemp <- as.numeric(npair)
-saved.file.prefix <- list() # identifier of filenames that contain the em and URI results
-source("/mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/functions-all-clayton-12-13.r")
-
-uri.list <- list()
-uri.list.match <- list()
-ez.list <- list()
-legend.txt <- c()
-em.output.list <- list()
-uri.output.list <- list()
-
-for(i in 1:npair){
-  saved.file.prefix[i] <- args[2+i]
-
-  load(paste(saved.file.prefix[i], "-uri.sav", sep=""))
-  load(paste(saved.file.prefix[i], "-em.sav", sep=""))
-
-  uri.output.list[[i]] <- uri.output
-  em.output.list[[i]] <- em.output
-
-  ez.list[[i]] <- get.ez.tt.all(em.output, uri.output.list[[i]]$data12.enrich$merge1,
-                                uri.output.list[[i]]$data12.enrich$merge2) # reverse =T for error rate
-
-  # URI for all peaks
-  uri.list[[i]] <- uri.output$uri.n
-  # URI for matched peaks
-  uri.match <- get.uri.matched(em.output$data.pruned, df=df.txt)
-  uri.list.match[[i]] <- uri.match$uri.n
-
-  file.name <- unlist(strsplit(as.character(saved.file.prefix[i]), "/"))
-
-  legend.txt[i] <- paste(i, "=", file.name[length(file.name)])
-
-}
-
-plot.uri.file <- paste(output.file.prefix, "-plot.ps", sep="")
-
-############# plot and report output
-# plot correspondence curve for each pair,
-# plot number of selected peaks vs IDR
-# plot all into 1 file
-postscript(paste(output.file.prefix, "-plot.ps", sep=""))
-par(mfcol=c(2,3), mar=c(5,6,4,2)+0.1)
-plot.uri.group(uri.list, NULL, file.name=NULL, c(1:npair), title.txt="all peaks")
-plot.uri.group(uri.list.match, NULL, file.name=NULL, c(1:npair), title.txt="matched peaks")
-plot.ez.group(ez.list, plot.dir=NULL, file.name=NULL, legend.txt=c(1:npair), y.lim=c(0, 0.6))
-plot(0, 1, type="n", xlim=c(0,1), ylim=c(0,1), xlab="", ylab="", xaxt="n", yaxt="n") # legends
-legend(0, 1, legend.txt, cex=0.6)
-
-dev.off()
-
--- a/idrPlotDef.xml	Fri Jan 18 17:30:47 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,31 +0,0 @@
-<!--
-Usage of THIS SCRIPT: ./idrPlotWrapper.sh em uri outputfile
--->
-
-<tool id="batch_consistency_plot" name="IDR-Plot">
-  <description>Plot Consistency Analysis on IDR output files</description>
-  <command interpreter="bash">idrPlotWrapper.sh $em $uri $outputfile</command>
-  <inputs>
-    <param name="uri" type="data" label="uri.sav file (output from IDR consistency analysis)"/>
-    <param name="em" type="data" label="em.sav file (output from IDR consistency analysis)"/>
-  </inputs>
-  <outputs>
-    <data format="pdf" name="outputfile" label="IDR-plot.pdf"/>
-  </outputs>
-
-  <tests>
-    <test>
-<!--
-      <param name="input" value="fa_gc_content_input.fa"/>
-      <output name="out_file1" file="fa_gc_content_output.txt"/>
--->
-    </test>
-  </tests>
-
-  <help>
-Plots the correspondence curve and IDR threshold (i.e. number of selected peaks vs IDR) for each pair of samples.
-Uses the output of IDR consistency analysis and produces a downloadable PDF file containing the graphical analysis.
-This is a part of the IDR package.  For more information about IDR, see https://sites.google.com/site/anshulkundaje/projects/idr.
-  </help>
-
-</tool>
--- a/idrPlotWrapper.sh	Fri Jan 18 17:30:47 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-# idrPlotWrapper.sh
-# OICR: Kar Ming Chu
-# July 2012
-
-# BASH wrapper for batch-consistency-plot.r (part of the IDR package)
-# For use with Galaxy
-
-# Usage of batch-consistency-plot.r: Rscript batch-consistency-plot-merged.r [npairs] [output.dir] [input.file.prefix 1, 2, 3 ...]
-#	npairs - will be a constant, since Galaxy requires explicit control over input and output files
-
-# Usage of THIS SCRIPT: ./idrPlotWrapper.sh em uri outputfile
-#	em - em.sav file provided by Galaxy
-#	uri - uri.sav file provided by Galaxy
-#	outputfile - output file name specified by Galaxy
-
-main() {
-	EM="${1}" 		# absolute file path to em.sav file, provided by Galaxy
-	URI="${2}"		# absolute file parth to uri.sav file, provided by Galaxy
-	OUTFILE="${3}"		# name of desired output file
-
-	cp "${EM}" ./idrPlot-em.sav	# cp to this directory and rename so they can be found by idrPlot
-	cp "${URI}" ./idrPlot-uri.sav
-
-	Rscript /mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/batch-consistency-plot.r 1 ./idrPlot idrPlot
-
-	# convert post script to pdf file
- 	ps2pdf ./idrPlot-plot.ps ./idrPlot-plot.pdf
-
-	# rename to output file name
-	mv ./idrPlot-plot.pdf "${OUTFILE}"
-
-	# clean up
-	rm idrPlot-em.sav
-	rm idrPlot-uri.sav
-}
-
-main "${@}"
--- a/idrToolDef.xml	Fri Jan 18 17:30:47 2013 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,61 +0,0 @@
-<!--
-
-Script Usage:
-Rscript batch-consistency-analysis.r
-../3066_rep1_VS_input0.macs14.out.regionPeak
-../3066_rep2_VS_input0.macs14.out.regionPeak
-1000
-3066_rep1_VS_rep2
-0
-F
-p.value
-genome_table.txt [ drop down to select ]
--->
-
-<tool id="batch_consistency_analysis_2" name="IDR">
-  <description>Consistency Analysis on a pair of narrowPeak files</description>
-  <command interpreter="Rscript">batch-consistency-analysis.r $input1 $input2 $halfwidth $overlap $option $sigvalue $gtable $rout $aboveIDR $ratio $emSav $uriSav</command>
-  <inputs>
-    <param format="narrowPeak" name="input1" type="data" label="First NarrowPeak File"/>
-    <param format="narrowPeak" name="input2" type="data" label="Second NarrowPeak File"/>
-    <param name="halfwidth" size="4" type="integer" value="1000" label="Half-Width" help="-1 if using reported peak width"/>
-<!--    <param name="outputprefix" type="text" size="50" label="Output Prefix" value="3066_rep1_VS_rep2"/> -->
-    <param name="option" type="select" label="File Type" value="F">
-      <option value="F">Narrow Peak</option>
-      <option value="T">Broad Peak</option>
-    </param>
-    <param name="overlap" size="4" type="float" value="0" label="Over-Lap Ratio" help="Between 0 and 1, inclusively" min="0" max="1"/>
-    <param name="sigvalue" type="select" label="Significant Value" value="p.value" help="Select p-value if the input peak files are generated by MAC. Select q-value if the input peak files are generated by SPP.">
-      <option value="p.value">p-value</option>
-      <option value="q.value">q-value</option>
-      <option value="signal.value">Significant Value</option>
-    </param>
-    <param name="gtable" type="select" label="Genome Table" value="/mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/genome_tables/genome_table.worm.ws220.txt">
-      <option value="/mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/genome_tables/genome_table.human.hg19.txt">human hg19</option>
-      <option value="/mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/genome_tables/genome_table.mm9.txt">mouse mm9</option>
-      <option value="/mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/genome_tables/genome_table.worm.ws220.txt">worm ws220</option>
-      <option value="/mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/genome_tables/genome_table.dmel.r5.32.txt">dmel r5.32</option>
-    </param>
-  </inputs>
-  <outputs>
-    <data format="txt" name="rout" label="IDR.Rout.txt"/>
-    <data format="txt" name="aboveIDR" label="IDR.npeaks-aboveIDR.txt"/>
-    <data format="txt" name="ratio" label="IDR.overlapped-peaks.txt"/>
-    <data format="txt" name="emSav" label="IDR.em.sav"/>
-    <data format="txt" name="uriSav" label="IDR.uri.sav"/>
-  </outputs>
-
-  <tests>
-    <test>
-<!--
-      <param name="input" value="fa_gc_content_input.fa"/>
-      <output name="out_file1" file="fa_gc_content_output.txt"/>
--->
-    </test>
-  </tests>
-
-  <help>
-Reproducibility is essential to reliable scientiﬁc discovery in high-throughput experiments. The IDR (Irreproducible Discovery Rate) framework is a uniﬁed approach to measure the reproducibility of ﬁndings identiﬁed from replicate experiments and provide highly stable thresholds based on reproducibility. Unlike the usual scalar measures of reproducibility, the IDR approach creates a curve, which quantitatively assesses when the ﬁndings are no longer consistent across replicates. In layman's terms, the IDR method compares a pair of ranked lists of identifications (such as ChIP-seq peaks). These ranked lists should not be pre-thresholded i.e. they should provide identifications across the entire spectrum of high confidence/enrichment (signal) and low confidence/enrichment (noise). The IDR method then fits the bivariate rank distributions over the replicates in order to separate signal from noise based on a defined confidence of rank consistency and reproducibility of identifications i.e the IDR threshold. For more information on IDR, see https://sites.google.com/site/anshulkundaje/projects/idr
-  </help>
-
-</tool>