19
|
1 # Name: region_motif_compare.r
|
|
2 # Description: Reads in two count files and determines enriched and depleted
|
|
3 # motifs (or any location based feature) based on poisson tests and gc
|
|
4 # corrections. All enrichment ratios relative to overall count / gc ratios.
|
|
5 # Author: Jeremy liu
|
|
6 # Email: jeremy.liu@yale.edu
|
35
|
7 # Date: 15/02/11
|
19
|
8 # Note: This script is meant to be invoked with the following command
|
35
|
9 # R --slave --vanilla -f ./region_motif_compare.r --args <workingdir> <pwm_file>
|
|
10 # <intab1> <intab2> <enriched_tab> <depleted_tab> <plots_png>
|
19
|
11 # <workingdir> is working directory of galaxy installation
|
35
|
12 # Dependencies: region_motif_data_manager, plotting.r
|
19
|
13
|
|
14 # Auxiliary function to concatenate multiple strings
|
|
15 concat <- function(...) {
|
|
16 input_list <- list(...)
|
|
17 return(paste(input_list, sep="", collapse=""))
|
|
18 }
|
|
19
|
|
20 # Supress all warning messages to prevent Galaxy treating warnings as errors
|
|
21 options(warn=-1)
|
|
22
|
|
23 # Set common and data directories
|
|
24 args <- commandArgs()
|
|
25 workingDir = args[7]
|
35
|
26 pwmFile = args[8].split(',')[0] # If duplicate entires, take first one
|
19
|
27
|
|
28 # Set input and reference files
|
|
29 inTab1 = args[9]
|
|
30 inTab2 = args[10]
|
|
31 enrichTab = args[11]
|
|
32 depleteTab = args[12]
|
|
33 plotsPng = args[13]
|
|
34
|
|
35 # Load dependencies
|
31
|
36 source(concat(workingDir, "/plotting.r"))
|
19
|
37
|
|
38 # Auxiliary function to read in tab file and prepare the data
|
|
39 read_tsv <- function(file) {
|
|
40 data = read.table(file, sep="\t", stringsAsFactors=FALSE)
|
|
41 names(data)[names(data) == "V1"] = "motif"
|
|
42 names(data)[names(data) == "V2"] = "counts"
|
|
43 return(data)
|
|
44 }
|
|
45
|
|
46 startTime = Sys.time()
|
|
47 cat("Running ... Started at:", format(startTime, "%a %b %d %X %Y"), "...\n")
|
|
48
|
|
49 # Loading motif position weight matrix (pwm) file and input tab file
|
|
50 cat("Loading and reading input region motif count files...\n")
|
|
51 load(pwmFile) # pwms data structure
|
35
|
52 #if (dbCode == "c") { # Remaining implementation of dbCode "c" combined
|
|
53 # temp = pwms
|
|
54 # load(pwmFile2)
|
|
55 # pwms = append(temp, pwms)
|
|
56 #}
|
19
|
57 region1DF = read_tsv(inTab1)
|
|
58 region2DF = read_tsv(inTab2)
|
|
59 region1Counts = region1DF$counts
|
|
60 region2Counts = region2DF$counts
|
|
61 names(region1Counts) = region1DF$motif
|
|
62 names(region2Counts) = region2DF$motif
|
|
63
|
|
64 # Processing count vectors to account for missing 0 count motifs, then sorting
|
|
65 cat("Performing 0 count correction and sorting...\n")
|
|
66 allNames = union(names(region1Counts), names(region2Counts))
|
|
67 region1Diff = setdiff(allNames, names(region1Counts))
|
|
68 region2Diff = setdiff(allNames, names(region2Counts))
|
|
69 addCounts1 = rep(0, length(region1Diff))
|
|
70 addCounts2 = rep(0, length(region2Diff))
|
|
71 names(addCounts1) = region1Diff
|
|
72 names(addCounts2) = region2Diff
|
|
73 newCounts1 = append(region1Counts, addCounts1)
|
|
74 newCounts2 = append(region2Counts, addCounts2)
|
|
75 region1Counts = newCounts1[sort.int(names(newCounts1), index.return=TRUE)$ix]
|
|
76 region2Counts = newCounts2[sort.int(names(newCounts2), index.return=TRUE)$ix]
|
|
77
|
|
78 # Generate gc content matrix
|
|
79 gc = sapply(pwms, function(i) mean(i[2:3,3:18]))
|
|
80
|
|
81 # Apply poisson test, calculate p and q values, and filter significant results
|
|
82 cat("Applying poisson test...\n")
|
|
83 rValue = sum(region2Counts) / sum(region1Counts)
|
|
84 pValue = sapply(seq(along=region1Counts), function(i) {
|
|
85 poisson.test(c(region1Counts[i], region2Counts[i]), r=1/rValue)$p.value
|
|
86 })
|
|
87 qValue = p.adjust(pValue, "fdr")
|
|
88 indices = which(qValue<0.1 & abs(log2(region1Counts/region2Counts/rValue))>log2(1.5))
|
|
89
|
|
90 # Setting up output diagnostic plots, 4 in 1 png image
|
|
91 png(plotsPng, width=800, height=800)
|
|
92 xlab = "region1_count"
|
|
93 ylab = "region2_count"
|
|
94 lim = c(0.5, 5000)
|
|
95 layout(matrix(1:4, ncol=2))
|
|
96 par(mar=c(5, 5, 5, 1))
|
|
97
|
|
98 # Plot all motif counts along the linear correlation coefficient
|
|
99 plot.scatter(region1Counts+0.5, region2Counts+0.5, log="xy", xlab=xlab, ylab=ylab,
|
|
100 cex.lab=2.2, cex.axis=1.8, xlim=lim, ylim=lim*rValue)
|
|
101 abline(0, rValue, untf=T)
|
|
102 abline(0, rValue*2, untf=T, lty=2)
|
|
103 abline(0, rValue/2, untf=T, lty=2)
|
|
104
|
|
105 # Plot enriched and depleted motifs in red, housed in second plot
|
|
106 plot.scatter(region1Counts+0.5, region2Counts+0.5, log="xy", xlab=xlab, ylab=ylab,
|
|
107 cex.lab=2.2, cex.axis=1.8, xlim=lim, ylim=lim*rValue)
|
|
108 points(region1Counts[indices]+0.5, region2Counts[indices]+0.5, col="red")
|
|
109 abline(0, rValue, untf=T)
|
|
110 abline(0, rValue*2, untf=T, lty=2)
|
|
111 abline(0, rValue/2, untf=T, lty=2)
|
|
112
|
|
113 # Apply and plot gc correction and loess curve
|
|
114 cat("Applying gc correction, rerunning poisson test...\n")
|
|
115 ind = which(region1Counts>5)
|
|
116 gc = gc[names(region2Counts)] # Reorder the indices of pwms to match input data
|
|
117 lo = plot.scatter(gc,log2(region2Counts/region1Counts),draw.loess=T,
|
|
118 xlab="gc content of motif",ylab=paste("log2(",ylab,"/",xlab,")"),
|
|
119 cex.lab=2.2,cex.axis=1.8,ind=ind) # This function is in plotting.r
|
|
120 gcCorrection = 2^approx(lo$loess,xout=gc,rule=2)$y
|
|
121
|
|
122 # Recalculate p and q values, and filter for significant entries
|
|
123 pValueGC = sapply(seq(along=region1Counts),function(i) {
|
|
124 poisson.test(c(region1Counts[i],region2Counts[i]),r=1/gcCorrection[i])$p.value
|
|
125 })
|
|
126 qValueGC=p.adjust(pValueGC,"fdr")
|
|
127 indicesGC = which(qValueGC<0.1 & abs(log2(region1Counts/region2Counts*gcCorrection))>log2(1.5))
|
|
128
|
|
129 # Plot gc corrected motif counts
|
|
130 plot.scatter(region1Counts+0.5, (region2Counts+0.5)/gcCorrection, log="xy",
|
|
131 xlab=xlab, ylab=paste(ylab,"(normalized)"), cex.lab=2.2, cex.axis=1.8,
|
|
132 xlim=lim, ylim=lim)
|
|
133 points(region1Counts[indicesGC]+0.5,
|
|
134 (region2Counts[indicesGC]+0.5)/gcCorrection[indicesGC], col="red")
|
|
135 abline(0,1)
|
|
136 abline(0,1*2,untf=T,lty=2)
|
|
137 abline(0,1/2,untf=T,lty=2)
|
|
138
|
|
139 # Trim results, compile statistics and output to file
|
|
140 # Only does so if significant results are computed
|
|
141 if(length(indicesGC) > 0) {
|
|
142 # Calculate expected counts and enrichment ratios
|
|
143 cat("Calculating statistics...\n")
|
|
144 nullExpect = region1Counts * gcCorrection
|
|
145 enrichment = region2Counts / nullExpect
|
|
146
|
|
147 # Reorder selected indices in ascending pvalue
|
|
148 cat("Reordering by ascending pvalue...\n")
|
|
149 indicesReorder = indicesGC[order(pValueGC[indicesGC])]
|
|
150
|
|
151 # Combine data into one data frame and output to two files
|
|
152 cat("Splitting and outputting data...\n")
|
|
153 outDF = data.frame(motif=names(pValueGC), p=as.numeric(pValueGC), q=qValueGC,
|
|
154 stringsAsFactors=F, region_1_count=region1Counts,
|
|
155 null_expectation=round(nullExpect,2), region_2_count=region2Counts,
|
|
156 enrichment=enrichment)[indicesReorder,]
|
|
157 names(outDF)[which(names(outDF)=="region_1_count")]=xlab
|
|
158 names(outDF)[which(names(outDF)=="region_2_count")]=ylab
|
|
159 indicesEnrich = which(outDF$enrichment>1)
|
|
160 indicesDeplete = which(outDF$enrichment<1)
|
|
161 outDF$enrichment = ifelse(outDF$enrichment>1,
|
|
162 round(outDF$enrichment,3),
|
|
163 paste("1/",round(1/outDF$enrichment,3)))
|
|
164 write.table(outDF[indicesEnrich,], file=enrichTab, quote=FALSE,
|
|
165 sep="\t", append=FALSE, row.names=FALSE, col.names=TRUE)
|
|
166 write.table(outDF[indicesDeplete,], file=depleteTab, quote=FALSE,
|
|
167 sep="\t", append=FALSE, row.names=FALSE, col.names=TRUE)
|
|
168 }
|
|
169
|
|
170 # Catch display messages and output timing information
|
|
171 catchMessage = dev.off()
|
|
172 cat("Done. Job started at:", format(startTime, "%a %b %d %X %Y."),
|
|
173 "Job ended at:", format(Sys.time(), "%a %b %d %X %Y."), "\n")
|