Mercurial > repos > jeremyjliu > region_motif_enrichment
comparison region_motif_compare.r @ 0:19d2cffb8db3 draft
Initial upload
author | jeremyjliu |
---|---|
date | Wed, 06 Aug 2014 15:36:46 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:19d2cffb8db3 |
---|---|
1 # Name: region_motif_compare.r | |
2 # Description: Reads in two count files and determines enriched and depleted | |
3 # motifs (or any location based feature) based on poisson tests and gc | |
4 # corrections. All enrichment ratios relative to overall count / gc ratios. | |
5 # Author: Jeremy liu | |
6 # Email: jeremy.liu@yale.edu | |
7 # Date: 14/07/03 | |
8 # Note: This script is meant to be invoked with the following command | |
9 # R --slave --vanilla -f ./region_motif_compare.r --args <workingdir> <db> <intab1> <intab2> | |
10 # <enriched_tab> <depleted_tab> <plots_png> | |
11 # <workingdir> is working directory of galaxy installation | |
12 # <db> types: "t" test, "p" pouya, "j" jaspar jolma, "m" mouse, "c" combined | |
13 # Dependencies: plotting.r | |
14 | |
15 # Auxiliary function to concatenate multiple strings | |
16 concat <- function(...) { | |
17 input_list <- list(...) | |
18 return(paste(input_list, sep="", collapse="")) | |
19 } | |
20 | |
21 # Supress all warning messages to prevent Galaxy treating warnings as errors | |
22 options(warn=-1) | |
23 | |
24 # Set common and data directories | |
25 args <- commandArgs() | |
26 workingDir = args[7] | |
27 commonDir = concat(workingDir, "/tools/my_tools") | |
28 dbCode = args[8] | |
29 # dbCode "c" implemented when pwmFile is loaded | |
30 if (dbCode == "t" | dbCode == "p") { | |
31 pwmFile = concat(commonDir, "/region_motif_db/pwms/pouya.pwms.from.seq.RData") | |
32 } else if (dbCode == "j") { | |
33 pwmFile = concat(commonDir, "/region_motif_db/pwms/jaspar.jolma.pwms.from.seq.RData") | |
34 } else if (dbCode == "m") { | |
35 pwmFile = concat(commonDir, "/region_motif_db/pwms/mm9.pwms.from.seq.RData") | |
36 } else if (dbCode == "c") { # rest of dbCode "c" implemeted when pwmFile loaded | |
37 pwmFile = concat(commonDir, "/region_motif_db/pwms/pouya.pwms.from.seq.RData") | |
38 pwmFile2 = concat(commonDir, "/region_motif_db/pwms/jaspar.jolma.pwms.from.seq.RData") | |
39 } else { | |
40 pwmFile = concat(commonDir, "/region_motif_db/pwms/pouya.pwms.from.seq.RData") | |
41 } | |
42 | |
43 # Set input and reference files | |
44 inTab1 = args[9] | |
45 inTab2 = args[10] | |
46 enrichTab = args[11] | |
47 depleteTab = args[12] | |
48 plotsPng = args[13] | |
49 | |
50 # Load dependencies | |
51 source(concat(commonDir, "/region_motif_lib/plotting.r")) | |
52 | |
53 # Auxiliary function to read in tab file and prepare the data | |
54 read_tsv <- function(file) { | |
55 data = read.table(file, sep="\t", stringsAsFactors=FALSE) | |
56 names(data)[names(data) == "V1"] = "motif" | |
57 names(data)[names(data) == "V2"] = "counts" | |
58 return(data) | |
59 } | |
60 | |
61 startTime = Sys.time() | |
62 cat("Running ... Started at:", format(startTime, "%a %b %d %X %Y"), "...\n") | |
63 | |
64 # Loading motif position weight matrix (pwm) file and input tab file | |
65 #cat("Loading and reading input region motif count files...\n") | |
66 load(pwmFile) # pwms data structure | |
67 if (dbCode == "c") { # Remaining implementation of dbCode "c" combined | |
68 temp = pwms | |
69 load(pwmFile2) | |
70 pwms = append(temp, pwms) | |
71 } | |
72 region1DF = read_tsv(inTab1) | |
73 region2DF = read_tsv(inTab2) | |
74 region1Counts = region1DF$counts | |
75 region2Counts = region2DF$counts | |
76 names(region1Counts) = region1DF$motif | |
77 names(region2Counts) = region2DF$motif | |
78 | |
79 # Processing count vectors to account for missing 0 count motifs, then sorting | |
80 #cat("Performing 0 count correction and sorting...\n") | |
81 allNames = union(names(region1Counts), names(region2Counts)) | |
82 region1Diff = setdiff(allNames, names(region1Counts)) | |
83 region2Diff = setdiff(allNames, names(region2Counts)) | |
84 addCounts1 = rep(0, length(region1Diff)) | |
85 addCounts2 = rep(0, length(region2Diff)) | |
86 names(addCounts1) = region1Diff | |
87 names(addCounts2) = region2Diff | |
88 newCounts1 = append(region1Counts, addCounts1) | |
89 newCounts2 = append(region2Counts, addCounts2) | |
90 region1Counts = newCounts1[sort.int(names(newCounts1), index.return=TRUE)$ix] | |
91 region2Counts = newCounts2[sort.int(names(newCounts2), index.return=TRUE)$ix] | |
92 | |
93 # Generate gc content matrix | |
94 gc = sapply(pwms, function(i) mean(i[2:3,3:18])) | |
95 | |
96 # Apply poisson test, calculate p and q values, and filter significant results | |
97 #cat("Applying poisson test...\n") | |
98 rValue = sum(region2Counts) / sum(region1Counts) | |
99 pValue = sapply(seq(along=region1Counts), function(i) { | |
100 poisson.test(c(region1Counts[i], region2Counts[i]), r=1/rValue)$p.value | |
101 }) | |
102 qValue = p.adjust(pValue, "fdr") | |
103 indices = which(qValue<0.1 & abs(log2(region1Counts/region2Counts/rValue))>log2(1.5)) | |
104 | |
105 # Setting up output diagnostic plots, 4 in 1 png image | |
106 png(plotsPng, width=800, height=800) | |
107 xlab = "region1_count" | |
108 ylab = "region2_count" | |
109 lim = c(0.5, 5000) | |
110 layout(matrix(1:4, ncol=2)) | |
111 par(mar=c(5, 5, 5, 1)) | |
112 | |
113 # Plot all motif counts along the linear correlation coefficient | |
114 plot.scatter(region1Counts+0.5, region2Counts+0.5, log="xy", xlab=xlab, ylab=ylab, | |
115 cex.lab=2.2, cex.axis=1.8, xlim=lim, ylim=lim*rValue) | |
116 abline(0, rValue, untf=T) | |
117 abline(0, rValue*2, untf=T, lty=2) | |
118 abline(0, rValue/2, untf=T, lty=2) | |
119 | |
120 # Plot enriched and depleted motifs in red, housed in second plot | |
121 plot.scatter(region1Counts+0.5, region2Counts+0.5, log="xy", xlab=xlab, ylab=ylab, | |
122 cex.lab=2.2, cex.axis=1.8, xlim=lim, ylim=lim*rValue) | |
123 points(region1Counts[indices]+0.5, region2Counts[indices]+0.5, col="red") | |
124 abline(0, rValue, untf=T) | |
125 abline(0, rValue*2, untf=T, lty=2) | |
126 abline(0, rValue/2, untf=T, lty=2) | |
127 | |
128 # Apply and plot gc correction and loess curve | |
129 #cat("Applying gc correction, rerunning poisson test...\n") | |
130 ind = which(region1Counts>5) | |
131 gc = gc[names(region2Counts)] # Reorder the indices of pwms to match input data | |
132 lo = plot.scatter(gc,log2(region2Counts/region1Counts),draw.loess=T, | |
133 xlab="gc content of motif",ylab=paste("log2(",ylab,"/",xlab,")"), | |
134 cex.lab=2.2,cex.axis=1.8,ind=ind) # This function is in plotting.r | |
135 gcCorrection = 2^approx(lo$loess,xout=gc,rule=2)$y | |
136 save(gc, file="gc.RData") | |
137 | |
138 # Recalculate p and q values, and filter for significant entries | |
139 pValueGC = sapply(seq(along=region1Counts),function(i) { | |
140 poisson.test(c(region1Counts[i],region2Counts[i]),r=1/gcCorrection[i])$p.value | |
141 }) | |
142 qValueGC=p.adjust(pValueGC,"fdr") | |
143 indicesGC = which(qValueGC<0.1 & abs(log2(region1Counts/region2Counts*gcCorrection))>log2(1.5)) | |
144 | |
145 # Plot gc corrected motif counts | |
146 plot.scatter(region1Counts+0.5, (region2Counts+0.5)/gcCorrection, log="xy", | |
147 xlab=xlab, ylab=paste(ylab,"(normalized)"), cex.lab=2.2, cex.axis=1.8, | |
148 xlim=lim, ylim=lim) | |
149 points(region1Counts[indicesGC]+0.5, | |
150 (region2Counts[indicesGC]+0.5)/gcCorrection[indicesGC], col="red") | |
151 abline(0,1) | |
152 abline(0,1*2,untf=T,lty=2) | |
153 abline(0,1/2,untf=T,lty=2) | |
154 | |
155 # Trim results, compile statistics and output to file | |
156 # Only does so if significant results are computed | |
157 if(length(indicesGC) > 0) { | |
158 # Calculate expected counts and enrichment ratios | |
159 #cat("Calculating statistics...\n") | |
160 nullExpect = region1Counts * gcCorrection | |
161 enrichment = region2Counts / nullExpect | |
162 | |
163 # Reorder selected indices in ascending pvalue | |
164 #cat("Reordering by ascending pvalue...\n") | |
165 indicesReorder = indicesGC[order(pValueGC[indicesGC])] | |
166 | |
167 # Combine data into one data frame and output to two files | |
168 #cat("Splitting and outputting data...\n") | |
169 outDF = data.frame(motif=names(pValueGC), p=as.numeric(pValueGC), q=qValueGC, | |
170 stringsAsFactors=F, region_1_count=region1Counts, | |
171 null_expectation=round(nullExpect,2), region_2_count=region2Counts, | |
172 enrichment=enrichment)[indicesReorder,] | |
173 names(outDF)[which(names(outDF)=="region_1_count")]=xlab | |
174 names(outDF)[which(names(outDF)=="region_2_count")]=ylab | |
175 indicesEnrich = which(outDF$enrichment>1) | |
176 indicesDeplete = which(outDF$enrichment<1) | |
177 outDF$enrichment = ifelse(outDF$enrichment>1, | |
178 round(outDF$enrichment,3), | |
179 paste("1/",round(1/outDF$enrichment,3))) | |
180 write.table(outDF[indicesEnrich,], file=enrichTab, quote=FALSE, | |
181 sep="\t", append=FALSE, row.names=FALSE, col.names=TRUE) | |
182 write.table(outDF[indicesDeplete,], file=depleteTab, quote=FALSE, | |
183 sep="\t", append=FALSE, row.names=FALSE, col.names=TRUE) | |
184 } | |
185 | |
186 # Catch display messages and output timing information | |
187 catchMessage = dev.off() | |
188 cat("Done. Job started at:", format(startTime, "%a %b %d %X %Y."), | |
189 "Job ended at:", format(Sys.time(), "%a %b %d %X %Y."), "\n") |