Mercurial > repos > zzhou > spp_phantompeak
comparison spp/man/spp-package.Rd @ 6:ce08b0efa3fd draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:11:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:608a8e0eac56 | 6:ce08b0efa3fd |
---|---|
1 \name{spp-package} | |
2 \alias{spp-package} | |
3 \alias{spp} | |
4 \docType{package} | |
5 \title{ | |
6 ChIP-seq (Solexa) Processing Pipeline | |
7 } | |
8 \description{ | |
9 A set of routines for reading short sequence alignments, calculating tag | |
10 density, estimates of statistically significant enrichment/depletion | |
11 along the chromosome, identifying point binding positions (peaks), and | |
12 characterizing saturation properties related to sequencing depth. | |
13 } | |
14 \details{ | |
15 \tabular{ll}{ | |
16 Package: \tab spp\cr | |
17 Type: \tab Package\cr | |
18 Version: \tab 1.8\cr | |
19 Date: \tab 2008-11-14\cr | |
20 License: \tab What license is it under?\cr | |
21 LazyLoad: \tab yes\cr | |
22 } | |
23 See example below for typical processing sequence.y | |
24 } | |
25 \author{Peter Kharchenko <peter.kharchenko@post.harvard.edu>} | |
26 \references{ | |
27 Kharchenko P., Tolstorukov M., Park P. "Design and analysis of ChIP-seq | |
28 experiments for DNA-binding proteins." Nature Biotech. doi:10.1038/nbt.1508 | |
29 } | |
30 | |
31 \examples{ | |
32 | |
33 # load the library | |
34 library(spp); | |
35 | |
36 ## The following section shows how to initialize a cluster of 8 nodes for parallel processing | |
37 ## To enable parallel processing, uncomment the next three lines, and comment out "cluster<-NULL"; | |
38 ## see "snow" package manual for details. | |
39 #library(snow) | |
40 #cluster <- makeCluster(2); | |
41 #invisible(clusterCall(cluster,source,"routines.r")); | |
42 cluster <- NULL; | |
43 | |
44 | |
45 | |
46 # read in tag alignments | |
47 chip.data <- read.eland.tags("chip.eland.alignment"); | |
48 input.data <- read.eland.tags("input.eland.alignment"); | |
49 | |
50 # get binding info from cross-correlation profile | |
51 # srange gives the possible range for the size of the protected region; | |
52 # srange should be higher than tag length; making the upper boundary too high will increase calculation time | |
53 # | |
54 # bin - bin tags within the specified number of basepairs to speed up calculation; | |
55 # increasing bin size decreases the accuracy of the determined parameters | |
56 binding.characteristics <- get.binding.characteristics(chip.data,srange=c(50,500),bin=5,cluster=cluster); | |
57 | |
58 | |
59 # plot cross-correlation profile | |
60 pdf(file="example.crosscorrelation.pdf",width=5,height=5) | |
61 par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8); | |
62 plot(binding.characteristics$cross.correlation,type='l',xlab="strand shift",ylab="cross-correlation"); | |
63 abline(v=binding.characteristics$peak$x,lty=2,col=2) | |
64 dev.off(); | |
65 | |
66 # select informative tags based on the binding characteristics | |
67 chip.data <- select.informative.tags(chip.data,binding.characteristics); | |
68 input.data <- select.informative.tags(input.data,binding.characteristics); | |
69 | |
70 # restrict or remove positions with anomalous number of tags relative | |
71 # to the local density | |
72 chip.data <- remove.local.tag.anomalies(chip.data); | |
73 input.data <- remove.local.tag.anomalies(input.data); | |
74 | |
75 | |
76 # output smoothed tag density (subtracting re-scaled input) into a WIG file | |
77 # note that the tags are shifted by half of the peak separation distance | |
78 smoothed.density <- get.smoothed.tag.density(chip.data,control.tags=input.data,bandwidth=200,step=100,tag.shift=round(binding.characteristics$peak$x/2)); | |
79 writewig(smoothed.density,"example.density.wig","Example smoothed, background-subtracted tag density"); | |
80 rm(smoothed.density); | |
81 | |
82 # output conservative enrichment estimates | |
83 # alpha specifies significance level at which confidence intervals will be estimated | |
84 enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=2*binding.characteristics$whs,step=100,alpha=0.01); | |
85 writewig(enrichment.estimates,"example.enrichment.estimates.wig","Example conservative fold-enrichment/depletion estimates shown on log2 scale"); | |
86 rm(enrichment.estimates); | |
87 | |
88 | |
89 # binding detection parameters | |
90 # desired FDR. Alternatively, an E-value can be supplied to the method calls below instead of the fdr parameter | |
91 fdr <- 1e-2; | |
92 # the binding.characteristics contains the optimized half-size for binding detection window | |
93 detection.window.halfsize <- binding.characteristics$whs; | |
94 | |
95 # determine binding positions using wtd method | |
96 bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize,cluster=cluster) | |
97 | |
98 # alternatively determined binding positions using lwcc method (note: this takes longer than wtd) | |
99 # bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.lwcc,whs=detection.window.halfsize,cluster=cluster) | |
100 | |
101 print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks")); | |
102 | |
103 # output detected binding positions | |
104 output.binding.results(bp,"example.binding.positions.txt"); | |
105 | |
106 | |
107 # ------------------------------------------------------------------------------------------- | |
108 # the set of commands in the following section illustrates methods for saturation analysis | |
109 # these are separated from the previous section, since they are highly CPU intensive | |
110 # ------------------------------------------------------------------------------------------- | |
111 | |
112 # determine MSER | |
113 # note: this will take approximately 10-15x the amount of time the initial binding detection did | |
114 # The saturation criteria here is 0.99 consistency in the set of binding positions when adding 1e5 tags. | |
115 # To ensure convergence the number of subsampled chains (n.chains) should be higher (80) | |
116 mser <- get.mser(chip.data,input.data,step.size=1e5,test.agreement=0.99,n.chains=8,cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize) | |
117 | |
118 print(paste("MSER at a current depth is",mser)); | |
119 | |
120 # note: an MSER value of 1 or very near one implies that the set of detected binding positions satisfies saturation criteria without | |
121 # additional selection by fold-enrichment ratios. In other words, the dataset has reached saturation in a traditional sense (absolute saturation). | |
122 | |
123 # interpolate MSER dependency on tag count | |
124 # note: this requires considerably more calculations than the previous steps (~ 3x more than the first MSER calculation) | |
125 # Here we interpolate MSER dependency to determine a point at which MSER of 2 is reached | |
126 # The interpolation will be based on the difference in MSER at the current depth, and a depth at 5e5 fewer tags (n.steps=6); | |
127 # evaluation of the intermediate points is omitted here to speed up the calculation (excluded.steps parameter) | |
128 # A total of 7 chains is used here to speed up calculation, whereas a higher number of chains (50) would give good convergence | |
129 msers <- get.mser.interpolation(chip.data,input.data,step.size=1e5,test.agreement=0.99, target.fold.enrichment=2, n.chains=7,n.steps=6,excluded.steps=c(2:4),cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize) | |
130 | |
131 print(paste("predicted sequencing depth =",round(unlist(lapply(msers,function(x) x$prediction))/1e6,5)," million tags")) | |
132 | |
133 | |
134 # note: the interpolation will return NA prediction if the dataset has reached absolute saturation at the current depth. | |
135 # note: use return.chains=T to also calculated random chains (returned under msers$chains field) - these can be passed back as | |
136 # "get.mser.interpolation( ..., chains=msers$chains)" to calculate predictions for another target.fold.enrichment value | |
137 # without having to recalculate the random chain predictions. | |
138 | |
139 ## stop cluster if it was initialized | |
140 #stopCluster(cluster); | |
141 | |
142 | |
143 | |
144 } |