0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
29
|
57 #remove duplicate V+J+CDR3, add together numerical values
|
34
|
58 if(mergeOn != "Clone_Sequence"){
|
|
59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
|
|
60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
|
|
61 Cell_Count=unique(.SD$Cell_Count),
|
|
62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
63 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
66 Frequency=sum(.SD$Frequency),
|
|
67 locus_V=unique(.SD$locus_V),
|
|
68 locus_J=unique(.SD$locus_J),
|
|
69 min_cell_count=unique(.SD$min_cell_count),
|
|
70 normalized_read_count=sum(.SD$normalized_read_count),
|
|
71 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
72 Clone_Sequence=.SD$Clone_Sequence[1],
|
|
73 min_cell_paste=.SD$min_cell_paste[1],
|
|
74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
75 }
|
29
|
76
|
22
|
77 patients = split(dat, dat$Patient, drop=T)
|
9
|
78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
81 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
82 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
83 Titles = factor(Titles, levels=Titles)
|
|
84 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
85
|
29
|
86 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
87
|
0
|
88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
89 if (!is.data.frame(x) & is.list(x)){
|
|
90 x = x[[1]]
|
|
91 }
|
34
|
92 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
93 x = data.frame(x,stringsAsFactors=F)
|
0
|
94 onShort = "reads"
|
|
95 if(on == "Frequency"){
|
|
96 onShort = "freq"
|
|
97 }
|
18
|
98 onx = paste(on, ".x", sep="")
|
|
99 ony = paste(on, ".y", sep="")
|
0
|
100 splt = split(x, x$Sample, drop=T)
|
4
|
101 type="pair"
|
2
|
102 if(length(splt) == 1){
|
3
|
103 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
104 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
105 type="single"
|
2
|
106 }
|
0
|
107 patient1 = splt[[1]]
|
|
108 patient2 = splt[[2]]
|
|
109
|
|
110 threshholdIndex = which(colnames(product) == "interval")
|
|
111 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
112 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
113 titleIndex = which(colnames(product) == "Titles")
|
|
114 sampleIndex = which(colnames(x) == "Sample")
|
|
115 patientIndex = which(colnames(x) == "Patient")
|
|
116 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
117 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
118 patient = paste(x[1,patientIndex])
|
3
|
119
|
0
|
120 switched = F
|
|
121 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
122 tmp = twoSample
|
|
123 twoSample = oneSample
|
|
124 oneSample = tmp
|
|
125 tmp = patient1
|
|
126 patient1 = patient2
|
|
127 patient2 = tmp
|
|
128 switched = T
|
|
129 }
|
|
130 if(appendtxt){
|
4
|
131 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
132 }
|
3
|
133 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
9
|
134
|
29
|
135 if(mergeOn == "Clone_Sequence"){
|
|
136 patient1$merge = paste(patient1$Clone_Sequence)
|
|
137 patient2$merge = paste(patient2$Clone_Sequence)
|
|
138 } else {
|
|
139 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
140 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
141 }
|
9
|
142
|
30
|
143 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
144 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
147 scatterplot_data$on = onShort
|
|
148
|
9
|
149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
|
34
|
150
|
|
151
|
|
152 #fuzzy matching here...
|
|
153 if(mergeOn == "Clone_Sequence"){
|
|
154 merge.list = patientMerge$merge
|
|
155
|
|
156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
158
|
36
|
159 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
160 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
161
|
|
162 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
163 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
34
|
164
|
|
165 merge.freq.table = data.frame(table(c(patient1.fuzzy$merge, patient2.fuzzy$merge)))
|
|
166 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
|
167
|
|
168 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
169 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
170
|
|
171 while(nrow(patient1.fuzzy) > 0 & nrow(patient2.fuzzy) > 0){
|
|
172 current.merge.1 = patient1.fuzzy[1,"merge"]
|
|
173 current.clone.seq.1 = patient1.fuzzy[1,"Clone_Sequence"]
|
|
174 current.merge.in.2 = patient2.fuzzy[patient2.fuzzy$merge == current.merge.1,]
|
|
175
|
|
176 #agrep/adist the two samples
|
36
|
177 agrep.match = agrep(current.clone.seq.1, current.merge.in.2$Clone_Sequence, max.distance = 9, costs=list(insertions=1, deletions=1, substitutions=10))
|
34
|
178
|
|
179
|
|
180 if(length(agrep.match) == 1){
|
|
181 current.clone.seq.2 = patient2.fuzzy[agrep.match,"Clone_Sequence"]
|
|
182 patientMerge.new.row = data.frame(merge=current.clone.seq.1,
|
|
183 min_cell_paste.x=patient1.fuzzy[1,"min_cell_paste"],
|
|
184 Patient.x=patient1.fuzzy[1,"Patient"],
|
|
185 Receptor.x=patient1.fuzzy[1,"Receptor"],
|
|
186 Sample.x=patient1.fuzzy[1,"Sample"],
|
|
187 Cell_Count.x=patient1.fuzzy[1,"Cell_Count"],
|
|
188 Clone_Molecule_Count_From_Spikes.x=patient1.fuzzy[1,"Clone_Molecule_Count_From_Spikes"],
|
|
189 Log10_Frequency.x=patient1.fuzzy[1,"Log10_Frequency"],
|
|
190 Total_Read_Count.x=patient1.fuzzy[1,"Total_Read_Count"],
|
|
191 dsPerM.x=patient1.fuzzy[1,"dsPerM"],
|
|
192 J_Segment_Major_Gene.x=patient1.fuzzy[1,"J_Segment_Major_Gene"],
|
|
193 V_Segment_Major_Gene.x=patient1.fuzzy[1,"V_Segment_Major_Gene"],
|
|
194 Clone_Sequence.x=patient1.fuzzy[1,"Clone_Sequence"],
|
|
195 CDR3_Sense_Sequence.x=patient1.fuzzy[1,"CDR3_Sense_Sequence"],
|
|
196 Related_to_leukemia_clone.x=patient1.fuzzy[1,"Related_to_leukemia_clone"],
|
|
197 Frequency.x=patient1.fuzzy[1,"Frequency"],
|
|
198 locus_V.x=patient1.fuzzy[1,"locus_V"],
|
|
199 locus_J.x=patient1.fuzzy[1,"locus_J"],
|
|
200 min_cell_count.x=patient1.fuzzy[1,"min_cell_count"],
|
|
201 normalized_read_count.x=patient1.fuzzy[1,"normalized_read_count"],
|
|
202 paste.x=patient1.fuzzy[1,"paste"],
|
|
203 min_cell_paste.y=patient2.fuzzy[agrep.match,"min_cell_paste"],
|
|
204 Patient.y=patient2.fuzzy[agrep.match,"Patient"],
|
|
205 Receptor.y=patient2.fuzzy[agrep.match,"Receptor"],
|
|
206 Sample.y=patient2.fuzzy[agrep.match,"Sample"],
|
|
207 Cell_Count.y=patient2.fuzzy[agrep.match,"Cell_Count"],
|
|
208 Clone_Molecule_Count_From_Spikes.y=patient2.fuzzy[agrep.match,"Clone_Molecule_Count_From_Spikes"],
|
|
209 Log10_Frequency.y=patient2.fuzzy[agrep.match,"Log10_Frequency"],
|
|
210 Total_Read_Count.y=patient2.fuzzy[agrep.match,"Total_Read_Count"],
|
|
211 dsPerM.y=patient2.fuzzy[agrep.match,"dsPerM"],
|
|
212 J_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"J_Segment_Major_Gene"],
|
|
213 V_Segment_Major_Gene.y=patient2.fuzzy[agrep.match,"V_Segment_Major_Gene"],
|
|
214 Clone_Sequence.y=patient2.fuzzy[agrep.match,"Clone_Sequence"],
|
|
215 CDR3_Sense_Sequence.y=patient2.fuzzy[agrep.match,"CDR3_Sense_Sequence"],
|
|
216 Related_to_leukemia_clone.y=patient2.fuzzy[agrep.match,"Related_to_leukemia_clone"],
|
|
217 Frequency.y=patient2.fuzzy[agrep.match,"Frequency"],
|
|
218 locus_V.y=patient2.fuzzy[agrep.match,"locus_V"],
|
|
219 locus_J.y=patient2.fuzzy[agrep.match,"locus_J"],
|
|
220 min_cell_count.y=patient2.fuzzy[agrep.match,"min_cell_count"],
|
|
221 normalized_read_count.y=patient2.fuzzy[agrep.match,"normalized_read_count"],
|
|
222 paste.y=patient2.fuzzy[agrep.match,"paste"])
|
|
223 #add to patientMerge
|
|
224 patientMerge = rbind(patientMerge, patientMerge.new.row)
|
|
225 #remove from patient*.fuzzy
|
|
226
|
|
227
|
|
228 #remove the fuzzy merged clone sequences from the original datasets
|
35
|
229 patient1 = patient1[patient1$Clone_Sequence != patient1.fuzzy[1,"Clone_Sequence"],]
|
|
230 patient2 = patient2[patient2$Clone_Sequence != patient2.fuzzy[agrep.match,"Clone_Sequence"],]
|
34
|
231
|
|
232 scatterplot_data = scatterplot_data[scatterplot_data$merge != current.clone.seq.2,]
|
|
233
|
|
234 patient2.fuzzy <<- patient2.fuzzy[-c(agrep.match),]
|
|
235
|
|
236
|
|
237 } else if (length(agrep.match) > 1){
|
|
238 #multiple matches, whatdo?
|
36
|
239 cat(paste("<tr><td>", "Multiple matches found for ", current.merge.1, ", ", current.clone.seq.1, " in ", patient, ", ", oneSample, "</td></tr>", sep=""), file=logfile, append=T)
|
34
|
240 }
|
|
241 patient1.fuzzy = patient1.fuzzy[-1,]
|
|
242 }
|
|
243
|
|
244 #adist(patient1.fuzzy$Clone_Sequence, patient2.fuzzy$Clone_Sequence, list(insertions=0.1, deletions=0.1, substitutions=1))
|
|
245 }
|
|
246
|
18
|
247 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
248 res1 = vector()
|
|
249 res2 = vector()
|
|
250 resBoth = vector()
|
|
251 read1Count = vector()
|
|
252 read2Count = vector()
|
2
|
253 locussum1 = vector()
|
|
254 locussum2 = vector()
|
9
|
255
|
0
|
256 #for(iter in 1){
|
|
257 for(iter in 1:length(product[,1])){
|
|
258 threshhold = product[iter,threshholdIndex]
|
|
259 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
260 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
261 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
262 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
263 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
264 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
265 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
266 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
267 res1 = append(res1, sum(one))
|
2
|
268 res2 = append(res2, sum(two))
|
0
|
269 resBoth = append(resBoth, sum(both))
|
2
|
270 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
271 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
272 #threshhold = 0
|
|
273 if(threshhold != 0){
|
|
274 if(sum(one) > 0){
|
15
|
275 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
276 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
277 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
278 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
279 }
|
|
280 if(sum(two) > 0){
|
15
|
281 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
282 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
283 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
284 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
285 }
|
29
|
286 } else {
|
|
287 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
288 if(nrow(scatterplot_locus_data) > 0){
|
|
289 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
290 }
|
34
|
291 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
292 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
293 not_in_one = !in_one
|
|
294 if(any(in_two)){
|
|
295 scatterplot_locus_data[not_in_one,]$type = twoSample
|
|
296 }
|
30
|
297 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
|
|
298 if(any(in_both)){
|
|
299 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
300 }
|
29
|
301 if(type == "single"){
|
|
302 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
303 }
|
|
304 p = NULL
|
|
305 if(nrow(scatterplot_locus_data) != 0){
|
|
306 if(on == "normalized_read_count"){
|
30
|
307 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
34
|
308 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales)
|
29
|
309 } else {
|
30
|
310 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
29
|
311 }
|
|
312 p = p + geom_point(aes(colour=type), position="jitter")
|
|
313 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
314 } else {
|
|
315 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
316 }
|
|
317 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
318 print(p)
|
|
319 dev.off()
|
0
|
320 }
|
|
321 if(sum(both) > 0){
|
15
|
322 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
323 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
324 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
325 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
326 }
|
0
|
327 }
|
2
|
328 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
329 if(sum(is.na(patientResult$percentage)) > 0){
|
|
330 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
331 }
|
|
332 colnames(patientResult)[6] = oneSample
|
|
333 colnames(patientResult)[8] = twoSample
|
|
334 colnamesBak = colnames(patientResult)
|
2
|
335 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
336 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
337 colnames(patientResult) = colnamesBak
|
|
338
|
|
339 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
340 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
341
|
|
342 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
343 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
344 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
345 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
346 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
347 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
348 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
349 print(plt)
|
|
350 dev.off()
|
|
351 #(t,r,b,l)
|
|
352 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
353 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
354 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
355 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
356 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
357 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
358 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
359 print(plt)
|
|
360 dev.off()
|
|
361
|
|
362 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
363 patientResult$relativeValue = patientResult$value * 10
|
|
364 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
365 plt = ggplot(patientResult)
|
|
366 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
367 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
368 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
369 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
370 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
371 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
372 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
373 print(plt)
|
|
374 dev.off()
|
|
375 }
|
|
376
|
3
|
377 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
378
|
0
|
379 interval = intervalFreq
|
|
380 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
381 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
382 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
383
|
3
|
384 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
385
|
0
|
386 interval = intervalReads
|
|
387 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
388 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
389 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
390
|
3
|
391 cat("</table></html>", file=logfile, append=T)
|
|
392
|
33
|
393
|
|
394 if(nrow(single_patients) > 0){
|
|
395 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
396 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
397 p = p + geom_point(aes(colour=type), position="jitter")
|
|
398 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
399 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
400 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
401 print(p)
|
|
402 dev.off()
|
7
|
403
|
33
|
404 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
405 p = p + geom_point(aes(colour=type), position="jitter")
|
|
406 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
407 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
408 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
409 print(p)
|
|
410 dev.off()
|
|
411 } else {
|
|
412 empty <- data.frame()
|
|
413 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
414
|
|
415 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
416 print(p)
|
|
417 dev.off()
|
|
418
|
|
419 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
420 print(p)
|
|
421 dev.off()
|
|
422 }
|
7
|
423 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
424 onShort = "reads"
|
|
425 if(on == "Frequency"){
|
|
426 onShort = "freq"
|
|
427 }
|
18
|
428 onx = paste(on, ".x", sep="")
|
|
429 ony = paste(on, ".y", sep="")
|
|
430 onz = paste(on, ".z", sep="")
|
7
|
431 type="triplet"
|
|
432
|
|
433 threshholdIndex = which(colnames(product) == "interval")
|
|
434 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
435 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
436 titleIndex = which(colnames(product) == "Titles")
|
|
437 sampleIndex = which(colnames(patient1) == "Sample")
|
|
438 patientIndex = which(colnames(patient1) == "Patient")
|
|
439 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
440 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
441 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
442
|
29
|
443 if(mergeOn == "Clone_Sequence"){
|
|
444 patient1$merge = paste(patient1$Clone_Sequence)
|
|
445 patient2$merge = paste(patient2$Clone_Sequence)
|
|
446 patient3$merge = paste(patient3$Clone_Sequence)
|
|
447
|
|
448 } else {
|
|
449 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
450 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
451 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
452 }
|
9
|
453
|
|
454 patientMerge = merge(patient1, patient2, by="merge")
|
|
455 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
456 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
457 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
458 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
459 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
460 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
461 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
462 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
463 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
464
|
28
|
465
|
30
|
466 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
467 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
468 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
469 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
470
|
7
|
471 res1 = vector()
|
|
472 res2 = vector()
|
|
473 res3 = vector()
|
20
|
474 res12 = vector()
|
|
475 res13 = vector()
|
|
476 res23 = vector()
|
7
|
477 resAll = vector()
|
|
478 read1Count = vector()
|
|
479 read2Count = vector()
|
|
480 read3Count = vector()
|
|
481
|
|
482 if(appendTriplets){
|
9
|
483 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
484 }
|
|
485 for(iter in 1:length(product[,1])){
|
|
486 threshhold = product[iter,threshholdIndex]
|
|
487 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
488 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
489 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
490 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
491
|
30
|
492 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
493 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
494 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
495
|
30
|
496 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
497 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
498 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
499
|
18
|
500 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
501 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
502 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
503 res1 = append(res1, sum(one))
|
|
504 res2 = append(res2, sum(two))
|
|
505 res3 = append(res3, sum(three))
|
|
506 resAll = append(resAll, sum(all))
|
20
|
507 res12 = append(res12, sum(one_two))
|
|
508 res13 = append(res13, sum(one_three))
|
|
509 res23 = append(res23, sum(two_three))
|
7
|
510 #threshhold = 0
|
|
511 if(threshhold != 0){
|
|
512 if(sum(one) > 0){
|
20
|
513 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
514 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
515 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
516 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
517 }
|
|
518 if(sum(two) > 0){
|
20
|
519 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
520 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
521 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
522 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
523 }
|
|
524 if(sum(three) > 0){
|
20
|
525 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
526 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
527 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
528 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
529 }
|
20
|
530 if(sum(one_two) > 0){
|
|
531 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
532 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
533 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
534 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
535 }
|
|
536 if(sum(one_three) > 0){
|
|
537 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
538 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
539 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
540 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
541 }
|
|
542 if(sum(two_three) > 0){
|
|
543 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
544 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
545 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
546 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
547 }
|
|
548 } else { #scatterplot data
|
24
|
549 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
550 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
551 if(sum(in_two) > 0){
|
|
552 scatterplot_locus_data[in_two,]$type = "In two"
|
|
553 }
|
30
|
554 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
555 if(sum(in_three)> 0){
|
|
556 scatterplot_locus_data[in_three,]$type = "In three"
|
|
557 }
|
|
558 not_in_one = scatterplot_locus_data$type != "In one"
|
|
559 if(sum(not_in_one) > 0){
|
|
560 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
561 }
|
20
|
562 p = NULL
|
|
563 if(nrow(scatterplot_locus_data) != 0){
|
|
564 if(on == "normalized_read_count"){
|
31
|
565 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
566 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
567 } else {
|
32
|
568 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
569 }
|
|
570 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
571 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
572 } else {
|
|
573 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
574 }
|
|
575 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
576 print(p)
|
|
577 dev.off()
|
|
578 }
|
7
|
579 if(sum(all) > 0){
|
20
|
580 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
581 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
582 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
583 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
584 }
|
|
585 }
|
20
|
586 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
587 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
588 colnames(patientResult)[6] = oneSample
|
20
|
589 colnames(patientResult)[7] = twoSample
|
|
590 colnames(patientResult)[8] = threeSample
|
|
591 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
592 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
593 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
594
|
|
595 colnamesBak = colnames(patientResult)
|
20
|
596 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
597 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
598 colnames(patientResult) = colnamesBak
|
|
599
|
|
600 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
601 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
602
|
|
603 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
604 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
605 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
606 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
607 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
608 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
609 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
610 print(plt)
|
|
611 dev.off()
|
|
612
|
|
613 fontSize = 4
|
|
614
|
|
615 bak = patientResult
|
|
616 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
617 patientResult$relativeValue = patientResult$value * 10
|
|
618 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
619 plt = ggplot(patientResult)
|
|
620 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
621 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
622 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
623 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
624 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
625 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
626 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
627 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
628 print(plt)
|
|
629 dev.off()
|
|
630 }
|
|
631
|
33
|
632 if(nrow(triplets) != 0){
|
|
633 triplets$uniqueID = "ID"
|
|
634
|
|
635 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
636 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
637 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
638
|
|
639 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
640 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
641 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
642
|
|
643 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
644
|
|
645 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
646 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
647 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
648
|
|
649 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
650 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
651
|
|
652 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
653
|
|
654 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
655
|
|
656 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
657
|
|
658 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
659
|
|
660 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
661
|
|
662 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
663
|
|
664 #remove duplicate V+J+CDR3, add together numerical values
|
|
665 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
666 Cell_Count=unique(.SD$Cell_Count),
|
|
667 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
668 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
669 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
670 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
671 Frequency=sum(.SD$Frequency),
|
|
672 normalized_read_count=sum(.SD$normalized_read_count),
|
|
673 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
674 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
675
|
|
676
|
|
677 interval = intervalReads
|
|
678 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
679 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
680
|
|
681 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
682 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
683 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
684 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count", T)
|
|
685
|
|
686 one = triplets[triplets$Sample == "16278_Left",]
|
|
687 two = triplets[triplets$Sample == "26402_Left",]
|
|
688 three = triplets[triplets$Sample == "26759_Left",]
|
|
689 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count", T)
|
|
690
|
|
691 one = triplets[triplets$Sample == "16278_Right",]
|
|
692 two = triplets[triplets$Sample == "26402_Right",]
|
|
693 three = triplets[triplets$Sample == "26759_Right",]
|
|
694 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count", T)
|
|
695
|
|
696
|
|
697 interval = intervalFreq
|
|
698 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
699 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
700
|
|
701 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
702 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
703 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
704 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", F)
|
|
705
|
|
706 one = triplets[triplets$Sample == "16278_Left",]
|
|
707 two = triplets[triplets$Sample == "26402_Left",]
|
|
708 three = triplets[triplets$Sample == "26759_Left",]
|
|
709 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", F)
|
|
710
|
|
711 one = triplets[triplets$Sample == "16278_Right",]
|
|
712 two = triplets[triplets$Sample == "26402_Right",]
|
|
713 three = triplets[triplets$Sample == "26759_Right",]
|
|
714 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", F)
|
|
715 } else {
|
|
716 cat("", file="triplets.txt")
|
|
717 }
|