0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
68
|
2 options(scipen=999)
|
0
|
3
|
|
4 inFile = args[1]
|
|
5 outDir = args[2]
|
3
|
6 logfile = args[3]
|
|
7 min_freq = as.numeric(args[4])
|
|
8 min_cells = as.numeric(args[5])
|
29
|
9 mergeOn = args[6]
|
3
|
10
|
|
11 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
12
|
4
|
13 library(ggplot2)
|
|
14 library(reshape2)
|
|
15 library(data.table)
|
|
16 library(grid)
|
|
17 library(parallel)
|
0
|
18 #require(xtable)
|
3
|
19 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
20 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
66
|
21 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Clone_Sequence")]
|
34
|
22 dat$dsPerM = 0
|
9
|
23 dat = dat[!is.na(dat$Patient),]
|
13
|
24 dat$Related_to_leukemia_clone = F
|
9
|
25
|
0
|
26 setwd(outDir)
|
3
|
27 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
28 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
30
|
3
|
31 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
32
|
13
|
33 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
34
|
3
|
35 dat = dat[dat$Frequency >= min_freq,]
|
|
36
|
13
|
37 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
38
|
|
39 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
40
|
|
41 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
42 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
43 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
44
|
|
45 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
46 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
47
|
|
48 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
49 print(paste("rows:", nrow(dat)))
|
13
|
50 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
51 print(paste("rows:", nrow(dat)))
|
13
|
52 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
53
|
3
|
54 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
55
|
|
56 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
57
|
22
|
58 patients = split(dat, dat$Patient, drop=T)
|
9
|
59 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
60 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
61 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
62 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
63 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
64 Titles = factor(Titles, levels=Titles)
|
|
65 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
66
|
29
|
67 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
68
|
51
|
69 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
70 patient.merge.list.second = list()
|
68
|
71 scatter_locus_data_list = list()
|
56
|
72 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="multiple_matches.html", append=T)
|
|
73 cat(paste("<table border='0' style='font-family:courier;'>", sep=""), file="single_matches.html", append=T)
|
0
|
74 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
75 if (!is.data.frame(x) & is.list(x)){
|
|
76 x = x[[1]]
|
|
77 }
|
34
|
78 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
79 x = data.frame(x,stringsAsFactors=F)
|
0
|
80 onShort = "reads"
|
|
81 if(on == "Frequency"){
|
|
82 onShort = "freq"
|
|
83 }
|
18
|
84 onx = paste(on, ".x", sep="")
|
|
85 ony = paste(on, ".y", sep="")
|
0
|
86 splt = split(x, x$Sample, drop=T)
|
4
|
87 type="pair"
|
2
|
88 if(length(splt) == 1){
|
3
|
89 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
90 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
91 type="single"
|
2
|
92 }
|
0
|
93 patient1 = splt[[1]]
|
|
94 patient2 = splt[[2]]
|
|
95
|
|
96 threshholdIndex = which(colnames(product) == "interval")
|
|
97 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
98 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
99 titleIndex = which(colnames(product) == "Titles")
|
|
100 sampleIndex = which(colnames(x) == "Sample")
|
|
101 patientIndex = which(colnames(x) == "Patient")
|
|
102 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
103 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
104 patient = paste(x[1,patientIndex])
|
3
|
105
|
0
|
106 switched = F
|
|
107 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
108 tmp = twoSample
|
|
109 twoSample = oneSample
|
|
110 oneSample = tmp
|
|
111 tmp = patient1
|
|
112 patient1 = patient2
|
|
113 patient2 = tmp
|
|
114 switched = T
|
|
115 }
|
|
116 if(appendtxt){
|
4
|
117 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
118 }
|
51
|
119 cat(paste("<tr><td>", patient, "</td>", sep=""), file=logfile, append=T)
|
9
|
120
|
29
|
121 if(mergeOn == "Clone_Sequence"){
|
|
122 patient1$merge = paste(patient1$Clone_Sequence)
|
|
123 patient2$merge = paste(patient2$Clone_Sequence)
|
|
124 } else {
|
|
125 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
126 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
127 }
|
9
|
128
|
30
|
129 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
68
|
130 #scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
|
131 scatterplot_data = patient1[NULL,scatterplot_data_columns]
|
|
132 #scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
133 #scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
|
134 scatterplot.data.type.factor = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both"))
|
|
135 #scatterplot_data$type = factor(x=NULL, levels=scatterplot.data.type.factor)
|
|
136 scatterplot_data$type = character(0)
|
|
137 scatterplot_data$link = numeric(0)
|
|
138 scatterplot_data$on = character(0)
|
29
|
139
|
49
|
140 #patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge") #merge alles 'fuzzy'
|
|
141 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")[NULL,] #blegh
|
|
142
|
|
143 cs.exact.matches = patient1[patient1$Clone_Sequence %in% patient2$Clone_Sequence,]$Clone_Sequence
|
|
144
|
51
|
145 start.time = proc.time()
|
|
146 merge.list = c()
|
|
147
|
|
148 if(patient %in% names(patient.merge.list)){
|
|
149 patientMerge = patient.merge.list[[patient]]
|
|
150 merge.list[["second"]] = patient.merge.list.second[[patient]]
|
68
|
151 scatterplot_data = scatter_locus_data_list[[patient]]
|
51
|
152 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
153
|
|
154 print(names(patient.merge.list))
|
|
155 } else {
|
|
156 #fuzzy matching here...
|
49
|
157 #merge.list = patientMerge$merge
|
51
|
158
|
49
|
159 #patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
160 #patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
161
|
|
162 patient1.fuzzy = patient1
|
|
163 patient2.fuzzy = patient2
|
|
164
|
36
|
165 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
166 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
167
|
48
|
168 #patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
169 #patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
51
|
170
|
48
|
171 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
172 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
51
|
173
|
49
|
174 #merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"]))) #also remove?
|
|
175 #merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
51
|
176
|
49
|
177 #patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
178 #patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
51
|
179
|
37
|
180 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
181 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
49
|
182
|
|
183 merge.list = list()
|
|
184
|
|
185 merge.list[["second"]] = vector()
|
68
|
186
|
|
187 link.count = 1
|
|
188
|
37
|
189 while(nrow(patient.fuzzy) > 1){
|
|
190 first.merge = patient.fuzzy[1,"merge"]
|
|
191 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
49
|
192 first.sample = patient.fuzzy[1,"Sample"]
|
37
|
193 merge.filter = first.merge == patient.fuzzy$merge
|
51
|
194
|
49
|
195 #length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
51
|
196
|
49
|
197 first.sample.filter = first.sample == patient.fuzzy$Sample
|
|
198 second.sample.filter = first.sample != patient.fuzzy$Sample
|
|
199
|
|
200 #first match same sample, sum to a single row, same for other sample
|
|
201 #then merge rows like 'normal'
|
51
|
202
|
48
|
203 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
49
|
204
|
|
205
|
|
206
|
44
|
207 #match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
49
|
208 first.match.filter = merge.filter & sequence.filter & first.sample.filter
|
|
209 second.match.filter = merge.filter & sequence.filter & second.sample.filter
|
|
210
|
|
211 first.rows = patient.fuzzy[first.match.filter,]
|
|
212 second.rows = patient.fuzzy[second.match.filter,]
|
|
213
|
50
|
214 first.rows.v = table(first.rows$V_Segment_Major_Gene)
|
|
215 first.rows.v = names(first.rows.v[which.max(first.rows.v)])
|
|
216 first.rows.j = table(first.rows$J_Segment_Major_Gene)
|
|
217 first.rows.j = names(first.rows.j[which.max(first.rows.j)])
|
|
218
|
49
|
219 first.sum = data.frame(merge = first.clone.sequence,
|
|
220 Patient = patient,
|
|
221 Receptor = first.rows[1,"Receptor"],
|
|
222 Sample = first.rows[1,"Sample"],
|
|
223 Cell_Count = first.rows[1,"Cell_Count"],
|
|
224 Clone_Molecule_Count_From_Spikes = sum(first.rows$Clone_Molecule_Count_From_Spikes),
|
|
225 Log10_Frequency = log10(sum(first.rows$Frequency)),
|
|
226 Total_Read_Count = sum(first.rows$Total_Read_Count),
|
|
227 dsPerM = sum(first.rows$dsPerM),
|
50
|
228 J_Segment_Major_Gene = first.rows.j,
|
|
229 V_Segment_Major_Gene = first.rows.v,
|
49
|
230 Clone_Sequence = first.clone.sequence,
|
|
231 CDR3_Sense_Sequence = first.rows[1,"CDR3_Sense_Sequence"],
|
|
232 Related_to_leukemia_clone = F,
|
|
233 Frequency = sum(first.rows$Frequency),
|
|
234 locus_V = first.rows[1,"locus_V"],
|
|
235 locus_J = first.rows[1,"locus_J"],
|
|
236 min_cell_count = first.rows[1,"min_cell_count"],
|
|
237 normalized_read_count = sum(first.rows$normalized_read_count),
|
|
238 paste = first.rows[1,"paste"],
|
|
239 min_cell_paste = first.rows[1,"min_cell_paste"])
|
|
240
|
|
241 if(nrow(second.rows) > 0){
|
50
|
242 second.rows.v = table(second.rows$V_Segment_Major_Gene)
|
|
243 second.rows.v = names(second.rows.v[which.max(second.rows.v)])
|
|
244 second.rows.j = table(second.rows$J_Segment_Major_Gene)
|
|
245 second.rows.j = names(second.rows.j[which.max(second.rows.j)])
|
|
246
|
55
|
247 second.sum = data.frame(merge = first.clone.sequence,
|
49
|
248 Patient = patient,
|
|
249 Receptor = second.rows[1,"Receptor"],
|
|
250 Sample = second.rows[1,"Sample"],
|
|
251 Cell_Count = second.rows[1,"Cell_Count"],
|
|
252 Clone_Molecule_Count_From_Spikes = sum(second.rows$Clone_Molecule_Count_From_Spikes),
|
|
253 Log10_Frequency = log10(sum(second.rows$Frequency)),
|
|
254 Total_Read_Count = sum(second.rows$Total_Read_Count),
|
|
255 dsPerM = sum(second.rows$dsPerM),
|
50
|
256 J_Segment_Major_Gene = second.rows.j,
|
|
257 V_Segment_Major_Gene = second.rows.v,
|
49
|
258 Clone_Sequence = first.clone.sequence,
|
|
259 CDR3_Sense_Sequence = second.rows[1,"CDR3_Sense_Sequence"],
|
|
260 Related_to_leukemia_clone = F,
|
|
261 Frequency = sum(second.rows$Frequency),
|
|
262 locus_V = second.rows[1,"locus_V"],
|
|
263 locus_J = second.rows[1,"locus_J"],
|
|
264 min_cell_count = second.rows[1,"min_cell_count"],
|
|
265 normalized_read_count = sum(second.rows$normalized_read_count),
|
|
266 paste = second.rows[1,"paste"],
|
|
267 min_cell_paste = second.rows[1,"min_cell_paste"])
|
|
268
|
|
269 patientMerge = rbind(patientMerge, merge(first.sum, second.sum, by="merge"))
|
|
270 patient.fuzzy = patient.fuzzy[!(first.match.filter | second.match.filter),]
|
|
271
|
50
|
272 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"], second.rows[second.rows$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
273 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
49
|
274
|
56
|
275 tmp.rows = rbind(first.rows, second.rows)
|
|
276 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
68
|
277
|
|
278
|
|
279 #add to the scatterplot data
|
|
280 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
281 scatterplot.row$type = paste(first.sum[,"Sample"], "In Both")
|
|
282 scatterplot.row$link = link.count
|
|
283 scatterplot.row$on = onShort
|
|
284
|
|
285 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
286
|
|
287 scatterplot.row = second.sum[,scatterplot_data_columns]
|
|
288 scatterplot.row$type = paste(second.sum[,"Sample"], "In Both")
|
|
289 scatterplot.row$link = link.count
|
|
290 scatterplot.row$on = onShort
|
|
291
|
|
292 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
|
293
|
|
294 #write some information about the match to a log file
|
56
|
295 if (nrow(first.rows) > 1 | nrow(second.rows) > 1) {
|
|
296 cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
297 } else {
|
|
298 second.clone.sequence = second.rows[1,"Clone_Sequence"]
|
|
299 if(nchar(first.clone.sequence) != nchar(second.clone.sequence)){
|
|
300 cat(paste("<tr bgcolor='#DDD'><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
|
301 } else {
|
58
|
302 #cat(paste("<tr><td>", patient, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
49
|
303 }
|
|
304 }
|
|
305
|
59
|
306 } else if(nrow(first.rows) > 1) {
|
|
307 if(patient1[1,"Sample"] == first.sample){
|
|
308 patient1 = patient1[!(patient1$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
309 patient1 = rbind(patient1, first.sum)
|
|
310 } else {
|
|
311 patient2 = patient2[!(patient2$Clone_Sequence %in% first.rows$Clone_Sequence),]
|
|
312 patient2 = rbind(patient2, first.sum)
|
|
313 }
|
|
314
|
|
315 hidden.clone.sequences = c(first.rows[-1,"Clone_Sequence"])
|
|
316 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
317
|
|
318 patient.fuzzy = patient.fuzzy[-first.match.filter,]
|
68
|
319
|
|
320 #add to the scatterplot data
|
|
321 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
322 scatterplot.row$type = first.sum[,"Sample"]
|
|
323 scatterplot.row$link = link.count
|
|
324 scatterplot.row$on = onShort
|
|
325
|
|
326 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
59
|
327
|
|
328 cat(paste("<tr bgcolor='#DDF'><td>", patient, " row ", 1:nrow(first.rows), "</td><td>", first.rows$Sample, ":</td><td>", first.rows$Clone_Sequence, "</td><td>", first.rows$normalized_read_count, "</td></tr>", sep=""), file="single_matches.html", append=T)
|
37
|
329 } else {
|
|
330 patient.fuzzy = patient.fuzzy[-1,]
|
68
|
331
|
|
332 #add to the scatterplot data
|
|
333 scatterplot.row = first.sum[,scatterplot_data_columns]
|
|
334 scatterplot.row$type = first.sum[,"Sample"]
|
|
335 scatterplot.row$link = link.count
|
|
336 scatterplot.row$on = onShort
|
|
337
|
|
338 scatterplot_data = rbind(scatterplot_data, scatterplot.row)
|
34
|
339 }
|
68
|
340 link.count = link.count + 1
|
34
|
341 }
|
51
|
342 patient.merge.list[[patient]] <<- patientMerge
|
|
343 patient.merge.list.second[[patient]] <<- merge.list[["second"]]
|
69
|
344
|
|
345 sample.order = data.frame(type = c(oneSample, twoSample, paste(c(oneSample, twoSample), "In Both")),type.order = 1:4)
|
|
346 scatterplot_data = merge(scatterplot_data, sample.order, by="type")
|
|
347
|
68
|
348 scatter_locus_data_list[[patient]] <<- scatterplot_data
|
51
|
349 cat(paste("<td>", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
34
|
350 }
|
51
|
351
|
52
|
352 patient1 = patient1[!(patient1$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
|
353 patient2 = patient2[!(patient2$Clone_Sequence %in% patient.merge.list.second[[patient]]),]
|
51
|
354
|
37
|
355
|
18
|
356 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
68
|
357 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony])
|
0
|
358 res1 = vector()
|
|
359 res2 = vector()
|
|
360 resBoth = vector()
|
|
361 read1Count = vector()
|
|
362 read2Count = vector()
|
2
|
363 locussum1 = vector()
|
|
364 locussum2 = vector()
|
9
|
365
|
0
|
366 #for(iter in 1){
|
|
367 for(iter in 1:length(product[,1])){
|
|
368 threshhold = product[iter,threshholdIndex]
|
|
369 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
370 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
371 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
372 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
373 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
374 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
375 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
376 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
377 res1 = append(res1, sum(one))
|
2
|
378 res2 = append(res2, sum(two))
|
0
|
379 resBoth = append(resBoth, sum(both))
|
2
|
380 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
381 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
382 #threshhold = 0
|
|
383 if(threshhold != 0){
|
|
384 if(sum(one) > 0){
|
15
|
385 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
386 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
387 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
388 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
389 }
|
|
390 if(sum(two) > 0){
|
15
|
391 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
392 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
393 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
394 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
395 }
|
29
|
396 } else {
|
|
397 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
398 if(nrow(scatterplot_locus_data) > 0){
|
|
399 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
400 }
|
68
|
401
|
|
402
|
69
|
403
|
29
|
404 p = NULL
|
68
|
405 print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data)))
|
29
|
406 if(nrow(scatterplot_locus_data) != 0){
|
|
407 if(on == "normalized_read_count"){
|
30
|
408 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
70
|
409 #p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,10^6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
|
410 print(paste("scatterplot_locus_data:"))
|
|
411 print(head(scatterplot_locus_data))
|
|
412 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales, limits=c(0,10^6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
413 } else {
|
69
|
414 p = ggplot(scatterplot_locus_data, aes(factor(reorder(type, type.order)), Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE)
|
29
|
415 }
|
68
|
416 p = p + geom_point(aes(colour=type), position="dodge")
|
29
|
417 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
418 } else {
|
|
419 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
420 }
|
|
421 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
422 print(p)
|
|
423 dev.off()
|
0
|
424 }
|
|
425 if(sum(both) > 0){
|
15
|
426 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
427 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
428 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
429 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
430 }
|
0
|
431 }
|
2
|
432 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
433 if(sum(is.na(patientResult$percentage)) > 0){
|
|
434 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
435 }
|
|
436 colnames(patientResult)[6] = oneSample
|
|
437 colnames(patientResult)[8] = twoSample
|
|
438 colnamesBak = colnames(patientResult)
|
2
|
439 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
440 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
441 colnames(patientResult) = colnamesBak
|
|
442
|
|
443 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
444 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
445
|
|
446 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
447 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
448 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
449 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
450 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
451 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
452 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
453 print(plt)
|
|
454 dev.off()
|
|
455 #(t,r,b,l)
|
|
456 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
457 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
458 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
459 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
460 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
461 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
462 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
463 print(plt)
|
|
464 dev.off()
|
|
465
|
|
466 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
467 patientResult$relativeValue = patientResult$value * 10
|
|
468 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
469 plt = ggplot(patientResult)
|
|
470 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
471 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
472 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
473 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
474 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
475 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
476 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
477 print(plt)
|
|
478 dev.off()
|
|
479 }
|
|
480
|
3
|
481 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
482
|
0
|
483 interval = intervalFreq
|
|
484 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
485 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
486 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
487
|
3
|
488 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
489
|
0
|
490 interval = intervalReads
|
|
491 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
492 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
61
|
493 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
33
|
494
|
|
495 if(nrow(single_patients) > 0){
|
|
496 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
68
|
497 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000))
|
33
|
498 p = p + geom_point(aes(colour=type), position="jitter")
|
|
499 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
500 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
501 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
502 print(p)
|
|
503 dev.off()
|
7
|
504
|
68
|
505 #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
506 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
33
|
507 p = p + geom_point(aes(colour=type), position="jitter")
|
|
508 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
509 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
510 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
511 print(p)
|
|
512 dev.off()
|
|
513 } else {
|
|
514 empty <- data.frame()
|
|
515 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
516
|
|
517 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
518 print(p)
|
|
519 dev.off()
|
|
520
|
|
521 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
522 print(p)
|
|
523 dev.off()
|
|
524 }
|
60
|
525
|
|
526 patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory...
|
|
527 patient.merge.list.second = list()
|
|
528
|
7
|
529 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
530 onShort = "reads"
|
|
531 if(on == "Frequency"){
|
|
532 onShort = "freq"
|
|
533 }
|
18
|
534 onx = paste(on, ".x", sep="")
|
|
535 ony = paste(on, ".y", sep="")
|
|
536 onz = paste(on, ".z", sep="")
|
7
|
537 type="triplet"
|
|
538
|
|
539 threshholdIndex = which(colnames(product) == "interval")
|
|
540 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
541 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
542 titleIndex = which(colnames(product) == "Titles")
|
|
543 sampleIndex = which(colnames(patient1) == "Sample")
|
|
544 patientIndex = which(colnames(patient1) == "Patient")
|
|
545 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
546 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
547 threeSample = paste(patient3[1,sampleIndex], sep="")
|
60
|
548
|
29
|
549 if(mergeOn == "Clone_Sequence"){
|
|
550 patient1$merge = paste(patient1$Clone_Sequence)
|
|
551 patient2$merge = paste(patient2$Clone_Sequence)
|
|
552 patient3$merge = paste(patient3$Clone_Sequence)
|
|
553
|
|
554 } else {
|
|
555 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
556 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
557 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
558 }
|
60
|
559
|
|
560 #patientMerge = merge(patient1, patient2, by="merge")[NULL,]
|
|
561 patient1.fuzzy = patient1
|
|
562 patient2.fuzzy = patient2
|
|
563 patient3.fuzzy = patient3
|
|
564
|
|
565 cat(paste("<tr><td>", label1, "</td>", sep=""), file=logfile, append=T)
|
|
566
|
|
567 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J)
|
|
568 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J)
|
|
569 patient3.fuzzy$merge = paste(patient3.fuzzy$locus_V, patient3.fuzzy$locus_J)
|
|
570
|
|
571 patient.fuzzy = rbind(patient1.fuzzy ,patient2.fuzzy, patient3.fuzzy)
|
65
|
572 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
60
|
573
|
|
574 other.sample.list = list()
|
|
575 other.sample.list[[oneSample]] = c(twoSample, threeSample)
|
|
576 other.sample.list[[twoSample]] = c(oneSample, threeSample)
|
|
577 other.sample.list[[threeSample]] = c(oneSample, twoSample)
|
|
578
|
9
|
579 patientMerge = merge(patient1, patient2, by="merge")
|
|
580 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
581 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
60
|
582 #patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
583 patientMerge = patientMerge[NULL,]
|
|
584
|
|
585 duo.merge.list = list()
|
|
586
|
20
|
587 patientMerge12 = merge(patient1, patient2, by="merge")
|
60
|
588 #patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
589 patientMerge12 = patientMerge12[NULL,]
|
|
590 duo.merge.list[[paste(oneSample, twoSample)]] = patientMerge12
|
|
591 duo.merge.list[[paste(twoSample, oneSample)]] = patientMerge12
|
|
592
|
20
|
593 patientMerge13 = merge(patient1, patient3, by="merge")
|
60
|
594 #patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
595 patientMerge13 = patientMerge13[NULL,]
|
|
596 duo.merge.list[[paste(oneSample, threeSample)]] = patientMerge13
|
|
597 duo.merge.list[[paste(threeSample, oneSample)]] = patientMerge13
|
|
598
|
20
|
599 patientMerge23 = merge(patient2, patient3, by="merge")
|
60
|
600 #patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
601 patientMerge23 = patientMerge23[NULL,]
|
|
602 duo.merge.list[[paste(twoSample, threeSample)]] = patientMerge23
|
|
603 duo.merge.list[[paste(threeSample, twoSample)]] = patientMerge23
|
|
604
|
|
605 merge.list = list()
|
|
606 merge.list[["second"]] = vector()
|
|
607
|
|
608 start.time = proc.time()
|
|
609 if(paste(label1, "123") %in% names(patient.merge.list)){
|
|
610 patientMerge = patient.merge.list[[paste(label1, "123")]]
|
|
611 patientMerge12 = patient.merge.list[[paste(label1, "12")]]
|
|
612 patientMerge13 = patient.merge.list[[paste(label1, "13")]]
|
|
613 patientMerge23 = patient.merge.list[[paste(label1, "23")]]
|
|
614
|
|
615 merge.list[["second"]] = patient.merge.list.second[[label1]]
|
|
616
|
|
617 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (fetched from cache)</td></tr>", sep=""), file=logfile, append=T)
|
|
618 } else {
|
|
619 while(nrow(patient.fuzzy) > 0){
|
|
620 first.merge = patient.fuzzy[1,"merge"]
|
|
621 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
622 first.sample = patient.fuzzy[1,"Sample"]
|
|
623
|
|
624 merge.filter = first.merge == patient.fuzzy$merge
|
|
625
|
|
626 second.sample = other.sample.list[[first.sample]][1]
|
|
627 third.sample = other.sample.list[[first.sample]][2]
|
|
628
|
|
629 sample.filter.1 = first.sample == patient.fuzzy$Sample
|
|
630 sample.filter.2 = second.sample == patient.fuzzy$Sample
|
|
631 sample.filter.3 = third.sample == patient.fuzzy$Sample
|
|
632
|
|
633 sequence.filter = grepl(paste("^", first.clone.sequence, sep=""), patient.fuzzy$Clone_Sequence)
|
|
634
|
|
635 match.filter.1 = sample.filter.1 & sequence.filter & merge.filter
|
|
636 match.filter.2 = sample.filter.2 & sequence.filter & merge.filter
|
|
637 match.filter.3 = sample.filter.3 & sequence.filter & merge.filter
|
|
638
|
|
639 matches.in.1 = any(match.filter.1)
|
|
640 matches.in.2 = any(match.filter.2)
|
|
641 matches.in.3 = any(match.filter.3)
|
|
642
|
|
643
|
|
644
|
|
645 rows.1 = patient.fuzzy[match.filter.1,]
|
|
646
|
|
647 sum.1 = data.frame(merge = first.clone.sequence,
|
|
648 Patient = label1,
|
|
649 Receptor = rows.1[1,"Receptor"],
|
|
650 Sample = rows.1[1,"Sample"],
|
|
651 Cell_Count = rows.1[1,"Cell_Count"],
|
|
652 Clone_Molecule_Count_From_Spikes = sum(rows.1$Clone_Molecule_Count_From_Spikes),
|
|
653 Log10_Frequency = log10(sum(rows.1$Frequency)),
|
|
654 Total_Read_Count = sum(rows.1$Total_Read_Count),
|
|
655 dsPerM = sum(rows.1$dsPerM),
|
|
656 J_Segment_Major_Gene = rows.1[1,"J_Segment_Major_Gene"],
|
|
657 V_Segment_Major_Gene = rows.1[1,"V_Segment_Major_Gene"],
|
|
658 Clone_Sequence = first.clone.sequence,
|
|
659 CDR3_Sense_Sequence = rows.1[1,"CDR3_Sense_Sequence"],
|
|
660 Related_to_leukemia_clone = F,
|
|
661 Frequency = sum(rows.1$Frequency),
|
|
662 locus_V = rows.1[1,"locus_V"],
|
|
663 locus_J = rows.1[1,"locus_J"],
|
64
|
664 uniqueID = rows.1[1,"uniqueID"],
|
60
|
665 normalized_read_count = sum(rows.1$normalized_read_count))
|
|
666 sum.2 = sum.1[NULL,]
|
|
667 rows.2 = patient.fuzzy[match.filter.2,]
|
|
668 if(matches.in.2){
|
|
669 sum.2 = data.frame(merge = first.clone.sequence,
|
|
670 Patient = label1,
|
|
671 Receptor = rows.2[1,"Receptor"],
|
|
672 Sample = rows.2[1,"Sample"],
|
|
673 Cell_Count = rows.2[1,"Cell_Count"],
|
|
674 Clone_Molecule_Count_From_Spikes = sum(rows.2$Clone_Molecule_Count_From_Spikes),
|
|
675 Log10_Frequency = log10(sum(rows.2$Frequency)),
|
|
676 Total_Read_Count = sum(rows.2$Total_Read_Count),
|
|
677 dsPerM = sum(rows.2$dsPerM),
|
|
678 J_Segment_Major_Gene = rows.2[1,"J_Segment_Major_Gene"],
|
|
679 V_Segment_Major_Gene = rows.2[1,"V_Segment_Major_Gene"],
|
|
680 Clone_Sequence = first.clone.sequence,
|
|
681 CDR3_Sense_Sequence = rows.2[1,"CDR3_Sense_Sequence"],
|
|
682 Related_to_leukemia_clone = F,
|
|
683 Frequency = sum(rows.2$Frequency),
|
|
684 locus_V = rows.2[1,"locus_V"],
|
|
685 locus_J = rows.2[1,"locus_J"],
|
64
|
686 uniqueID = rows.2[1,"uniqueID"],
|
60
|
687 normalized_read_count = sum(rows.2$normalized_read_count))
|
|
688 }
|
|
689
|
|
690 sum.3 = sum.1[NULL,]
|
|
691 rows.3 = patient.fuzzy[match.filter.3,]
|
|
692 if(matches.in.3){
|
|
693 sum.3 = data.frame(merge = first.clone.sequence,
|
|
694 Patient = label1,
|
|
695 Receptor = rows.3[1,"Receptor"],
|
|
696 Sample = rows.3[1,"Sample"],
|
|
697 Cell_Count = rows.3[1,"Cell_Count"],
|
|
698 Clone_Molecule_Count_From_Spikes = sum(rows.3$Clone_Molecule_Count_From_Spikes),
|
|
699 Log10_Frequency = log10(sum(rows.3$Frequency)),
|
|
700 Total_Read_Count = sum(rows.3$Total_Read_Count),
|
|
701 dsPerM = sum(rows.3$dsPerM),
|
|
702 J_Segment_Major_Gene = rows.3[1,"J_Segment_Major_Gene"],
|
|
703 V_Segment_Major_Gene = rows.3[1,"V_Segment_Major_Gene"],
|
|
704 Clone_Sequence = first.clone.sequence,
|
|
705 CDR3_Sense_Sequence = rows.3[1,"CDR3_Sense_Sequence"],
|
|
706 Related_to_leukemia_clone = F,
|
|
707 Frequency = sum(rows.3$Frequency),
|
|
708 locus_V = rows.3[1,"locus_V"],
|
|
709 locus_J = rows.3[1,"locus_J"],
|
64
|
710 uniqueID = rows.3[1,"uniqueID"],
|
60
|
711 normalized_read_count = sum(rows.3$normalized_read_count))
|
|
712 }
|
|
713
|
|
714 if(matches.in.2 & matches.in.3){
|
|
715 merge.123 = merge(sum.1, sum.2, by="merge")
|
|
716 merge.123 = merge(merge.123, sum.3, by="merge")
|
|
717 colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123)))] = paste(colnames(merge.123)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(merge.123), perl=T))], ".z", sep="")
|
|
718 #merge.123$thresholdValue = pmax(merge.123[,onx], merge.123[,ony], merge.123[,onz])
|
|
719
|
|
720 patientMerge = rbind(patientMerge, merge.123)
|
|
721 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2 | match.filter.3),]
|
|
722
|
|
723 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
724 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
725
|
|
726 } else if (matches.in.2) {
|
|
727 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
728 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
729
|
|
730 second.sample = sum.2[,"Sample"]
|
|
731
|
|
732 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
733
|
|
734 merge.12 = merge(sum.1, sum.2, by="merge")
|
|
735
|
|
736 current.merge.list = rbind(current.merge.list, merge.12)
|
|
737 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
738
|
|
739 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.2),]
|
|
740
|
|
741 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.2[rows.2$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
742 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
743
|
|
744 } else if (matches.in.3) {
|
|
745
|
|
746 #other.sample1 = other.sample.list[[first.sample]][1]
|
|
747 #other.sample2 = other.sample.list[[first.sample]][2]
|
|
748
|
|
749 second.sample = sum.3[,"Sample"]
|
|
750
|
|
751 current.merge.list = duo.merge.list[[paste(first.sample, second.sample)]]
|
|
752
|
|
753 merge.13 = merge(sum.1, sum.3, by="merge")
|
|
754
|
|
755 current.merge.list = rbind(current.merge.list, merge.13)
|
|
756 duo.merge.list[[paste(first.sample, second.sample)]] = current.merge.list
|
|
757
|
|
758 patient.fuzzy = patient.fuzzy[!(match.filter.1 | match.filter.3),]
|
|
759
|
|
760 hidden.clone.sequences = c(rows.1[-1,"Clone_Sequence"], rows.3[rows.3$Clone_Sequence != first.clone.sequence,"Clone_Sequence"])
|
|
761 merge.list[["second"]] = append(merge.list[["second"]], hidden.clone.sequences)
|
|
762
|
63
|
763 } else if(nrow(rows.1) > 1){
|
|
764 patient1 = patient1[!(patient1$Clone_Sequence %in% rows.1$Clone_Sequence),]
|
64
|
765 print(names(patient1)[names(patient1) %in% sum.1])
|
|
766 print(names(patient1)[!(names(patient1) %in% sum.1)])
|
|
767 print(names(patient1))
|
|
768 print(names(sum.1))
|
|
769 print(summary(sum.1))
|
|
770 print(summary(patient1))
|
|
771 print(dim(sum.1))
|
|
772 print(dim(patient1))
|
|
773 print(head(sum.1[,names(patient1)]))
|
|
774 patient1 = rbind(patient1, sum.1[,names(patient1)])
|
63
|
775 patient.fuzzy = patient.fuzzy[-match.filter.1,]
|
60
|
776 } else {
|
|
777 patient.fuzzy = patient.fuzzy[-1,]
|
|
778 }
|
|
779
|
|
780 tmp.rows = rbind(rows.1, rows.2, rows.3)
|
|
781 tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),]
|
|
782
|
|
783 if (sum(match.filter.1) > 1 | sum(match.filter.2) > 1 | sum(match.filter.1) > 1) {
|
|
784 cat(paste("<tr><td>", label1, " row ", 1:nrow(tmp.rows), "</td><td>", tmp.rows$Sample, ":</td><td>", tmp.rows$Clone_Sequence, "</td><td>", tmp.rows$normalized_read_count, "</td></tr>", sep=""), file="multiple_matches.html", append=T)
|
|
785 } else {
|
|
786 }
|
|
787
|
|
788 }
|
|
789 patient.merge.list[[paste(label1, "123")]] = patientMerge
|
|
790
|
|
791 patientMerge12 = duo.merge.list[[paste(oneSample, twoSample)]]
|
|
792 patientMerge13 = duo.merge.list[[paste(oneSample, threeSample)]]
|
|
793 patientMerge23 = duo.merge.list[[paste(twoSample, threeSample)]]
|
|
794
|
|
795 patient.merge.list[[paste(label1, "12")]] = patientMerge12
|
|
796 patient.merge.list[[paste(label1, "13")]] = patientMerge13
|
|
797 patient.merge.list[[paste(label1, "23")]] = patientMerge23
|
|
798
|
|
799 patient.merge.list.second[[label1]] = merge.list[["second"]]
|
|
800 }
|
|
801 cat(paste("<td>", nrow(patient1), " in ", label1, " and ", nrow(patient2), " in ", label2, nrow(patient3), " in ", label3, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)</td></tr>", sep=""), file=logfile, append=T)
|
|
802 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
803 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
804 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
20
|
805 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
60
|
806
|
68
|
807 #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
808 #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony])
|
|
809 #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony])
|
|
810 #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony])
|
|
811
|
63
|
812 patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),]
|
|
813 patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),]
|
|
814 patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),]
|
62
|
815
|
60
|
816 if(F){
|
|
817 patientMerge = merge(patient1, patient2, by="merge")
|
|
818 patientMerge = merge(patientMerge, patient3, by="merge")
|
|
819 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
|
820 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
|
821 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
822 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
823 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
824 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
825 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
826 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
827 }
|
28
|
828
|
30
|
829 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
830 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
831 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
832 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
833
|
7
|
834 res1 = vector()
|
|
835 res2 = vector()
|
|
836 res3 = vector()
|
20
|
837 res12 = vector()
|
|
838 res13 = vector()
|
|
839 res23 = vector()
|
7
|
840 resAll = vector()
|
|
841 read1Count = vector()
|
|
842 read2Count = vector()
|
|
843 read3Count = vector()
|
|
844
|
|
845 if(appendTriplets){
|
9
|
846 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
847 }
|
|
848 for(iter in 1:length(product[,1])){
|
|
849 threshhold = product[iter,threshholdIndex]
|
|
850 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
851 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
852 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
853 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
854
|
30
|
855 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
856 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
857 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
858
|
30
|
859 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
860 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
861 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
862
|
18
|
863 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
864 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
865 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
866 res1 = append(res1, sum(one))
|
|
867 res2 = append(res2, sum(two))
|
|
868 res3 = append(res3, sum(three))
|
|
869 resAll = append(resAll, sum(all))
|
20
|
870 res12 = append(res12, sum(one_two))
|
|
871 res13 = append(res13, sum(one_three))
|
|
872 res23 = append(res23, sum(two_three))
|
7
|
873 #threshhold = 0
|
|
874 if(threshhold != 0){
|
|
875 if(sum(one) > 0){
|
20
|
876 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
877 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
878 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
879 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
880 }
|
|
881 if(sum(two) > 0){
|
20
|
882 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
883 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
884 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
885 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
886 }
|
|
887 if(sum(three) > 0){
|
20
|
888 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
889 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
890 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
891 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
892 }
|
20
|
893 if(sum(one_two) > 0){
|
|
894 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
895 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
896 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
897 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
898 }
|
|
899 if(sum(one_three) > 0){
|
|
900 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
901 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
902 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
903 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
904 }
|
|
905 if(sum(two_three) > 0){
|
|
906 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
907 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
908 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
909 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
910 }
|
|
911 } else { #scatterplot data
|
24
|
912 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
60
|
913 scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),]
|
30
|
914 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
915 if(sum(in_two) > 0){
|
|
916 scatterplot_locus_data[in_two,]$type = "In two"
|
|
917 }
|
30
|
918 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
919 if(sum(in_three)> 0){
|
|
920 scatterplot_locus_data[in_three,]$type = "In three"
|
|
921 }
|
|
922 not_in_one = scatterplot_locus_data$type != "In one"
|
|
923 if(sum(not_in_one) > 0){
|
70
|
924 #scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
27
|
925 }
|
20
|
926 p = NULL
|
|
927 if(nrow(scatterplot_locus_data) != 0){
|
|
928 if(on == "normalized_read_count"){
|
70
|
929 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
930 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
931 } else {
|
68
|
932 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100))
|
|
933 #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
934 }
|
|
935 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
936 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
937 } else {
|
|
938 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
939 }
|
|
940 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
941 print(p)
|
|
942 dev.off()
|
|
943 }
|
7
|
944 if(sum(all) > 0){
|
20
|
945 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
946 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
947 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
948 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
949 }
|
|
950 }
|
20
|
951 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
952 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
953 colnames(patientResult)[6] = oneSample
|
20
|
954 colnames(patientResult)[7] = twoSample
|
|
955 colnames(patientResult)[8] = threeSample
|
|
956 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
957 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
958 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
959
|
|
960 colnamesBak = colnames(patientResult)
|
20
|
961 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
962 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
963 colnames(patientResult) = colnamesBak
|
|
964
|
|
965 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
966 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
967
|
|
968 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
969 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
970 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
971 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
972 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
973 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
974 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
975 print(plt)
|
|
976 dev.off()
|
|
977
|
|
978 fontSize = 4
|
|
979
|
|
980 bak = patientResult
|
|
981 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
982 patientResult$relativeValue = patientResult$value * 10
|
|
983 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
984 plt = ggplot(patientResult)
|
|
985 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
986 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
987 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
988 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
989 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
990 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
991 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
992 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
993 print(plt)
|
|
994 dev.off()
|
|
995 }
|
|
996
|
70
|
997 if(nrow(triplets) != 0){
|
60
|
998
|
|
999 cat("<tr><td>Starting triplet analysis</td></tr>", file=logfile, append=T)
|
|
1000
|
33
|
1001 triplets$uniqueID = "ID"
|
|
1002
|
|
1003 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
1004 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
1005 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
1006
|
|
1007 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
1008 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
1009 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
1010
|
|
1011 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
60
|
1012
|
|
1013 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
1014
|
33
|
1015 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
1016 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
1017 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
1018
|
|
1019 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
1020 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
1021
|
|
1022 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
1023
|
|
1024 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
1025
|
|
1026 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
1027
|
|
1028 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
1029
|
60
|
1030 column_drops = c("min_cell_count", "min_cell_paste")
|
33
|
1031
|
|
1032 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
60
|
1033
|
|
1034 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
1035
|
33
|
1036 interval = intervalReads
|
|
1037 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1038 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1039
|
|
1040 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
1041 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
1042 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
1043 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1044
|
|
1045 one = triplets[triplets$Sample == "16278_Left",]
|
|
1046 two = triplets[triplets$Sample == "26402_Left",]
|
|
1047 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
1048 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1049
|
|
1050 one = triplets[triplets$Sample == "16278_Right",]
|
|
1051 two = triplets[triplets$Sample == "26402_Right",]
|
|
1052 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
1053 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="normalized_read_count", T)
|
33
|
1054
|
60
|
1055 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
1056
|
33
|
1057 interval = intervalFreq
|
|
1058 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
1059 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
1060
|
|
1061 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
1062 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
1063 three = triplets[triplets$Sample == "24062_reg_BM",]
|
61
|
1064 tripletAnalysis(one, "14696_1_Trio", two, "14696_2_Trio", three, "14696_3_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1065
|
|
1066 one = triplets[triplets$Sample == "16278_Left",]
|
|
1067 two = triplets[triplets$Sample == "26402_Left",]
|
|
1068 three = triplets[triplets$Sample == "26759_Left",]
|
61
|
1069 tripletAnalysis(one, "16278_Left_Trio", two, "26402_Left_Trio", three, "26759_Left_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1070
|
|
1071 one = triplets[triplets$Sample == "16278_Right",]
|
|
1072 two = triplets[triplets$Sample == "26402_Right",]
|
|
1073 three = triplets[triplets$Sample == "26759_Right",]
|
53
|
1074 tripletAnalysis(one, "16278_Right_Trio", two, "26402_Right_Trio", three, "26759_Right_Trio", product=product, interval=interval, on="Frequency", F)
|
33
|
1075 } else {
|
|
1076 cat("", file="triplets.txt")
|
|
1077 }
|
68
|
1078 cat("</table></html>", file=logfile, append=T)
|