0
|
1 args <- commandArgs(trailingOnly = TRUE)
|
|
2
|
|
3 inFile = args[1]
|
|
4 outDir = args[2]
|
3
|
5 logfile = args[3]
|
|
6 min_freq = as.numeric(args[4])
|
|
7 min_cells = as.numeric(args[5])
|
29
|
8 mergeOn = args[6]
|
3
|
9
|
|
10 cat("<html><table><tr><td>Starting analysis</td></tr>", file=logfile, append=F)
|
0
|
11
|
4
|
12 library(ggplot2)
|
|
13 library(reshape2)
|
|
14 library(data.table)
|
|
15 library(grid)
|
|
16 library(parallel)
|
0
|
17 #require(xtable)
|
3
|
18 cat("<tr><td>Reading input</td></tr>", file=logfile, append=T)
|
13
|
19 dat = read.table(inFile, header=T, sep="\t", dec=".", fill=T, stringsAsFactors=F)
|
34
|
20 dat = dat[,c("Patient", "Receptor", "Sample", "Cell_Count", "Clone_Molecule_Count_From_Spikes", "Log10_Frequency", "Total_Read_Count", "J_Segment_Major_Gene", "V_Segment_Major_Gene", "CDR3_Sense_Sequence", "Related_to_leukemia_clone", "Clone_Sequence")]
|
|
21 dat$dsPerM = 0
|
9
|
22 dat = dat[!is.na(dat$Patient),]
|
13
|
23 dat$Related_to_leukemia_clone = F
|
9
|
24
|
0
|
25 setwd(outDir)
|
3
|
26 cat("<tr><td>Selecting first V/J Genes</td></tr>", file=logfile, append=T)
|
2
|
27 dat$V_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$V_Segment_Major_Gene), "; "), "[[", 1)))
|
|
28 dat$J_Segment_Major_Gene = as.factor(as.character(lapply(strsplit(as.character(dat$J_Segment_Major_Gene), "; "), "[[", 1)))
|
|
29
|
3
|
30 cat("<tr><td>Calculating Frequency</td></tr>", file=logfile, append=T)
|
12
|
31
|
13
|
32 dat$Frequency = ((10^dat$Log10_Frequency)*100)
|
2
|
33
|
3
|
34 dat = dat[dat$Frequency >= min_freq,]
|
|
35
|
13
|
36 triplets = dat[grepl("VanDongen_cALL_14696", dat$Patient) | grepl("(16278)|(26402)|(26759)", dat$Sample),]
|
|
37
|
|
38 cat("<tr><td>Normalizing to lowest cell count within locus</td></tr>", file=logfile, append=T)
|
|
39
|
|
40 dat$locus_V = substring(dat$V_Segment_Major_Gene, 0, 4)
|
|
41 dat$locus_J = substring(dat$J_Segment_Major_Gene, 0, 4)
|
|
42 min_cell_count = data.frame(data.table(dat)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("Patient", "locus_V", "locus_J")])
|
|
43
|
|
44 dat$min_cell_paste = paste(dat$Patient, dat$locus_V, dat$locus_J)
|
|
45 min_cell_count$min_cell_paste = paste(min_cell_count$Patient, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
46
|
|
47 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
34
|
48 print(paste("rows:", nrow(dat)))
|
13
|
49 dat = merge(dat, min_cell_count, by="min_cell_paste")
|
34
|
50 print(paste("rows:", nrow(dat)))
|
13
|
51 dat$normalized_read_count = round(dat$Clone_Molecule_Count_From_Spikes / dat$Cell_Count * dat$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
52
|
3
|
53 dat = dat[dat$normalized_read_count >= min_cells,]
|
13
|
54
|
|
55 dat$paste = paste(dat$Sample, dat$Clone_Sequence)
|
9
|
56
|
29
|
57 #remove duplicate V+J+CDR3, add together numerical values
|
34
|
58 if(mergeOn != "Clone_Sequence"){
|
|
59 cat("<tr><td>Adding duplicate V+J+CDR3 sequences</td></tr>", file=logfile, append=T)
|
|
60 dat= data.frame(data.table(dat)[, list(Receptor=unique(.SD$Receptor),
|
|
61 Cell_Count=unique(.SD$Cell_Count),
|
|
62 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
63 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
64 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
65 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
66 Frequency=sum(.SD$Frequency),
|
|
67 locus_V=unique(.SD$locus_V),
|
|
68 locus_J=unique(.SD$locus_J),
|
|
69 min_cell_count=unique(.SD$min_cell_count),
|
|
70 normalized_read_count=sum(.SD$normalized_read_count),
|
|
71 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
72 Clone_Sequence=.SD$Clone_Sequence[1],
|
|
73 min_cell_paste=.SD$min_cell_paste[1],
|
|
74 paste=unique(.SD$paste)), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
75 }
|
29
|
76
|
22
|
77 patients = split(dat, dat$Patient, drop=T)
|
9
|
78 intervalReads = rev(c(0,10,25,50,100,250,500,750,1000,10000))
|
6
|
79 intervalFreq = rev(c(0,0.01,0.05,0.1,0.5,1,5))
|
0
|
80 V_Segments = c(".*", "IGHV", "IGHD", "IGKV", "IGKV", "IgKINTR", "TRGV", "TRDV", "TRDD" , "TRBV")
|
|
81 J_Segments = c(".*", ".*", ".*", "IGKJ", "KDE", ".*", ".*", ".*", ".*", ".*")
|
|
82 Titles = c("Total", "IGH-Vh-Jh", "IGH-Dh-Jh", "Vk-Jk", "Vk-Kde" , "Intron-Kde", "TCRG", "TCRD-Vd-Dd", "TCRD-Dd-Dd", "TCRB-Vb-Jb")
|
|
83 Titles = factor(Titles, levels=Titles)
|
|
84 TitlesOrder = data.frame("Title"=Titles, "TitlesOrder"=1:length(Titles))
|
|
85
|
29
|
86 single_patients = data.frame("Patient" = character(0),"Sample" = character(0), "on" = character(0), "Clone_Sequence" = character(0), "Frequency" = numeric(0), "normalized_read_count" = numeric(0), "V_Segment_Major_Gene" = character(0), "J_Segment_Major_Gene" = character(0), "Rearrangement" = character(0))
|
|
87
|
0
|
88 patientCountOnColumn <- function(x, product, interval, on, appendtxt=F){
|
28
|
89 if (!is.data.frame(x) & is.list(x)){
|
|
90 x = x[[1]]
|
|
91 }
|
34
|
92 #x$Sample = factor(x$Sample, levels=unique(x$Sample))
|
|
93 x = data.frame(x,stringsAsFactors=F)
|
0
|
94 onShort = "reads"
|
|
95 if(on == "Frequency"){
|
|
96 onShort = "freq"
|
|
97 }
|
18
|
98 onx = paste(on, ".x", sep="")
|
|
99 ony = paste(on, ".y", sep="")
|
0
|
100 splt = split(x, x$Sample, drop=T)
|
4
|
101 type="pair"
|
2
|
102 if(length(splt) == 1){
|
3
|
103 print(paste(paste(x[1,which(colnames(x) == "Patient")]), "has one sample"))
|
4
|
104 splt[[2]] = data.frame("Patient" = character(0), "Receptor" = character(0), "Sample" = character(0), "Cell_Count" = numeric(0), "Clone_Molecule_Count_From_Spikes" = numeric(0), "Log10_Frequency" = numeric(0), "Total_Read_Count" = numeric(0), "dsMol_per_1e6_cells" = numeric(0), "J_Segment_Major_Gene" = character(0), "V_Segment_Major_Gene" = character(0), "Clone_Sequence" = character(0), "CDR3_Sense_Sequence" = character(0), "Related_to_leukemia_clone" = logical(0), "Frequency"= numeric(0), "normalized_read_count" = numeric(0), "paste" = character(0))
|
|
105 type="single"
|
2
|
106 }
|
0
|
107 patient1 = splt[[1]]
|
|
108 patient2 = splt[[2]]
|
|
109
|
|
110 threshholdIndex = which(colnames(product) == "interval")
|
|
111 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
112 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
113 titleIndex = which(colnames(product) == "Titles")
|
|
114 sampleIndex = which(colnames(x) == "Sample")
|
|
115 patientIndex = which(colnames(x) == "Patient")
|
|
116 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
117 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
118 patient = paste(x[1,patientIndex])
|
3
|
119
|
0
|
120 switched = F
|
|
121 if(length(grep(".*_Right$", twoSample)) == 1 || length(grep(".*_Dx_BM$", twoSample)) == 1 || length(grep(".*_Dx$", twoSample)) == 1 ){
|
|
122 tmp = twoSample
|
|
123 twoSample = oneSample
|
|
124 oneSample = tmp
|
|
125 tmp = patient1
|
|
126 patient1 = patient2
|
|
127 patient2 = tmp
|
|
128 switched = T
|
|
129 }
|
|
130 if(appendtxt){
|
4
|
131 cat(paste(patient, oneSample, twoSample, type, sep="\t"), file="patients.txt", append=T, sep="", fill=3)
|
0
|
132 }
|
3
|
133 cat(paste("<tr><td>", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
9
|
134
|
29
|
135 if(mergeOn == "Clone_Sequence"){
|
|
136 patient1$merge = paste(patient1$Clone_Sequence)
|
|
137 patient2$merge = paste(patient2$Clone_Sequence)
|
|
138 } else {
|
|
139 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
140 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
141 }
|
9
|
142
|
30
|
143 scatterplot_data_columns = c("Patient", "Sample", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
29
|
144 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns])
|
30
|
145 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
|
146 scatterplot_data$type = factor(x=oneSample, levels=c(oneSample, twoSample, "In Both"))
|
29
|
147 scatterplot_data$on = onShort
|
|
148
|
9
|
149 patientMerge = merge(patient1, patient2, by.x="merge", by.y="merge")
|
34
|
150
|
|
151
|
|
152 #fuzzy matching here...
|
|
153 if(mergeOn == "Clone_Sequence"){
|
|
154 merge.list = patientMerge$merge
|
|
155
|
|
156 patient1.fuzzy = patient1[!(patient1$merge %in% merge.list),]
|
|
157 patient2.fuzzy = patient2[!(patient2$merge %in% merge.list),]
|
|
158
|
36
|
159 #patient1.fuzzy$merge = paste(patient1.fuzzy$V_Segment_Major_Gene, patient1.fuzzy$J_Segment_Major_Gene, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
160 #patient2.fuzzy$merge = paste(patient2.fuzzy$V_Segment_Major_Gene, patient2.fuzzy$J_Segment_Major_Gene, patient2.fuzzy$CDR3_Sense_Sequence)
|
|
161
|
|
162 patient1.fuzzy$merge = paste(patient1.fuzzy$locus_V, patient1.fuzzy$locus_J, patient1.fuzzy$CDR3_Sense_Sequence)
|
|
163 patient2.fuzzy$merge = paste(patient2.fuzzy$locus_V, patient2.fuzzy$locus_J, patient2.fuzzy$CDR3_Sense_Sequence)
|
34
|
164
|
37
|
165 merge.freq.table = data.frame(table(c(patient1.fuzzy[!duplicated(patient1.fuzzy$merge),"merge"], patient2.fuzzy[!duplicated(patient2.fuzzy$merge),"merge"])))
|
34
|
166 merge.freq.table.gt.1 = merge.freq.table[merge.freq.table$Freq > 1,]
|
|
167
|
|
168 patient1.fuzzy = patient1.fuzzy[patient1.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
169 patient2.fuzzy = patient2.fuzzy[patient2.fuzzy$merge %in% merge.freq.table.gt.1$Var1,]
|
|
170
|
37
|
171 patient.fuzzy = rbind(patient1.fuzzy, patient2.fuzzy)
|
|
172 patient.fuzzy = patient.fuzzy[order(nchar(patient.fuzzy$Clone_Sequence)),]
|
|
173
|
|
174 while(nrow(patient.fuzzy) > 1){
|
|
175 first.merge = patient.fuzzy[1,"merge"]
|
|
176 first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"]
|
|
177
|
|
178 merge.filter = first.merge == patient.fuzzy$merge
|
|
179
|
42
|
180 length.filter = nchar(patient.fuzzy$Clone_Sequence) - nchar(first.clone.sequence) <= 9
|
|
181
|
43
|
182 sample.filter = patient.fuzzy[1,"Sample"] != patient.fuzzy$Sample
|
|
183
|
|
184 match.filter = merge.filter & grepl(first.clone.sequence, patient.fuzzy$Clone_Sequence) & length.filter & sample.filter
|
34
|
185
|
37
|
186 if(sum(match.filter) == 2){
|
|
187 second.match = which(match.filter)[2]
|
|
188 second.clone.sequence = patient.fuzzy[second.match,"Clone_Sequence"]
|
|
189 first.sample = patient.fuzzy[1,"Sample"]
|
|
190 second.sample = patient.fuzzy[second.match,"Sample"]
|
34
|
191
|
37
|
192 if(((nchar(second.clone.sequence) - nchar(first.clone.sequence)) <= 9) & (first.sample != second.sample)){
|
|
193 first.match.row = patient.fuzzy[1,]
|
|
194 second.match.row = patient.fuzzy[second.match,]
|
|
195 print(paste(first.merge, first.match.row$normalized_read_count, second.match.row$normalized_read_count, first.clone.sequence, second.clone.sequence))
|
|
196 patientMerge.new.row = data.frame(merge=first.clone.sequence,
|
|
197 min_cell_paste.x=first.match.row[1,"min_cell_paste"],
|
|
198 Patient.x=first.match.row[1,"Patient"],
|
|
199 Receptor.x=first.match.row[1,"Receptor"],
|
|
200 Sample.x=first.match.row[1,"Sample"],
|
|
201 Cell_Count.x=first.match.row[1,"Cell_Count"],
|
|
202 Clone_Molecule_Count_From_Spikes.x=first.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
203 Log10_Frequency.x=first.match.row[1,"Log10_Frequency"],
|
|
204 Total_Read_Count.x=first.match.row[1,"Total_Read_Count"],
|
|
205 dsPerM.x=first.match.row[1,"dsPerM"],
|
|
206 J_Segment_Major_Gene.x=first.match.row[1,"J_Segment_Major_Gene"],
|
|
207 V_Segment_Major_Gene.x=first.match.row[1,"V_Segment_Major_Gene"],
|
|
208 Clone_Sequence.x=first.match.row[1,"Clone_Sequence"],
|
|
209 CDR3_Sense_Sequence.x=first.match.row[1,"CDR3_Sense_Sequence"],
|
|
210 Related_to_leukemia_clone.x=first.match.row[1,"Related_to_leukemia_clone"],
|
|
211 Frequency.x=first.match.row[1,"Frequency"],
|
|
212 locus_V.x=first.match.row[1,"locus_V"],
|
|
213 locus_J.x=first.match.row[1,"locus_J"],
|
|
214 min_cell_count.x=first.match.row[1,"min_cell_count"],
|
|
215 normalized_read_count.x=first.match.row[1,"normalized_read_count"],
|
|
216 paste.x=first.match.row[1,"paste"],
|
|
217 min_cell_paste.y=second.match.row[1,"min_cell_paste"],
|
|
218 Patient.y=second.match.row[1,"Patient"],
|
|
219 Receptor.y=second.match.row[1,"Receptor"],
|
|
220 Sample.y=second.match.row[1,"Sample"],
|
|
221 Cell_Count.y=second.match.row[1,"Cell_Count"],
|
|
222 Clone_Molecule_Count_From_Spikes.y=second.match.row[1,"Clone_Molecule_Count_From_Spikes"],
|
|
223 Log10_Frequency.y=second.match.row[1,"Log10_Frequency"],
|
|
224 Total_Read_Count.y=second.match.row[1,"Total_Read_Count"],
|
|
225 dsPerM.y=second.match.row[1,"dsPerM"],
|
|
226 J_Segment_Major_Gene.y=second.match.row[1,"J_Segment_Major_Gene"],
|
|
227 V_Segment_Major_Gene.y=second.match.row[1,"V_Segment_Major_Gene"],
|
|
228 Clone_Sequence.y=second.match.row[1,"Clone_Sequence"],
|
|
229 CDR3_Sense_Sequence.y=second.match.row[1,"CDR3_Sense_Sequence"],
|
|
230 Related_to_leukemia_clone.y=second.match.row[1,"Related_to_leukemia_clone"],
|
|
231 Frequency.y=second.match.row[1,"Frequency"],
|
|
232 locus_V.y=second.match.row[1,"locus_V"],
|
|
233 locus_J.y=second.match.row[1,"locus_J"],
|
|
234 min_cell_count.y=second.match.row[1,"min_cell_count"],
|
|
235 normalized_read_count.y=second.match.row[1,"normalized_read_count"],
|
|
236 paste.y=first.match.row[1,"paste"])
|
|
237
|
|
238
|
|
239 patientMerge = rbind(patientMerge, patientMerge.new.row)
|
|
240 patient.fuzzy = patient.fuzzy[-match.filter,]
|
|
241
|
|
242 patient1 = patient1[!(patient1$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
243 patient2 = patient2[!(patient2$Clone_Sequence %in% c(first.clone.sequence, second.clone.sequence)),]
|
|
244
|
|
245 scatterplot_data = scatterplot_data[scatterplot_data$merge != second.clone.sequence,]
|
|
246
|
|
247 } else {
|
|
248 patient.fuzzy = patient.fuzzy[-1,]
|
|
249 }
|
|
250
|
|
251 } else if (sum(match.filter) > 2){
|
41
|
252 cat(paste("<tr><td>", "Multiple matches found for", first.merge, "in", patient, "</td></tr>", sep=""), file=logfile, append=T)
|
39
|
253 patient.fuzzy = patient.fuzzy[-1,]
|
37
|
254 } else {
|
|
255 patient.fuzzy = patient.fuzzy[-1,]
|
34
|
256 }
|
37
|
257
|
|
258
|
34
|
259 }
|
|
260
|
|
261 }
|
|
262
|
37
|
263
|
18
|
264 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony])
|
0
|
265 res1 = vector()
|
|
266 res2 = vector()
|
|
267 resBoth = vector()
|
|
268 read1Count = vector()
|
|
269 read2Count = vector()
|
2
|
270 locussum1 = vector()
|
|
271 locussum2 = vector()
|
9
|
272
|
0
|
273 #for(iter in 1){
|
|
274 for(iter in 1:length(product[,1])){
|
|
275 threshhold = product[iter,threshholdIndex]
|
|
276 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
277 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
278 #both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold) #both higher than threshold
|
29
|
279 both = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold) #highest of both is higher than threshold
|
30
|
280 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[both,]$merge))
|
|
281 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[both,]$merge))
|
14
|
282 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count))
|
|
283 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count))
|
0
|
284 res1 = append(res1, sum(one))
|
2
|
285 res2 = append(res2, sum(two))
|
0
|
286 resBoth = append(resBoth, sum(both))
|
2
|
287 locussum1 = append(locussum1, sum(patient1[(grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene)),]$normalized_read_count))
|
|
288 locussum2 = append(locussum2, sum(patient2[(grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene)),]$normalized_read_count))
|
0
|
289 #threshhold = 0
|
|
290 if(threshhold != 0){
|
|
291 if(sum(one) > 0){
|
15
|
292 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
293 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
294 filenameOne = paste(oneSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
295 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
296 }
|
|
297 if(sum(two) > 0){
|
15
|
298 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
299 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone Sequence", "Related_to_leukemia_clone")
|
0
|
300 filenameTwo = paste(twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
301 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
302 }
|
29
|
303 } else {
|
|
304 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
|
305 if(nrow(scatterplot_locus_data) > 0){
|
|
306 scatterplot_locus_data$Rearrangement = product[iter, titleIndex]
|
|
307 }
|
34
|
308 in_one = (scatterplot_locus_data$merge %in% patient1$merge)
|
|
309 in_two = (scatterplot_locus_data$merge %in% patient2$merge)
|
|
310 not_in_one = !in_one
|
|
311 if(any(in_two)){
|
|
312 scatterplot_locus_data[not_in_one,]$type = twoSample
|
|
313 }
|
30
|
314 in_both = (scatterplot_locus_data$merge %in% patientMerge[both,]$merge)
|
|
315 if(any(in_both)){
|
|
316 scatterplot_locus_data[in_both,]$type = "In Both"
|
|
317 }
|
29
|
318 if(type == "single"){
|
|
319 single_patients <<- rbind(single_patients, scatterplot_locus_data)
|
|
320 }
|
|
321 p = NULL
|
|
322 if(nrow(scatterplot_locus_data) != 0){
|
|
323 if(on == "normalized_read_count"){
|
30
|
324 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
38
|
325 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6)
|
29
|
326 } else {
|
30
|
327 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
29
|
328 }
|
|
329 p = p + geom_point(aes(colour=type), position="jitter")
|
|
330 p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
331 } else {
|
|
332 p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex]))
|
|
333 }
|
|
334 png(paste(patient1[1,patientIndex], "_", patient1[1,sampleIndex], "_", patient2[1,sampleIndex], "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
335 print(p)
|
|
336 dev.off()
|
0
|
337 }
|
|
338 if(sum(both) > 0){
|
15
|
339 dfBoth = patientMerge[both,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
340 colnames(dfBoth) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
0
|
341 filenameBoth = paste(oneSample, "_", twoSample, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
342 write.table(dfBoth, file=paste(filenameBoth, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
29
|
343 }
|
0
|
344 }
|
2
|
345 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "Both"=resBoth, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "Sum"=res1 + res2 + resBoth, "percentage" = round((resBoth/(res1 + res2 + resBoth)) * 100, digits=2), "Locus_sum1"=locussum1, "Locus_sum2"=locussum2)
|
0
|
346 if(sum(is.na(patientResult$percentage)) > 0){
|
|
347 patientResult[is.na(patientResult$percentage),]$percentage = 0
|
|
348 }
|
|
349 colnames(patientResult)[6] = oneSample
|
|
350 colnames(patientResult)[8] = twoSample
|
|
351 colnamesBak = colnames(patientResult)
|
2
|
352 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", paste("Number of sequences ", patient, "_Both", sep=""), paste("Number of sequences", oneSample, sep=""), paste("Normalized Read Count", oneSample), paste("Number of sequences", twoSample, sep=""), paste("Normalized Read Count", twoSample), paste("Sum number of sequences", patient), paste("Percentage of sequences ", patient, "_Both", sep=""), paste("Locus Sum", oneSample), paste("Locus Sum", twoSample))
|
0
|
353 write.table(patientResult, file=paste(patient, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
354 colnames(patientResult) = colnamesBak
|
|
355
|
|
356 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
357 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
358
|
|
359 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "Both")])
|
|
360 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=Both), stat='identity', position="dodge", fill="#79c36a")
|
|
361 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
362 plt = plt + geom_text(aes(ymax=max(Both), x=cut_off_value,y=Both,label=Both), angle=90, hjust=0)
|
|
363 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in both")
|
|
364 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
365 png(paste(patient, "_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
366 print(plt)
|
|
367 dev.off()
|
|
368 #(t,r,b,l)
|
|
369 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "percentage")])
|
|
370 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=percentage), stat='identity', position="dodge", fill="#79c36a")
|
|
371 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
372 plt = plt + geom_text(aes(ymax=max(percentage), x=cut_off_value,y=percentage,label=percentage), angle=90, hjust=0)
|
|
373 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("% clones in both left and right")
|
|
374 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
375 png(paste(patient, "_percent_", onShort, ".png", sep=""), width=1920, height=1080)
|
|
376 print(plt)
|
|
377 dev.off()
|
|
378
|
|
379 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample)] ,id.vars=1:2)
|
|
380 patientResult$relativeValue = patientResult$value * 10
|
|
381 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
382 plt = ggplot(patientResult)
|
|
383 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
384 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
385 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
386 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.2)
|
|
387 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.8)
|
|
388 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle(paste("Number of clones in only ", oneSample, " and only ", twoSample, sep=""))
|
|
389 png(paste(patient, "_", onShort, "_both.png", sep=""), width=1920, height=1080)
|
|
390 print(plt)
|
|
391 dev.off()
|
|
392 }
|
|
393
|
3
|
394 cat("<tr><td>Starting Frequency analysis</td></tr>", file=logfile, append=T)
|
|
395
|
0
|
396 interval = intervalFreq
|
|
397 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
398 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
399 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="Frequency", appendtxt=T)
|
0
|
400
|
3
|
401 cat("<tr><td>Starting Cell Count analysis</td></tr>", file=logfile, append=T)
|
|
402
|
0
|
403 interval = intervalReads
|
|
404 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
4
|
405 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
29
|
406 lapply(patients, FUN=patientCountOnColumn, product = product, interval=interval, on="normalized_read_count")
|
0
|
407
|
3
|
408 cat("</table></html>", file=logfile, append=T)
|
|
409
|
33
|
410
|
|
411 if(nrow(single_patients) > 0){
|
|
412 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
|
413 p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
|
414 p = p + geom_point(aes(colour=type), position="jitter")
|
|
415 p = p + xlab("In one or both samples") + ylab("Reads")
|
|
416 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample")
|
|
417 png("singles_reads_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
418 print(p)
|
|
419 dev.off()
|
7
|
420
|
33
|
421 p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
|
422 p = p + geom_point(aes(colour=type), position="jitter")
|
|
423 p = p + xlab("In one or both samples") + ylab("Frequency")
|
|
424 p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
425 png("singles_freq_scatterplot.png", width=640 * length(unique(single_patients$Patient)) + 100, height=1080)
|
|
426 print(p)
|
|
427 dev.off()
|
|
428 } else {
|
|
429 empty <- data.frame()
|
|
430 p = ggplot(empty) + geom_point() + xlim(0, 10) + ylim(0, 100) + xlab("In one or both samples") + ylab("Frequency") + ggtitle("Scatterplot of the frequency of the patients with a single sample")
|
|
431
|
|
432 png("singles_reads_scatterplot.png", width=400, height=300)
|
|
433 print(p)
|
|
434 dev.off()
|
|
435
|
|
436 png("singles_freq_scatterplot.png", width=400, height=300)
|
|
437 print(p)
|
|
438 dev.off()
|
|
439 }
|
7
|
440 tripletAnalysis <- function(patient1, label1, patient2, label2, patient3, label3, product, interval, on, appendTriplets= FALSE){
|
|
441 onShort = "reads"
|
|
442 if(on == "Frequency"){
|
|
443 onShort = "freq"
|
|
444 }
|
18
|
445 onx = paste(on, ".x", sep="")
|
|
446 ony = paste(on, ".y", sep="")
|
|
447 onz = paste(on, ".z", sep="")
|
7
|
448 type="triplet"
|
|
449
|
|
450 threshholdIndex = which(colnames(product) == "interval")
|
|
451 V_SegmentIndex = which(colnames(product) == "V_Segments")
|
|
452 J_SegmentIndex = which(colnames(product) == "J_Segments")
|
|
453 titleIndex = which(colnames(product) == "Titles")
|
|
454 sampleIndex = which(colnames(patient1) == "Sample")
|
|
455 patientIndex = which(colnames(patient1) == "Patient")
|
|
456 oneSample = paste(patient1[1,sampleIndex], sep="")
|
|
457 twoSample = paste(patient2[1,sampleIndex], sep="")
|
|
458 threeSample = paste(patient3[1,sampleIndex], sep="")
|
|
459
|
29
|
460 if(mergeOn == "Clone_Sequence"){
|
|
461 patient1$merge = paste(patient1$Clone_Sequence)
|
|
462 patient2$merge = paste(patient2$Clone_Sequence)
|
|
463 patient3$merge = paste(patient3$Clone_Sequence)
|
|
464
|
|
465 } else {
|
|
466 patient1$merge = paste(patient1$V_Segment_Major_Gene, patient1$J_Segment_Major_Gene, patient1$CDR3_Sense_Sequence)
|
|
467 patient2$merge = paste(patient2$V_Segment_Major_Gene, patient2$J_Segment_Major_Gene, patient2$CDR3_Sense_Sequence)
|
|
468 patient3$merge = paste(patient3$V_Segment_Major_Gene, patient3$J_Segment_Major_Gene, patient3$CDR3_Sense_Sequence)
|
|
469 }
|
9
|
470
|
|
471 patientMerge = merge(patient1, patient2, by="merge")
|
|
472 patientMerge = merge(patientMerge, patient3, by="merge")
|
28
|
473 colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge)))] = paste(colnames(patientMerge)[which(!grepl("(\\.x$)|(\\.y$)|(merge)", names(patientMerge), perl=T))], ".z", sep="")
|
18
|
474 patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz])
|
20
|
475 patientMerge12 = merge(patient1, patient2, by="merge")
|
|
476 patientMerge12$thresholdValue = pmax(patientMerge12[,onx], patientMerge12[,ony])
|
|
477 patientMerge13 = merge(patient1, patient3, by="merge")
|
|
478 patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony])
|
|
479 patientMerge23 = merge(patient2, patient3, by="merge")
|
|
480 patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony])
|
|
481
|
28
|
482
|
30
|
483 scatterplot_data_columns = c("Clone_Sequence", "Frequency", "normalized_read_count", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "merge")
|
20
|
484 scatterplot_data = rbind(patient1[,scatterplot_data_columns], patient2[,scatterplot_data_columns], patient3[,scatterplot_data_columns])
|
30
|
485 scatterplot_data = scatterplot_data[!duplicated(scatterplot_data$merge),]
|
27
|
486 scatterplot_data$type = factor(x="In one", levels=c("In one", "In two", "In three", "In multiple"))
|
20
|
487
|
7
|
488 res1 = vector()
|
|
489 res2 = vector()
|
|
490 res3 = vector()
|
20
|
491 res12 = vector()
|
|
492 res13 = vector()
|
|
493 res23 = vector()
|
7
|
494 resAll = vector()
|
|
495 read1Count = vector()
|
|
496 read2Count = vector()
|
|
497 read3Count = vector()
|
|
498
|
|
499 if(appendTriplets){
|
9
|
500 cat(paste(label1, label2, label3, sep="\t"), file="triplets.txt", append=T, sep="", fill=3)
|
7
|
501 }
|
|
502 for(iter in 1:length(product[,1])){
|
|
503 threshhold = product[iter,threshholdIndex]
|
|
504 V_Segment = paste(".*", as.character(product[iter,V_SegmentIndex]), ".*", sep="")
|
|
505 J_Segment = paste(".*", as.character(product[iter,J_SegmentIndex]), ".*", sep="")
|
18
|
506 #all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge[,onx] > threshhold & patientMerge[,ony] > threshhold & patientMerge[,onz] > threshhold)
|
|
507 all = (grepl(V_Segment, patientMerge$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge$J_Segment_Major_Gene.x) & patientMerge$thresholdValue > threshhold)
|
7
|
508
|
30
|
509 one_two = (grepl(V_Segment, patientMerge12$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge12$J_Segment_Major_Gene.x) & patientMerge12$thresholdValue > threshhold & !(patientMerge12$merge %in% patientMerge[all,]$merge))
|
|
510 one_three = (grepl(V_Segment, patientMerge13$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge13$J_Segment_Major_Gene.x) & patientMerge13$thresholdValue > threshhold & !(patientMerge13$merge %in% patientMerge[all,]$merge))
|
|
511 two_three = (grepl(V_Segment, patientMerge23$V_Segment_Major_Gene.x) & grepl(J_Segment, patientMerge23$J_Segment_Major_Gene.x) & patientMerge23$thresholdValue > threshhold & !(patientMerge23$merge %in% patientMerge[all,]$merge))
|
24
|
512
|
30
|
513 one = (grepl(V_Segment, patient1$V_Segment_Major_Gene) & grepl(J_Segment, patient1$J_Segment_Major_Gene) & patient1[,on] > threshhold & !(patient1$merge %in% patientMerge[all,]$merge) & !(patient1$merge %in% patientMerge12[one_two,]$merge) & !(patient1$merge %in% patientMerge13[one_three,]$merge))
|
|
514 two = (grepl(V_Segment, patient2$V_Segment_Major_Gene) & grepl(J_Segment, patient2$J_Segment_Major_Gene) & patient2[,on] > threshhold & !(patient2$merge %in% patientMerge[all,]$merge) & !(patient2$merge %in% patientMerge12[one_two,]$merge) & !(patient2$merge %in% patientMerge23[two_three,]$merge))
|
|
515 three = (grepl(V_Segment, patient3$V_Segment_Major_Gene) & grepl(J_Segment, patient3$J_Segment_Major_Gene) & patient3[,on] > threshhold & !(patient3$merge %in% patientMerge[all,]$merge) & !(patient3$merge %in% patientMerge13[one_three,]$merge) & !(patient3$merge %in% patientMerge23[two_three,]$merge))
|
20
|
516
|
18
|
517 read1Count = append(read1Count, sum(patient1[one,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.x))
|
|
518 read2Count = append(read2Count, sum(patient2[two,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.y))
|
|
519 read3Count = append(read3Count, sum(patient3[three,]$normalized_read_count) + sum(patientMerge[all,]$normalized_read_count.z))
|
7
|
520 res1 = append(res1, sum(one))
|
|
521 res2 = append(res2, sum(two))
|
|
522 res3 = append(res3, sum(three))
|
|
523 resAll = append(resAll, sum(all))
|
20
|
524 res12 = append(res12, sum(one_two))
|
|
525 res13 = append(res13, sum(one_three))
|
|
526 res23 = append(res23, sum(two_three))
|
7
|
527 #threshhold = 0
|
|
528 if(threshhold != 0){
|
|
529 if(sum(one) > 0){
|
20
|
530 dfOne = patient1[one,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
531 colnames(dfOne) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
532 filenameOne = paste(label1, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
533 write.table(dfOne, file=paste(filenameOne, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
534 }
|
|
535 if(sum(two) > 0){
|
20
|
536 dfTwo = patient2[two,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
537 colnames(dfTwo) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
538 filenameTwo = paste(label2, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
539 write.table(dfTwo, file=paste(filenameTwo, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
540 }
|
|
541 if(sum(three) > 0){
|
20
|
542 dfThree = patient3[three,c("V_Segment_Major_Gene", "J_Segment_Major_Gene", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")]
|
|
543 colnames(dfThree) = c("Proximal segment", "Distal segment", "normalized_read_count", "Frequency", "Clone_Sequence", "Related_to_leukemia_clone")
|
7
|
544 filenameThree = paste(label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
545 write.table(dfThree, file=paste(filenameThree, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
546 }
|
20
|
547 if(sum(one_two) > 0){
|
|
548 dfOne_two = patientMerge12[one_two,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
549 colnames(dfOne_two) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample))
|
|
550 filenameOne_two = paste(label1, "_", label2, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
551 write.table(dfOne_two, file=paste(filenameOne_two, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
552 }
|
|
553 if(sum(one_three) > 0){
|
|
554 dfOne_three = patientMerge13[one_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
555 colnames(dfOne_three) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
556 filenameOne_three = paste(label1, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
557 write.table(dfOne_three, file=paste(filenameOne_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
558 }
|
|
559 if(sum(two_three) > 0){
|
|
560 dfTwo_three = patientMerge23[two_three,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y")]
|
|
561 colnames(dfTwo_three) = c(paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample),"Clone_Sequence", paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
|
562 filenameTwo_three = paste(label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, onShort, sep="")
|
|
563 write.table(dfTwo_three, file=paste(filenameTwo_three, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
564 }
|
|
565 } else { #scatterplot data
|
24
|
566 scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),]
|
30
|
567 in_two = (scatterplot_locus_data$merge %in% patientMerge12[one_two,]$merge) | (scatterplot_locus_data$merge %in% patientMerge13[one_three,]$merge) | (scatterplot_locus_data$merge %in% patientMerge23[two_three,]$merge)
|
27
|
568 if(sum(in_two) > 0){
|
|
569 scatterplot_locus_data[in_two,]$type = "In two"
|
|
570 }
|
30
|
571 in_three = (scatterplot_locus_data$merge %in% patientMerge[all,]$merge)
|
27
|
572 if(sum(in_three)> 0){
|
|
573 scatterplot_locus_data[in_three,]$type = "In three"
|
|
574 }
|
|
575 not_in_one = scatterplot_locus_data$type != "In one"
|
|
576 if(sum(not_in_one) > 0){
|
|
577 scatterplot_locus_data[not_in_one,]$type = "In multiple"
|
|
578 }
|
20
|
579 p = NULL
|
|
580 if(nrow(scatterplot_locus_data) != 0){
|
|
581 if(on == "normalized_read_count"){
|
31
|
582 scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count))))
|
32
|
583 p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000))
|
20
|
584 } else {
|
32
|
585 p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100))
|
20
|
586 }
|
|
587 p = p + geom_point(aes(colour=type), position="jitter")
|
25
|
588 p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
20
|
589 } else {
|
|
590 p = ggplot(NULL, aes(x=c("In one", "In multiple"),y=0)) + geom_blank(NULL) + xlab("In two or in three of the samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex]))
|
|
591 }
|
|
592 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_", product[iter, titleIndex],"_scatter.png", sep=""))
|
|
593 print(p)
|
|
594 dev.off()
|
|
595 }
|
7
|
596 if(sum(all) > 0){
|
20
|
597 dfAll = patientMerge[all,c("V_Segment_Major_Gene.x", "J_Segment_Major_Gene.x", "normalized_read_count.x", "Frequency.x", "Related_to_leukemia_clone.x", "Clone_Sequence.x", "V_Segment_Major_Gene.y", "J_Segment_Major_Gene.y", "normalized_read_count.y", "Frequency.y", "Related_to_leukemia_clone.y", "V_Segment_Major_Gene.z", "J_Segment_Major_Gene.z", "normalized_read_count.z", "Frequency.z", "Related_to_leukemia_clone.z")]
|
|
598 colnames(dfAll) = c(paste("Proximal segment", oneSample), paste("Distal segment", oneSample), paste("Normalized_Read_Count", oneSample), paste("Frequency", oneSample), paste("Related_to_leukemia_clone", oneSample),"Clone_Sequence", paste("Proximal segment", twoSample), paste("Distal segment", twoSample), paste("Normalized_Read_Count", twoSample), paste("Frequency", twoSample), paste("Related_to_leukemia_clone", twoSample), paste("Proximal segment", threeSample), paste("Distal segment", threeSample), paste("Normalized_Read_Count", threeSample), paste("Frequency", threeSample), paste("Related_to_leukemia_clone", threeSample))
|
7
|
599 filenameAll = paste(label1, "_", label2, "_", label3, "_", product[iter, titleIndex], "_", threshhold, sep="")
|
|
600 write.table(dfAll, file=paste(filenameAll, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
601 }
|
|
602 }
|
20
|
603 #patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "read_count1" = round(read1Count), "tmp2"=res2, "read_count2"= round(read2Count), "tmp3"=res3, "read_count3"=round(read3Count))
|
|
604 patientResult = data.frame("Locus"=product$Titles, "J_Segment"=product$J_Segments, "V_Segment"=product$V_Segments, "cut_off_value"=paste(">", product$interval, sep=""), "All"=resAll, "tmp1"=res1, "tmp2"=res2, "tmp3"=res3, "tmp12"=res12, "tmp13"=res13, "tmp23"=res23)
|
7
|
605 colnames(patientResult)[6] = oneSample
|
20
|
606 colnames(patientResult)[7] = twoSample
|
|
607 colnames(patientResult)[8] = threeSample
|
|
608 colnames(patientResult)[9] = paste(oneSample, twoSample, sep="_")
|
|
609 colnames(patientResult)[10] = paste(oneSample, twoSample, sep="_")
|
|
610 colnames(patientResult)[11] = paste(oneSample, twoSample, sep="_")
|
7
|
611
|
|
612 colnamesBak = colnames(patientResult)
|
20
|
613 colnames(patientResult) = c("Ig/TCR gene rearrangement type", "Distal Gene segment", "Proximal gene segment", "cut_off_value", "Number of sequences All", paste("Number of sequences", oneSample), paste("Number of sequences", twoSample), paste("Number of sequences", threeSample), paste("Number of sequences", oneSample, twoSample), paste("Number of sequences", oneSample, threeSample), paste("Number of sequences", twoSample, threeSample))
|
7
|
614 write.table(patientResult, file=paste(label1, "_", label2, "_", label3, "_", onShort, ".txt", sep=""), quote=F, sep="\t", dec=",", row.names=F, col.names=T)
|
|
615 colnames(patientResult) = colnamesBak
|
|
616
|
|
617 patientResult$Locus = factor(patientResult$Locus, Titles)
|
|
618 patientResult$cut_off_value = factor(patientResult$cut_off_value, paste(">", interval, sep=""))
|
|
619
|
|
620 plt = ggplot(patientResult[,c("Locus", "cut_off_value", "All")])
|
|
621 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=All), stat='identity', position="dodge", fill="#79c36a")
|
|
622 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
623 plt = plt + geom_text(aes(ymax=max(All), x=cut_off_value,y=All,label=All), angle=90, hjust=0)
|
|
624 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in All")
|
|
625 plt = plt + theme(plot.margin = unit(c(1,8.8,0.5,1.5), "lines"))
|
|
626 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_total_all.png", sep=""), width=1920, height=1080)
|
|
627 print(plt)
|
|
628 dev.off()
|
|
629
|
|
630 fontSize = 4
|
|
631
|
|
632 bak = patientResult
|
|
633 patientResult = melt(patientResult[,c('Locus','cut_off_value', oneSample, twoSample, threeSample)] ,id.vars=1:2)
|
|
634 patientResult$relativeValue = patientResult$value * 10
|
|
635 patientResult[patientResult$relativeValue == 0,]$relativeValue = 1
|
|
636 plt = ggplot(patientResult)
|
|
637 plt = plt + geom_bar( aes( x=factor(cut_off_value), y=relativeValue, fill=variable), stat='identity', position="dodge")
|
|
638 plt = plt + facet_grid(.~Locus) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
639 plt = plt + scale_y_continuous(trans="log", breaks=10^c(0:10), labels=c(0, 10^c(0:9)))
|
|
640 plt = plt + geom_text(data=patientResult[patientResult$variable == oneSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=-0.7, size=fontSize)
|
|
641 plt = plt + geom_text(data=patientResult[patientResult$variable == twoSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=0.4, size=fontSize)
|
|
642 plt = plt + geom_text(data=patientResult[patientResult$variable == threeSample,], aes(ymax=max(value), x=cut_off_value,y=relativeValue,label=value), angle=90, position=position_dodge(width=0.9), hjust=0, vjust=1.5, size=fontSize)
|
|
643 plt = plt + xlab("Reads per locus") + ylab("Count") + ggtitle("Number of clones in only one sample")
|
|
644 png(paste(label1, "_", label2, "_", label3, "_", onShort, "_indiv_all.png", sep=""), width=1920, height=1080)
|
|
645 print(plt)
|
|
646 dev.off()
|
|
647 }
|
|
648
|
33
|
649 if(nrow(triplets) != 0){
|
|
650 triplets$uniqueID = "ID"
|
|
651
|
|
652 triplets[grepl("16278_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
653 triplets[grepl("26402_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
654 triplets[grepl("26759_Left", triplets$Sample),]$uniqueID = "16278_26402_26759_Left"
|
|
655
|
|
656 triplets[grepl("16278_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
657 triplets[grepl("26402_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
658 triplets[grepl("26759_Right", triplets$Sample),]$uniqueID = "16278_26402_26759_Right"
|
|
659
|
|
660 triplets[grepl("14696", triplets$Patient),]$uniqueID = "14696"
|
|
661
|
|
662 triplets$locus_V = substring(triplets$V_Segment_Major_Gene, 0, 4)
|
|
663 triplets$locus_J = substring(triplets$J_Segment_Major_Gene, 0, 4)
|
|
664 min_cell_count = data.frame(data.table(triplets)[, list(min_cell_count=min(.SD$Cell_Count)), by=c("uniqueID", "locus_V", "locus_J")])
|
|
665
|
|
666 triplets$min_cell_paste = paste(triplets$uniqueID, triplets$locus_V, triplets$locus_J)
|
|
667 min_cell_count$min_cell_paste = paste(min_cell_count$uniqueID, min_cell_count$locus_V, min_cell_count$locus_J)
|
|
668
|
|
669 min_cell_count = min_cell_count[,c("min_cell_paste", "min_cell_count")]
|
|
670
|
|
671 triplets = merge(triplets, min_cell_count, by="min_cell_paste")
|
|
672
|
|
673 triplets$normalized_read_count = round(triplets$Clone_Molecule_Count_From_Spikes / triplets$Cell_Count * triplets$min_cell_count / 2, digits=2) #??????????????????????????????????? wel of geen / 2
|
|
674
|
|
675 triplets = triplets[triplets$normalized_read_count >= min_cells,]
|
|
676
|
|
677 column_drops = c("locus_V", "locus_J", "min_cell_count", "min_cell_paste")
|
|
678
|
|
679 triplets = triplets[,!(colnames(triplets) %in% column_drops)]
|
|
680
|
|
681 #remove duplicate V+J+CDR3, add together numerical values
|
|
682 triplets = data.frame(data.table(triplets)[, list(Receptor=unique(.SD$Receptor),
|
|
683 Cell_Count=unique(.SD$Cell_Count),
|
|
684 Clone_Molecule_Count_From_Spikes=sum(.SD$Clone_Molecule_Count_From_Spikes),
|
|
685 Total_Read_Count=sum(.SD$Total_Read_Count),
|
|
686 dsPerM=ifelse("dsPerM" %in% names(dat), sum(.SD$dsPerM), 0),
|
|
687 Related_to_leukemia_clone=all(.SD$Related_to_leukemia_clone),
|
|
688 Frequency=sum(.SD$Frequency),
|
|
689 normalized_read_count=sum(.SD$normalized_read_count),
|
|
690 Log10_Frequency=sum(.SD$Log10_Frequency),
|
|
691 Clone_Sequence=.SD$Clone_Sequence[1]), by=c("Patient", "Sample", "V_Segment_Major_Gene", "J_Segment_Major_Gene", "CDR3_Sense_Sequence")])
|
|
692
|
|
693
|
|
694 interval = intervalReads
|
|
695 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
696 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
697
|
|
698 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
699 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
700 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
701 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="normalized_read_count", T)
|
|
702
|
|
703 one = triplets[triplets$Sample == "16278_Left",]
|
|
704 two = triplets[triplets$Sample == "26402_Left",]
|
|
705 three = triplets[triplets$Sample == "26759_Left",]
|
|
706 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="normalized_read_count", T)
|
|
707
|
|
708 one = triplets[triplets$Sample == "16278_Right",]
|
|
709 two = triplets[triplets$Sample == "26402_Right",]
|
|
710 three = triplets[triplets$Sample == "26759_Right",]
|
|
711 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="normalized_read_count", T)
|
|
712
|
|
713
|
|
714 interval = intervalFreq
|
|
715 intervalOrder = data.frame("interval"=paste(">", interval, sep=""), "intervalOrder"=1:length(interval))
|
|
716 product = data.frame("Titles"=rep(Titles, each=length(interval)), "interval"=rep(interval, times=10), "V_Segments"=rep(V_Segments, each=length(interval)), "J_Segments"=rep(J_Segments, each=length(interval)))
|
|
717
|
|
718 one = triplets[triplets$Sample == "14696_reg_BM",]
|
|
719 two = triplets[triplets$Sample == "24536_reg_BM",]
|
|
720 three = triplets[triplets$Sample == "24062_reg_BM",]
|
|
721 tripletAnalysis(one, "14696_1", two, "14696_2", three, "14696_3", product=product, interval=interval, on="Frequency", F)
|
|
722
|
|
723 one = triplets[triplets$Sample == "16278_Left",]
|
|
724 two = triplets[triplets$Sample == "26402_Left",]
|
|
725 three = triplets[triplets$Sample == "26759_Left",]
|
|
726 tripletAnalysis(one, "16278_Left", two, "26402_Left", three, "26759_Left", product=product, interval=interval, on="Frequency", F)
|
|
727
|
|
728 one = triplets[triplets$Sample == "16278_Right",]
|
|
729 two = triplets[triplets$Sample == "26402_Right",]
|
|
730 three = triplets[triplets$Sample == "26759_Right",]
|
|
731 tripletAnalysis(one, "16278_Right", two, "26402_Right", three, "26759_Right", product=product, interval=interval, on="Frequency", F)
|
|
732 } else {
|
|
733 cat("", file="triplets.txt")
|
|
734 }
|