comparison report_clonality/RScript.r @ 59:11ec9edfefee draft

Uploaded
author davidvanzessen
date Thu, 31 Mar 2016 10:26:30 -0400
parents a073fa12ef98
children 14ea4c464435
comparison
equal deleted inserted replaced
58:a073fa12ef98 59:11ec9edfefee
62 #filter uniques 62 #filter uniques
63 inputdata.removed = inputdata[NULL,] 63 inputdata.removed = inputdata[NULL,]
64 64
65 inputdata$clonaltype = 1:nrow(inputdata) 65 inputdata$clonaltype = 1:nrow(inputdata)
66 66
67 #keep track of the count of sequences in samples or samples/replicates for the front page overview
68 input.sample.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample")])
69 input.rep.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample", "Replicate")])
70
67 PRODF = inputdata 71 PRODF = inputdata
68 UNPROD = inputdata 72 UNPROD = inputdata
69 if(filterproductive){ 73 if(filterproductive){
70 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column 74 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column
71 PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] 75 #PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ]
72 UNPROD = inputdata[!(inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)"), ] 76 PRODF = inputdata[inputdata$Functionality %in% c("productive (see comment)","productive"),]
77
78 PRODF.count = data.frame(data.table(PRODF)[, list(count=.N), by=c("Sample")])
79
80 UNPROD = inputdata[inputdata$Functionality %in% c("unproductive (see comment)","unproductive"), ]
73 } else { 81 } else {
74 PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ] 82 PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ]
75 UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ] 83 UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ]
76 } 84 }
77 } 85 }
86
87 prod.sample.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample")])
88 prod.rep.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample", "Replicate")])
89
90 unprod.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample")])
91 unprod.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample", "Replicate")])
78 92
79 clonalityFrame = PRODF 93 clonalityFrame = PRODF
80 94
81 #remove duplicates based on the clonaltype 95 #remove duplicates based on the clonaltype
82 if(clonaltype != "none"){ 96 if(clonaltype != "none"){
83 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples 97 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples
84 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) 98 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":"))
85 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] 99 PRODF = PRODF[!duplicated(PRODF$clonaltype), ]
86 100
87 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":")) 101 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":"))
88 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ] 102 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ]
89 103
90 #again for clonalityFrame but with sample+replicate 104 #again for clonalityFrame but with sample+replicate
91 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":")) 105 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":"))
92 clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":")) 106 clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":"))
93 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ] 107 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ]
94 } 108 }
109
110 prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")])
111 prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")])
112
113 unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")])
114 unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")])
95 115
96 PRODF$freq = 1 116 PRODF$freq = 1
97 117
98 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" 118 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*"
99 PRODF$freq = gsub("^[0-9]+_", "", PRODF$ID) 119 PRODF$freq = gsub("^[0-9]+_", "", PRODF$ID)
120 140
121 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- 141 # ---------------------- Counting the productive/unproductive and unique sequences ----------------------
122 142
123 print("Report Clonality - counting productive/unproductive/unique") 143 print("Report Clonality - counting productive/unproductive/unique")
124 144
125 if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data 145 #create the table on the overview page with the productive/unique counts per sample/replicate
126 inputdata$Functionality = "unproductive" 146 #first for sample
127 search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND") 147 sample.count = merge(input.sample.count, prod.sample.count, by="Sample")
128 if(sum(search) > 0){ 148 sample.count$perc_prod = round(sample.count$Productive / sample.count$All * 100)
129 inputdata[search,]$Functionality = "productive" 149 sample.count = merge(sample.count, prod.unique.sample.count, by="Sample")
130 } 150 sample.count$perc_prod_un = round(sample.count$Productive_unique / sample.count$All * 100)
131 } 151
132 152 sample.count = merge(sample.count , unprod.sample.count, by="Sample")
133 inputdata.dt = data.table(inputdata) #for speed 153 sample.count$perc_unprod = round(sample.count$Unproductive / sample.count$All * 100)
134 154 sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample")
135 if(clonaltype == "none"){ 155 sample.count$perc_unprod_un = round(sample.count$Unproductive_unique / sample.count$All * 100)
136 ct = c("clonaltype") 156
137 } 157 #then sample/replicate
138 158 rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate"))
139 inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_") 159 rep.count$perc_prod = round(rep.count$Productive / rep.count$All * 100)
140 samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample))) 160 rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate"))
141 frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)]) 161 rep.count$perc_prod_un = round(rep.count$Productive_unique / rep.count$All * 100)
142 162
143 163 rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate"))
144 sample_productive_count = inputdata.dt[, list(All=.N, 164 rep.count$perc_unprod = round(rep.count$Unproductive / rep.count$All * 100)
145 Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]), 165 rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate"))
146 perc_prod = 1, 166 rep.count$perc_unprod_un = round(rep.count$Unproductive_unique / rep.count$All * 100)
147 Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]), 167
148 perc_prod_un = 1, 168 rep.count$Sample = paste(rep.count$Sample, rep.count$Replicate, sep="_")
149 Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]), 169 rep.count = rep.count[,names(rep.count) != "Replicate"]
150 perc_unprod = 1, 170
151 Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]), 171 count = rbind(sample.count, rep.count)
152 perc_unprod_un = 1), 172
153 by=c("Sample")] 173 write.table(x=count, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F)
154
155 sample_productive_count$perc_prod = round(sample_productive_count$Productive / sample_productive_count$All * 100)
156 sample_productive_count$perc_prod_un = round(sample_productive_count$Productive_unique / sample_productive_count$All * 100)
157
158 sample_productive_count$perc_unprod = round(sample_productive_count$Unproductive / sample_productive_count$All * 100)
159 sample_productive_count$perc_unprod_un = round(sample_productive_count$Unproductive_unique / sample_productive_count$All * 100)
160
161 sample_replicate_productive_count = inputdata.dt[, list(All=.N,
162 Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]),
163 perc_prod = 1,
164 Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]),
165 perc_prod_un = 1,
166 Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]),
167 perc_unprod = 1,
168 Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]),
169 perc_unprod_un = 1),
170 by=c("samples_replicates")]
171
172 sample_replicate_productive_count$perc_prod = round(sample_replicate_productive_count$Productive / sample_replicate_productive_count$All * 100)
173 sample_replicate_productive_count$perc_prod_un = round(sample_replicate_productive_count$Productive_unique / sample_replicate_productive_count$All * 100)
174
175 sample_replicate_productive_count$perc_unprod = round(sample_replicate_productive_count$Unproductive / sample_replicate_productive_count$All * 100)
176 sample_replicate_productive_count$perc_unprod_un = round(sample_replicate_productive_count$Unproductive_unique / sample_replicate_productive_count$All * 100)
177
178 setnames(sample_replicate_productive_count, colnames(sample_productive_count))
179
180 counts = rbind(sample_replicate_productive_count, sample_productive_count)
181 counts = counts[order(counts$Sample),]
182
183 write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F)
184 174
185 # ---------------------- Frequency calculation for V, D and J ---------------------- 175 # ---------------------- Frequency calculation for V, D and J ----------------------
186 176
187 print("Report Clonality - frequency calculation V, D and J") 177 print("Report Clonality - frequency calculation V, D and J")
188 178
379 print("Report Clonality - Heatmaps VD") 369 print("Report Clonality - Heatmaps VD")
380 plotVD <- function(dat){ 370 plotVD <- function(dat){
381 if(length(dat[,1]) == 0){ 371 if(length(dat[,1]) == 0){
382 return() 372 return()
383 } 373 }
374
384 img = ggplot() + 375 img = ggplot() +
385 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + 376 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) +
386 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 377 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
387 scale_fill_gradient(low="gold", high="blue", na.value="white") + 378 scale_fill_gradient(low="gold", high="blue", na.value="white") +
388 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + 379 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
400 VandDCount$l = log(VandDCount$Length) 391 VandDCount$l = log(VandDCount$Length)
401 maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")]) 392 maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")])
402 VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T) 393 VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T)
403 VandDCount$relLength = VandDCount$l / VandDCount$max 394 VandDCount$relLength = VandDCount$l / VandDCount$max
404 395
405 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(inputdata$Sample)) 396 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name)
406 397
407 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) 398 completeVD = merge(VandDCount, cartegianProductVD, by.x=c("Top.V.Gene", "Top.D.Gene"), by.y=c("Top.V.Gene", "Top.D.Gene"), all=TRUE)
399
408 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) 400 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
401
409 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) 402 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
410 403
411 fltr = is.nan(completeVD$relLength) 404 fltr = is.nan(completeVD$relLength)
412 if(any(fltr)){ 405 if(all(fltr)){
413 completeVD[fltr,"relLength"] = 1 406 completeVD[fltr,"relLength"] = 0
414 } 407 }
415 408
416 VDList = split(completeVD, f=completeVD[,"Sample"]) 409 VDList = split(completeVD, f=completeVD[,"Sample"])
417 lapply(VDList, FUN=plotVD) 410 lapply(VDList, FUN=plotVD)
418 } 411 }
443 VandJCount$l = log(VandJCount$Length) 436 VandJCount$l = log(VandJCount$Length)
444 maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")]) 437 maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")])
445 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T) 438 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T)
446 VandJCount$relLength = VandJCount$l / VandJCount$max 439 VandJCount$relLength = VandJCount$l / VandJCount$max
447 440
448 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample)) 441 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name)
449 442
450 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) 443 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
451 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) 444 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
452 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) 445 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
453 446
487 DandJCount$l = log(DandJCount$Length) 480 DandJCount$l = log(DandJCount$Length)
488 maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")]) 481 maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")])
489 DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T) 482 DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T)
490 DandJCount$relLength = DandJCount$l / DandJCount$max 483 DandJCount$relLength = DandJCount$l / DandJCount$max
491 484
492 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample)) 485 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name)
493 486
494 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) 487 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)
495 completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) 488 completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
496 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) 489 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
497 490