# HG changeset patch # User davidvanzessen # Date 1452008980 18000 # Node ID ef13f0a3f4d6c238c4625a608a173eddd9322738 # Parent 40c72b9ffc791fe60ecfd0e8d8f332a4eee8a64a Uploaded diff -r 40c72b9ffc79 -r ef13f0a3f4d6 RScript.r --- a/RScript.r Fri Nov 20 11:41:30 2015 -0500 +++ b/RScript.r Tue Jan 05 10:49:40 2016 -0500 @@ -1,4 +1,5 @@ args <- commandArgs(trailingOnly = TRUE) +options(scipen=999) inFile = args[1] outDir = args[2] @@ -67,6 +68,7 @@ patient.merge.list = list() #cache the 'both' table, 2x speedup for more memory... patient.merge.list.second = list() + scatter_locus_data_list = list() cat(paste("
| ", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (fetched from cache)", sep=""), file=logfile, append=T) print(names(patient.merge.list)) @@ -175,7 +183,9 @@ merge.list = list() merge.list[["second"]] = vector() - + + link.count = 1 + while(nrow(patient.fuzzy) > 1){ first.merge = patient.fuzzy[1,"merge"] first.clone.sequence = patient.fuzzy[1,"Clone_Sequence"] @@ -264,7 +274,24 @@ tmp.rows = rbind(first.rows, second.rows) tmp.rows = tmp.rows[order(nchar(tmp.rows$Clone_Sequence)),] - + + + #add to the scatterplot data + scatterplot.row = first.sum[,scatterplot_data_columns] + scatterplot.row$type = paste(first.sum[,"Sample"], "In Both") + scatterplot.row$link = link.count + scatterplot.row$on = onShort + + scatterplot_data = rbind(scatterplot_data, scatterplot.row) + + scatterplot.row = second.sum[,scatterplot_data_columns] + scatterplot.row$type = paste(second.sum[,"Sample"], "In Both") + scatterplot.row$link = link.count + scatterplot.row$on = onShort + + scatterplot_data = rbind(scatterplot_data, scatterplot.row) + + #write some information about the match to a log file if (nrow(first.rows) > 1 | nrow(second.rows) > 1) { cat(paste(" | |||
| ", patient, " row ", 1:nrow(tmp.rows), " | ", tmp.rows$Sample, ": | ", tmp.rows$Clone_Sequence, " | ", tmp.rows$normalized_read_count, " | 
| ", patient, " row ", 1:nrow(first.rows), " | ", first.rows$Sample, ": | ", first.rows$Clone_Sequence, " | ", first.rows$normalized_read_count, " | ", nrow(patient1), " in ", oneSample, " and ", nrow(patient2), " in ", twoSample, ", ", nrow(patientMerge), " in both (finding both took ", (proc.time() - start.time)[[3]], "s)", sep=""), file=logfile, append=T) } @@ -305,6 +350,7 @@ patientMerge$thresholdValue = pmax(patientMerge[,onx], patientMerge[,ony]) + #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony]) res1 = vector() res2 = vector() resBoth = vector() @@ -346,33 +392,42 @@ } else { scatterplot_locus_data = scatterplot_data[grepl(V_Segment, scatterplot_data$V_Segment_Major_Gene) & grepl(J_Segment, scatterplot_data$J_Segment_Major_Gene),] #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[[twoSample]]),] - scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),] + #scatterplot_locus_data = scatterplot_locus_data[!(scatterplot_locus_data$merge %in% merge.list[["second"]]),] if(nrow(scatterplot_locus_data) > 0){ scatterplot_locus_data$Rearrangement = product[iter, titleIndex] } - in_one = (scatterplot_locus_data$merge %in% patient1$merge) - in_two = (scatterplot_locus_data$merge %in% patient2$merge) - if(any(in_two)){ - scatterplot_locus_data[in_two,]$type = twoSample - } - in_both = (scatterplot_locus_data$merge %in% patientMerge$merge) - #merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]]) - #exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches) - if(any(in_both)){ - scatterplot_locus_data[in_both,]$type = "In Both" - } - if(type == "single"){ - single_patients <<- rbind(single_patients, scatterplot_locus_data) - } + + + + #in_one = (scatterplot_locus_data$merge %in% patient1$merge) + #in_two = (scatterplot_locus_data$merge %in% patient2$merge) + #if(any(in_two)){ + # scatterplot_locus_data[in_two,]$type = twoSample + #} + #in_both = (scatterplot_locus_data$merge %in% patientMerge$merge) + ##merge.list.filter = (scatterplot_locus_data$merge %in% merge.list[[oneSample]]) + ##exact.matches.filter = (scatterplot_locus_data$merge %in% cs.exact.matches) + #if(any(in_both)){ + # scatterplot_locus_data[in_both,]$type = "In Both" + #} + #if(type == "single" & (nrow(scatterplot_locus_data) > 0 | !any(scatterplot_locus_data$Patient %in% single_patients$Patient))){ + # single_patients <<- rbind(single_patients, scatterplot_locus_data) + #} + + p = NULL + print(paste("nrow scatterplot_locus_data", nrow(scatterplot_locus_data))) if(nrow(scatterplot_locus_data) != 0){ if(on == "normalized_read_count"){ scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) - p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) + #p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=10^6) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) + p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count, group=link)) + geom_line() + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,10^6)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) } else { - p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) + #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) + p = ggplot(scatterplot_locus_data, aes(type, Frequency, group=link)) + geom_line() + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + scale_x_discrete(breaks=levels(scatterplot_data$type), labels=levels(scatterplot_data$type), drop=FALSE) } - p = p + geom_point(aes(colour=type), position="jitter") + #p = p + geom_point(aes(colour=type), position="jitter") + p = p + geom_point(aes(colour=type), position="dodge") p = p + xlab("In one or both samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) } else { p = ggplot(NULL, aes(x=c("In one", "In Both"),y=0)) + geom_blank(NULL) + xlab("In one or both of the samples") + ylab(onShort) + ggtitle(paste(patient1[1,patientIndex], patient1[1,sampleIndex], patient2[1,sampleIndex], onShort, product[iter, titleIndex])) @@ -453,7 +508,7 @@ if(nrow(single_patients) > 0){ scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) - p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000)) + p = ggplot(single_patients, aes(Rearrangement, normalized_read_count)) + scale_y_log10(breaks=scales,labels=as.character(scales)) + expand_limits(y=c(0,1000000)) p = p + geom_point(aes(colour=type), position="jitter") p = p + xlab("In one or both samples") + ylab("Reads") p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the reads of the patients with a single sample") @@ -461,7 +516,8 @@ print(p) dev.off() - p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + #p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + p = ggplot(single_patients, aes(Rearrangement, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100)) p = p + geom_point(aes(colour=type), position="jitter") p = p + xlab("In one or both samples") + ylab("Frequency") p = p + facet_grid(.~Patient) + ggtitle("Scatterplot of the frequency of the patients with a single sample") @@ -762,6 +818,11 @@ patientMerge13$thresholdValue = pmax(patientMerge13[,onx], patientMerge13[,ony]) patientMerge23$thresholdValue = pmax(patientMerge23[,onx], patientMerge23[,ony]) + #patientMerge$thresholdValue = pmin(patientMerge[,onx], patientMerge[,ony], patientMerge[,onz]) + #patientMerge12$thresholdValue = pmin(patientMerge12[,onx], patientMerge12[,ony]) + #patientMerge13$thresholdValue = pmin(patientMerge13[,onx], patientMerge13[,ony]) + #patientMerge23$thresholdValue = pmin(patientMerge23[,onx], patientMerge23[,ony]) + patient1 = patient1[!(patient1$Clone_Sequence %in% merge.list[["second"]]),] patient2 = patient2[!(patient2$Clone_Sequence %in% merge.list[["second"]]),] patient3 = patient3[!(patient3$Clone_Sequence %in% merge.list[["second"]]),] @@ -882,7 +943,8 @@ scales = 10^(0:6) #(0:ceiling(log10(max(scatterplot_locus_data$normalized_read_count)))) p = ggplot(scatterplot_locus_data, aes(type, normalized_read_count)) + scale_y_log10(breaks=scales,labels=scales) + expand_limits(y=c(0,1000000)) } else { - p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) + p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_log10(limits=c(0.0001,100), breaks=c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), labels=c("0.0001", "0.001", "0.01", "0.1", "1", "10", "100")) + expand_limits(y=c(0,100)) + #p = ggplot(scatterplot_locus_data, aes(type, Frequency)) + scale_y_continuous(limits = c(0, 100)) + expand_limits(y=c(0,100)) } p = p + geom_point(aes(colour=type), position="jitter") p = p + xlab("In one or in multiple samples") + ylab(onShort) + ggtitle(paste(label1, label2, label3, onShort, product[iter, titleIndex])) @@ -1027,4 +1089,4 @@ } else { cat("", file="triplets.txt") } -cat(" |